In [None]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
# Disable chardet debug messages
import logging
logging.getLogger().setLevel(logging.WARNING)

In [None]:
class FilmScraper:
    def __init__(self, country, page):
        self.country = country
        self.page = page
        self.url = self.get_url()

    def get_url(self):
        return 'https://letterboxd.com/films/ajax/language/{}/by/release-earliest/page/{}/'.format(self.country, self.page)

    def get_rating(self, item):
        if item.find('li', class_='listitem poster-container') is None:
            rating = None
        else:
            rating = item.find('li', class_='listitem poster-container')['data-average-rating']
        return rating

    def scrape_films(self):
        response = requests.get(self.url)
        html_content = response.content
        soup = BeautifulSoup(html_content, 'html.parser')
        film_div = soup.find_all('li', class_='listitem poster-container')
        film_lst = [movie.find('div', class_='linked-film-poster')['data-target-link'] for movie in film_div]
        rate_lst = [self.get_rating(movie) for movie in film_div]
        return film_lst, rate_lst

# get film list

Example:
```html
<li class="listitem poster-container">
<div class="really-lazy-load poster film-poster film-poster-203520 no-poster linked-film-poster" data-cache-busting-key="00000000" data-film-id="203520" data-film-slug="/film/resurrection-of-a-corpse/" data-image-height="105" data-image-width="70" data-linked="linked" data-poster-url="/film/resurrection-of-a-corpse/image-150/" data-show-menu="true" data-target-link="/film/resurrection-of-a-corpse/" data-target-link-target=""> <img alt="Resurrection of a Corpse" class="image" height="105" src="https://s.ltrbxd.com/static/img/empty-poster-70.8112b435.png" width="70"/> <span class="frame"><span class="frame-title"></span></span> </div>
</li>
```

In [None]:
# country: japanese 34535, korean 11632, chinese 23499, cantonese 4707
jp_film_lst, jp_rate_lst = [], []

In [None]:
34535/72, 11632/72, 23499/72, 4707/72

(479.65277777777777, 161.55555555555554, 326.375, 65.375)

In [None]:
pbar = tqdm(range(401, 501))
for page in pbar:
    pbar.set_description(f'Scraping page {page} of 100 pages')
    scraper = FilmScraper('japanese', page)
    temp_film_lst, temp_rate_lst = scraper.scrape_films()
    jp_film_lst.extend(temp_film_lst)
    jp_rate_lst.extend(temp_rate_lst)

Scraping page 500 of 100 pages: 100%|██████████| 100/100 [02:41<00:00,  1.62s/it]


In [None]:
len(jp_film_lst), len(jp_rate_lst)

(34535, 34535)

In [None]:
kr_film_lst, kr_rate_lst = [], []
pbar = tqdm(range(1, 200))
for page in pbar:
    pbar.set_description(f'Scraping page {page} of 200 pages')
    scraper = FilmScraper('korean', page)
    temp_film_lst, temp_rate_lst = scraper.scrape_films()
    kr_film_lst.extend(temp_film_lst)
    kr_rate_lst.extend(temp_rate_lst)

Scraping page 199 of 200 pages: 100%|██████████| 199/199 [04:36<00:00,  1.39s/it]


In [None]:
len(kr_film_lst), len(kr_rate_lst)

(11632, 11632)

In [None]:
cn_film_lst, cn_rate_lst = [], []
pbar = tqdm(range(1, 330))
for page in pbar:
    pbar.set_description(f'Scraping page {page} of 330 pages')
    scraper = FilmScraper('chinese', page)
    temp_film_lst, temp_rate_lst = scraper.scrape_films()
    cn_film_lst.extend(temp_film_lst)
    cn_rate_lst.extend(temp_rate_lst)

Scraping page 329 of 330 pages: 100%|██████████| 329/329 [07:43<00:00,  1.41s/it]


In [None]:
len(cn_film_lst), len(cn_rate_lst)

(23508, 23508)

In [None]:
cn_film_lst[-1]

'/film/film:1017118/'

In [None]:
pbar = tqdm(range(1, 70))
for page in pbar:
    pbar.set_description(f'Scraping page {page} of 70 pages')
    scraper = FilmScraper('cantonese', page)
    temp_film_lst, temp_rate_lst = scraper.scrape_films()
    cn_film_lst.extend(temp_film_lst)
    cn_rate_lst.extend(temp_rate_lst)
len(cn_film_lst), len(cn_rate_lst)

Scraping page 69 of 70 pages: 100%|██████████| 69/69 [01:24<00:00,  1.22s/it]


(28215, 28215)

In [None]:
28215-23508

4707

In [None]:
jp_film = pd.DataFrame({'film':jp_film_lst, 'rate':jp_rate_lst})
kr_film = pd.DataFrame({'film':kr_film_lst, 'rate':kr_rate_lst})
cn_film = pd.DataFrame({'film':cn_film_lst, 'rate':cn_rate_lst})
jp_film.shape, kr_film.shape, cn_film.shape

((34535, 2), (11632, 2), (28215, 2))

In [None]:
jp_film.to_pickle('jp_film.pkl')
kr_film.to_pickle('kr_film.pkl')
cn_film.to_pickle('cn_film.pkl')

In [None]:
film_div[0]

<li class="listitem poster-container">
<div class="really-lazy-load poster film-poster film-poster-203520 no-poster linked-film-poster" data-cache-busting-key="00000000" data-film-id="203520" data-film-slug="/film/resurrection-of-a-corpse/" data-image-height="105" data-image-width="70" data-linked="linked" data-poster-url="/film/resurrection-of-a-corpse/image-150/" data-show-menu="true" data-target-link="/film/resurrection-of-a-corpse/" data-target-link-target=""> <img alt="Resurrection of a Corpse" class="image" height="105" src="https://s.ltrbxd.com/static/img/empty-poster-70.8112b435.png" width="70"/> <span class="frame"><span class="frame-title"></span></span> </div>
</li>