<a href="https://colab.research.google.com/github/sarmadchandio/WebScrapper/blob/main/letterboxd_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# @title
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import numpy as np
import csv
from google.colab import drive
import os

_domain = 'https://letterboxd.com/'

def scrape_list(list_link):
    """
    Takes in a Letterboxd link and outputs a list of film title, release year,
    director, cast, average rating and letterboxd url
    """

    film_rows = []
    film_rows.append(['Film_title', 'Release_year', 'Director', 'Cast', 'Personal_rating', 'Average_rating','Letterboxd URL'])

    while True:
        list_page = requests.get(list_link)

        # check to see page was downloaded correctly
        if list_page.status_code != 200:
            encounter_error("")

        soup = BeautifulSoup(list_page.content, 'html.parser')
        # browser.get(following_url)

        # grab the main film grid
        table = soup.find('ul', class_='poster-list')
        if table is None:
            return None

        films = table.find_all('li')

        # iterate through films
        for film in tqdm(films):

            # finding the film name
            panel = film.find('div').find('img')
            film_name = panel['alt']

            # try to find the rating of a film if possible and converting to float
            try:
                stars = film.find('span', class_='rating').get_text().strip()
                rating = transform_stars(stars)
            except:
                rating = np.nan

            # Obtaining release year, director, cast and average rating of the movie
            film_card = film.find('div').get('data-target-link')
            film_page = _domain + film_card
            filmget = requests.get(film_page)
            film_soup = BeautifulSoup(filmget.content, 'html.parser')

            release_year = film_soup.find('meta', attrs={'property':'og:title'}).attrs['content'][-5:-1]
            director = film_soup.find('meta', attrs={'name':'twitter:data1'}).attrs['content']

            # try to find the cast, if not found insert a nan
            try:
                cast = [ line.contents[0] for line in film_soup.find('div', attrs={'id':'tab-cast'}).find_all('a')]

                # remove all the 'Show All...' tags if they are present
                cast = [i for i in cast if i != 'Show All…']

            except:
                cast = np.nan

            # try to find average rating, if not insert a nan
            try:
                average_rating = float(film_soup.find('meta', attrs={'name':'twitter:data2'}).attrs['content'][:4])
            except:
                average_rating = np.nan

            film_rows.append([film_name, release_year, director, cast, rating, average_rating, _domain+film_card])

        # check if there is another page of ratings
        next_button = soup.find('a', class_='next')
        if next_button is None:
            break
        else:
            list_link = _domain + next_button['href']

    return film_rows

def transform_stars(starstring):
    """
    Transforms star rating into float value
    """
    stars = {
        "★": 1,
        "★★": 2,
        "★★★": 3,
        "★★★★": 4,
        "★★★★★": 5,
        "½": 0.5,
        "★½": 1.5,
        "★★½": 2.5,
        "★★★½": 3.5,
        "★★★★½": 4.5
    }
    try:
        return stars[starstring]
    except:
        return np.nan

class List:
    """
    List to store data pertaining to a specific list
    """

    def __init__(self, list_name, link):
        """
        :param list_name: List name for data file (if applicable):
        :param link: The link of the list
        """

        self.name = list_name
        self.link = link
        print("\nScraping list data...\n")
        self.films = scrape_list(self.link)


def list_to_csv(film_rows, list_name):
    """
    Takes in a list of filmrows outputted by the list_scraper()
    and converts it to a CSV file

    """

    with open(f'{list_name}.csv', 'w') as f:
        write = csv.writer(f)

        write.writerows(film_rows)

    return

In [29]:
# @title
'''
Letterboxd List scraper - main program
'''

def main():
    print('====================================================')
    print('Welcome to the Letterboxd List scraper!')
    print('Provided with an URL, this program outputs a CSV file')
    print('of movie title, release data and Letterboxd link.')
    print('Example url: https://letterboxd.com/.../list/short-films/).')
    print('The program currently only supports lists and watchlists.')
    print('Enter q or quit to exit the program.')
    print('====================================================\n')

    # Checking if URL is of a watchlist or of a list
    while True:
        list_url = input('Enter the URL of the list you wish to scrape:')

        # exit option
        if list_url == 'q' or list_url == 'quit':
            exit()

        # if a watchlist proceed this way
        elif list_url.split('/')[-3] != 'list':
            try:
                list_name = list_url.split('/')[-2]
                username = list_url.split('/')[-3]
                current_list = List(list_name, list_url)
                break

            except:
                print('That is not a valid URL, please try again.')
                continue

        # if a list proceed this way
        elif list_url.split('/')[-3] == 'list':
            try:
                list_name = list_url.split('/')[-2]
                current_list = List(list_name, list_url)
                break

            except:
                print('That is not a valid URL, please try again.')
                continue

    # writing to a CSV file
    try:
        csv_name = username + '_' + list_name
        print(f'Writing to {csv_name}.csv.')
        list_to_csv(current_list.films, csv_name)

    except:
        print(f'Writing to {list_name}.csv.')
        list_to_csv(current_list.films, list_name)

    print('Done!')

In [7]:
# This step connects with your google drive to store the scraped data.
drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
# Create a letterboxd folder in your google
if not os.path.isdir('/content/drive/MyDrive/letterboxd'):
  !mkdir drive/MyDrive/letterboxd
!cd /content/drive/MyDrive/letterboxd

In [5]:
# Running this cell will ask you for a url
# It can only scrape lists from the website. An example url of a list is: https://letterboxd.com/_benferrer/list/top-50-favourite-films/
if __name__ == "__main__":
    main()

Welcome to the Letterboxd List scraper!
Provided with an URL, this program outputs a CSV file
of movie title, release data and Letterboxd link.
Example url: https://letterboxd.com/.../list/short-films/).
The program currently only supports lists and watchlists.
Enter q or quit to exit the program.

Enter the URL of the list you wish to scrape:https://letterboxd.com/_benferrer/list/top-50-favourite-films/

Scraping list data...



100%|██████████| 50/50 [00:28<00:00,  1.76it/s]

Writing to top-50-favourite-films.csv.
Done!



