In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
base_url = 'https://letterboxd.com'
user = 'scoutog'
len_user_name = len(user) + 1
diary_urls = []
ratings = [5, 4.5, 4, 3.5, 3, 2.5, 2, 1.5, 1, .5, 'none']
df = pd.DataFrame()

In [None]:
def get_entry_details(dataframe, soup_entry, rating_num):
    # Create a list to store the extracted data
    data = []

    for row in soup_entry:
        # Extracting the movie title
        title = row.select_one('h3.headline-3 a[href]').text

        # Extracting the rating
        rating = row.select_one('td.td-rating .rating').text.strip()

        # Extracting the viewing date
        viewing_date = row.select_one('span.diary-entry-edit a.edit-review-button')['data-viewing-date']

        # Extracting the release year
        release_year = row.select_one('td.td-released').text.strip()

        # Extracting the film poster URL
        poster_url = row.select_one('div.really-lazy-load img.image')['src']

        # Extracting the film ID
        film_id = row.select_one('div.really-lazy-load')['data-film-id']

        # Extracting the viewing ID
        viewing_id = row['data-viewing-id']

        # Extracting review text if available
        review = row.select_one('a.edit-review-button')
        review_text = review['data-review-text'] if review else ''

        # Extracting rewatch status
        rewatch = review['data-rewatch'] if review else 'false'

        # Extracting user information
        user = row['data-owner']

        # Extracting links
    #     film_link = row.select_one('h3.headline-3 a')['href']
        film_link = base_url + row.select_one('h3.headline-3 a')['href'][len_user_name:]
        diary_entry_link = row.select_one('td.td-day a')['href']

        # Appending the extracted information to the data list
        data.append({
            'Title': title,
            'Rating_txt': rating,
            'Rating_num' : rating_num,
            'Viewing Date': viewing_date,
            'Release Year': release_year,
#             'Poster URL': poster_url,
#             'Film ID': film_id,
#             'Viewing ID': viewing_id,
            'Review Text': review_text,
            'Rewatch': rewatch,
#             'User': user,
            'Film Link': film_link,
#             'Diary Entry Link': diary_entry_link
        })

    # Creating a DataFrame from the data list
    entry_df = pd.DataFrame(data)

    # Displaying the DataFrame
    df = pd.concat([dataframe, entry_df]).reset_index(drop=True)
    
    return df

In [None]:
df = pd.DataFrame()

# Loop through ratings
for rating in tqdm(ratings):
    page = 1
    
    # Loop through pages
    while True:
        
        r = requests.get(f'{base_url}/{user}/films/diary/rated/{rating}/page/{page}/')
        soup = BeautifulSoup(r.content, "html.parser")
        
        if soup.select('h3>a[href]') == []:
            break
        else:
            df = get_entry_details(df, soup.select('tr.diary-entry-row'), rating)
            page += 1

In [None]:
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
    url = row['Film Link']
    
    r = requests.get(url)
    soup = BeautifulSoup(r.content, "html.parser")
    
    cast_element = soup.select_one('#tab-cast p')
    if cast_element:
        cast_text = cast_element.get_text(',', strip=True)
        cast_list = cast_text.split(',')[:10]  # Get the first 10 elements
        df.at[index, 'Cast'] = ','.join(cast_list)  # Join them back into a string
    else:
        df.at[index, 'Cast'] = None  # or an appropriate default value

In [None]:
pair_df = pd.DataFrame({'pair_key': list(zip(df['Title'], df['Rating_num']))})

In [None]:
pair_df.pair_key.unique()