In [1]:
import os
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
today = pd.Timestamp.today().strftime('%Y%m%d')

In [3]:
headers = {
    'accept': 'application/graphql+json, application/json',
    'accept-language': 'en-US,en;q=0.9,es;q=0.8',
    'content-type': 'application/json',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
}

In [4]:
cat_url = 'https://www.imdb.com/interest/all/'
cat_response = requests.get(cat_url, headers=headers)
cat_content = BeautifulSoup(cat_response.text, 'html.parser')
cat_script = cat_content.find('script', type="application/json")
cat_json = json.loads(cat_script.text)
cat_items = cat_json['props']['pageProps']['interestCategories']

In [5]:
cat_dicts  = []
for g in cat_items:
    interests = g['interests']['edges']
    for interest in interests:
        interest_id = interest['node']['id']
        interest_name = interest['node']['primaryText']['text']
        cat_dicts.append({
            "genre_id": g['id'],
            "genre": g['text'],
            "interest_id": interest_id,
            "interest_name": interest_name            
        })

In [6]:
cat_df = pd.DataFrame(cat_dicts)

In [42]:
genre_list = cat_df['genre'].str.lower().to_list()
interest_list = cat_df['interest_id'].str.lower().to_list()
interest_list = list(reversed(interest_list))

---

In [43]:
def fetch_movies_by_genre(interest_id, progress_bar):
    has_next_page = True
    cursor = ""  # Start without a cursor for the first page
    all_movies = []

    # Fetch movies for the given genre/interest
    while has_next_page:
        payload = {
            'operationName': 'AdvancedTitleSearch',
            'variables': {
                "after": cursor,
                "first": 50,
                "interestConstraint": {
                    "allInterestIds": [interest_id],
                    "excludeInterestIds": []
                },
                "locale": "en-US",
                "sortBy": "POPULARITY",
                "sortOrder": "ASC",
                "titleTypeConstraint": {
                    "anyTitleTypeIds": ["movie", "tvMovie", "tvSpecial", "video", "tvSeries", "tvMiniSeries"],
                    "excludeTitleTypeIds": []
                }
            },
            'extensions': {
                'persistedQuery': {
                    'sha256Hash': 'f3e9d880ef5404e832446904abc3c455b762cf23c66089c3747ae96dfb3c0065',
                    'version': 1
                }
            }
        }

        response = requests.post('https://caching.graphql.imdb.com/', json=payload, headers=headers)

        if response.status_code != 200:
            print(f"Error: Received status code {response.status_code}")
            print(f"Response content: {response.text}")
            break
        
        try:
            data = response.json()
        except json.JSONDecodeError:
            print("Error: Failed to parse JSON.")
            print(f"Response content: {response.content.decode('utf-8')}")
            break

        movies = data['data']['advancedTitleSearch']['edges']

        for movie in movies:
            node = movie.get('node', {})
            title_data = node.get('title', {})

            if title_data:
                release_year_data = title_data.get('releaseYear', {})
                runtime_data = title_data.get('runtime', {})
                runtime_seconds = runtime_data.get('seconds') if runtime_data else None
                certificate_data = title_data.get('certificate', {})
                certificate_rating = certificate_data.get('rating') if certificate_data else None

                primary_image = title_data.get('primaryImage', {})
                image_url = primary_image.get('url', None) if primary_image else None

                movie_data = {
                    'id': title_data.get('id'),
                    'title': title_data.get('titleText', {}).get('text', None),
                    'type': title_data.get('titleType', {}).get('text', None),
                    'release_year': release_year_data.get('year', None), 
                    'end_year': release_year_data.get('endYear', None),
                    'rating': title_data.get('ratingsSummary', {}).get('aggregateRating', None),
                    'vote_count': title_data.get('ratingsSummary', {}).get('voteCount', None),
                    'runtime_seconds': runtime_seconds,
                    'certificate': certificate_rating,
                    'genres': [genre.get('genre', {}).get('text', None) for genre in title_data.get('titleGenres', {}).get('genres', [])],
                    'plot': node.get('plot', {}).get('plotText', {}).get('plainText', None),
                    'image_url': image_url
                }
                all_movies.append(movie_data)

        # Update the shared progress bar
        progress_bar.update(len(movies))

        # Pagination: move to the next page
        page_info = data['data']['advancedTitleSearch']['pageInfo']
        has_next_page = page_info.get('hasNextPage', False)
        cursor = page_info.get('endCursor', "")

    return all_movies

In [None]:
total_movies = 0
dfs = []

for interest_id in interest_list:
    payload = {
        'operationName': 'AdvancedTitleSearch',
        'variables': {
            "after": "",
            "first": 1,
            "interestConstraint": {
                "allInterestIds": [interest_id],
                "excludeInterestIds": []
            },
            "locale": "en-US",
            "sortBy": "POPULARITY",
            "sortOrder": "ASC",
            "titleTypeConstraint": {
                "anyTitleTypeIds": ["movie", "tvMovie", "tvSpecial", "video", "tvSeries", "tvMiniSeries"],
                "excludeTitleTypeIds": []
            }
        },
        'extensions': {
            'persistedQuery': {
                'sha256Hash': 'f3e9d880ef5404e832446904abc3c455b762cf23c66089c3747ae96dfb3c0065',
                'version': 1
            }
        }
    }

    response = requests.post('https://caching.graphql.imdb.com/', json=payload, headers=headers)
    if response.status_code == 200:
        data = response.json()
        total_movies += data['data']['advancedTitleSearch']['total']

progress_bar = tqdm(total=total_movies, desc="Fetching all movies")

for interest_id in interest_list:
    movies = fetch_movies_by_genre(interest_id, progress_bar)
    df = pd.DataFrame(movies)
    df['genre_id'] = interest_id
    dfs.append(df)

all_interests_combined_df = pd.concat(dfs).reset_index(drop=True)

progress_bar.close()

In [None]:
# Combine all dataframes into one
all_interests_combined_df = pd.concat(dfs).reset_index(drop=True)

In [None]:
all_interests_combined_df.to_json(f'data/processed/imdb_movies_by_genre.json', indent=4, orient='records')