In [3]:
import pandas as pd
import os

## Load Merged Data (SovAI + TMDB MOVIE IDs)

In [5]:
CLEAN_DATA_PATH = "data/cleaned"

In [9]:
merged_df = pd.read_csv(f'{CLEAN_DATA_PATH}/merged.csv')
merged_df.head()

Unnamed: 0.1,Unnamed: 0,ticker,date,title,distributor,gross,percent_yd,percent_lw,theaters,per_theater,...,distributor address,distributorwebsite,release_date,year,title_key,id,original_title,popularity,weekday,is_weekend
0,0,PARA,2016-06-02,10 Cloverfield Lane,Paramount Pi…,11414,0.32,-0.12,120.0,95.0,...,"1515 Broadway, New York City, New York, 10036,...",http://www.paramount.com/,2016-03-10,2016,10 cloverfield lane,333371,10 Cloverfield Lane,7.8372,3,0
1,1,Private,2025-10-16,100 Meters,GKIDS,313,-0.03,0.0,1.0,313.0,...,"225 Broadway, Lower Manhattan, New York City, U.S",http://gkids.com/,2025-10-09,2025,100 meters,911001,100 meters,0.0143,3,0
2,2,SONY,2018-06-03,102 Not Out,Sony Pictures,2806,-0.48,-0.83,17.0,165.0,...,"10202 West Washington Boulevard, Culver City, ...",https://www.sonypictures.com/,2018-05-03,2018,102 not out,460713,102 Not Out,3.4544,6,1
3,3,Private,2006-09-04,10th & Wolf,ThinkFilm,1791,0.0,0.0,6.0,299.0,...,"1335 Windsor Ridge Ln, Annapolis, Maryland",https://www.thinkfilm-inc.com/,2006-08-17,2006,10th & wolf,13197,10th & Wolf,3.6557,0,0
4,4,WBD,2007-09-03,11th Hour,Warner Indep…,64888,0.0,0.0,111.0,585.0,...,"230 Park Avenue South, New York City, New York...",www.warnerbros.com,2007-08-16,2007,11th hour,476899,11th Hour,2.3114,0,0


## Extract Movie Details from TMDB API



In [10]:
import json, requests
import rich
import time
from tqdm.notebook import tqdm

In [11]:
tmdb_key = os.getenv("TMDB_API_KEY")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_key}"
}

Note that there are a bunch of different endpoints for different sorts of info, meaning we could query by:


*   movie_id
*   genre
*   what's popular/trending
*   keywords
*   release dates...

You get the idea.

View API Reference here: [TMDB API Reference](https://developer.themoviedb.org/reference/getting-started)

Idk exactly what rate limiting looks like, but I don't think we should try to extract details from ALL the ids in the movie id dataset

In [None]:
movie_url = "https://api.themoviedb.org/3/discover/movie"

details_url = "https://api.themoviedb.org/3/movie/"

### Flatten movie data recieved from API



In [None]:
def flatten_movie_data(data: dict) -> dict:
    """Flatten a TMDB movie details JSON object into a flat dict suitable for a dataframe."""

    # handle potential missing keys gracefully
    def safe_get(key, default=None):
        return data.get(key, default)

    return {
        # Core identifiers
        "id": safe_get("id"),
        "imdb_id": safe_get("imdb_id"),
        "title": safe_get("title"),
        "original_title": safe_get("original_title"),
        "original_language": safe_get("original_language"),

        # Release info
        "release_date": safe_get("release_date"),
        "status": safe_get("status"),
        "homepage": safe_get("homepage"),

        # Financials
        "budget": safe_get("budget"),
        "revenue": safe_get("revenue"),

        # Content
        "adult": safe_get("adult"),
        "overview": safe_get("overview"),
        "tagline": safe_get("tagline"),
        "runtime": safe_get("runtime"),

        # Popularity & ratings
        "popularity": safe_get("popularity"),
        "vote_average": safe_get("vote_average"),
        "vote_count": safe_get("vote_count"),

        # Country & language
        "origin_country": ", ".join(safe_get("origin_country", [])),
        "spoken_languages": ", ".join(
            [lang.get("english_name") or lang.get("name", "") for lang in safe_get("spoken_languages", [])]
        ),

        # Genres (split IDs and names)
        "genre_ids": ", ".join([str(g["id"]) for g in safe_get("genres", [])]),
        "genre_names": ", ".join([g["name"] for g in safe_get("genres", [])]),

        # Production companies
        "production_company_ids": ", ".join([str(c["id"]) for c in safe_get("production_companies", [])]),
        "production_company_names": ", ".join([c["name"] for c in safe_get("production_companies", [])]),
        "production_company_countries": ", ".join([c["origin_country"] for c in safe_get("production_companies", [])]),

        # Production countries
        "production_country_codes": ", ".join([c["iso_3166_1"] for c in safe_get("production_countries", [])]),
        "production_country_names": ", ".join([c["name"] for c in safe_get("production_countries", [])]),

        # Extra metadata
        "belongs_to_collection": (
            safe_get("belongs_to_collection", {}).get("name")
            if isinstance(safe_get("belongs_to_collection"), dict)
            else None
        ),
    }


def query_movie_details(movie_id):
  details_payload = requests.get(f"{details_url}{movie_id}", headers=headers).text

  return flatten_movie_data(json.loads(details_payload))


def get_movie_id(df, title):
    row = df[df["original_title"].str.lower() == title.lower()]
    return int(row["id"].values[0]) if not row.empty else None



In [None]:
example_movie = "Gone Girl"
movie_id = get_movie_id(tmdb_df, example_movie)
print(f"Movie ID for '{example_movie}': {movie_id}")

example_details = requests.get(f"{details_url}{movie_id}", headers=headers).text
print(example_details)
flattened_details = flatten_movie_data(json.loads(example_details))

Movie ID for 'Gone Girl': 210577
{"adult":false,"backdrop_path":"/1ufic9NSdolkgNkQSAVjlVp0uqn.jpg","belongs_to_collection":null,"budget":61000000,"genres":[{"id":9648,"name":"Mystery"},{"id":53,"name":"Thriller"},{"id":18,"name":"Drama"}],"homepage":"https://www.20thcenturystudios.com/movies/gone-girl","id":210577,"imdb_id":"tt2267998","origin_country":["US"],"original_language":"en","original_title":"Gone Girl","overview":"With his wife's disappearance having become the focus of an intense media circus, a man sees the spotlight turned on him when it's suspected that he may not be innocent.","popularity":14.5324,"poster_path":"/ts996lKsxvjkO2yiYG0ht4qAicO.jpg","production_companies":[{"id":25,"logo_path":"/qZCc1lty5FzX30aOCVRBLzaVmcp.png","name":"20th Century Fox","origin_country":"US"},{"id":508,"logo_path":"/4sGWXoboEkWPphI6es6rTmqkCBh.png","name":"Regency Enterprises","origin_country":"US"}],"production_countries":[{"iso_3166_1":"US","name":"United States of America"}],"release_date

In [None]:
print("Details for Gone Girl: ")
print(flattened_details)

Details for Gone Girl: 
{'id': 210577, 'imdb_id': 'tt2267998', 'title': 'Gone Girl', 'original_title': 'Gone Girl', 'original_language': 'en', 'release_date': '2014-10-01', 'status': 'Released', 'homepage': 'https://www.20thcenturystudios.com/movies/gone-girl', 'budget': 61000000, 'revenue': 370890259, 'adult': False, 'overview': "With his wife's disappearance having become the focus of an intense media circus, a man sees the spotlight turned on him when it's suspected that he may not be innocent.", 'tagline': "You don't know what you've got 'til it's...", 'runtime': 149, 'popularity': 14.5324, 'vote_average': 7.89, 'vote_count': 19389, 'origin_country': 'US', 'spoken_languages': 'English', 'genre_ids': '9648, 53, 18', 'genre_names': 'Mystery, Thriller, Drama', 'production_company_ids': '25, 508', 'production_company_names': '20th Century Fox, Regency Enterprises', 'production_company_countries': 'US, US', 'production_country_codes': 'US', 'production_country_names': 'United States of 

In [None]:
# =============================
def collect_tmdb_data(movie_ids, checkpoint_every=500, sleep_time=0.35, resume=False):
    """
    Collect TMDB details for a list of movie IDs.
    - checkpoint_every: how often to save to disk (set None to disable)
    - sleep_time: delay between API calls to avoid rate limit
    - resume: if True, load existing 'tmdb_results.csv' and skip already done ones
    """

    # Resume support
    collected = []
    done_ids = set()

    if resume:
        try:
            collected = pd.read_csv("tmdb_results.csv").to_dict("records")
            done_ids = {int(row["id"]) for row in collected if pd.notnull(row["id"])}
            print(f"Resuming from checkpoint: {len(done_ids)} already collected.")
        except FileNotFoundError:
            pass

    session = requests.Session()
    results = collected.copy()
    errors = []

    for idx, mid in enumerate(tqdm(movie_ids, desc="Fetching TMDB data")):
        if mid in done_ids:
            continue

        url = f"{details_url}{int(mid)}"
        retries = 3
        for attempt in range(retries):
            try:
                r = session.get(url, headers=headers, timeout=20)
                if r.status_code == 200:
                    data = r.json()
                    results.append(flatten_movie_data(data))
                    break
                elif r.status_code == 404:
                    errors.append((mid, "404 Not Found"))
                    break
                else:
                    errors.append((mid, f"HTTP {r.status_code}"))
                    break
            except requests.exceptions.RequestException as e:
                if attempt == retries - 1:
                    errors.append((mid, str(e)))
                else:
                    time.sleep(2 ** attempt)
                    continue

        # Rate limit safety
        time.sleep(sleep_time)

        # Save checkpoint
        if checkpoint_every and (idx + 1) % checkpoint_every == 0:
            pd.DataFrame(results).to_csv("tmdb_results.csv", index=False)
            pd.DataFrame(errors, columns=["id", "error"]).to_csv("tmdb_failed.csv", index=False)
            print(f"Checkpoint saved at {idx+1}/{len(movie_ids)}")

    # Final save
    df = pd.DataFrame(results)
    return df

In [None]:
# Assuming your big dataframe is called df_movies
ids = merged_df["id"].dropna().astype(int).unique().tolist()

# Collect info for all IDs (with resume + checkpoints)


In [None]:
full_tmdb_df = collect_tmdb_data(ids, checkpoint_every=500, resume=True)

In [None]:
full_tmdb_df.to_csv(f"{movie_id_path}/tmdb_movie_metadata.csv", index=False)

In [None]:
full_tmdb_df.head()

Unnamed: 0,id,imdb_id,title,original_title,original_language,release_date,status,homepage,budget,revenue,adult,overview,tagline,runtime,popularity,vote_average,vote_count,origin_country,spoken_languages,genre_ids,genre_names,production_company_ids,production_company_names,production_company_countries,production_country_codes,production_country_names,belongs_to_collection
0,333371,tt1179933,10 Cloverfield Lane,10 Cloverfield Lane,en,2016-03-10,Released,https://www.paramountmovies.com/movies/10-clov...,15000000,110216998,False,"After a catastrophic car crash, a young woman ...",Monsters come in many forms.,104,4.343,7.0,8351,US,English,"53, 878, 18, 27","Thriller, Science Fiction, Drama, Horror",11461,Bad Robot,US,US,United States of America,
1,911001,,100 meters,100 meters,en,2012-12-01,Released,,0,0,False,,,1,0.014,0.0,0,US,,,,,,,,,
2,460713,tt6580564,102 Not Out,102 Not Out,hi,2018-05-04,Released,,0,0,False,An unusual love-hate relationship between a 75...,"Cool, Old School",107,1.018,7.5,62,"IN, US",Hindi,"35, 10751, 18","Comedy, Family, Drama","5544, 106569, 157559, 63520","Benchmark, Treetop Entertainment, Sony Picture...",", , IN, US","IN, US","India, United States of America",
3,13197,tt0360323,10th & Wolf,10th & Wolf,en,2006-02-19,Released,,8000000,143451,False,A former street thug returns to his Philadelph...,"The Intersection Where Family, Honor and Betra...",107,2.686,5.856,108,US,English,"28, 80, 18, 9648, 53","Action, Crime, Drama, Mystery, Thriller",41427,Suzanne DeLaurentiis Productions,,US,United States of America,
4,476899,tt6207498,11th Hour,11th Hour,en,2017-12-31,Released,http://undergroundfilms.ie/projects/11th-hour/,0,0,False,"Based on a true story, 11th Hour recounts how,...",,10,5.061,6.417,6,"IE, MX, US","English, Spanish",,,"160740, 2307, 71357, 49326, 11294","Canal 44, Hell's Kitchen, Hell's Kitchen Inter...",", IE, , US, MX","IE, MX","Ireland, Mexico",


In [None]:
full_tmdb_df.shape

(15793, 27)