In [2]:
import pandas as pd
import os

# Film Ratings + Other Film Metadata

Sourced from TMDB (the movie database)

Includes aggregate runtimes, popularity ratings, descriptions of movies, number of raters, also includes revenue etc.

[Movie details for a given movie id](https://developer.themoviedb.org/reference/movie-details)


Here is the drive folder with the bulk movie id data:
[download tmdb_movie_ids.csv](https://drive.google.com/file/d/1gOMNDu7MLIriftb3audXMP0UEmZAiHQt/view?usp=sharing)


In [11]:
import requests
import json

In [14]:
RAW_PATH = "raw"

In [15]:
tmdb_df = pd.read_csv(f"{RAW_PATH}/tmdb_movie_ids.csv")

In [16]:
tmdb_df.sort_values(by='popularity', ascending=False)

Unnamed: 0,adult,id,original_title,popularity,video
811727,False,1156594,Culpa nuestra,532.1524,False
1073852,False,1511789,Captain Hook - The Cursed Tides,340.5134,False
534839,False,755898,War of the Worlds,317.3022,False
907504,False,1280450,Stolen Girl,304.1407,False
926600,False,1305717,Hunting Grounds,300.4624,False
...,...,...,...,...,...
807723,False,1151169,Fade Away,0.0000,False
807749,False,1151216,Scenic National Parks: Zion and Bryce,0.0000,True
807758,False,1151233,Pompeii: The Doomed City,0.0000,True
807760,False,1151235,"When It Rayns, It Pours",0.0000,False


In [20]:
tmdb_key = os.getenv("TMDB_API_KEY")

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {tmdb_key}"
}

Note that there are a bunch of different endpoints for different sorts of info, meaning we could query by:


*   movie_id
*   genre
*   what's popular/trending
*   keywords
*   release dates...

You get the idea.

View API Reference here: [TMDB API Reference](https://developer.themoviedb.org/reference/getting-started)

Idk exactly what rate limiting looks like, but I don't think we should try to extract details from ALL the ids in the movie id dataset

In [21]:
movie_url = "https://api.themoviedb.org/3/discover/movie"

details_url = "https://api.themoviedb.org/3/movie/"

## 1. Discover movies endpoint



In [22]:
movies = requests.get(movie_url, headers=headers).text

json.loads(movies)

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/hpXBJxLD2SEf8l2CspmSeiHrBKX.jpg',
   'genre_ids': [18, 27, 14],
   'id': 1062722,
   'original_language': 'en',
   'original_title': 'Frankenstein',
   'overview': 'Dr. Victor Frankenstein, a brilliant but egotistical scientist, brings a creature to life in a monstrous experiment that ultimately leads to the undoing of both the creator and his tragic creation.',
   'popularity': 1011.0264,
   'poster_path': '/g4JtvGlQO7DByTI6frUobqvSL3R.jpg',
   'release_date': '2025-10-17',
   'title': 'Frankenstein',
   'video': False,
   'vote_average': 7.934,
   'vote_count': 940},
  {'adult': False,
   'backdrop_path': '/82lM4GJ9uuNvNDOEpxFy77uv4Ak.jpg',
   'genre_ids': [28, 878, 12],
   'id': 1242898,
   'original_language': 'en',
   'original_title': 'Predator: Badlands',
   'overview': 'Cast out from his clan, a young Predator finds an unlikely ally in a damaged android and embarks on a treacherous journey in search of the ultimate

# Query by movie id using the fat dataset of ids

In [23]:
def get_movie_id(df, title):
    row = df[df["original_title"].str.lower() == title.lower()]
    return int(row["id"].values[0]) if not row.empty else None


example_movie = "Gone Girl"
movie_id = get_movie_id(tmdb_df, example_movie)
print(f"Movie ID for '{example_movie}': {movie_id}")

Movie ID for 'Gone Girl': 210577


In [None]:
def flatten_movie_data(data):
    return {
        "id": data["id"],
        "title": data["title"],
        "release_date": data["release_date"],
        'imbd_id': data["imdb_id"],
        "budget": data["budget"],
        "revenue": data["revenue"],
        "genres": ", ".join([g["name"] for g in data.get("genres", [])]),
        "overview": data["overview"],
        "popularity": data["popularity"],
        "runtime": data["runtime"],
        "vote_average": data["vote_average"],
        'vote_count': data["vote_count"],
        'production_countries': ', '.join([c['name'] for c in data.get('production_countries', [])]),
        "production_companies": ", ".join([c["name"] for c in data.get("production_companies", [])])
    }


example_details = requests.get(f"{details_url}{movie_id}", headers=headers).text
print(example_details)
flattened_details = flatten_movie_data(json.loads(example_details))

print("Details for Gone Girl: ")
flattened_details

{"adult":false,"backdrop_path":"/1ufic9NSdolkgNkQSAVjlVp0uqn.jpg","belongs_to_collection":null,"budget":61000000,"genres":[{"id":9648,"name":"Mystery"},{"id":53,"name":"Thriller"},{"id":18,"name":"Drama"}],"homepage":"https://www.20thcenturystudios.com/movies/gone-girl","id":210577,"imdb_id":"tt2267998","origin_country":["US"],"original_language":"en","original_title":"Gone Girl","overview":"With his wife's disappearance having become the focus of an intense media circus, a man sees the spotlight turned on him when it's suspected that he may not be innocent.","popularity":15.0221,"poster_path":"/ts996lKsxvjkO2yiYG0ht4qAicO.jpg","production_companies":[{"id":25,"logo_path":"/qZCc1lty5FzX30aOCVRBLzaVmcp.png","name":"20th Century Fox","origin_country":"US"},{"id":508,"logo_path":"/4sGWXoboEkWPphI6es6rTmqkCBh.png","name":"Regency Enterprises","origin_country":"US"}],"production_countries":[{"iso_3166_1":"US","name":"United States of America"}],"release_date":"2014-10-01","revenue":37089025

{'id': 210577,
 'title': 'Gone Girl',
 'release_date': '2014-10-01',
 'imbd_id': 'tt2267998',
 'budget': 61000000,
 'revenue': 370890259,
 'genres': 'Mystery, Thriller, Drama',
 'overview': "With his wife's disappearance having become the focus of an intense media circus, a man sees the spotlight turned on him when it's suspected that he may not be innocent.",
 'popularity': 15.0221,
 'runtime': 149,
 'vote_average': 7.89,
 'vote_count': 19391,
 'production_countries': 'United States of America',
 'production_companies': '20th Century Fox, Regency Enterprises'}