In [4]:
import requests
import pandas as pd
import time


In [19]:
# TMDB API Configuration
API_KEY = '350d9555d777e2abfbc466e83e8fb0d2'  # Replace with your API key
BASE_URL = 'https://api.themoviedb.org/3/'
HEADERS = {'accept': 'application/json'}

In [7]:
def process_nested_data(data, key):
    return ', '.join([item['name'] for item in data.get(key, [])])

In [71]:
# Function 1: Fetch Movie Metadata
def fetch_movies_data(movie_ids):
    movies_data = []
    for idx, movie_id in enumerate(movie_ids, 1):
        # Fetch movie details
        movie_url = f"{BASE_URL}movie/{movie_id}?api_key={API_KEY}&append_to_response=keywords"
        movie_response = requests.get(movie_url, headers=HEADERS)
        time.sleep(0.25)  # Rate limiting
        
        if movie_response.status_code != 200:
            continue  # Skip invalid IDs
        
        movie = movie_response.json()
        movies_data.append({
            'budget': movie.get('budget'),
            'genres': process_nested_data(movie, 'genres'),
            'homepage': movie.get('homepage'),
            'id': movie.get('id'),
            'keywords': process_nested_data(movie.get('keywords', {}), 'keywords'),
            'original_language': movie.get('original_language'),
            'original_title': movie.get('original_title'),
            'overview': movie.get('overview'),
            'popularity': movie.get('popularity'),
            'production_companies': process_nested_data(movie, 'production_companies'),
            'production_countries': ', '.join([c['iso_3166_1'] for c in movie.get('production_countries', [])]),
            'release_date': movie.get('release_date'),
            'revenue': movie.get('revenue'),
            'runtime': movie.get('runtime'),
            'spoken_languages': ', '.join([lang['iso_639_1'] for lang in movie.get('spoken_languages', [])]),
            'status': movie.get('status'),
            'tagline': movie.get('tagline'),
            'title': movie.get('title'),
            'vote_average': movie.get('vote_average'),
            'vote_count': movie.get('vote_count')
        })
        
        # Progress tracking
        if idx % 100 == 0:
            print(f"Processed {idx}/{len(movie_ids)} movies for metadata")
        
        print(f"{idx} Movie: " + movies_data[-1]['title'])  # Access the last appended movie
    
    return movies_data


In [82]:
# Function 2: Fetch Credits Data
def fetch_credits_data(movie_ids):
    credits_data = []
    for idx, movie_id in enumerate(movie_ids, 1):
        # Fetch credits
        credits_url = f"{BASE_URL}movie/{movie_id}/credits?api_key={API_KEY}"
        credits_response = requests.get(credits_url, headers=HEADERS)

        movie_url = f"{BASE_URL}movie/{movie_id}?api_key={API_KEY}&append_to_response=keywords"
        movie_response = requests.get(movie_url, headers=HEADERS)
        movie = movie_response.json()
        time.sleep(0.25)  # Rate limiting
        
        if credits_response.status_code == 200:
            credits = credits_response.json()
            movie_title = movie.get('title')   # Fallback title
            credits_data.append({
                'movie_id': movie_id,
                'title': movie_title,
                'cast': str([{"name": m['name'], "role": m['character']} for m in credits.get('cast', [])]),
                'crew': str([{"name": m['name'], "department": m['department']} for m in credits.get('crew', [])])
            })
        
        # Progress tracking
        if idx % 100 == 0:
            print(f"Processed {idx}/{len(movie_ids)} movies for credits")
        
        print(f"{idx} Movie: " + credits_data[-1]['title']) 
    
    return credits_data

In [20]:
num_movies = 50

In [25]:
movie_ids = []
page = 2

In [26]:
url = f"{BASE_URL}discover/movie?api_key={API_KEY}&sort_by=popularity.desc&page={page}"

In [27]:
response = requests.get(url, headers=HEADERS)
response.status_code

200

In [None]:
data = response.json()
data

In [37]:
def fetch_movie_ids(num_movies=5000):
    movie_ids = []
    page = 1
    while len(movie_ids) < num_movies:
        # Use the Discover endpoint to get movies sorted by popularity
        url = f"{BASE_URL}discover/movie?api_key={API_KEY}&sort_by=popularity.desc&page={page}"
        response = requests.get(url, headers=HEADERS)
        if response.status_code == 200:
            data = response.json()
            movies = data['results']
            total_pages = data['total_pages']
            
            if not movies:
                break  # No more movies
            for movie in movies:
                movie_ids.append(movie['id'])
            # Stop if we've reached the maximum pages
            if page >= total_pages:
                break
            page += 1
            print(f"Page {page}: Collected {len(movie_ids)} movie IDs so far")
            time.sleep(0.25)  # Avoid rate limits
        else:
            print(f"Failed to fetch page {page}")
            break
    return movie_ids[:num_movies]

In [50]:
movies_ids = fetch_movie_ids(1000)

Page 2: Collected 20 movie IDs so far
Page 3: Collected 40 movie IDs so far
Page 4: Collected 60 movie IDs so far
Page 5: Collected 80 movie IDs so far
Page 6: Collected 100 movie IDs so far
Page 7: Collected 120 movie IDs so far
Page 8: Collected 140 movie IDs so far
Page 9: Collected 160 movie IDs so far
Page 10: Collected 180 movie IDs so far
Page 11: Collected 200 movie IDs so far
Page 12: Collected 220 movie IDs so far
Page 13: Collected 240 movie IDs so far
Page 14: Collected 260 movie IDs so far
Page 15: Collected 280 movie IDs so far
Page 16: Collected 300 movie IDs so far
Page 17: Collected 320 movie IDs so far
Page 18: Collected 340 movie IDs so far
Page 19: Collected 360 movie IDs so far
Page 20: Collected 380 movie IDs so far
Page 21: Collected 400 movie IDs so far
Page 22: Collected 420 movie IDs so far
Page 23: Collected 440 movie IDs so far
Page 24: Collected 460 movie IDs so far
Page 25: Collected 480 movie IDs so far
Page 26: Collected 500 movie IDs so far
Page 27: Col

In [51]:
len(movies_ids)

1000

In [53]:
enumerate(movies_ids, 1)

<enumerate at 0x270f3abd1c0>

In [None]:
for idx, mov_id in enumerate(movies_ids, 1):
    print("idx: ", idx)
    print("movie_id: ", mov_id)

In [None]:
movie_url = f"{BASE_URL}movie/{1097549}?api_key={API_KEY}&append_to_response=keywords"

In [55]:
movie_response = requests.get(movie_url, headers=HEADERS)
movie_response

<Response [200]>

In [None]:
movie = movie_response.json()
movie

In [57]:
movies_data = []

In [58]:
movies_data.append({
    'budget': movie.get('budget'),
    'genres': process_nested_data(movie, 'genres'),
    'homepage': movie.get('homepage'),
    'id': movie.get('id'),
    'keywords': process_nested_data(movie.get('keywords', {}), 'keywords'),  # Nested keywords
    'original_language': movie.get('original_language'),
    'original_title': movie.get('original_title'),
    'overview': movie.get('overview'),
    'popularity': movie.get('popularity'),
    'production_companies': process_nested_data(movie, 'production_companies'),
    'production_countries': ', '.join([c['iso_3166_1'] for c in movie.get('production_countries', [])]),
    'release_date': movie.get('release_date'),
    'revenue': movie.get('revenue'),
    'runtime': movie.get('runtime'),
    'spoken_languages': ', '.join([lang['iso_639_1'] for lang in movie.get('spoken_languages', [])]),
    'status': movie.get('status'),
    'tagline': movie.get('tagline'),
    'title': movie.get('title'),
    'vote_average': movie.get('vote_average'),
    'vote_count': movie.get('vote_count')
})

In [59]:
movies_data

[{'budget': 20000000,
  'genres': 'Drama',
  'homepage': 'https://a24films.com/films/babygirl',
  'id': 1097549,
  'keywords': 'infidelity, new york city, husband wife relationship, sexual frustration, sexuality, eroticism, seduction, workplace, female protagonist, married woman, ceo, intern, woman director, submission, taboo sex, masculinity, female sexuality, innuendo, erotic thriller, workplace romance, unfaithful wife, sex, young man seduces old lady, age-gap relationship, kink, power dynamics, employee boss relationship',
  'original_language': 'en',
  'original_title': 'Babygirl',
  'overview': 'A high-powered CEO puts her career and family on the line when she begins a torrid affair with her much younger intern.',
  'popularity': 351.992,
  'production_companies': 'A24, 2AM, Man Up Film',
  'production_countries': 'US, NL',
  'release_date': '2024-12-25',
  'revenue': 47756311,
  'runtime': 115,
  'spoken_languages': 'en',
  'status': 'Released',
  'tagline': 'This Christmas get

In [64]:
movies_ids = fetch_movie_ids(5000)

Page 2: Collected 20 movie IDs so far
Page 3: Collected 40 movie IDs so far
Page 4: Collected 60 movie IDs so far
Page 5: Collected 80 movie IDs so far
Page 6: Collected 100 movie IDs so far
Page 7: Collected 120 movie IDs so far
Page 8: Collected 140 movie IDs so far
Page 9: Collected 160 movie IDs so far
Page 10: Collected 180 movie IDs so far
Page 11: Collected 200 movie IDs so far
Page 12: Collected 220 movie IDs so far
Page 13: Collected 240 movie IDs so far
Page 14: Collected 260 movie IDs so far
Page 15: Collected 280 movie IDs so far
Page 16: Collected 300 movie IDs so far
Page 17: Collected 320 movie IDs so far
Page 18: Collected 340 movie IDs so far
Page 19: Collected 360 movie IDs so far
Page 20: Collected 380 movie IDs so far
Page 21: Collected 400 movie IDs so far
Page 22: Collected 420 movie IDs so far
Page 23: Collected 440 movie IDs so far
Page 24: Collected 460 movie IDs so far
Page 25: Collected 480 movie IDs so far
Page 26: Collected 500 movie IDs so far
Page 27: Col

In [66]:
movies_ids

[1241982,
 927342,
 822119,
 939243,
 1160956,
 539972,
 762509,
 1138749,
 1249289,
 558449,
 912649,
 993710,
 1410082,
 1247019,
 710295,
 811941,
 1114894,
 970450,
 1249013,
 1352774,
 1294203,
 933260,
 1255788,
 1357633,
 950396,
 426063,
 1035048,
 1272149,
 1010581,
 1081012,
 845781,
 839033,
 533535,
 1184918,
 974453,
 1222248,
 1241320,
 516729,
 519182,
 1156593,
 1156593,
 85,
 519182,
 823219,
 1222248,
 1064486,
 1043905,
 799766,
 980477,
 1022789,
 1252309,
 1064213,
 426889,
 1097549,
 1318917,
 259872,
 1215185,
 945961,
 1118031,
 402431,
 1147416,
 980477,
 1252309,
 1138194,
 1100782,
 402431,
 1097549,
 1318917,
 1215185,
 1005331,
 1278101,
 1034541,
 299536,
 1051896,
 157336,
 411,
 974576,
 1260594,
 1216191,
 1000075,
 826937,
 1299372,
 604685,
 829557,
 1260594,
 957119,
 1278263,
 1216191,
 1361622,
 929204,
 1084736,
 507086,
 271110,
 7451,
 1000075,
 1084199,
 1414272,
 1182387,
 1300607,
 1155281,
 1278263,
 278,
 1323784,
 7451,
 1011985,
 1412113,

In [None]:
import os
os.makedirs('data/api', exist_ok=True)

In [None]:
movies_data = fetch_movies_data(movies_ids)
# Save to CSV
pd.DataFrame(movies_data).to_csv('data/api/tmdb_movies.csv', index=False)
print("Data saved to data/api/tmdb_movies.csv!")

1 Movie: Moana 2
2 Movie: Amaran
3 Movie: Captain America: Brave New World
4 Movie: Sonic the Hedgehog 3
5 Movie: Panda Plan
6 Movie: Kraven the Hunter
7 Movie: Mufasa: The Lion King
8 Movie: The Island
9 Movie: Alarum
10 Movie: Gladiator II
11 Movie: Venom: The Last Dance
12 Movie: Back in Action
13 Movie: Sniper: The Last Stand
14 Movie: Death Whisperer 2
15 Movie: Wolf Man
16 Movie: Devara: Part 1
17 Movie: Star Trek: Section 31
18 Movie: Werewolves
19 Movie: Jugaremos en el bosque
20 Movie: Piglet
21 Movie: My Fault: London
22 Movie: The Substance
23 Movie: The Gardener
24 Movie: Solo Leveling -ReAwakening-
25 Movie: The Gorge
26 Movie: Nosferatu
27 Movie: Elevation
28 Movie: Bridget Jones: Mad About the Boy
29 Movie: My Fault
30 Movie: Aftermath
31 Movie: Red One
32 Movie: The Lord of the Rings: The War of the Rohirrim
33 Movie: Deadpool & Wolverine
34 Movie: The Wild Robot
35 Movie: Absolution
36 Movie: Number 24
37 Movie: Kingdom IV: Return of the Great General
38 Movie: Padding

In [None]:
credits_data = fetch_credits_data(movies_ids)

credits_data.to_csv('data/api/tmdb_credits.csv', index=False)
print("Data saved to data/api/tmdb_credits.csv!")

1 Movie: Moana 2
2 Movie: Amaran
3 Movie: Captain America: Brave New World
4 Movie: Sonic the Hedgehog 3
5 Movie: Panda Plan
6 Movie: Kraven the Hunter
7 Movie: Mufasa: The Lion King
8 Movie: The Island
9 Movie: Alarum
10 Movie: Gladiator II
11 Movie: Venom: The Last Dance
12 Movie: Back in Action
13 Movie: Sniper: The Last Stand
14 Movie: Death Whisperer 2
15 Movie: Wolf Man
16 Movie: Devara: Part 1
17 Movie: Star Trek: Section 31
18 Movie: Werewolves
19 Movie: Jugaremos en el bosque
20 Movie: Piglet
21 Movie: My Fault: London
22 Movie: The Substance
23 Movie: The Gardener
24 Movie: Solo Leveling -ReAwakening-
25 Movie: The Gorge
26 Movie: Nosferatu
27 Movie: Elevation
28 Movie: Bridget Jones: Mad About the Boy
29 Movie: My Fault
30 Movie: Aftermath
31 Movie: Red One
32 Movie: The Lord of the Rings: The War of the Rohirrim
33 Movie: Deadpool & Wolverine
34 Movie: The Wild Robot
35 Movie: Absolution
36 Movie: Number 24
37 Movie: Kingdom IV: Return of the Great General
38 Movie: Padding