In [1]:
import requests
import json
import ast
import re
import pandas as pd

## Getting the Movie Reviews from API

In [2]:
api_key = "8265bd1679663a7ea12ac168da84d2e8"
url = "https://api.themoviedb.org/3/movie/top_rated"
all_movies = []

# Get total number of pages
params = {
    "api_key": api_key,
    "language": "en-US",
    "page": 1
}
response = requests.get(url, params=params)
data = response.json()
total_pages = data.get("total_pages", 1)

# Loop through all pages
for page in range(1, total_pages + 1):
    params["page"] = page
    res = requests.get(url, params=params)
    if res.status_code == 200:
        movies = res.json().get("results", [])
        all_movies.extend(movies)
        # print(f"Saved page {page}/{total_pages}")
    else:
        print(f"Failed to fetch page {page}")

# Save to CSV
df = pd.DataFrame(all_movies)
df.to_csv("data/top_rated_movies_all_pages.csv", index=False)

print(f"✅ Saved {len(df)} movies from {total_pages} pages.")

Failed to fetch page 501
Failed to fetch page 502
Failed to fetch page 503
Failed to fetch page 504
Failed to fetch page 505
Failed to fetch page 506
Failed to fetch page 507
✅ Saved 10000 movies from 507 pages.


In [3]:
df.head()

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,32.0995,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.71,28248
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",56.9185,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.687,21411
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,15.8349,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.571,12933
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,20.5698,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.565,16411
4,False,/bxgTSUenZDHNFerQ1whRKplrMKF.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,27.1879,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.5,9110


In [4]:
# Only select 3 columns to work with
df = df[['original_title','overview','genre_ids']]

df

Unnamed: 0,original_title,overview,genre_ids
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]"
4,12 Angry Men,The defense and the prosecution have rested an...,[18]
...,...,...,...
9995,The Last Airbender,"The story follows the adventures of Aang, a yo...","[28, 12, 14]"
9996,From Dusk Till Dawn 2: Texas Blood Money,A bank-robbing gang of misfits heads to Mexico...,"[80, 28, 27, 53]"
9997,Cage Dive,Three friends from California are filming an a...,"[27, 18, 53]"
9998,Street Fighter,Col. Guile and various other martial arts hero...,"[28, 12, 35, 53]"


## Getting the Genre Data from API

In [5]:
## Fetching movie genres from other api

# Define API URL
url = "https://api.themoviedb.org/3/genre/movie/list"
params = {
    "api_key": "8265bd1679663a7ea12ac168da84d2e8",
    "language": "en-US"
}

# Make the request
response = requests.get(url, params=params)

# Check if successful
if response.status_code == 200:
    data = response.json()
    genres = data.get('genres', [])
else:
    print(f"Failed to fetch data: {response.status_code}")
    genres = []

genre_mapping = {g['id']: g['name'] for g in genres}

genre_mapping

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

## Merge Both Datasets

In [6]:
# merge genre with ids
def get_genre_names(genre_ids):
    genre_names = [genre_mapping.get(int(g_id), "Unknown") for g_id in genre_ids]
    return ', '.join(genre_names)

# df['genre_ids'] = df['genre_ids'].apply(ast.literal_eval) 
df['genre_names'] = df['genre_ids'].apply(get_genre_names)

df

Unnamed: 0,original_title,overview,genre_ids,genre_names
0,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,"[18, 80]","Drama, Crime"
1,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]","Drama, Crime"
2,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]","Drama, Crime"
3,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]","Drama, History, War"
4,12 Angry Men,The defense and the prosecution have rested an...,[18],Drama
...,...,...,...,...
9995,The Last Airbender,"The story follows the adventures of Aang, a yo...","[28, 12, 14]","Action, Adventure, Fantasy"
9996,From Dusk Till Dawn 2: Texas Blood Money,A bank-robbing gang of misfits heads to Mexico...,"[80, 28, 27, 53]","Crime, Action, Horror, Thriller"
9997,Cage Dive,Three friends from California are filming an a...,"[27, 18, 53]","Horror, Drama, Thriller"
9998,Street Fighter,Col. Guile and various other martial arts hero...,"[28, 12, 35, 53]","Action, Adventure, Comedy, Thriller"


In [7]:
df.to_csv("data/movie_review_data_with_genre_from_api.csv")