In [4]:
import requests
import pandas as pd

api_key = "ae60927b2621ffdf665f3b47791feda8"

def fetch_data(api_key, page_limit=10):
    all_movies = []  # Initialize all_movies as an empty list
    for page in range(1, page_limit + 1):
        print(f"Fetching data from page {page}")
        url = f"https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page={page}"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to fetch page {page}: {response.status_code}")
        else:
            results = response.json().get("results", [])

    for movie in results:
        try:
            movie_id = movie["id"]
            details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&append_to_response=credits"   
            details_response = requests.get(details_url).json()


            #Extract Details
            title = details_response.get("title","")
            genres = ", ".join([g['name'] for g in details_response.get("genres",[])])
            synopsis = details_response.get("overview","")
            rating = details_response.get("vote_average",0)

            credits = details_response.get("credits",{})
            cast = credits.get("cast",[])
            crew = credits.get("crew",[])

            actors = ", ".join([actor["name"] for actor in cast[:3]])
            directors = ", ".join([person["name"] for person in crew if person["job"] == "Director"])

            all_movies.append({
                'title': title,
                'genres': genres,
                'synopsis': synopsis,
                'rating': rating,
                'actors': actors,
                'directors': directors
            })

        except Exception as e:
            print(f"Failed to process movie {movie_id}: {e}")
            continue
    return pd.DataFrame(all_movies)

movie_df = fetch_data(api_key, page_limit=10)
movie_df.to_csv("popular_movies.csv", index=False)
print("Data saved to popular_movies.csv")


Fetching data from page 1
Fetching data from page 2
Fetching data from page 3
Fetching data from page 4
Fetching data from page 5
Fetching data from page 6
Fetching data from page 7
Fetching data from page 8
Fetching data from page 9
Fetching data from page 10
Data saved to popular_movies.csv


In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer 
import pickle



In [8]:
data = pd.read_csv("popular_movies.csv")
data.fillna("", inplace=True)
data.head()

Unnamed: 0,title,genres,synopsis,rating,actors,directors
0,Faust: Love of the Damned,"Horror, Fantasy, Action",An artist sells his soul to the mysterious M i...,4.9,"Mark Frost, Isabel Brook, Jennifer Rope",Brian Yuzna
1,Road House,"Action, Thriller",Ex-UFC fighter Dalton takes a job as a bouncer...,6.946,"Jake Gyllenhaal, Billy Magnussen, Daniela Melc...",Doug Liman
2,Parthenope,"Romance, Drama","Parthenope, born in the sea near Naples in 195...",6.9,"Celeste Dalla Porta, Stefania Sandrelli, Gary ...",Paolo Sorrentino
3,Victoria,Animation,Wordless animated fim from Robert Sahakyants,0.0,,Robert Sahakyants
4,Coraline,"Animation, Family, Fantasy",Wandering her rambling old house in her boring...,7.9,"Dakota Fanning, Teri Hatcher, Jennifer Saunders",Henry Selick


In [9]:
avg_actor_rating = data.groupby('actors')['rating'].mean().to_dict()
avg_director_rating = data.groupby('directors')['rating'].mean().to_dict()

data['avg_actor_rating'] = data['actors'].map(avg_actor_rating)
data['avg_director_rating'] = data['directors'].map(avg_director_rating)


In [10]:
 data.head()

Unnamed: 0,title,genres,synopsis,rating,actors,directors,avg_actor_rating,avg_director_rating
0,Faust: Love of the Damned,"Horror, Fantasy, Action",An artist sells his soul to the mysterious M i...,4.9,"Mark Frost, Isabel Brook, Jennifer Rope",Brian Yuzna,4.9,4.9
1,Road House,"Action, Thriller",Ex-UFC fighter Dalton takes a job as a bouncer...,6.946,"Jake Gyllenhaal, Billy Magnussen, Daniela Melc...",Doug Liman,6.946,6.946
2,Parthenope,"Romance, Drama","Parthenope, born in the sea near Naples in 195...",6.9,"Celeste Dalla Porta, Stefania Sandrelli, Gary ...",Paolo Sorrentino,6.9,6.9
3,Victoria,Animation,Wordless animated fim from Robert Sahakyants,0.0,,Robert Sahakyants,0.0,0.0
4,Coraline,"Animation, Family, Fantasy",Wandering her rambling old house in her boring...,7.9,"Dakota Fanning, Teri Hatcher, Jennifer Saunders",Henry Selick,7.9,7.9


In [12]:
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
synopsis_embeddings = sentence_model.encode(data['synopsis'].tolist())

genre_ohe = pd.get_dummies(data['genres'])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
#combine all the features

x = np.hstack((
    synopsis_embeddings,
    genre_ohe.values,
    data[['avg_actor_rating', 'avg_director_rating']].values
))
y = data['rating']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
