In [41]:
import requests
import pandas as pd
from additional import api

api_key = api

def fetch_data(api_key, page_limit = 10):
    all_movies = []

    for page in range(1, page_limit + 1):
        print(f"Fetching data from page {page}...")
        url = f"https://api.themoviedb.org/3/movie/popular?api_key={api_key}&language=en-US&page={page}"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to fetch data from page {page}. Status code: {response.status_code}")
            continue

        results = response.json().get("results",[])

        for movie in results:
            try:
                movie_id = movie["id"]
                details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&append_to_response=credits"
                details_response = requests.get(details_url).json

                # Extract details
                title = details_response.get("title","")
                genres = ", ".join([g["name"] for g in details_response.get("genres",[])])
                synopsis = details_response.get("overview","")
                rating = details_response.get("vote_average",0)

                credits = details_response.get("credits",{})
                cast = credits.get("cast", [])
                crew = credits.get("crew", [])

                actors = ", ".join([person["name"] for person in cast[:3]])
                directors = ", ".join([person["name"] for person in crew if person["job"] == "Director"])

                all_movies.append({
                    "title": title,
                    "genres": genres,
                    "actors": actors,
                    "directors": directors,
                    "synopsis": synopsis,
                    "rating": rating
                })
            except Exception as e:
                print(f"Error processing movie {movie_id}: {str(e)}")
                continue
        return pd.DataFrame(all_movies)
    
    movie_df = fetch_data(api_key, page_limit=10)
    movie_df.to_csv('movies_dataset.csv', index=False)
    print("Movie Dataset is created")

In [42]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer
import pickle

In [43]:
data = pd.read_csv("movies_dataset.csv")
data.fillna('', inplace=True)
data.head()

Unnamed: 0,title,genres,actors,directors,synopsis,rating
0,Sonic the Hedgehog 3,"Action, Science Fiction, Comedy, Family","Jim Carrey, Ben Schwartz, Keanu Reeves",Jeff Fowler,"Sonic, Knuckles, and Tails reunite against a p...",7.593
1,Mufasa: The Lion King,"Adventure, Family, Animation","Aaron Pierre, Kelvin Harrison Jr., Tiffany Boone",Barry Jenkins,"Mufasa, a cub lost and alone, meets a sympathe...",7.4
2,Gladiator II,"Action, Adventure, Drama","Paul Mescal, Denzel Washington, Pedro Pascal",Ridley Scott,Years after witnessing the death of the revere...,6.756
3,Venom: The Last Dance,"Action, Science Fiction, Adventure","Tom Hardy, Chiwetel Ejiofor, Juno Temple",Kelly Marcel,Eddie and Venom are on the run. Hunted by both...,6.8
4,Moana 2,"Animation, Adventure, Family, Comedy","Auliʻi Cravalho, Dwayne Johnson, Hualālai Chung","David G. Derrick Jr., Jason Hand, Dana Ledoux ...",After receiving an unexpected call from her wa...,7.0


In [44]:
avg_actor_rating = data.groupby('actors')['rating'].mean().to_dict()
avg_director_rating = data.groupby('directors')['rating'].mean().to_dict()

data['avg_actor_rating'] = data['actors'].map(avg_actor_rating)
data['avg_director_rating'] = data['directors'].map(avg_director_rating)

In [45]:
data.head()

Unnamed: 0,title,genres,actors,directors,synopsis,rating,avg_actor_rating,avg_director_rating
0,Sonic the Hedgehog 3,"Action, Science Fiction, Comedy, Family","Jim Carrey, Ben Schwartz, Keanu Reeves",Jeff Fowler,"Sonic, Knuckles, and Tails reunite against a p...",7.593,7.593,5.60075
1,Mufasa: The Lion King,"Adventure, Family, Animation","Aaron Pierre, Kelvin Harrison Jr., Tiffany Boone",Barry Jenkins,"Mufasa, a cub lost and alone, meets a sympathe...",7.4,7.4,7.4
2,Gladiator II,"Action, Adventure, Drama","Paul Mescal, Denzel Washington, Pedro Pascal",Ridley Scott,Years after witnessing the death of the revere...,6.756,6.756,7.488
3,Venom: The Last Dance,"Action, Science Fiction, Adventure","Tom Hardy, Chiwetel Ejiofor, Juno Temple",Kelly Marcel,Eddie and Venom are on the run. Hunted by both...,6.8,6.8,6.8
4,Moana 2,"Animation, Adventure, Family, Comedy","Auliʻi Cravalho, Dwayne Johnson, Hualālai Chung","David G. Derrick Jr., Jason Hand, Dana Ledoux ...",After receiving an unexpected call from her wa...,7.0,7.0,7.0


In [46]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
synopsis_embeddings = sentence_model.encode(data['synopsis'].tolist())

genre_ohe = pd.get_dummies(data['genres'])

In [47]:
# Combine all the features
X = np.hstack((
    synopsis_embeddings,
    genre_ohe.values,
    data[['avg_actor_rating', 'avg_director_rating']].values
))
y = data['rating']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
xgb_model = XGBRegressor(
    n_estimators = 300,
    learning_rate = 0.03,
    max_depth = 7,
    subsample = 0.8,
    colsample_bytree = 0.8,
    random_state = 42
)

In [49]:
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_test)

In [50]:
# Evaluation
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"r2 Score {r2:.4f}")
print(f"Mean squared Error: {mse:.4f}")

r2 Score 0.9354
Mean squared Error: 0.0401


In [51]:
pickle.dump(xgb_model, open('movie_rating_model.pkl', 'wb'))
pickle.dump(sentence_model, open('Sentence_model.pkl', 'wb'))
pickle.dump(genre_ohe.columns.to_list(), open('genre_columns.pkl', 'wb'))
pickle.dump(avg_actor_rating, open('avg_actor_rating.pkl', 'wb'))
pickle.dump(avg_director_rating, open('avg_director_rating.pkl', 'wb'))