#### import and load data

In [1]:
import pandas as pd
import numpy as np

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle


In [3]:
# Stage 2 — Load CSVs & keep required columns
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# Keep only necessary columns
movies = movies[['id','title','overview','genres','keywords','vote_average','vote_count','popularity']].copy()
credits = credits[['title','cast','crew']].copy()

print("movies columns:", list(movies.columns))
print("credits columns:", list(credits.columns))


movies columns: ['id', 'title', 'overview', 'genres', 'keywords', 'vote_average', 'vote_count', 'popularity']
credits columns: ['title', 'cast', 'crew']


In [4]:
movies.head(4)

Unnamed: 0,id,title,overview,genres,keywords,vote_average,vote_count,popularity
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",7.2,11800,150.437577
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",6.9,4500,139.082615
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",6.3,4466,107.376788
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",7.6,9106,112.31295


#### merge dataset

In [5]:
movies = movies.merge(credits, on='title', how='left')
print("Columns after merge:", movies.columns)


Columns after merge: Index(['id', 'title', 'overview', 'genres', 'keywords', 'vote_average',
       'vote_count', 'popularity', 'cast', 'crew'],
      dtype='object')


In [6]:
movies.shape

(4809, 10)

#### data cleaning

In [7]:
# Drop rows with nulls
movies.dropna(inplace=True)

# Filter movies with very few votes
movies = movies[movies['vote_count'] > 10]

movies.reset_index(drop=True, inplace=True)
print("Number of movies after cleaning:", len(movies))


Number of movies after cleaning: 4363


#### feature engeneering

In [8]:
import ast

def safe_eval(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

def get_names(lst):
    return [i['name'] for i in lst] if lst else []

def get_top_cast(lst):
    return [i['name'] for i in lst[:3]] if lst else []

def get_director(lst):
    if lst:
        for i in lst:
            if i.get('job') == 'Director':
                return [i.get('name')]
    return []

movies['genres'] = movies['genres'].apply(lambda x: get_names(safe_eval(x)))
movies['keywords'] = movies['keywords'].apply(lambda x: get_names(safe_eval(x)))
movies['cast'] = movies['cast'].apply(lambda x: get_top_cast(safe_eval(x)))
movies['crew'] = movies['crew'].apply(lambda x: get_director(safe_eval(x)))
movies['overview'] = movies['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])


#### data labelling

In [9]:
# Features and target
X = movies[['overview','genres','keywords','cast','crew']]
y = movies['vote_average']


#### text processing

In [10]:
# Combine all text features into a single column
X['tags'] = X.apply(lambda row: row['overview'] + row['genres'] + row['keywords'] + row['cast'] + row['crew'], axis=1)
X['tags'] = X['tags'].apply(lambda x: " ".join(x).lower() if x else "")

# Remove empty tags
non_empty_idx = X['tags'].str.strip() != ""
X = X[non_empty_idx]
y = y[X.index]

print("Number of movies with non-empty tags:", X.shape[0])


Number of movies with non-empty tags: 4363


#### tf-idf vectorizer

In [14]:
# Convert to dense
X_vectors_dense = X_vectors.toarray()

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_vectors_dense, y, test_size=0.2, random_state=42)

# Fit model
from sklearn.ensemble import HistGradientBoostingRegressor
model = HistGradientBoostingRegressor(max_iter=300, learning_rate=0.1, max_depth=10, random_state=42)
model.fit(X_train, y_train)


In [26]:
similarity = cosine_similarity(X_vectors)


#### train/test split

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=42)


#### model training

In [28]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=300, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)  # X_train can be sparse


#### model evaluation

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


MSE: 0.6588808109350429
MAE: 0.6329795413547091
R2 Score: 0.2057263543598319


#### recomendation function

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend(movie_title, top_n=5):
    if movie_title not in movies['title'].values:
        print("Movie not found")
        return
    
    idx = movies[movies['title'] == movie_title].index[0]
    movie_vector = X_vectors[idx]
    
    predicted_rating = model.predict(movie_vector)[0]
    print(f"Predicted rating for '{movie_title}': {predicted_rating}")
    
    sim_scores = cosine_similarity(movie_vector, X_vectors)
    top_indices = np.argsort(sim_scores[0])[::-1][1:top_n+1]
    
    return movies.iloc[top_indices][['title','vote_average']]

recommend("Avatar")


Predicted rating for 'Avatar': 6.779996395111084


Unnamed: 0,title,vote_average
2384,Aliens,7.7
836,Alien³,6.2
3095,Alien,7.9
1527,Moonraker,5.9
4096,Silent Running,6.3


In [31]:
pickle.dump(model, open('model.pkl','wb'))
pickle.dump(tfidf, open('vectorizer.pkl','wb'))
pickle.dump(X_vectors, open('vectors.pkl','wb'))
pickle.dump(movies, open('movies.pkl','wb'))
pickle.dump(similarity, open('similarity.pkl','wb'))

movies_dict = movies.to_dict()
pickle.dump(movies_dict, open('movie_dict.pkl','wb'))
