In [1]:
import pandas as pd
import numpy as np

movies = pd.read_csv("../data/tmdb_5000_movies.csv")
credits = pd.read_csv("../data/tmdb_5000_credits.csv")

movies.shape, credits.shape


((4803, 20), (4803, 4))

In [2]:
movies = movies.merge(credits, on="title")
movies.shape


(4809, 23)

In [3]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.isnull().sum()



movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [4]:
movies.dropna(inplace=True)
movies.isnull().sum()



movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [5]:
import ast

def extract_names(text):
    return [i['name'] for i in ast.literal_eval(text)]

movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)


In [6]:
def extract_cast(text):
    return [i['name'] for i in ast.literal_eval(text)][:3]

movies['cast'] = movies['cast'].apply(extract_cast)


In [7]:
def fetch_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return i['name']
    return ""

movies['crew'] = movies['crew'].apply(fetch_director)


In [8]:
def remove_spaces(text):
    return [i.replace(" ", "") for i in text]

movies['genres'] = movies['genres'].apply(remove_spaces)
movies['keywords'] = movies['keywords'].apply(remove_spaces)
movies['cast'] = movies['cast'].apply(remove_spaces)
movies['crew'] = movies['crew'].apply(lambda x: x.replace(" ", ""))


In [9]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())


In [10]:
movies['tags'] = (
    movies['overview'] +
    movies['genres'] +
    movies['keywords'] +
    movies['cast'] +
    movies['crew'].apply(lambda x: [x])
)

movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))


In [11]:
final_df = movies[['movie_id', 'title', 'tags']]
final_df.head()


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

vectors = tfidf.fit_transform(final_df['tags']).toarray()
vectors.shape


(4806, 5000)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)
similarity.shape


(4806, 4806)

In [14]:
# Map movie titles to index
movie_index = {title: idx for idx, title in enumerate(final_df['title'])}


In [15]:
def recommend(movie_name, top_n=5):
    if movie_name not in movie_index:
        return "Movie not found in database"

    idx = movie_index[movie_name]

    distances = list(enumerate(similarity[idx]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)

    recommendations = []
    for i in distances[1:top_n+1]:
        recommendations.append(final_df.iloc[i[0]].title)

    return recommendations


In [16]:
recommend("Avatar")


['Aliens',
 'Falcon Rising',
 'Battle: Los Angeles',
 'Apollo 18',
 'Star Trek Into Darkness']

In [19]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(final_df, "../models/movies.pkl")
joblib.dump(similarity, "../models/similarity.pkl")


['../models/similarity.pkl']