#  Movie Recommendation System (Content-Based Filtering)

### This project recommends movies similar to a given movie using "Content-Based Filtering" on the TMDB 5000 Movie Dataset.

## Step 1: Import Libraries

In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Step 2: Load Datasets

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

print("Movies shape:", movies.shape)
print("Credits shape:", credits.shape)

movies.head(2)


Movies shape: (4803, 20)
Credits shape: (4803, 4)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


##  Step 3: Merge Datasets and Fix Title Column

In [13]:
# Merge datasets correctly
credits = credits.rename(columns={'movie_id':'id'})
df = movies.merge(credits, on='id')

# Pick the right title column
if 'title' in df.columns:
    df = df[['id','title','overview','genres','keywords','cast','crew']]
else:
    df = df[['id','original_title','overview','genres','keywords','cast','crew']]
    df = df.rename(columns={'original_title':'title'})

df.head(2)


Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Step 4: Define Helper Functions for Data Preprocessing

In [4]:
def convert_to_list_of_names(text):
    if pd.isna(text):
        return []
    try:
        data = ast.literal_eval(text)
    except Exception:
        return []
    return [i['name'] for i in data if 'name' in i]

def get_top_cast(text, top_n=3):
    if pd.isna(text):
        return []
    try:
        data = ast.literal_eval(text)
    except Exception:
        return []
    return [i['name'] for i in data[:top_n]]

def get_director(text):
    if pd.isna(text):
        return []
    try:
        data = ast.literal_eval(text)
    except Exception:
        return []
    return [i['name'] for i in data if i.get('job') == 'Director']

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    return text


## Data Cleaning and Feature Engineering for Movie Tags

In [5]:
df['overview'] = df['overview'].fillna("")

df['genres'] = df['genres'].apply(convert_to_list_of_names)
df['keywords'] = df['keywords'].apply(convert_to_list_of_names)
df['cast'] = df['cast'].apply(lambda x: get_top_cast(x, top_n=3))
df['crew'] = df['crew'].apply(get_director)

df['overview'] = df['overview'].apply(clean_text)

# Remove spaces in names (Tom Cruise -> tomcruise)
def collapse_names(x):
    return [i.replace(" ","").lower() for i in x]

df['genres'] = df['genres'].apply(collapse_names)
df['keywords'] = df['keywords'].apply(collapse_names)
df['cast'] = df['cast'].apply(collapse_names)
df['crew'] = df['crew'].apply(collapse_names)

df['tags'] = df['overview'] + " " + df['genres'].apply(lambda x: " ".join(x)) + " " + df['keywords'].apply(lambda x: " ".join(x)) + " " + df['cast'].apply(lambda x: " ".join(x)) + " " + df['crew'].apply(lambda x: " ".join(x))

df_small = df[['id','title','tags']]
df_small.head(2)


Unnamed: 0,id,title,tags
0,19995,Avatar,in the 22nd century a paraplegic marine is di...
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believed to be dead ha...


## Step 6: Vectorization of Tags

In [6]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df_small['tags']).toarray()

print("Shape of Vectors:", vectors.shape)


Shape of Vectors: (4803, 5000)


## Step 7: Compute Cosine Similarity Matrix

In [7]:
similarity = cosine_similarity(vectors)
print("Similarity matrix shape:", similarity.shape)


Similarity matrix shape: (4803, 4803)


## Step 8: Build Movie Recommendation Function

In [8]:
def recommend(movie, top_n=10):
    # find index of movie
    movie = movie.lower()
    indices = df_small[df_small['title'].str.lower() == movie].index
    if len(indices) == 0:
        print(f"Movie '{movie}' not found!")
        return
    index = indices[0]

    distances = list(enumerate(similarity[index]))
    distances = sorted(distances, key=lambda x: x[1], reverse=True)

    print(f"\nTop {top_n} recommendations for '{df_small.iloc[index].title}':")
    for i in distances[1:top_n+1]:
        print(df_small.iloc[i[0]].title)


## Step 9: Test the Movie Recommender

### Test the recommender by inputting a movie name.  
### Display the top N similar movies based on content-based filtering.

In [9]:
recommend("Avatar", top_n=5)


Top 5 recommendations for 'Avatar':
Titan A.E.
Independence Day
Small Soldiers
Aliens vs Predator: Requiem
Ender's Game


## Step 10: Additional Test of Movie Recommender

In [10]:
recommend("The Dark Knight")


Top 10 recommendations for 'The Dark Knight':
The Dark Knight Rises
Batman Begins
Batman Returns
Batman Forever
Batman & Robin
Amidst the Devil's Wings
Batman v Superman: Dawn of Justice
Batman: The Dark Knight Returns, Part 2
Batman
Jerusalema


In [14]:
# This code should be added to the end of your Jupyter notebook after computing df_small and similarity
import pickle

# Save the movie dataframe and similarity matrix as pickle files
pickle.dump(df_small, open('models/movies.pkl', 'wb'))
pickle.dump(similarity, open('models/similarity.pkl', 'wb'))