<a href="https://colab.research.google.com/github/semereab-merry/42-Projects/blob/main/movie_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Load Libraries and Dataset

In [1]:
# --- Load Dataset ---
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path_movies = "tmdb_5000_movies.csv"
file_path_credits = "tmdb_5000_credits.csv"

# Load the latest version
movies = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "tmdb/tmdb-movie-metadata",
    file_path_movies,
)

credits = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "tmdb/tmdb-movie-metadata",
    file_path_credits,
)

  movies = kagglehub.load_dataset(
  credits = kagglehub.load_dataset(


In [2]:
# --- Import Necessary Libraries ---
import numpy as np
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer


In [3]:
# First instance in movies dataset
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
# First 5 instance in credits dataset
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
# Dimenstion of movies dataset
movies.shape

(4803, 20)

In [6]:
# Dimension of credits dataset
credits.shape

(4803, 4)

In [7]:
## To simplify, we can add the movie information with the credit information based on title
movies = movies.merge(credits, on='title')

In [8]:
# Check the new dimenstions of movies dataset
movies.shape

(4809, 23)

In [9]:
# First instance of new movies dataset
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


# 2. Explanatory Data Analysis

In [10]:
# Info about each of the columns
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [11]:
# --- Check for null values ---
movies.isnull().sum()

Unnamed: 0,0
budget,0
genres,0
homepage,3096
id,0
keywords,0
original_language,0
original_title,0
overview,3
popularity,0
production_companies,0


**Note:** the columns homepage and tagline have many null values. Some of the columns like 'movie_id', 'id', 'budget', 'original_title', 'production_companies', 'production_countries', 'spoken_languages' and 'tagline' does not provide much information to the recommendation system.
On the other hand, 'release_date', 'popularity', 'revenue', 'vote_average' or 'vote_count' might be a good indicator to find similar movies, but they do not provide enough insights to one's taste/preferences.


In [12]:
# -- Check the status of movies in the list ---
movies['status'].unique()

array(['Released', 'Post Production', 'Rumored'], dtype=object)

**Note:** movies that are 'Rumored' should not be recommended to the user because they are not released yet.

In [13]:
movies = movies[movies.status != 'Rumored']

In [14]:
movies['status'].unique()

array(['Released', 'Post Production'], dtype=object)

In [15]:
movies.shape

(4804, 23)

**Note:** now that we have cleared all 'Rumored' movies, status is not needed in the movie dataset

In [16]:
# --- Reusable Function: To dissect the importatnt values from dictionaries
def convert_dict(text):
    result = []
    for i in ast.literal_eval(text):
        result.append(i['name'])
    return result

In [17]:
def fetch_director(text):
    result = ""
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            result = i['name']
    return result

In [18]:
movies['genres'] = movies['genres'].apply(convert_dict)
movies['keywords'] = movies['keywords'].apply(convert_dict)
movies['cast'] = movies['cast'].apply(convert_dict)
movies['production_companies'] = movies['production_companies'].apply(convert_dict)
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B24]",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[Legendary Pictures, Warner Bros., DC Entertai...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion, space travel...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [19]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron


In [20]:
# Limit the number of genres, and cast
movies['cast'] = movies['cast'].apply(lambda x:x[0:5]) # max of 5 cast members
movies['genres'] = movies['genres'].apply(lambda x:x[0:2]) # max of 2 genres
movies['keywords'] = movies['keywords'].apply(lambda x:x[0:8]) # max of 8 keywords
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[Action, Adventure]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron
1,300000000,"[Adventure, Fantasy]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski
2,245000000,"[Action, Adventure]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq, B24]",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes
3,250000000,"[Action, Crime]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[Legendary Pictures, Warner Bros., DC Entertai...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[Christian Bale, Michael Caine, Gary Oldman, A...",Christopher Nolan
4,260000000,"[Action, Adventure]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion, space travel...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",Andrew Stanton


In [21]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [22]:
# --- Dimensionality reduction for K-means ---
movies = movies[['id', 'original_title','genres','cast','vote_average','crew', 'keywords']]
movies.rename(columns={'crew': 'director'}, inplace=True)
movies.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies.rename(columns={'crew': 'director'}, inplace=True)


Unnamed: 0,id,original_title,genres,cast,vote_average,director,keywords
0,19995,Avatar,"[Action, Adventure]","[Sam Worthington, Zoe Saldana, Sigourney Weave...",7.2,James Cameron,"[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy]","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",6.9,Gore Verbinski,"[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[Action, Adventure]","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",6.3,Sam Mendes,"[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[Action, Crime]","[Christian Bale, Michael Caine, Gary Oldman, A...",7.6,Christopher Nolan,"[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[Action, Adventure]","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",6.1,Andrew Stanton,"[based on novel, mars, medallion, space travel..."


In [23]:
# --- Creating Dummies ---
def create_dummies(column_name):
    new_columns = movies[column_name]

    mlb = MultiLabelBinarizer()

    return pd.DataFrame(mlb.fit_transform(new_columns),columns=mlb.classes_, index=movies.index)

In [24]:
create_dummies('genres')

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4805,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4806,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4807,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
movies = pd.concat([movies, create_dummies('genres')], axis=1)
movies = pd.concat([movies, create_dummies('cast')], axis=1)
movies = pd.concat([movies, create_dummies('director')], axis=1)
movies = pd.concat([movies, create_dummies('keywords')], axis=1)

In [26]:
movies.drop(['cast', 'genres', 'director', 'keywords'], axis=1, inplace=True)

In [27]:
movies.drop('id', axis = 1, inplace=True)
movies.head()

Unnamed: 0,original_title,vote_average,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,...,begins with text,flipping coin,gilbert and sullivan,nightgown,north carolinam,nosferatu,Γη,卧底肥妈,绝地奶霸,超级妈妈
0,Avatar,7.2,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Pirates of the Caribbean: At World's End,6.9,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Spectre,6.3,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Dark Knight Rises,7.6,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,John Carter,6.1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
# Make the dataset a Pivot Table
movies = movies.set_index('original_title')
movies.head()

Unnamed: 0_level_0,vote_average,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,begins with text,flipping coin,gilbert and sullivan,nightgown,north carolinam,nosferatu,Γη,卧底肥妈,绝地奶霸,超级妈妈
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,7.2,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,6.9,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Spectre,6.3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Dark Knight Rises,7.6,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
John Carter,6.1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Convert the pivot_table into an array matrix
from scipy.sparse import csr_matrix
features_matrix = csr_matrix(movies.values)    # All info of pivot table converted into an array
features_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 113095 stored elements and shape (4804, 16370)>

In [30]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric = "cosine", algorithm="brute")
model.fit(features_matrix)

In [35]:
# Find similar movies(nearer to the selected movie: 1 in this example) using kneighbors
distances, indices = model.kneighbors(movies.iloc[1,:].values.reshape(1, -1), n_neighbors=6)

In [36]:
for i in range(0, len(distances.flatten())):
    if(i==0):
        print("Recommendations for {0}:\n".format(movies.index[1]))
    else:
        print("{0}: {1}".format(i, movies.index[indices.flatten()[i]]))

Recommendations for Pirates of the Caribbean: At World's End:

1: Pirates of the Caribbean: Dead Man's Chest
2: Pirates of the Caribbean: The Curse of the Black Pearl
3: Chats perchés
4: Iraq for Sale: The War Profiteers
5: Sisters in Law


In [43]:
# Find similar movies(nearer to the selected movie: 3 in this example) using kneighbors
distances, indices = model.kneighbors(movies.loc['The Dark Knight Rises',:].values.reshape(1, -1), n_neighbors=6)

In [47]:
for i in range(0, len(distances.flatten())):
    if(i==0):
        print("Recommendations for The Dark Knight Rises: \n")
    else:
        print("{0}: {1}".format(i, movies.index[indices.flatten()[i]]))

Recommendations for The Dark Knight Rises: 

1: Batman Begins
2: The Dark Knight
3: Interstellar
4: Inception
5: The Prestige


In [52]:
# --- Create a reccomender function ---
def recommend(movie_name):
    distances, indices = model.kneighbors(movies.loc[movie_name,:].values.reshape(1, -1), n_neighbors=6)
    results = []
    for i in range(0, len(distances.flatten())):
        if(i==0):
            pass
        else:
            results.append(movies.index[indices.flatten()[i]])
    return results

In [53]:
print(recommend('The Dark Knight Rises'))

['Batman Begins', 'The Dark Knight', 'Interstellar', 'Inception', 'The Prestige']


In [54]:
print(recommend('Avatar'))

['Counting', 'Aliens', 'Chats perchés', 'Caravans', 'The Abyss']
