# About the project

To build a content based recommender system that recommends movies based on the genre, cast, crew and some keywords of a previously watched movie.

# Import Libraries and load datasets

In [1]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"
%cd /content
!kaggle datasets download -d rounakbanik/the-movies-dataset
!unzip \*.zip  && rm *.zip

/content
Downloading the-movies-dataset.zip to /content
 94% 213M/228M [00:03<00:00, 86.0MB/s]
100% 228M/228M [00:03<00:00, 68.3MB/s]
Archive:  the-movies-dataset.zip
replace credits.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace keywords.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace links_small.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace movies_metadata.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace ratings_small.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
movies = pd.read_csv('/content/movies_metadata.csv')
credits = pd.read_csv('/content/credits.csv')
keywords = pd.read_csv('/content/keywords.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


# Data Cleaning

In [3]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
credits.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


In [5]:
keywords.head(3)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


From the movies dataframe, we need only 3 columns i.e., id, title and genres.

In [6]:
movies = movies[['id','title','genres']]
movies.head()

Unnamed: 0,id,title,genres
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      45466 non-null  object
 1   title   45460 non-null  object
 2   genres  45466 non-null  object
dtypes: object(3)
memory usage: 1.0+ MB


In [8]:
movies.isnull().sum()

id        0
title     6
genres    0
dtype: int64

In [9]:
#clean movie_id function
def clean_id(x):
  try:
    return int(x)
  except:
    return np.nan

In [10]:
movies['id'] = movies['id'].apply(clean_id)
movies = movies[movies['id'].notnull()]

In [11]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      45463 non-null  float64
 1   title   45460 non-null  object 
 2   genres  45463 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


In [12]:
keywords.dtypes

id           int64
keywords    object
dtype: object

In [13]:
credits.dtypes

cast    object
crew    object
id       int64
dtype: object

In [14]:
#converting id into integer to match the data types in other 2 dataframes
movies['id'] = movies['id'].astype('int')

In [15]:
movies.dtypes

id         int64
title     object
genres    object
dtype: object

In [16]:
#merging the 3 dataframes to get all the required data on 1 datafarame movies
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [17]:
movies.head()

Unnamed: 0,id,title,genres,cast,crew,keywords
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


The genres, cast, crew, and keywords columns all are of an object (or a string datatype). Let us get the required words we will be using from these columns by first using **literal_eval** to convert these strings into python objects (a list of dictionaries here) and use pandas and numpy to wrangle them.



In [18]:
# Changing the 4 columns into python objects ( list of dictionaries here)
movies['genres'] = movies['genres'].apply(literal_eval)
movies['cast'] = movies['cast'].apply(literal_eval)
movies['crew'] = movies['crew'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)

In [19]:
# Extracting the names of all the genres attached to each movie
movies['genres'] = movies['genres'].apply(lambda x: [i['name'].lower() for i in x])
# Extracting the name of the director from all the crew members
# we will only extract directors from the crew column for our purpose
movies['crew'] = movies['crew'].apply(lambda x: [i['name'].lower() for i in x if i['job']=='Director'])
# Extracting the cast and keywords from the list of dictionaries of those columns
movies['cast'] = movies['cast'].apply(lambda x: [i['name'].lower() for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i['name'].lower() for i in x])

In [20]:
# Extracting maximum 3 cast/genre/keywords for each movie
movies['genres'] = movies['genres'].apply(lambda x: x[:3] if len(x)>3 else x)
movies['cast'] = movies['cast'].apply(lambda x: x[:3] if len(x)>3 else x)
movies['keywords'] = movies['keywords'].apply(lambda x: x[:3] if len(x)>3 else x)

In [21]:
movies.head()

Unnamed: 0,id,title,genres,cast,crew,keywords
0,862,Toy Story,"[animation, comedy, family]","[tom hanks, tim allen, don rickles]",[john lasseter],"[jealousy, toy, boy]"
1,8844,Jumanji,"[adventure, fantasy, family]","[robin williams, jonathan hyde, kirsten dunst]",[joe johnston],"[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,"[romance, comedy]","[walter matthau, jack lemmon, ann-margret]",[howard deutch],"[fishing, best friend, duringcreditsstinger]"
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[whitney houston, angela bassett, loretta devine]",[forest whitaker],"[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,[comedy],"[steve martin, diane keaton, martin short]",[charles shyer],"[baby, midlife crisis, confidence]"


Next, we need to remove spaces between the names and surnames.

In [22]:
#removing spaces
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ','') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ','') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ','') for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ','') for i in x])

Due to memory constraints in Colab, we are using only the first 20000 rows here.

In [23]:
movies = movies[:20000]
movies.shape

(20000, 6)

Now, let us make 1 column of all the metadata that we have by appending the values in the genres, cast, crew, and keywords column.

In [24]:
movies['metadata'] = movies.apply(lambda x : ' '.join(x['genres']) + ' ' + ' '.join(x['cast']) + ' ' + ' '.join(x['crew']) + ' ' + ' '.join(x['keywords']), axis = 1)
movies.head()

Unnamed: 0,id,title,genres,cast,crew,keywords,metadata
0,862,Toy Story,"[animation, comedy, family]","[tomhanks, timallen, donrickles]",[johnlasseter],"[jealousy, toy, boy]",animation comedy family tomhanks timallen donr...
1,8844,Jumanji,"[adventure, fantasy, family]","[robinwilliams, jonathanhyde, kirstendunst]",[joejohnston],"[boardgame, disappearance, basedonchildren'sbook]",adventure fantasy family robinwilliams jonatha...
2,15602,Grumpier Old Men,"[romance, comedy]","[waltermatthau, jacklemmon, ann-margret]",[howarddeutch],"[fishing, bestfriend, duringcreditsstinger]",romance comedy waltermatthau jacklemmon ann-ma...
3,31357,Waiting to Exhale,"[comedy, drama, romance]","[whitneyhouston, angelabassett, lorettadevine]",[forestwhitaker],"[basedonnovel, interracialrelationship, single...",comedy drama romance whitneyhouston angelabass...
4,11862,Father of the Bride Part II,[comedy],"[stevemartin, dianekeaton, martinshort]",[charlesshyer],"[baby, midlifecrisis, confidence]",comedy stevemartin dianekeaton martinshort cha...


We will use a CountVectorizer to built numeric features from our metadata. We won’t use TfIdf here because there might be many movies with the same directors and we definitely don’t wanna penalize that director. It might be possible that a user wants to be recommended movies belonging to that director. Most of the words we have are names and genres whose counts are actually useful for recommending movies.

In [25]:
count_vec = CountVectorizer(stop_words='english')
count_vec_matrix = count_vec.fit_transform(movies['metadata'])

Next,  we use cosine similarity to find the similarity between any 2 movies. For that, let's make a cosine similarity matrix using count vectorizer values.

In [26]:
cosine_sim_matrix = cosine_similarity(count_vec_matrix, count_vec_matrix)

In [27]:
#movies index mapping
mapping = pd.Series(movies.index,index = movies['title'])

Now, let's build a recommender function

In [28]:
# Recommender function to recommend movies based on metadata
def recommend_movies_based_on_metadata(movie_input):
  movie_index = mapping[movie_input]
  # Get similarity values with other movies
  similarity_score = list(enumerate(cosine_sim_matrix[movie_index]))
  similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
  # Get the scores of the 15 most similar movies. Ignore the first movie as the first movie would be the input movie
  similarity_score = similarity_score[1:15]
  movie_indices = [i[0] for i in similarity_score]
  return (movies['title'].iloc[movie_indices])

In [29]:
recommend_movies_based_on_metadata('Kill Me Again')

12925           Conspiracy
14312                Flood
17232                 Vice
549          Trial by Jury
6140         Trouble Bound
454            The Getaway
2104                Willow
9996         Blind Horizon
18940            Last Exit
241       The Glass Shield
2724           In Too Deep
3261       Brown's Requiem
3311     Raise the Titanic
4683           Dinner Rush
Name: title, dtype: object

Great! We got the top 15 movies similar to the movie Kill Me Again.