In [1]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('metadata_clean.csv')

In [4]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995


In [6]:
orig_df=pd.read_csv('movies_metadata.csv', low_memory=False)
#Add the useful features into the cleaned dataframe
df['overview'], df['id']=orig_df['overview'], orig_df['id']

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862


### Creating the TF-IDF matrix

In [7]:
#Import TfidfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf=TfidfVectorizer(stop_words='english')

In [10]:
#Replace NaN with an empty string
df['overview']=df['overview'].fillna('')

In [11]:
#Construct the required TF-IDF matrix by applying the fit_transform method on the overrview feature
tfidf_matrix=tfidf.fit_transform(df['overview'])

In [12]:
#output the shape of tfidf_matrix
tfidf_matrix.shape

(45466, 75827)

### Computing the cosine similarity score

In [13]:
#Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

In [14]:
#Compute the cosine similarity matrix
cosine_sim=linear_kernel(tfidf_matrix, tfidf_matrix)

### Building the recommender function

In [17]:
#Construct a reverse mapping of indice and movie titles, and drop duplicate titles, if any
indices=pd.Series(df.index, index=df['title']).drop_duplicates()

In [16]:
#Function that takes in movie title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    #Obtain the index of the movie that matches the title
    idx=indices[title]
    
    #Get the pairwise similarity scores of all movies with movie
    #And convert it into a list of tuples as described above
    sim_scores=list(enumerate(cosine_sim[idx]))
    
    #Sort the movies based on the cosine similarity scores
    sim_scores=sorted(sim_scores, key=lambda x:x[1], reverse=True)
    
    #Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores=sim_scores[1:11]
    
    #Get the movie indices
    movie_indices=[i[0] for i in sim_scores]
    
    #Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [21]:
content_recommender('Dracula: Dead and Loving It')

21540             Dracula 3D
181                    Nadja
3905            Dracula 2000
8689       Dracula A.D. 1972
1291                 Dracula
5512       Horror of Dracula
41124    Manson's Lost Girls
21650     A Jitney Elopement
31724             Pale Blood
451                    Fresh
Name: title, dtype: object

In [20]:
df.head(20)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year,overview,id
0,Toy Story,"['Animation', 'Comedy', 'Family']",81.0,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ...",862
1,Jumanji,"['Adventure', 'Fantasy', 'Family']",104.0,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...,8844
2,Grumpier Old Men,"['Romance', 'Comedy']",101.0,6.5,92.0,1995,A family wedding reignites the ancient feud be...,15602
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",127.0,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom...",31357
4,Father of the Bride Part II,['Comedy'],106.0,5.7,173.0,1995,Just when George Banks has recovered from his ...,11862
5,Heat,"['Action', 'Crime', 'Drama', 'Thriller']",170.0,7.7,1886.0,1995,"Obsessive master thief, Neil McCauley leads a ...",949
6,Sabrina,"['Comedy', 'Romance']",127.0,6.2,141.0,1995,An ugly duckling having undergone a remarkable...,11860
7,Tom and Huck,"['Action', 'Adventure', 'Drama', 'Family']",97.0,5.4,45.0,1995,"A mischievous young boy, Tom Sawyer, witnesses...",45325
8,Sudden Death,"['Action', 'Adventure', 'Thriller']",106.0,5.5,174.0,1995,International action superstar Jean Claude Van...,9091
9,GoldenEye,"['Adventure', 'Action', 'Thriller']",130.0,6.6,1194.0,1995,James Bond must unmask the mysterious head of ...,710
