In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./data/movies_kaggle/movies_metadata.csv', low_memory=False)

In [3]:
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [4]:
data = data.head(20000)

In [5]:
print('Number of null values:',data['overview'].isnull().sum())

Number of null values: 135


In [6]:
data['overview'] = data['overview'].fillna('')

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print('TF-IDF Array(shape) :',tfidf_matrix.shape)

TF-IDF Array(shape) : (20000, 47487)


In [8]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print('Result of cosine similarity:',cosine_sim.shape)

Result of cosine similarity: (20000, 20000)


In [9]:
title_to_index = dict(zip(data['title'], data.index))

In [10]:
idx = title_to_index['The Dark Knight']
print(idx)

12481


In [11]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [idx[0] for idx in sim_scores]
    return data['title'].iloc[movie_indices]

In [16]:
get_recommendations('Superman')

16626                            All Star Superman
2527                                   Superman II
9169     The Batman Superman Movie: World's Finest
11067                             Superman Returns
2529              Superman IV: The Quest for Peace
17852                    Superman and the Mole-Men
19741                   Gable: The King Remembered
6822                                  The Freshman
12617                          The Ron Clark Story
19063              Superman/Batman: Public Enemies
Name: title, dtype: object