In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

pd.pandas.set_option('display.max_columns', None)

### Data Loading

In [3]:
metadata = pd.read_csv(os.path.join(os.getcwd(),'movie_dataset/movies_metadata.csv'), low_memory=False)

In [4]:
metadata['vote_count'].quantile(0.50)

10.0

### Content-Based Recommender using Description

In [5]:
len(metadata)

45466

In [6]:
metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
tfidf = TfidfVectorizer(stop_words='english')
metadata['overview'] = metadata['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(metadata['overview'])
print(tfidf_matrix.shape)

(45466, 75827)


In [13]:
tfidf_matrix

<45466x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 1210882 stored elements in Compressed Sparse Row format>

In [8]:
for features in tfidf.get_feature_names_out():
    idx = tfidf.vocabulary_.get(features)
    print('word:',features,'    IDF Score',tfidf.idf_[idx])

word: 00     IDF Score 9.159792709613278
word: 000     IDF Score 5.97534908116656
word: 000km     IDF Score 11.031594886514869
word: 000th     IDF Score 10.338447705954923
word: 001     IDF Score 11.031594886514869
word: 006     IDF Score 11.031594886514869
word: 007     IDF Score 9.239835417286814
word: 008     IDF Score 11.031594886514869
word: 009     IDF Score 10.626129778406703
word: 0093     IDF Score 11.031594886514869
word: 01     IDF Score 9.7788319180195
word: 0123     IDF Score 11.031594886514869
word: 02     IDF Score 10.626129778406703
word: 03     IDF Score 11.031594886514869
word: 04     IDF Score 10.338447705954923
word: 042     IDF Score 11.031594886514869
word: 05     IDF Score 10.626129778406703
word: 05pm     IDF Score 11.031594886514869
word: 06     IDF Score 10.626129778406703
word: 07     IDF Score 10.338447705954923
word: 077     IDF Score 11.031594886514869
word: 07am     IDF Score 11.031594886514869
word: 08     IDF Score 10.626129778406703
word: 088     IDF S

### Cosine Similarity

In [9]:
# pairwise cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)

(45466, 45466)


In [10]:
cosine_sim

array([[1.        , 0.01504121, 0.        , ..., 0.        , 0.00595453,
        0.        ],
       [0.01504121, 1.        , 0.04681953, ..., 0.        , 0.02198641,
        0.00929411],
       [0.        , 0.04681953, 1.        , ..., 0.        , 0.01402548,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.00595453, 0.02198641, 0.01402548, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.00929411, 0.        , ..., 0.        , 0.        ,
        1.        ]])

### Inference

In [11]:
def infer(title,cosine_sim=cosine_sim):
    # idx=indices['The Shawshank Redemption']
    recommendations = sorted(list(enumerate(cosine_sim[317])),key= lambda x:x[1], reverse=True)[1:11]
    print(recommendations)
    for x,y in recommendations:
        print(metadata['title'].iloc[x])

In [12]:
infer('The Shawshank Redemption')

[(40755, 0.21790313568011982), (43795, 0.20718312039526582), (35644, 0.20656843885639142), (42715, 0.1911604280150896), (35711, 0.19037269413773938), (44932, 0.17057757670451443), (3896, 0.1582087058515281), (39607, 0.15629876833323927), (34517, 0.15608533331968383), (12502, 0.15299665311621552)]
Parallel Courses
For the Good of Others
Dirty Mind
Timecode
Blue and Not So Pink
Grimm
Before Night Falls
Night of the Sharks
Enter Laughing
Alatriste
