In [1]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [4]:
cleaned = pd.read_csv("./imdb_top_1000.csv")


In [5]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1,3),min_df=3,analyzer='word')

cleaned['overview'] = cleaned['Gross'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(cleaned['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape



(1000, 319)

In [6]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(cleaned.index, index=cleaned['Series_Title']).drop_duplicates()

In [8]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return cleaned['Series_Title'].iloc[movie_indices]

In [9]:
get_recommendations('Psycho')

415                                   Jaws
447               A Streetcar Named Desire
466                         Marriage Story
484                           The Irishman
698    Willy Wonka & the Chocolate Factory
703                           My Fair Lady
566                              King Kong
183                       Some Like It Hot
850                       Enter the Dragon
987                       Midnight Express
Name: Series_Title, dtype: object

In [10]:
get_recommendations('Coco')

667              Night on Earth
595                Kaze tachinu
169              Dom za vesanje
591                 Vicky Donor
914             Sherlock Holmes
212               Hacksaw Ridge
469                Isle of Dogs
531           A Christmas Story
0      The Shawshank Redemption
1                 The Godfather
Name: Series_Title, dtype: object

In [11]:
get_recommendations('Close Encounters of the Third Kind')

97          Requiem for a Dream
578    Kubo and the Two Strings
625                  Apocalypto
501               Les choristes
333                      Wonder
752     Silver Linings Playbook
37                 The Departed
631           Pride & Prejudice
951             Minority Report
0      The Shawshank Redemption
Name: Series_Title, dtype: object