In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data_4.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6984 entries, 0 to 6983
Data columns (total 8 columns):
director_name    6984 non-null object
actor_1_name     6984 non-null object
actor_2_name     6984 non-null object
actor_3_name     6984 non-null object
genres           6984 non-null object
movie_title      6984 non-null object
comb             6984 non-null object
overview         6980 non-null object
dtypes: object(8)
memory usage: 436.6+ KB


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

In [5]:
# Filling NaNs with empty string
data['overview'] = data['overview'].fillna('')

In [6]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['overview'])

In [7]:
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(6984, 14301)

In [8]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(data.index, index=data['movie_title']).drop_duplicates()

In [13]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['movie_title'].iloc[movie_indices]

In [14]:
get_recommendations('the dark knight rises')

315                              batman forever
71                              the dark knight
1523                                     batman
455                              batman returns
2817                                  slow burn
128                               batman begins
1327                                        jfk
9            batman v superman: dawn of justice
4320    batman: the dark knight returns, part 2
217                              batman & robin
Name: movie_title, dtype: object

In [15]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig_sim = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
def give_rec(title, sig_sim=sig_sim):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig_sim[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return data['movie_title'].iloc[movie_indices]

In [19]:
give_rec('The Dark Knight Rises'.lower())

315                              batman forever
71                              the dark knight
1523                                     batman
455                              batman returns
2817                                  slow burn
128                               batman begins
1327                                        jfk
9            batman v superman: dawn of justice
4320    batman: the dark knight returns, part 2
217                              batman & robin
Name: movie_title, dtype: object

In [20]:
get_recommendations('avatar')

700                       the matrix
6089                obitaemyy ostrov
4046                       apollo 18
2374                    the american
867                        supernova
573                 tears of the sun
5603          moontrap: target earth
325     the adventures of pluto nash
941                         semi-pro
6643                       first man
Name: movie_title, dtype: object

In [21]:
give_rec('avatar')

700                       the matrix
6089                obitaemyy ostrov
4046                       apollo 18
2374                    the american
867                        supernova
573                 tears of the sun
5603          moontrap: target earth
325     the adventures of pluto nash
941                         semi-pro
6643                       first man
Name: movie_title, dtype: object

In [24]:
give_rec('the gentlemen')

3931                                deadfall
4304                           she's the one
409                           ocean's eleven
2766                  the master of disguise
4013                               clerks ii
3314    the visual bible: the gospel of john
6605                        never goin' back
655                              bad company
2458             the man who knew too little
2651                             being julia
Name: movie_title, dtype: object