In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data_4.csv")

In [3]:
data.columns

Index(['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name',
       'genres', 'movie_title', 'comb', 'overview'],
      dtype='object')

In [4]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [5]:
# Apply clean_data function to your features.
features = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']

for feature in features:
    data[feature] = data[feature].apply(clean_data)

In [6]:
data.head(2)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb,overview
0,jamescameron,cchpounder,joeldavidmoore,wesstudi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...,"In the 22nd century, a paraplegic Marine is di..."
1,goreverbinski,johnnydepp,orlandobloom,jackdavenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...,"Captain Barbossa, long believed to be dead, ha..."


In [7]:
def create_soup(x):
    return x['actor_1_name'] + ' '+ x['actor_2_name'] + ' ' + x['actor_3_name'] + ' ' + x['director_name'] + ' ' + x['genres']

data['soup'] = data.apply(create_soup, axis=1)

In [8]:
data['soup'][1]

'johnnydepp orlandobloom jackdavenport goreverbinski Action Adventure Fantasy'

In [9]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(data['soup'])

In [10]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [11]:
# Reset index of our main DataFrame and construct reverse mapping as before
data = data.reset_index(drop=True)
indices = pd.Series(data.index, index=data['movie_title'])

In [12]:
indices['avatar']

0

In [13]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return data['movie_title'].iloc[movie_indices]

In [17]:
get_recommendations('the gentlemen', cosine_sim)

3374                                     snatch
4782        lock, stock and two smoking barrels
299                              the other guys
421                                 tower heist
576                     the man from u.n.c.l.e.
757                 austin powers in goldmember
833     miss congeniality 2: armed and fabulous
1012                             22 jump street
1014                       central intelligence
1256                             21 jump street
Name: movie_title, dtype: object

In [27]:
doc_term_matrix = count_matrix.todense()
df = pd.DataFrame(doc_term_matrix, columns=count.get_feature_names(), index=data['movie_title'])
df

Unnamed: 0_level_0,300ml,50cent,aaliyah,aaronashmore,aaronhann,aaronhill,aaronhughes,aaronkatz,aaronleong,aaronmoten,...,élodieyung,émilegaudreault,émiliedequenne,érictessier,étiennefaure,ólafurdarriólafsson,óscarjaenada,ølgaard,úrsulacorberó,михаилпореченков
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
avatar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pirates of the caribbean: at world's end,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
spectre,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
the dark knight rises,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john carter,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
top gun: maverick,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
the croods 2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
the last duel,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
news of the world,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
df2 = pd.DataFrame(cosine_sim, columns=data['movie_title'], index=data['movie_title'])
df2

movie_title,avatar,pirates of the caribbean: at world's end,spectre,the dark knight rises,john carter,tangled,tangled,avengers: age of ultron,harry potter and the half-blood prince,batman v superman: dawn of justice,...,soul,happiest season,voyagers,free guy,coming 2 america,top gun: maverick,the croods 2,the last duel,news of the world,respect
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
avatar,1.000000,0.377964,0.251976,0.125988,0.471405,0.201008,0.201008,0.471405,0.235702,0.444444,...,0.000000,0.0,0.000000,0.316228,0.0,0.136083,0.125988,0.000000,0.000000,0.000000
pirates of the caribbean: at world's end,0.377964,1.000000,0.285714,0.142857,0.267261,0.227921,0.227921,0.267261,0.267261,0.251976,...,0.000000,0.0,0.000000,0.358569,0.0,0.154303,0.142857,0.000000,0.000000,0.000000
spectre,0.251976,0.285714,1.000000,0.285714,0.267261,0.113961,0.113961,0.267261,0.133631,0.251976,...,0.142857,0.0,0.133631,0.239046,0.0,0.154303,0.142857,0.000000,0.000000,0.000000
the dark knight rises,0.125988,0.142857,0.285714,1.000000,0.133631,0.000000,0.000000,0.133631,0.000000,0.125988,...,0.142857,0.0,0.133631,0.119523,0.0,0.154303,0.000000,0.000000,0.000000,0.000000
john carter,0.471405,0.267261,0.267261,0.133631,1.000000,0.106600,0.106600,0.500000,0.125000,0.471405,...,0.000000,0.0,0.000000,0.223607,0.0,0.144338,0.133631,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
top gun: maverick,0.136083,0.154303,0.154303,0.154303,0.144338,0.000000,0.000000,0.144338,0.000000,0.136083,...,0.000000,0.0,0.000000,0.129099,0.0,1.000000,0.000000,0.182574,0.166667,0.166667
the croods 2,0.125988,0.142857,0.142857,0.000000,0.133631,0.341882,0.341882,0.133631,0.267261,0.125988,...,0.000000,0.0,0.000000,0.239046,0.0,0.000000,1.000000,0.000000,0.000000,0.000000
the last duel,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.141421,0.0,0.182574,0.000000,1.000000,0.182574,0.182574
news of the world,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.0,0.166667,0.000000,0.182574,1.000000,0.166667


In [36]:
rec = get_recommendations('the gentlemen', cosine_sim)

In [37]:
rec.tolist()

['snatch',
 'lock, stock and two smoking barrels',
 'the other guys',
 'tower heist',
 'the man from u.n.c.l.e.',
 'austin powers in goldmember',
 'miss congeniality 2: armed and fabulous',
 '22 jump street',
 'central intelligence',
 '21 jump street']