In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Read CSV file using Panda Dataframe

In [2]:
df = pd.read_csv("movie_dataset.csv")

In [27]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,ocean drug abuse exotic island east india trad...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,spy based on novel secret agent sequel mi6 Dan...
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan,dc comics crime fighter terrorist secret ident...
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton,based on novel mars medallion space travel pri...


In [7]:
print("Total samples", df.shape)
print("column names: \n", df.columns)

Total samples (4803, 24)
column names: 
 Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


We don't need all of them, we just need the columns: keywords, cast, genres, director

In [8]:
features = ['keywords','cast','genres','director']

In [9]:
# Combine all the values from the above selected columns in a single string
def combine_features(row):
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']

### Pre-processing

In [12]:
# fill all NaN values with blank string
for i in features:
    df[i] = df[i].fillna('')

In [13]:
df["combined_features"] = df.apply(combine_features,axis=1)

In [14]:
df.iloc[0].combined_features

'culture clash future space war space colony society Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez Action Adventure Fantasy Science Fiction James Cameron'

In [16]:
df.iloc[2].combined_features

'spy based on novel secret agent sequel mi6 Daniel Craig Christoph Waltz L\\u00e9a Seydoux Ralph Fiennes Monica Bellucci Action Adventure Crime Sam Mendes'

### Count Matrix

In [22]:
# feed these strings to a CountVectorizer() object for getting the count matrix.
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

### Find cosine similarity

In [23]:
cosine_sim = cosine_similarity(count_matrix)

### Get movie title from movie index 

In [24]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

###  Get movie index from movie title

In [25]:
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [30]:
movie_user_likes = "The Dark Knight Rises"
movie_index = get_index_from_title(movie_user_likes)

In [31]:
movie_index

3

In [32]:
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [33]:
similar_movies

[(0, 0.03774256780481986),
 (1, 0.03580574370197164),
 (2, 0.16357216402190616),
 (3, 0.9999999999999997),
 (4, 0.03922322702763681),
 (5, 0.07844645405527362),
 (6, 0.0),
 (7, 0.03580574370197164),
 (8, 0.08362420100070908),
 (9, 0.12009611535381537),
 (10, 0.11766968108291043),
 (11, 0.23533936216582085),
 (12, 0.04089304100547654),
 (13, 0.03922322702763681),
 (14, 0.14824986333222023),
 (15, 0.0),
 (16, 0.03522349768381735),
 (17, 0.04181210050035454),
 (18, 0.07161148740394328),
 (19, 0.04181210050035454),
 (20, 0.04003203845127179),
 (21, 0.04003203845127179),
 (22, 0.0),
 (23, 0.0),
 (24, 0.06933752452815364),
 (25, 0.08770580193070293),
 (26, 0.03706246583305506),
 (27, 0.08006407690254358),
 (28, 0.08006407690254358),
 (29, 0.1254363015010636),
 (30, 0.07692307692307693),
 (31, 0.07844645405527362),
 (32, 0.07412493166611012),
 (33, 0.07412493166611012),
 (34, 0.0),
 (35, 0.08559209850218259),
 (36, 0.08006407690254358),
 (37, 0.0),
 (38, 0.038461538461538464),
 (39, 0.1153846

### Sort this list in descending order

In [35]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [36]:
sorted_similar_movies

[(119, 0.7307692307692306),
 (65, 0.6923076923076922),
 (4638, 0.45291081365783836),
 (2793, 0.39467610868816316),
 (1196, 0.37630890450319093),
 (428, 0.3602883460614461),
 (1359, 0.3530090432487313),
 (210, 0.3396831102433788),
 (1720, 0.3335621924974955),
 (2371, 0.3335621924974955),
 (1740, 0.32776066832815337),
 (3332, 0.3149448894660932),
 (3359, 0.29417420270727607),
 (3819, 0.2926847035024818),
 (3966, 0.2926847035024818),
 (4099, 0.29268470350248177),
 (2790, 0.29134281629169184),
 (1013, 0.28625128703833574),
 (1792, 0.28625128703833574),
 (2398, 0.28625128703833574),
 (1503, 0.2853908964926965),
 (3293, 0.28022426915890253),
 (1986, 0.27735009811261463),
 (4664, 0.27735009811261463),
 (1234, 0.27456258919345766),
 (3073, 0.2692307692307693),
 (95, 0.264197974633739),
 (813, 0.264197974633739),
 (1664, 0.264197974633739),
 (968, 0.2631174057921088),
 (1674, 0.2631174057921088),
 (1727, 0.2631174057921088),
 (1803, 0.2631174057921088),
 (3637, 0.2631174057921088),
 (4065, 0.25

### Top 10 out of them

In [37]:
i=0
print("Top 10 similar movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>10:
        break

Top 10 similar movies to The Dark Knight Rises are:

Batman Begins
The Dark Knight
Amidst the Devil's Wings
The Killer Inside Me
The Prestige
Batman Returns
Batman
Batman & Robin
Kick-Ass
RockNRolla
Kick-Ass 2


### The result is very close if you search on google "similar movies like The Dark Knight Rises"