In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]
##################################################

##Step 1: Read CSV File
df=pd.read_csv('movie_dataset.csv')
print(df.head())
print(df.columns)




   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [6]:
##Step 2: Select Features
features=['genres','keywords','cast','director']
##Step 3: Create a column in DF which combines all selected features

for feature in features:
    df[feature]=df[feature].fillna('')

def combine_features(row):
    try:
        return row['keywords']+''+row['cast']+''+row['genres']+''+row['director']
    except:
        print('Error: ',row)
df['combine_features']=df.apply(combine_features,axis=1)
df['combine_features'].head()

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6Dani...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combine_features, dtype: object

In [13]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df['combine_features'])

##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim=cosine_similarity(count_matrix)
movie_user_likes = "Avatar"

## Step 6: Get index of this movie from its title
movie_index=get_index_from_title(movie_user_likes)
similar_movies=list(enumerate(cosine_sim[movie_index]))
## Step 7: Get a list of similar movies in descending order of similarity score
sorted_sim_movies=sorted(similar_movies,key=lambda x:x[1],reverse=True)

## Step 8: Print titles of first 50 
i=0
for movie in sorted_sim_movies:
    print(get_title_from_index(movie[0]))
    i=i+1
    if i>50:
          break

Avatar
Guardians of the Galaxy
Alien
Aliens
Star Wars: Clone Wars: Volume 1
Space Dogs
Cargo
Star Trek Beyond
Moonraker
Jason X
Galaxy Quest
Trekkies
Silent Running
The Astronaut's Wife
Planet of the Apes
Wing Commander
Star Trek Into Darkness
Babylon A.D.
Space Chimps
Galaxina
Oblivion
The Ice Pirates
John Carter
Soldier
Damnation Alley
Gravity
Men in Black
Sheena
Men in Black II
Memoirs of an Invisible Man
Treasure Planet
The Iron Giant
Shrek Forever After
Source Code
Captain America: The Winter Soldier
Lost in Space
Event Horizon
The Right Stuff
Crossroads
Stargate: The Ark of Truth
Saving Private Ryan
Terminator 2: Judgment Day
Titan A.E.
Sunshine
The Thing
Spaceballs
Jupiter Ascending
Mad Max: Fury Road
Red Planet
AVP: Alien vs. Predator
Zathura: A Space Adventure
