In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]

In [8]:
df = pd.read_csv("meta_and_keywords_clean_cut.csv")
df.head(5)

Unnamed: 0,id,title,Date,Language,Genre,averageRating,numVotes,Keywords,index,genres_clean
0,31186339,The Hunger Games,2012-03-12,"{""/m/02h40lc"": ""English Language""}","{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":...",7.2,834462,arena sponsors tributes tribute survivor suppl...,0,Action/Adventure Science Fiction Action Drama
1,2231378,The Lemon Drop Kid,1951-03-08,"{""/m/02h40lc"": ""English Language""}","{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""...",7.1,1646,home money days women scheme crew charity lice...,1,Screwball comedy Comedy
2,595909,A Cry in the Dark,1988-11-03,"{""/m/02h40lc"": ""English Language""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",7.4,9,wife baby tent investigation sacrifice wildern...,2,Crime Fiction Drama Docudrama World cinema Cou...
3,1952976,Dark Water,2005-06-27,"{""/m/02h40lc"": ""English Language""}","{""/m/01jfsb"": ""Thriller"", ""/m/07s9rl0"": ""Drama...",5.6,59137,girl water apartment mother school daughter gi...,3,Thriller Drama Horror
4,2462689,Meet John Doe,1941-05-03,"{""/m/02h40lc"": ""English Language""}","{""/m/06cvj"": ""Romantic comedy"", ""/m/0hj3nby"": ...",7.6,11875,letter speech newspaper editor suicide sensati...,4,Romantic comedy Media Satire Americana Comedy-...


In [5]:
##Step 2: Select Features

features = ['Keywords','genres_clean']
features_nokey = ['genres_clean']

In [6]:
for feature in features:
	df[feature] = df[feature].fillna('')

def combine_features(row):
	try:
		return row['Keywords']+" "+row["genres_clean"]
	except:
		print ("Error:", row)

df["combined_features"] = df.apply(combine_features,axis=1)

print ("Combined Features:", df["combined_features"].head())

Combined Features: 0    arena sponsors tributes tribute survivor suppl...
1    home money days women scheme crew charity lice...
2    wife baby tent investigation sacrifice wildern...
3    girl water apartment mother school daughter gi...
4    letter speech newspaper editor suicide sensati...
Name: combined_features, dtype: object


In [7]:
for feature in features_nokey:
	df[feature] = df[feature].fillna('')

def combine_features_nokey(row):
	try:
		return row["genres_clean"]
	except:
		print ("Error:", row)

df["combined_features_nokey"] = df.apply(combine_features_nokey,axis=1)

print ("Combined Features_nokey:", df["combined_features_nokey"].head())

Combined Features_nokey: 0        Action/Adventure Science Fiction Action Drama
1                              Screwball comedy Comedy
2    Crime Fiction Drama Docudrama World cinema Cou...
3                                Thriller Drama Horror
4    Romantic comedy Media Satire Americana Comedy-...
Name: combined_features_nokey, dtype: object


In [8]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])
count_matrix_nokey = cv.fit_transform(df["combined_features_nokey"])

In [16]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix) 
cosine_sim_nokey = cosine_similarity(count_matrix_nokey) 
movie_user_likes = "The Hunger Games"

In [17]:
## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)
# print('index')
# print(movie_index)
similar_movies =  list(enumerate(cosine_sim[movie_index]))


In [18]:
similar_movies_nokey =  list(enumerate(cosine_sim_nokey[movie_index]))


In [19]:
## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)


In [20]:
sorted_similar_movies_nokey = sorted(similar_movies_nokey,key=lambda x:x[1],reverse=True)

In [21]:
## Step 8: Print titles of first 20 movies
# this is recommendation with the keywords included
i=0
print("YOUR MOVIE: "+movie_user_likes.upper())
for element in sorted_similar_movies:
		print (get_title_from_index(element[0]))
		i=i+1
		if i>20:
			break

YOUR MOVIE: THE HUNGER GAMES
The Hunger Games
XIII
Trancers 6
Northville Cemetery Massacre
Against the Dark
The Island
Mar Mitenge
Sticky Fingers
Rangbaaz
Tekken
Two Lost Worlds
Hercules
Circadian Rhythm
Spy Kids 4: All the Time in the World
Cowboys & Aliens
The Hero: Love Story of a Spy
The Cutter
Sci-fighters
Kranti Kshetra
Raptor Island
Naaraaz


In [22]:
i=0
# without keywords
for element in sorted_similar_movies_nokey:
		print (get_title_from_index(element[0]))
		i=i+1
		if i>20:
			break

The Hunger Games
The Postman
Universal Soldier II: Brothers in Arms
Universal Soldier III: Unfinished Business
Super 8
The Island
Ra.One
Star Trek: Generations
Knights
After Earth
Women of the Prehistoric Planet
Blood Rush
Velocity Trap
Ender's Game
Mysterious Island
Replicant
Northville Cemetery Massacre
Cyborg Soldier
Independence Day
One Good Cop
Eve of Destruction
