In [None]:
#Building a Movie Rec. Engine

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process #have to pip install into anaconda root

In [8]:
#load data
df = pd.read_csv('movie_dataset.csv')

In [9]:
#print df
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


In [10]:
#get a count of number of rows and columns (number of movies)
df.shape

(4803, 24)

In [11]:
#list of important columns to keep
features = ['overview','keywords','cast', 'genres','director']
df[features].head(3)

Unnamed: 0,overview,keywords,cast,genres,director
0,"In the 22nd century, a paraplegic Marine is di...",culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron
1,"Captain Barbossa, long believed to be dead, ha...",ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski
2,A cryptic message from Bond’s past sends him o...,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes


In [12]:
#clean and process data
for feature in features:
    df[feature]=df[feature].fillna('') #filling any missing values with empty

In [13]:
#create a function to combine the values of important columns into one string
def combined_features(row):
    return row['overview']+' '+row['keywords']+' '+row['cast']+' '+row['genres']+' '+row['director']

In [14]:
#apply the function on each row to store the combined strings into a new column
#called cobined_features
df['combined_features']=df.apply(combined_features, axis=1) #axis =1 means to do on all rows

In [15]:
#converts the title column strings to Title Font
df['title'] = df['title'].apply(lambda x: x.title())
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,"In the 22nd century, a paraplegic Marine is di..."
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates Of The Caribbean: At World'S End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,"Captain Barbossa, long believed to be dead, ha..."
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,A cryptic message from Bond’s past sends him o...


In [16]:
#convert a collection of text to matrix of token counts
count_matrix = CountVectorizer().fit_transform(df['combined_features'])

In [17]:
#get the cosign similarity matrix from count matrix
cosine_sim = cosine_similarity(count_matrix)
print(cosine_sim)  #will show the similarity of every movie

[[1.         0.22613351 0.21374669 ... 0.20586559 0.17707162 0.10643623]
 [0.22613351 1.         0.25318484 ... 0.26552513 0.25975852 0.13728129]
 [0.21374669 0.25318484 1.         ... 0.24970006 0.22875451 0.1489609 ]
 ...
 [0.20586559 0.26552513 0.24970006 ... 1.         0.28274407 0.1636604 ]
 [0.17707162 0.25975852 0.22875451 ... 0.28274407 1.         0.19491173]
 [0.10643623 0.13728129 0.1489609  ... 0.1636604  0.19491173 1.        ]]


In [18]:
#get count for cosine_sim: rows and columns
cosine_sim.shape

(4803, 4803)

In [19]:
#get the title of movie in the index function
def get_title_from_index(index):
    return df[df.index==index]['title'].values[0]

#get index from title of movie function
def get_index_from_title(title):
    return df[df.title==title]['index'].values[0]

In [20]:
#get the title of the movie that user likes
user_likes = input('Whats youre favorite movie? ')
movie_that_user_likes = process.extractOne(user_likes,df['title'])[0]  #uses fuzzy matching to extract the input to a movie closest to whats typed

Whats youre favorite movie? hrry potter


In [21]:
#checks to see if the inputed movie is in the database and if so looks up its index number
if movie_that_user_likes in df['title'].values:
    movie_index = get_index_from_title(movie_that_user_likes)
    print(movie_that_user_likes,'is index',movie_index)
else:
    print(movie_that_user_likes,'is not in the database. Try again!')
    


Harry Potter And The Half-Blood Prince is index 8


In [48]:
#Enumerate through all similarity scores of "the amazing spiderman"
#make a tuple of movie index and similarity scores
#NOTE: we will return a list of tuples in the form (movie index,similarity score)

similar_movies = list(enumerate(cosine_sim[movie_index]))

In [49]:
similar_movies

[(0, 0.3205439351732011),
 (1, 0.2953122116093092),
 (2, 0.32898173250501855),
 (3, 0.2933657898845541),
 (4, 0.3284098814272745),
 (5, 0.16140048692757375),
 (6, 0.20185669181626792),
 (7, 0.42542326231693417),
 (8, 0.13754993359993484),
 (9, 0.19857122326661658),
 (10, 0.34633948719670543),
 (11, 0.3161030951008881),
 (12, 0.15749984619163154),
 (13, 0.30293177486839407),
 (14, 0.3497349098935194),
 (15, 0.42075824723812366),
 (16, 0.30713001378770693),
 (17, 0.2841798783331475),
 (18, 0.4002476260058419),
 (19, 0.3394611386387797),
 (20, 0.3095944581824335),
 (21, 0.2934986087147826),
 (22, 0.23701266349031497),
 (23, 0.18036829916803734),
 (24, 0.19714805534881863),
 (25, 0.3234983196103152),
 (26, 0.27712835991110385),
 (27, 0.4319264431216999),
 (28, 0.10783277320343843),
 (29, 0.3549069608751226),
 (30, 0.289395766105824),
 (31, 0.17791499872137215),
 (32, 0.38118638229608437),
 (33, 0.2588414429342565),
 (34, 0.12451456127293807),
 (35, 0.21596487416564994),
 (36, 0.33025787075

In [50]:
#sort the list of similar movies according to the similarity scores in desc order
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse=True)[1:]


In [51]:
#print sorted similar movies
sorted_similar_movies

[(1995, 0.47143714732813624),
 (59, 0.4638666728515557),
 (153, 0.4635996228141774),
 (657, 0.4576732364335252),
 (342, 0.452780222810684),
 (274, 0.4519305513163183),
 (4384, 0.45190107229755194),
 (569, 0.448252420582577),
 (870, 0.4454768371117636),
 (48, 0.4425718220321346),
 (549, 0.4403907650173666),
 (4332, 0.4393606097702238),
 (3192, 0.43912899952211837),
 (4468, 0.4367307293334348),
 (1004, 0.43654854000893195),
 (2900, 0.4364763772611031),
 (2969, 0.4342784617696135),
 (3396, 0.4321158740252361),
 (27, 0.4319264431216999),
 (222, 0.4304032811118404),
 (1985, 0.43005376008022506),
 (2127, 0.42940744973002704),
 (2696, 0.42839083295910685),
 (432, 0.42668005460395614),
 (3549, 0.4264728788512117),
 (3899, 0.42548226494102415),
 (7, 0.42542326231693417),
 (4239, 0.4239596597782146),
 (999, 0.4233167112995736),
 (2647, 0.42308516268352725),
 (420, 0.42263985063512266),
 (220, 0.4226164350362994),
 (531, 0.42157250647186933),
 (786, 0.421171139093132),
 (85, 0.4208118566975986),


In [52]:
#create a loop to print the first 5 entries from the sorted similar movies list
i=0
print('The top 5 similar movies to '+ movie_that_user_likes +' are:')

for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i = i + 1
    if i >=5:
        break
    

The top 5 similar movies to Independence Day are:
Pitch Black
2012
Mission: Impossible - Ghost Protocol
Resident Evil: Retribution
Men In Black
