In [264]:
# coding: utf-8
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [265]:
# read the csv file
df = pd.read_csv("movie_db.csv", sep='\t',low_memory=False)

In [267]:
df.columns
df.drop_duplicates('movie_title',keep='first',inplace=True)
df['movie_title']=df['movie_title'].apply(lambda x: x.strip())
df=df.fillna('')

In [268]:
df.head(5).style

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,jamescameron,723.0,178.0,0,855.0,joeldavidmoore,1000,760506000.0,"['action', 'adventure', 'fantasy', 'sci-fi']",cchpounder,Avatar,886204,4834,wesstudi,0,"['avatar', 'future', 'marine', 'native', 'paraplegic']",http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1,3054.0,English,USA,PG-13,237000000.0,2009.0,936,7.9,1.78,33000
1,Color,goreverbinski,302.0,169.0,563,1000.0,orlandobloom,40000,309404000.0,"['action', 'adventure', 'fantasy']",johnnydepp,Pirates of the Caribbean: At World's End,471220,48350,jackdavenport,0,"['goddess', 'marriage ceremony', 'marriage proposal', 'pirate', 'singapore']",http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1,1238.0,English,USA,PG-13,300000000.0,2007.0,5000,7.1,2.35,0
2,Color,sammendes,602.0,148.0,0,161.0,rorykinnear,11000,200074000.0,"['action', 'adventure', 'thriller']",christophwaltz,Spectre,275868,11700,stephaniesigman,1,"['bomb', 'espionage', 'sequel', 'spy', 'terrorist']",http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1,994.0,English,UK,PG-13,245000000.0,2015.0,393,6.8,2.35,85000
3,Color,christophernolan,813.0,164.0,22000,23000.0,christianbale,27000,448131000.0,"['action', 'thriller']",tomhardy,The Dark Knight Rises,1144337,106759,josephgordon-levitt,0,"['deception', 'imprisonment', 'lawlessness', 'police officer', 'terrorist plot']",http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1,2701.0,English,USA,PG-13,250000000.0,2012.0,23000,8.5,2.35,164000
4,,dougwalker,,,131,,robwalker,131,,['documentary'],dougwalker,Star Wars: Episode VII - The Force Awakens,8,143,,0,,http://www.imdb.com/title/tt5289954/?ref_=fn_tt_tt_1,,,,,,,12,7.1,,0


In [269]:
df=df.fillna('')
def create_metadata(x):
    return x['plot_keywords']+ ' ' + x['genres'] + ' ' + x['actor_1_name']+ ' ' + x['actor_2_name']+ ' ' + x['actor_3_name']+ ' ' + x['director_name'] 
            
df['metadata']=df.apply(create_metadata,axis=1)

In [270]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['metadata'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [271]:
df = df.reset_index()
indices = pd.Series(df.index, index=df['movie_title'])

### First recommendation process, using a metadata soup that takes the director, genre, plot keywords and main 3 actors into account

In [342]:
def get_recommendations1(title):
    
    index=indices[title]
    sim_scores=list(enumerate(cosine_sim1[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return df[['movie_title','movie_imdb_link','imdb_score']].iloc[movie_indices]

In [320]:
count_matrix1 = count.fit_transform(df['genres'])
cosine_sim1 = cosine_similarity(count_matrix1, count_matrix1)

count_matrix2 = count.fit_transform(df['plot_keywords'])
cosine_sim2 = cosine_similarity(count_matrix2, count_matrix2)

count_matrix3 = count.fit_transform(df['director_name'])
cosine_sim3 = cosine_similarity(count_matrix3, count_matrix3)

count_matrix4 = count.fit_transform(df['actor_1_name'])
cosine_sim4 = cosine_similarity(count_matrix4, count_matrix4)

count_matrix5 = count.fit_transform(df['actor_2_name'])
cosine_sim5 = cosine_similarity(count_matrix5, count_matrix5)

count_matrix6 = count.fit_transform(df['actor_3_name'])
cosine_sim6 = cosine_similarity(count_matrix6, count_matrix6)

### Second recommendation engine will use weighed similarity scores for each feature in order of importance, then sort the most similar movies by IMDB score before suggesting them

In [350]:
def get_recommendations2(title):
    
    index=indices[title]
    sim_scores1=list(enumerate(cosine_sim1[index]))
    sim_scores2=list(enumerate(cosine_sim2[index]))
    sim_scores3=list(enumerate(cosine_sim3[index]))
    sim_scores4=list(enumerate(cosine_sim4[index]))
    sim_scores5=list(enumerate(cosine_sim5[index]))
    sim_scores6=list(enumerate(cosine_sim6[index]))
    sim_scores=[score1[1]+0.75*score2[1]+0.5*score3[1]+0.1*score4[1]+0.1*score5[1]+0.1*score6[1] for score1,score2,score3,score4,score5,score6 in zip(sim_scores1,sim_scores2,sim_scores3,sim_scores4,sim_scores5,sim_scores6)]
    weighted_scores = pd.Series(sim_scores)
    weighted_scores=weighted_scores.sort_values(ascending=False)
    weighted_scores = weighted_scores[1:20]    
    movie_indices = [i for i in weighted_scores.index]
    final_scores=weighted_scores+df['imdb_score'].iloc[movie_indices]
    final_scores=final_scores.sort_values(ascending=False)
    final_scores=final_scores[0:5]
    movie_indices = [i for i in final_scores.index]
    return df[['movie_title','movie_imdb_link','imdb_score']].iloc[movie_indices]

In [352]:
print(get_recommendations1('Carrie'))

              movie_title                                    movie_imdb_link  \
1441  Queen of the Damned  http://www.imdb.com/title/tt0238546/?ref_=fn_t...   
1646               Carrie  http://www.imdb.com/title/tt1939659/?ref_=fn_t...   
136           The Wolfman  http://www.imdb.com/title/tt0780653/?ref_=fn_t...   
531           Constantine  http://www.imdb.com/title/tt3489184/?ref_=fn_t...   
1123         The Returned  http://www.imdb.com/title/tt2521668/?ref_=fn_t...   

      imdb_score  
1441         5.2  
1646         5.9  
136          5.8  
531          7.5  
1123         8.3  


In [353]:
print(get_recommendations2('Carrie'))

                                            movie_title  \
2333                                        The Shining   
1123                                       The Returned   
931   Interview with the Vampire: The Vampire Chroni...   
531                                         Constantine   
1425                                        Poltergeist   

                                        movie_imdb_link  imdb_score  
2333  http://www.imdb.com/title/tt0081505/?ref_=fn_t...         8.4  
1123  http://www.imdb.com/title/tt2521668/?ref_=fn_t...         8.3  
931   http://www.imdb.com/title/tt0110148/?ref_=fn_t...         7.6  
531   http://www.imdb.com/title/tt3489184/?ref_=fn_t...         7.5  
1425  http://www.imdb.com/title/tt0084516/?ref_=fn_t...         7.4  
