In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset, SVD, SVDpp,SlopeOne,NMF,NormalPredictor,KNNBaseline,KNNBasic,KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

In [2]:
#movie recommendation using top movies in the genre
md=pd.read_csv('data/dataset/movies_metadata.csv')

In [3]:
#creating top movies chart in each genre

md['id']=md['id'].astype('int')
md['id']

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45458    439050
45459    111109
45460     67758
45461    227506
45462    461257
Name: id, Length: 45463, dtype: int32

In [4]:
#to check top charts using IMDB formula----weighted rating = ((v/v+m).R) + ((v/v+m).C)
#v is the number of votes for the movie
#m is the minimum votes required to be listed in the chart
#R is the average rating of the movie
#C is the mean vote across the whole report

In [5]:
from ast import literal_eval
md['genres']=md['genres'].fillna('[]').apply(literal_eval)
md['genres']=md['genres'].map(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])


In [6]:
md['vote_count']=md['vote_count'].fillna(0)
#total vote counts
#total_votes=md['vote_count'].sum()
#print(total_votes)
#minimum votes require-->95%
vote_min=md['vote_count'].astype('int').quantile(0.95).astype(int)
#mean value for vote
C=md[md['vote_average'].notnull()]['vote_average'].mean()

In [7]:
#creating a column for year
md['year']=pd.to_datetime(md['release_date'],errors='coerce').map(lambda x: str(x).split('-')[0] if x!=np.nan else np.nan)

In [8]:
#modifying spoken_lagnuages column to consist an array
md['languages']=md['spoken_languages'].fillna('[]').apply(literal_eval).map(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])

In [9]:
#creating a small dataset with vote_count > minimum value ,is not null and remove unnecessary columns
qual=md[(md['vote_count'] >=vote_min) & (md['vote_count'].notnull())][['adult','budget','genres','id','imdb_id','original_language','original_title','revenue','languages','vote_average','vote_count','year']]

In [10]:
qual.info()
#qual['vote_count']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2282 entries, 0 to 45011
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   adult              2282 non-null   bool   
 1   budget             2282 non-null   int64  
 2   genres             2282 non-null   object 
 3   id                 2282 non-null   int32  
 4   imdb_id            2282 non-null   object 
 5   original_language  2282 non-null   object 
 6   original_title     2282 non-null   object 
 7   revenue            2282 non-null   float64
 8   languages          2282 non-null   object 
 9   vote_average       2282 non-null   float64
 10  vote_count         2282 non-null   float64
 11  year               2282 non-null   object 
dtypes: bool(1), float64(3), int32(1), int64(1), object(6)
memory usage: 207.3+ KB


In [11]:
#calculating weighted rating of each movie
#a=qual.apply(lambda x: (x.vote_count/(x.vote_count+vote_min))*C + (x.vote_count/(x.vote_count+vote_min))*x.vote_average)
def rating(row):
    return ((row['vote_count']/(row['vote_count']+vote_min))*row['vote_average']) + (vote_min/(row['vote_count']+vote_min))*C
    #row['vote_count']=row['vote_count']*2
qual['weighted_rating']=qual.apply(rating,axis=1)
#qual['rating']=qual.apply(lambda x: (x.vote_count/(x.vote_count+vote_min))*C + (x.vote_count/(x.vote_count+vote_min))*x.vote_average ,axis=1)


In [12]:
#top 250 movies are 
qual.sort_values('weighted_rating',ascending=False).head(20)

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,revenue,languages,vote_average,vote_count,year,weighted_rating
314,False,25000000,"[Drama, Crime]",278,tt0111161,en,The Shawshank Redemption,28341470.0,[English],8.5,8358.0,1994,8.358058
834,False,6000000,"[Drama, Crime]",238,tt0068646,en,The Godfather,245066400.0,"[English, Italiano, Latin]",8.5,6024.0,1972,8.30675
12481,False,185000000,"[Drama, Action, Crime, Thriller]",155,tt0468569,en,The Dark Knight,1004558000.0,"[English, 普通话]",8.3,12269.0,2008,8.20858
2843,False,63000000,[Drama],550,tt0137523,en,Fight Club,100853800.0,[English],8.3,9678.0,1999,8.185153
292,False,8000000,"[Thriller, Crime]",680,tt0110912,en,Pulp Fiction,213928800.0,"[English, Español, Français]",8.3,8670.0,1994,8.172436
351,False,55000000,"[Comedy, Drama, Romance]",13,tt0109830,en,Forrest Gump,677945400.0,[English],8.2,8147.0,1994,8.069707
522,False,22000000,"[Drama, History, War]",424,tt0108052,en,Schindler's List,321365600.0,"[Deutsch, Polski, עִבְרִית, English]",8.3,4436.0,1993,8.061508
23672,False,3300000,[Drama],244786,tt2582802,en,Whiplash,13092000.0,[English],8.3,4376.0,2014,8.058533
5481,False,15000000,"[Fantasy, Adventure, Animation, Family]",129,tt0245429,ja,千と千尋の神隠し,274925100.0,[日本語],8.3,3968.0,2001,8.036147
1154,False,18000000,"[Adventure, Action, Science Fiction]",1891,tt0080684,en,The Empire Strikes Back,538400000.0,[English],8.2,5998.0,1980,8.026168


In [13]:
#movie should have votes more than vot_min to be considered

In [14]:
#get all unique values for genres
genres=[]
for x in md['genres']:
    [genres.append(i) for i in x]

genres=set(genres)


In [15]:
#creating movie recommendation model as per genres (will return top 5 or what ever is paassed)
#creating function that accepts genre
#df=pd.DataFrame()

df=qual
df['gen']=0
def found(a,x,ic):
    if x in a['genres'] and (a.gen==0) :
        return ic
    else:
        return a.gen

def recommender_1(genre,val):
    ind=1
    for gen in genre:
        df['gen']=df.apply(found,args=(gen,ind ),axis='columns')
        ind=ind+1
    v=df[df['gen']!=0].groupby('gen').apply(lambda x:x.sort_values(['weighted_rating'],ascending=False)).reset_index(drop=True)
    return v.groupby('gen').head(val)
f=recommender_1(['Comedy','Animation'],5)
#df
#gen_md.head()

In [16]:
#printing top 5 movies 0for both genres selected
def get_recommendations_1(name):
    genres_list=qual[qual['original_title']==name].genres
    val=recommender_1(genres_list,3)
    return val
get_recommendations_1('The Godfather')

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,revenue,languages,vote_average,vote_count,year,weighted_rating,gen
0,False,55000000,"[Comedy, Drama, Romance]",13,tt0109830,en,Forrest Gump,677945399.0,[English],8.2,8147.0,1994,8.069707,1
1,False,20000000,"[Comedy, Drama]",637,tt0118799,it,La vita è bella,229400000.0,"[English, Deutsch, Italiano]",8.3,3643.0,1997,8.015109,1
2,False,13000000,"[Drama, Comedy]",77338,tt1675434,fr,Intouchables,426480871.0,"[English, Français]",8.2,5410.0,2011,8.008674,1
748,False,15000000,"[Fantasy, Adventure, Animation, Family]",129,tt0245429,ja,千と千尋の神隠し,274925095.0,[日本語],8.3,3968.0,2001,8.036147,2
749,False,45000000,"[Family, Animation, Drama]",8587,tt0110357,en,The Lion King,788241776.0,[English],8.0,5520.0,1994,7.826757,2
750,False,24000000,"[Fantasy, Animation, Adventure]",4935,tt0347149,ja,ハウルの動く城,234710455.0,[日本語],8.2,2049.0,2004,7.749591,2


In [17]:
#modifying genre based recommendation as comdey,drama cannot be put in similar to crim,drama just because both have genre-drama


In [18]:
#recommeder as per cast
#import data that has cast
cast_data=pd.read_csv('data/dataset/credits.csv')

md['id']=md['id'].astype('int')
#merge this with main dataset
md=md.merge(cast_data,on='id')


In [19]:
#md.head(2)
md

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,languages,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,1995,[English],"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[English, Français]","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,[English],"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,[English],"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,[English],"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45533,False,,0,"[Drama, Family]",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0,NaT,[فارسی],"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de..."
45534,False,,0,[Drama],,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,Released,,Century of Birthing,False,9.0,3.0,2011,[],"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de..."
45535,False,,0,"[Action, Drama, Thriller]",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,Released,A deadly game of wits.,Betrayal,False,3.8,6.0,2003,[English],"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de..."
45536,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,Released,,Satan Triumphant,False,0.0,0.0,1917,[],"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de..."


In [20]:
#checking director
#director=cast_data[['crew','id']]
#director['crew']=director['crew'].apply(literal_eval)
md['crew']=md['crew'].apply(literal_eval)
def get_director(row):
    #print(row)
    #print(1)
    for a in row:
        if a['job']=='Director':
            return a['name']
    return "NA"
md['director']=md['crew'].apply(get_director)
#md['cast']

In [21]:
#for rest of the crew lets take only the fellow whose count is more than 90%
#top 10 crew
md['cast']=md['cast'].apply(literal_eval)
md['cast']=md['cast'].apply(lambda x:[a['name'] for a in x] if isinstance(x,list) else [])



In [22]:
cast_list=[]
for x in md['cast']:
    [cast_list.append(i) for i in x]
cast_li=pd.DataFrame()
cast_li['name']=cast_list
cast_li['cn']=1
cast_li=cast_li.groupby('name').cn.agg([sum]).reset_index().sort_values(by='sum',ascending=False)
cast_li.head(200)
#involves a lot of side actors.. Hence take first 4-5 characters for each movie

Unnamed: 0,name,sum
19954,Bess Flowers,241
34454,Christopher Lee,148
94329,John Wayne,125
169114,Samuel L. Jackson,123
132443,Michael Caine,110
...,...,...
127822,Mary Steenburgen,61
114372,Liev Schreiber,61
171428,Sean Connery,61
31350,Charlotte Rampling,61


In [23]:
#get all unique values for Cast
md['cast']=md['cast'].apply(lambda x: x[:3] if len(x)>=3 else x)
md['cast']

0                      [Tom Hanks, Tim Allen, Don Rickles]
1           [Robin Williams, Jonathan Hyde, Kirsten Dunst]
2               [Walter Matthau, Jack Lemmon, Ann-Margret]
3        [Whitney Houston, Angela Bassett, Loretta Devine]
4               [Steve Martin, Diane Keaton, Martin Short]
                               ...                        
45533          [Leila Hatami, Kourosh Tahami, Elham Korda]
45534           [Angel Aquino, Perry Dizon, Hazel Orencio]
45535         [Erika Eleniak, Adam Baldwin, Julie du Page]
45536    [Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...
45537                                                   []
Name: cast, Length: 45538, dtype: object

In [24]:
#formating and using keywords

keywds=pd.read_csv('data/dataset/keywords.csv')
keywds['keywords']=keywds['keywords'].apply(literal_eval)
keywds['keywords']=keywds['keywords'].apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
md=md.merge(keywds,on='id')


In [25]:
md=md.drop('crew',axis=1)

In [26]:
#checking unique keywords
ke=md.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(drop=True)
ke.name='keyword'
ke.value_counts()

  


woman director             3128
independent film           1942
murder                     1314
based on novel              841
musical                     734
                           ... 
seinen                        1
gus                           1
investing                     1
shakespeare's king lear       1
star wars spoof scene         1
Name: keyword, Length: 19956, dtype: int64

In [27]:
a=[]
for x in md['keywords']:
    [a.append(i) for i in x]
a=pd.Series(a,name='keyword')
a=a.value_counts()
#keep key words that occur more than once
a=a[a>1]

In [28]:

def filter_keywords(x):
    words = []
    for i in x:
        if i in a:
            words.append(i)
    return words


In [29]:
md['keywords'] = md['keywords'].apply(filter_keywords)
#remove spaces betwwen keywords to take them as one 
md['keywords'] = md['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])
#use lemma to avoid getting different count for similar words
md['keywords']=md['keywords'].apply(lambda x:' '.join(x))

md.head(2)
#using smaller dataset for matching for now to check model

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,year,languages,cast,director,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,1995,[English],"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,jealousy toy boy friendship friends rivalry bo...
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[English, Français]","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,boardgame disappearance basedonchildren'sbook ...


In [30]:
import en_core_web_lg
nlp = en_core_web_lg.load()
z=md['keywords'].apply(lambda x:" ".join([token.lemma_ for token in nlp(x)]))
z

0        jealousy toy boy friendship friend rivalry boy...
1        boardgame disappearance basedonchildren'sbook ...
2           fishing bestfriend duringcreditsstinger oldmen
3        basedonnovel interracialrelationship singlemot...
4        baby midlifecrisis confidence age daughter mot...
                               ...                        
46623                                           tragiclove
46624                                    artist play pinoy
46625                                                     
46626                                                     
46627                                                     
Name: keywords, Length: 46628, dtype: object

In [31]:
md['keywords']=z.map(lambda x:[x])
md.head(5)
#md[md['id'].isin(links_small)]
#links_small

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,year,languages,cast,director,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,1995,[English],"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,[jealousy toy boy friendship friend rivalry bo...
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[English, Français]","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,[boardgame disappearance basedonchildren'sbook...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,[English],"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,[fishing bestfriend duringcreditsstinger oldmen]
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,[English],"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,[basedonnovel interracialrelationship singlemo...
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,[English],"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,[baby midlifecrisis confidence age daughter mo...


In [32]:
links_small=pd.read_csv('data/dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
links_small.columns=['movieId','imdbId','id']
sd=md[md['id'].isin(links_small)]
#sd=sd.reset_index()
sd['tagline']=sd['tagline'].fillna('')
sd['description']=sd['overview']+sd['tagline']
sd['description']=sd['description'].fillna('')
sd=sd.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [33]:
#md[md['id'].isin(links_small)]
#removing spaces between names to avoid similarity between people with same first name or last name
sd['cast']=sd['cast'].apply(lambda x:[str.lower(i.replace(" ","")) for i in x])
sd['director']=sd['director'].apply(lambda x:str.lower(x.replace(" ","")))
#gving more importance to director by 

In [34]:
md[md['id'].isin(links_small)]
#links_small
sd

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,title,video,vote_average,vote_count,year,languages,cast,director,keywords,description
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Toy Story,False,7.7,5415.0,1995,[English],"[tomhanks, timallen, donrickles]",johnlasseter,[jealousy toy boy friendship friend rivalry bo...,"Led by Woody, Andy's toys live happily in his ..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Jumanji,False,6.9,2413.0,1995,"[English, Français]","[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,[boardgame disappearance basedonchildren'sbook...,When siblings Judy and Peter discover an encha...
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Grumpier Old Men,False,6.5,92.0,1995,[English],"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,[fishing bestfriend duringcreditsstinger oldmen],A family wedding reignites the ancient feud be...
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Waiting to Exhale,False,6.1,34.0,1995,[English],"[whitneyhouston, angelabassett, lorettadevine]",forestwhitaker,[basedonnovel interracialrelationship singlemo...,"Cheated on, mistreated and stepped on, the wom..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Father of the Bride Part II,False,5.7,173.0,1995,[English],"[stevemartin, dianekeaton, martinshort]",charlesshyer,[baby midlifecrisis confidence age daughter mo...,Just when George Banks has recovered from his ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9214,False,,8000000,[Drama],,159550,tt0255313,en,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,...,The Last Brickmaker in America,False,7.0,1.0,2001,[],"[sidneypoitier, wendycrewson, jayo.sanders]",greggchampion,[friendship],A man must cope with the loss of his wife and ...
9215,False,,1000000,"[Thriller, Romance]",,392572,tt5165344,hi,रुस्तम,"Rustom Pavri, an honourable officer of the Ind...",...,Rustom,False,7.3,25.0,2016,[हिन्दी],"[akshaykumar, ileanad'cruz, eshagupta]",tinusureshdesai,[bollywood],"Rustom Pavri, an honourable officer of the Ind..."
9216,False,,15050000,"[Adventure, Drama, History, Romance]",,402672,tt3859980,hi,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",...,Mohenjo Daro,False,6.7,26.0,2016,[हिन्दी],"[hrithikroshan, poojahegde, kabirbedi]",ashutoshgowariker,[bollywood],"Village lad Sarman is drawn to big, bad Mohenj..."
9217,False,,15000000,"[Action, Adventure, Drama, Horror, Science Fic...",,315011,tt4262980,ja,シン・ゴジラ,From the mind behind Evangelion comes a hit la...,...,Shin Godzilla,False,6.6,152.0,2016,"[Italiano, Deutsch, English, 日本語]","[hirokihasegawa, yutakatakenouchi, satomiishih...",hideakianno,[monster godzilla giantmonster destruction kaiju],From the mind behind Evangelion comes a hit la...


In [35]:
sd['director']=sd['director'].apply(lambda x:[x,x,x,x,x])
#joining all coumns to apply cosine similarity
sd['col']=sd['cast']+ sd['director']+ sd['keywords']
sd['col']=sd['col'].apply(lambda x:' '.join(x))
sd.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,languages,cast,director,keywords,description,col
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,1995,[English],"[tomhanks, timallen, donrickles]","[johnlasseter, johnlasseter, johnlasseter, joh...",[jealousy toy boy friendship friend rivalry bo...,"Led by Woody, Andy's toys live happily in his ...",tomhanks timallen donrickles johnlasseter john...
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413.0,1995,"[English, Français]","[robinwilliams, jonathanhyde, kirstendunst]","[joejohnston, joejohnston, joejohnston, joejoh...",[boardgame disappearance basedonchildren'sbook...,When siblings Judy and Peter discover an encha...,robinwilliams jonathanhyde kirstendunst joejoh...


In [36]:
#using count vectorizer here for keywords because no stop words and every word has equal weights and will not be biased wby stop words
cf=CountVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
cf_matrix=cf.fit_transform(sd['col'])

In [37]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(sd['description'])
tfidf_matrix.shape

(9219, 268123)

In [61]:
#check cosine similarity for col
cosine_sim = cosine_similarity(cf_matrix, cf_matrix)

In [39]:
#checking cosine similarity for description
#cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#t=[]

In [73]:
def get_recommendation(title):
    index=sd[sd['title']==title].index[0]
    scores=list(enumerate(cosine_sim[index]))
    #scores=cosine_sim[index]
    scores=sorted(scores,key=lambda x:x[1],reverse=True)
    scores=scores[1:31]
    t=scores[1:31]
    mov_id=[i[0] for i in scores]
    return sd.iloc[mov_id,8]
get_recommendation('The Dark Knight')

8031                   The Dark Knight Rises
6218                           Batman Begins
6623                            The Prestige
7648                               Inception
2085                               Following
4145                                Insomnia
3381                                 Memento
8613                            Interstellar
7659              Batman: Under the Red Hood
1134                          Batman Returns
2131                                Superman
8478    Justice League: Crisis on Two Earths
9024      Batman v Superman: Dawn of Justice
8001                        Batman: Year One
2132                             Superman II
6521                        Superman Returns
8467                              Kick-Ass 2
524                                   Batman
8419                            Man of Steel
1260                          Batman & Robin
2133                            Superman III
8872              Captain America: Civil War
6733      

In [41]:
#importing ratings data
ratings=pd.read_csv('data/dataset/ratings_small.csv')

In [42]:
reader = Reader(rating_scale=(0,5))
da= Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
#da.split(n_folds=5)


In [43]:
#trainset=da.build_full_trainset()
#Benchmarkinf the algorithms to see the best one
bm=[]
#iterating over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    #performing cross validatio
    results =cross_validate(algorithm,da,measures=['RMSE','MAE'],cv=3,verbose=False)
    #Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['algorithm']))
    bm.append(tmp)
pd.DataFrame(bm).set_index('algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVDpp,0.893409,0.685823,226.751324,9.959825
BaselineOnly,0.896598,0.693453,0.195702,0.133701
SVD,0.902727,0.695724,3.295737,0.603672
KNNBaseline,0.905773,0.693399,0.321073,2.670678
KNNWithMeans,0.926811,0.709506,0.171368,2.292376
KNNWithZScore,0.929816,0.707449,0.218723,2.596689
SlopeOne,0.945905,0.723115,3.145252,8.680123
NMF,0.959642,0.737292,4.352318,0.793733
CoClustering,0.975928,0.755673,2.576384,0.256796
KNNBasic,0.978664,0.753128,0.142699,2.010456


In [44]:
svd=SVD()
trainset=da.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1576f657a88>

In [None]:
# final recommendation
#def final_model(userid,title):
ids=pd.read_csv('data/dataset/links_small.csv')[['movieId','tmdbId']]
#map tmdb id to id for ratings
ids.columns=['movieId','id']
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan
ids['id']=ids['id'].apply(convert_int)
ids=ids.merge(sd[['title','id']], on='id').set_index('title')
indices_map = ids.set_index('id')

In [58]:
indices=pd.Series(sd.index, index=sd['title'])
title=sd['title']

In [72]:
def final_recommendor(userid,title):
    index=indices[title]
    tmdbid=ids.loc[title]['id']
    movie_id=ids.loc[title]['movieId']
    sim_scores=list(enumerate(cosine_sim[int(index)]))
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores=sim_scores[1:26]
    movie_indices=[i[0] for i in sim_scores]
    movies=sd.iloc[movie_indices][['title','vote_count','vote_average','year','id']]
    movies['est']=movies['id'].apply(lambda x: svd.predict(userid, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(30)
final_recommendor(1, 'The Terminator')
#get_recommendation('The Dark knight')

IndexError: index 0 is out of bounds for axis 0 with size 0