#                            Movie Recommender System

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader,Dataset,SVD
from surprise.model_selection import cross_validate

import warnings;
warnings.simplefilter('ignore')

## Simple Recommender based on TMBD rating formula 

In [2]:
md = pd.read_csv(r"C:\Users\hp\Documents\Movielens dataset\movies_metadata.csv")
md['genres']=md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Formula Parameter Calculation

Formula: (v/(v+m))*R+(m/(m+v))*C
v: number of votes for the movie
R:average rating of the movie
C:mean vote across the whole report
m:minimum votes reqd. to be listed 

In [66]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C=vote_averages.mean()
m=vote_counts.quantile(0.95)
m,C

(425.0, 5.238696808510638)

In [4]:
md['year']=pd.to_datetime(md['release_date'],errors='coerce').apply(lambda x: str(x).split('-')[0] if x!=np.nan else np.nan)
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995


In [5]:
qualified=md[(md['vote_count']>=m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title','year','vote_count','vote_average','popularity','genres']]
qualified['vote_count']=qualified['vote_count'].astype('int')
qualified['vote_average']=qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [6]:
def weighted(x):
    v=x['vote_count']
    R=x['vote_average']
    return (v/(v+m))*R+(m/(v+m))*C

In [7]:
qualified['wr']=qualified.apply(weighted,axis=1)

In [8]:
qualified=qualified.sort_values('wr',ascending=False).head(250)
qualified.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.1081,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.2135,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.8696,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.0707,"[Adventure, Fantasy, Action]",7.871787


In [9]:
s = md.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_md = md.drop('genres', axis=1).join(s)

In [10]:
def build_chart(genre,percentile=0.85):
    df=gen_md[gen_md['genre']==genre]
    vote_counts=df[df['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages=df[df['vote_average'].notnull()]['vote_average'].astype('int')
    C=vote_averages.mean()
    m=vote_counts.quantile(percentile)
    
    qualified = df[(df['vote_count'] >= m) & (df['vote_count'].notnull()) & (df['vote_average'].notnull())][['title', 'year', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

Top 15 Romantic Movies

In [11]:
build_chart('Romance').head(15)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457,8.565285
351,Forrest Gump,1994,8147,8,48.3072,7.971357
876,Vertigo,1958,1162,8,18.2082,7.811667
40251,Your Name.,2016,1030,8,34.461252,7.789489
883,Some Like It Hot,1959,835,8,11.8451,7.745154
1132,Cinema Paradiso,1988,834,8,14.177,7.744878
19901,Paperman,2012,734,8,7.19863,7.713951
37863,Sing Street,2016,669,8,10.672862,7.689483
882,The Apartment,1960,498,8,11.9943,7.599317
38718,The Handmaiden,2016,453,8,16.727405,7.566166


# Content Based Recommender

#### 1.Based on Movie description(metadata or content) :Overview,Tagline
2.Cast,Crew,Keywords,Genre

In [12]:
links_small=pd.read_csv(r"C:\Users\hp\Documents\Movielens dataset\links_small.csv") #9125 examples of three columns
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [13]:
md = md.drop([19730, 29503, 35587])

In [14]:
md['id']=md['id'].astype('int')
smd=md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [15]:
smd['tagline']=smd['tagline'].fillna('')
smd['description']=smd['overview']+smd['tagline']
smd['description']=smd['description'].fillna('')

In [16]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix=tf.fit_transform(smd['description'])
tfidf_matrix.shape

(9099, 268124)

In [17]:
cosine_sim=linear_kernel(tfidf_matrix,tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

predicting the top 30 most similar movies based on cosine similarity 

In [18]:
smd=smd.reset_index()
titles=smd['title']
indices=pd.Series(smd.index,index=smd['title'])

In [19]:
def get_recommendation(title):
    idx=indices[title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores=sim_scores[1:31]
    movie_indices=[i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [20]:
get_recommendation('The Godfather').head(10)

973      The Godfather: Part II
8387                 The Family
3509                       Made
4196         Johnny Dangerously
29               Shanghai Triad
5667                       Fury
2412             American Movie
1582    The Godfather: Part III
4221                    8 Women
2159              Summer of Sam
Name: title, dtype: object

In [21]:
get_recommendation('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

#### Based on Metadata:cast,crew,genre,director

In [22]:
credits=pd.read_csv(r"C:\Users\hp\Documents\Movielens dataset\credits.csv")
keywords=pd.read_csv(r"C:\Users\hp\Documents\Movielens dataset\keywords.csv")

In [23]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [24]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [25]:
keywords['id']=keywords['id'].astype('int')
credits['id']=credits['id'].astype('int')
md['id']=md['id'].astype('int')

In [26]:
md.shape

(45463, 25)

In [27]:
md=md.merge(credits,on='id')
md=md.merge(keywords,on='id')
md.shape

(46628, 28)

In [28]:
smd=md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [29]:
smd.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,1995,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


##### Changes needed for better performance
1. Cast-Picking only the top three major actors from the movie
2. Crew-Most contribution to be done by director name,so we discard the rest of them.

In [30]:
smd['cast']=smd['cast'].apply(literal_eval)
smd['crew']=smd['crew'].apply(literal_eval)
smd['keywords']=smd['keywords'].apply(literal_eval)

smd['cast_size']=smd['cast'].apply(lambda x: len(x))
smd['crew_size']=smd['crew'].apply(lambda x: len(x))

In [31]:
def get_director(x):
    for i in x:
        if i['job']=='Director':
            return i['name']
        return np.nan

In [32]:
smd['director']=smd['crew'].apply(get_director)

In [33]:
smd['cast']=smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x,list) else [])
smd['cast']=smd['cast'].apply(lambda x: x[:3] if len(x)>=3 else x)

In [34]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

###### Further steps to be followed:
1. Strip the whitespaces and convert names to lowercases alone
2. Mention the name of the director 3 times to decrease bias
3. We maintain keyword counts

In [35]:
smd['cast']=smd['cast'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [36]:
smd['director']=smd['director'].astype('str').apply(lambda x: [str.lower(x.replace(" ",""))])
smd['director']=smd['director'].apply(lambda x:[x,x,x])

In [37]:
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [38]:
s=s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [39]:
s=s[s>1]

In [40]:
stemmer=SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [41]:
def filter_keywords(x):
    words=[]
    for i in x:
        if i in s:
            words.append(i)
    return words

In [42]:
smd['keywords']=smd['keywords'].apply(filter_keywords)
smd['keywords']=smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords']=smd['keywords'].apply(lambda x: [str.lower(i.replace(" ","")) for i in x])

In [43]:
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x:' '.join([str(i) for i in x]))

In [44]:
count=CountVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
count_matrix=count.fit_transform(smd['soup'])

In [45]:
cosine_sim=cosine_similarity(count_matrix,count_matrix)
cosine_sim.shape

(9219, 9219)

In [46]:
smd=smd.reset_index()
titles=smd['title']
indices=pd.Series(smd.index,index=smd['title'])

In [47]:
get_recommendation('The Dark Knight').head(10)

8031         The Dark Knight Rises
5098                  The Enforcer
6218                 Batman Begins
5943                      Thursday
217               The Glass Shield
2272                   In Too Deep
6349           Menolippu Mombasaan
8026            Bullet to the Head
7659    Batman: Under the Red Hood
5073                 The Statement
Name: title, dtype: object

In [48]:
get_recommendation('Forrest Gump').head(10)

3990                      Used Cars
3052              What Lies Beneath
9031                       The Walk
1274                        Contact
2414        Who Framed Roger Rabbit
1933            Romancing the Stone
1591    Back to the Future Part III
1041             Back to the Future
2539                  Stealing Home
3628                The Big Picture
Name: title, dtype: object

###### Popularity and Ratings

I will take the top 25 movies based on similarity scores and calculate the vote of the 60th percentile movie. Then, using this as the value of  m , we will calculate the weighted rating of each movie using IMDB's formula like we did in the Simple Recommender section.

In [49]:
def improved_recommendation(title):
    idx=indices[title]
    sim_scores=list(enumerate(cosine_sim[idx]))
    sim_scores=sorted(sim_scores,key=lambda x:x[1],reverse=True)
    sim_scores=sim_scores[1:26]
    movie_indices=[i[0] for i in sim_scores]
    movies=smd.iloc[movie_indices][['title','vote_count','vote_average','year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C=vote_averages.mean()
    m=vote_counts.quantile(0.60)
    qualified=movies[(movies['vote_count']>=m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count']=qualified['vote_count'].astype('int')
    qualified['vote_average']=qualified['vote_average'].astype('int')
    qualified['wr']=qualified.apply(weighted,axis=1)
    qualified=qualified.sort_values('wr',ascending=False).head(10)
    return qualified


In [50]:
improved_recommendation('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
8031,The Dark Knight Rises,9263,7,2012,6.921448
6218,Batman Begins,7511,7,2005,6.904127
7659,Batman: Under the Red Hood,459,7,2010,6.147016
7380,Bronson,756,6,2008,5.724609
7912,Takers,399,6,2010,5.606585
6551,Chaos,278,6,2005,5.539726
7673,Animal Kingdom,240,6,2010,5.513776
4871,Sleeping with the Enemy,228,6,1991,5.504962
5655,Ladder 49,213,6,2004,5.493486
8026,Bullet to the Head,490,5,2013,5.115027


In [51]:
improved_recommendation("Mean Girls")

Unnamed: 0,title,vote_count,vote_average,year,wr
7962,Chinese Take-Out,95,7,2011,5.560085
8459,Louis C.K.: Oh My God,66,7,2013,5.47657
8886,Louis C.K.: Live at The Comedy Store,58,7,2015,5.451799
3428,Nine to Five,102,6,1980,5.388592
8707,Premature,120,5,2014,5.19185
8320,Parental Guidance,212,5,2012,5.164528
8694,Tammy,508,5,2014,5.112829
5140,Welcome to Mooseport,54,4,2004,5.107142
8213,Lola Versus,55,4,2012,5.104878
8127,The Dictator,1789,5,2012,5.047812


# Collaborative Filtering
Collaborative Filtering is based on the idea that users similar to a me can be used to predict how much I will like a particular product or service those users have used/experienced but I have not.

Surprise package has a very powerful algorithm called Singular Value Decomposition(SVD) to make great recommendations by minimising the root mean square error. 

In [52]:
reader=Reader()

In [53]:
s=r'C:\Users\hp\Documents\Movielens dataset\ratings_small.csv'
ratings=pd.read_csv(s)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [54]:
#RMSE:root mean square error ,MAE:mean absolute error
data=Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
svd=SVD()
cross_validate(svd,data,measures=['RMSE','MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8975  0.8994  0.8986  0.8948  0.8938  0.8968  0.0022  
MAE (testset)     0.6897  0.6920  0.6963  0.6872  0.6876  0.6906  0.0033  
Fit time          15.94   11.65   10.39   8.99    9.16    11.23   2.54    
Test time         1.81    0.45    0.45    0.42    0.60    0.75    0.54    


{'test_rmse': array([0.89749145, 0.89944977, 0.89861768, 0.89475304, 0.89375194]),
 'test_mae': array([0.6896767 , 0.69201305, 0.69625827, 0.68724965, 0.68757387]),
 'fit_time': (15.940707683563232,
  11.648239850997925,
  10.39478611946106,
  8.991257667541504,
  9.163808584213257),
 'test_time': (1.8121583461761475,
  0.4499025344848633,
  0.4459850788116455,
  0.41530704498291016,
  0.6016807556152344)}

In [55]:
trainset=data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f96503f630>

In [56]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [57]:
svd.predict(1,302,3) #userid,movieid,rating

Prediction(uid=1, iid=302, r_ui=3, est=2.602736287635164, details={'was_impossible': False})

uid: userid, iid:itemid,r_ui:the true rating, est:estimated rating, details:stores additional details about the prediction that might be useful for later analysis

For movie with ID 302, we get an estimated prediction of 2.85. One startling feature of this recommender system is that it doesn't care what the movie is (or what it contains). It works purely on the basis of an assigned movie ID and tries to predict ratings based on how the other users have predicted the movie.

# Hybrid Recommender

We bring together techniques implemented in the content based and collaborative filtering methods.
Input: Userid and title
Output: Similar movies sorted on the basis of expected ratings by that particular user.

In [58]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [59]:
id_map=pd.read_csv(r'C:\Users\hp\Documents\Movielens dataset\links_small.csv')[['movieId','tmdbId']]
id_map['tmdbId']=id_map['tmdbId'].apply(convert_int)
id_map.columns=['movieId','id']
id_map=id_map.merge(smd[['title','id']],on='id').set_index('title')

In [60]:
id_map.head()

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
Jumanji,2,8844.0
Grumpier Old Men,3,15602.0
Waiting to Exhale,4,31357.0
Father of the Bride Part II,5,11862.0


In [61]:
indices_map=id_map.set_index('id')
indices_map.head()

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
8844.0,2
15602.0,3
31357.0,4
11862.0,5


In [62]:
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [63]:
hybrid(1,'Toy Story')

Unnamed: 0,title,vote_count,vote_average,year,id,est
8595,The Lego Movie,3127.0,7.5,2014,137106,3.252641
3833,"Monsters, Inc.",6150.0,7.5,2001,585,3.167624
7629,Toy Story 3,4710.0,7.6,2010,10193,3.138439
2751,Creature Comforts,29.0,7.3,1989,54825,3.010791
7169,A Matter of Loaf and Death,120.0,7.2,2008,14447,2.988795
4341,"The Looney, Looney, Looney Bugs Bunny Movie",13.0,6.6,1981,41394,2.916329
7404,Cloudy with a Chance of Meatballs,1799.0,6.5,2009,22794,2.899921
2522,Toy Story 2,3914.0,7.3,1999,863,2.874368
7254,Kung Fu Panda: Secrets of the Furious Five,80.0,6.5,2008,15854,2.734238
6386,Luxo Jr.,148.0,7.1,1986,13925,2.701618


In [64]:
hybrid(500,'Avatar')

Unnamed: 0,title,vote_count,vote_average,year,id,est
7392,The Three Musketeers,2.0,3.0,1933,140887,3.278447
7705,Alice in Wonderland,8.0,5.4,1933,25694,3.259028
7091,Mutant Chronicles,142.0,5.1,2008,13256,3.199818
9006,Star Trek Beyond,2636.0,6.6,2016,188927,3.194199
4017,Hawk the Slayer,13.0,4.5,1980,25628,3.176383
3894,Spacehunter: Adventures in the Forbidden Zone,37.0,5.1,1983,26978,3.132024
7208,Replicant,93.0,5.0,2001,10596,3.036937
7088,Star Wars: The Clone Wars,434.0,5.8,2008,12180,3.022369
8555,Zatôichi on the Road,11.0,7.5,1963,18624,3.014867
6689,Epic Movie,334.0,3.2,2007,9760,2.994701


The project is hereby concluded.