In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

### Collaborative Filtering

The content-based recommender has few limitations. It can suggest movies that are close only to a certain
movie. It gives recommendations based on the movie queried and doesn’t consider the user’s preference.
It doesn’t capture users’ personal tastes or biases or their favorite (or highly rated) movies. 
So, anyone querying for recommendations (based on a movie) will receive the same result of movies irrespective of who that user is. Filtering and recommending based on information given by other users is known as collaborative filtering.


The Collaborative approach is based on users with similar interests. So, the
recommender suggests movies to users based on other users that highly rated similar
movies or subset of the movies that are in common.


To give an example: If a person A likes item 1, 2, 3 and B likes 2,3,4
then they have similar interests and
A should like item 4 and B should like item 1. Figure 21 shows a visualization
explaining the user-based filtering concept.

In [2]:
md = pd. read_csv(r'C:\Users\Pravesh\Documents\Assignments\ML Team Projects\Project\Dataset\the-movies-dataset\movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [5]:
md = md.drop([19730, 29503, 35587])

In [6]:
md['id'] = md['id'].astype('int')

In [7]:
links_small = pd.read_csv(r'C:\Users\Pravesh\Documents\Assignments\ML Team Projects\Project\Dataset\the-movies-dataset\links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [8]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [9]:
smd['tagline'] = smd['tagline'].fillna('')
smd['description'] = smd['overview'] + smd['tagline']
smd['description'] = smd['description'].fillna('')

In [10]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [11]:
tfidf_matrix.shape

(9099, 268124)

In [12]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [13]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [14]:
reader = Reader()

In [15]:
ratings = pd.read_csv(r'C:\Users\Pravesh\Documents\Assignments\ML Team Projects\Project\Dataset\the-movies-dataset\ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [16]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [17]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8887
MAE:  0.6811
------------
Fold 2
RMSE: 0.8981
MAE:  0.6895
------------
Fold 3
RMSE: 0.8978
MAE:  0.6902
------------
Fold 4
RMSE: 0.8984
MAE:  0.6945
------------
Fold 5
RMSE: 0.8991
MAE:  0.6926
------------
------------
Mean RMSE: 0.8964
Mean MAE : 0.6896
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.888732282608134,
                             0.8980922900556885,
                             0.8978347956034853,
                             0.8984319063858289,
                             0.8991246633300891],
                            'mae': [0.68106973968938,
                             0.6894902264774145,
                             0.690156638549986,
                             0.6944832324112798,
                             0.6926457709421041]})

In [18]:
trainset = data.build_full_trainset()
svd.train(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x189a7a33550>

In [19]:
ratings[ratings['userId'] == 500]

Unnamed: 0,userId,movieId,rating,timestamp
71152,500,1,2.0,1228946388
71153,500,2,1.5,1228946281
71154,500,19,3.0,1228963336
71155,500,34,2.5,1228946744
71156,500,39,2.5,1228946243
71157,500,48,2.5,1228963545
71158,500,62,3.5,1228946126
71159,500,110,3.5,1228920957
71160,500,158,2.5,1229098692
71161,500,231,3.0,1228920995


In [20]:
svd.predict(500, 302, 3)

Prediction(uid=500, iid=302, r_ui=3, est=3.117108369251667, details={'was_impossible': False})

### The Collaborative Function

In [21]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [22]:
id_map = pd.read_csv(r'C:\Users\Pravesh\Documents\Assignments\ML Team Projects\Project\Dataset\the-movies-dataset\links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
#id_map = id_map.set_index('tmdbId')

In [23]:
indices_map = id_map.set_index('id')

In [27]:
def collaborative(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [28]:
collaborative(201, 'The Matrix')

Unnamed: 0,title,vote_count,vote_average,year,id,est
5330,Gladiator 1992,28.0,5.7,1992,16219,4.445927
1518,Out of Africa,290.0,7.0,1985,606,4.297126
7941,The Girl with the Dragon Tattoo,2479.0,7.2,2011,65754,4.270499
3114,The Specials,11.0,5.5,2000,29015,4.175058
6407,23,55.0,7.1,1998,1557,4.147236
5804,Electric Dreams,37.0,6.4,1984,19596,4.118975
5911,The Animatrix,433.0,6.9,2003,55931,4.082861
7393,Avatar,12114.0,7.2,2009,19995,4.022977
4741,Commando,753.0,6.4,1985,10999,4.008199
6320,District B13,572.0,6.5,2004,10045,3.987419


In [29]:
collaborative(301, 'The Matrix')

Unnamed: 0,title,vote_count,vote_average,year,id,est
5330,Gladiator 1992,28.0,5.7,1992,16219,4.111763
1518,Out of Africa,290.0,7.0,1985,606,3.993932
5911,The Animatrix,433.0,6.9,2003,55931,3.810384
1510,Oliver!,106.0,7.0,1968,17917,3.760948
7941,The Girl with the Dragon Tattoo,2479.0,7.2,2011,65754,3.754115
6407,23,55.0,7.1,1998,1557,3.721967
6320,District B13,572.0,6.5,2004,10045,3.704436
6364,Pulse,154.0,5.0,2006,9682,3.67989
5804,Electric Dreams,37.0,6.4,1984,19596,3.675862
937,The English Patient,559.0,7.0,1996,409,3.669869


#### How does it work?
We use Singular Value Decomposition (SVD) in this approach to recommend movies based on similar users’ and their ratings (for other movies).
The essence of SVD is that it decomposes a matrix of any shape into a product of 3 matrices with mathematical
properties: A=USVT. We normalize the matrix and compute the SVD. We also calculate cosine similarity and sort the
results by top N.


Here, we turn the recommendation problem into an optimization problem. We view it as how good we are in
predicting the rating for movies, given a user. Hence, RMSE and MAE are used to evaluate the performance
and that is why we use SVD to minimize this error.
So, if I’m looking for a new movie and I’ve watched ‘The Matrix’, this method will recommend movies that have a
similar rating pattern to ‘The Matrix’ across a set of users.
As we can see in the results above - the movie recommendations changed for the same movie queried with a
different user ID. This shows that the algorithm generates different suggestions for different users.