## Collaborative Filtering for recommendation of spotify using SVD factorization.

In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from surprise import Dataset, Reader
from surprise.prediction_algorithms import SVD

import warnings
warnings.filterwarnings('ignore')

Reading the 4 datasets:

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
links = pd.read_csv('links.csv')
tags = pd.read_csv('tags.csv')

In [3]:
print(movies.info())
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
print(ratings.info())
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
print(tags.info())
tags.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB
None


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
#dropping the timestamp column because we don't need it
ratings = ratings.drop('timestamp', axis = 1)

Merging the movies and ratings into one dataframe:

In [7]:
movie_ratings = pd.merge(movies, ratings)
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


Merging movie ratings with the tags dataframe:

In [8]:
tagged_ratings = pd.merge(movie_ratings,tags).drop('timestamp',axis = 1)
tagged_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game


In [9]:
tagged_ratings['genres'] = tagged_ratings['genres'].apply(lambda x: ''.join(x.replace('|', ' ').lower()))
tagged_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),adventure animation children comedy fantasy,336,4.0,pixar
1,1,Toy Story (1995),adventure animation children comedy fantasy,474,4.0,pixar
2,1,Toy Story (1995),adventure animation children comedy fantasy,567,3.5,fun
3,2,Jumanji (1995),adventure children fantasy,62,4.0,fantasy
4,2,Jumanji (1995),adventure children fantasy,62,4.0,magic board game


In [10]:
def combined_features(row):
    return row['genres']+" "+row['tag']
tagged_ratings["combined_features"] = tagged_ratings.apply(combined_features, axis =1)
tagged_ratings.sample(2)

Unnamed: 0,movieId,title,genres,userId,rating,tag,combined_features
3332,135536,Suicide Squad (2016),action crime sci-fi,62,4.0,Batman,action crime sci-fi Batman
1811,4878,Donnie Darko (2001),drama mystery sci-fi thriller,567,3.5,surreal,drama mystery sci-fi thriller surreal


In [11]:
tagged_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,tag,combined_features
0,1,Toy Story (1995),adventure animation children comedy fantasy,336,4.0,pixar,adventure animation children comedy fantasy pixar
1,1,Toy Story (1995),adventure animation children comedy fantasy,474,4.0,pixar,adventure animation children comedy fantasy pixar
2,1,Toy Story (1995),adventure animation children comedy fantasy,567,3.5,fun,adventure animation children comedy fantasy fun
3,2,Jumanji (1995),adventure children fantasy,62,4.0,fantasy,adventure children fantasy fantasy
4,2,Jumanji (1995),adventure children fantasy,62,4.0,magic board game,adventure children fantasy magic board game


#### Collaborative Filtering:

In [12]:
# checking the unique number of user id and movie id in the ratings dataset
print(len(ratings['userId'].unique()), 'Number of user ids')
print(len(ratings['movieId'].unique()), 'Number of movie ids')

610 Number of user ids
9724 Number of movie ids


In [13]:
#reading in the range of ratings score
reader = Reader(rating_scale = (1,5))
#loading the dataframe into surprise
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [14]:
dataset = data.build_full_trainset()

In [15]:
svd = SVD(n_factors= 100, n_epochs = 60, reg_all=0.1, lr_all=0.01)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x209d1742970>

Function to get user ratings for movies to predict recommendations:

In [16]:
# movie_df is the df that contains the movie title, id and genres, num refer to the number of movie ratings required to be given by the user
def movie_rater(movie_df,num, genre=None):
#     choosing 690 to ensure we do not overwrite an already existing userid
    userID = 690
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list 

In [29]:
print('List of unique genres you can search for:\n')
print('Action, Comedy, Adventure, Children, Musical, Romance, Fantasy, Mystery, Sci-Fi, War, Thriller, Animation, Western, Horror')

List of unique genres you can search for:

Action, Comedy, Adventure, Children, Musical, Romance, Fantasy, Mystery, Sci-Fi, War, Thriller, Animation, Western, Horror


In [52]:
# gets user ratings for movies the user has watched
user_rating = movie_rater(movies, 5, 'Adventure')

      movieId                           title                  genres
1488     2013  Poseidon Adventure, The (1972)  Action|Adventure|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                                              title  \
7644    88125  Harry Potter and the Deathly Hallows: Part 2 (...   

                                           genres  
7644  Action|Adventure|Drama|Fantasy|Mystery|IMAX  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId               title                           genres
4103     5880  Extreme Ops (2002)  Action|Adventure|Crime|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                                              title  \
7928    95519  Dragon Ball Z: Bojack Unbound (Doragon bôru Z ...   

                                  genres  
7928  Action|Adventure|Animation|Fantasy  
How do you rate this movi

In [53]:
# append the user_rating result to the end of the ratings df which has the userid, movieid, and ratings
new_ratings_df = ratings.append(user_rating,ignore_index=True)
# load into surprise
new_data = Dataset.load_from_df(new_ratings_df,reader)
# build the train data set
new_dataset = new_data.build_full_trainset()

In [54]:
# fit the svd model tot he new dataset
svd_ = SVD(n_factors= 100, n_epochs = 60, reg_all=0.1, lr_all=0.01)
svd_.fit(new_dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x209d5d6cbb0>

Getting Predicted Recommendations:

In [55]:
# this creates a list of tuples (movie_id, predicted_rating) in descending order of rating score
list_of_movies = []
for movie_id in ratings['movieId'].unique():
    list_of_movies.append( (movie_id,svd_.predict(690,movie_id)[3]))
ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True) 

Generating Recommendations:

The function below takes in the ranked_movies from the result above, the movies dataframe and the number of movie recommendations desired by the user and returns the top five highly rated movie based on what out model predicted for the user:

In [56]:
def recommended_movies(predicted_ratings,movie_title_df,n):
        print('Here are 5 movies we recommend for you to watch:','\n')
        for indices, rec in enumerate(predicted_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            genre = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['genres']           
            print(f'Recommendation No:  {indices+1} :  {title, genre} \n')
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,movies,5)

Here are 5 movies we recommend for you to watch: 

Recommendation No:  1 :  (46    Usual Suspects, The (1995)
Name: title, dtype: object, 46    Crime|Mystery|Thriller
Name: genres, dtype: object) 

Recommendation No:  2 :  (224    Star Wars: Episode IV - A New Hope (1977)
Name: title, dtype: object, 224    Action|Adventure|Sci-Fi
Name: genres, dtype: object) 

Recommendation No:  3 :  (257    Pulp Fiction (1994)
Name: title, dtype: object, 257    Comedy|Crime|Drama|Thriller
Name: genres, dtype: object) 

Recommendation No:  4 :  (314    Forrest Gump (1994)
Name: title, dtype: object, 314    Comedy|Drama|Romance|War
Name: genres, dtype: object) 

Recommendation No:  5 :  (461    Schindler's List (1993)
Name: title, dtype: object, 461    Drama|War
Name: genres, dtype: object) 

