In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from surprise import KNNWithMeans, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

In [2]:
df_movies = pd.read_csv("Data/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
df_ratings = pd.read_csv("Data/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df_ratings.drop(columns='timestamp', inplace=True)

In [5]:
df_cleaned = df_movies.merge(df_ratings, on='movieId')

In [6]:
df_cleaned.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [7]:
df_clean = df_cleaned[['movieId','userId','title','rating']]

In [8]:
df_clean.head()

Unnamed: 0,movieId,userId,title,rating
0,1,1,Toy Story (1995),4.0
1,1,5,Toy Story (1995),4.0
2,1,7,Toy Story (1995),4.5
3,1,15,Toy Story (1995),2.5
4,1,17,Toy Story (1995),4.5


In [9]:
num_users = len(df_clean['userId'].value_counts())
num_items = len(df_clean['title'].value_counts())
print('Unique number of users in the dataset: {}'.format(num_users))
print('Unique number of movies in the dataset: {}'.format(num_items))


Unique number of users in the dataset: 610
Unique number of movies in the dataset: 9719


In [10]:
rating_count_df = pd.DataFrame(df_clean.groupby(['rating']).size(), columns=['Num Ratings'])
rating_count_df

Unnamed: 0_level_0,Num Ratings
rating,Unnamed: 1_level_1
0.5,1370
1.0,2811
1.5,1791
2.0,7551
2.5,5550
3.0,20047
3.5,13136
4.0,26818
4.5,8551
5.0,13211


In [11]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   movieId  100836 non-null  int64  
 1   userId   100836 non-null  int64  
 2   title    100836 non-null  object 
 3   rating   100836 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 3.8+ MB


In [12]:
df_clean = df_cleaned[df_cleaned['rating'] >= 3.0]
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81763 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  81763 non-null  int64  
 1   title    81763 non-null  object 
 2   genres   81763 non-null  object 
 3   userId   81763 non-null  int64  
 4   rating   81763 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 3.7+ MB


# Collaborative Model

In [13]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from sklearn.preprocessing import MultiLabelBinarizer


In [14]:
reader = Reader(rating_scale=(3.0, 5.0))
data = Dataset.load_from_df(df_clean[['userId', 'movieId', 'rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=.25)

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

# use the fitted model to predict ratings on the testing set
predictions = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mse(predictions)

MSE: 0.3599


0.35991166089695137

In [15]:
# create a dictionary to map movie IDs to their indices in the dataset
movie_to_idx = {movie_id: i for i, movie_id in enumerate(df_clean['movieId'].unique())}

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff181966e50>

In [16]:
def recommend_movies(user_id):
    # get all movies that the user has not rated yet
    user_movies = df_clean[df_clean['userId'] == user_id]['movieId']
    unrated_movies = [movie_id for movie_id in df_clean['movieId'].unique() if movie_id not in user_movies]

    # create a list of tuples containing the movie ID and the predicted rating
    movie_ratings = [(movie_id, algo.predict(user_id, movie_to_idx[movie_id]).est) for movie_id in unrated_movies]

    # sort the list of tuples by the predicted rating in descending order and get the top 5 movies
    top_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:5]

    # map the recommended movie indices back to their titles
    recommended_movies = [(df_clean[df_clean['movieId'] == movie_id]['title'].iloc[0], rating) for movie_id, rating in top_movies]

    # create a dataframe with the recommended movie titles and predicted ratings
    recommended_movies_df = pd.DataFrame(recommended_movies, columns=['title', 'predicted_rating'])

    return recommended_movies_df

In [17]:
recommend_movies(400)

Unnamed: 0,title,predicted_rating
0,"Rainmaker, The (1997)",4.965031
1,Three Colors: White (Trzy kolory: Bialy) (1994),4.873576
2,Nine to Five (a.k.a. 9 to 5) (1980),4.866575
3,"39 Steps, The (1935)",4.861333
4,Maverick (1994),4.841895


In [18]:
user_id = 400 # replace with the ID of the user you want to look up

movies_rated_by_user = df_clean[df_clean['userId'] == user_id][['title', 'rating']].sort_values(by='rating', ascending=False)

print(movies_rated_by_user)


                                                    title  rating
501                                           Heat (1995)     5.0
17077                                        Fargo (1996)     5.0
58457                          Requiem for a Dream (2000)     5.0
2243                          Seven (a.k.a. Se7en) (1995)     5.0
25998   Star Wars: Episode VI - Return of the Jedi (1983)     5.0
82052                                   Inside Man (2006)     5.0
24790   Star Wars: Episode V - The Empire Strikes Back...     5.0
91446                                    Inception (2010)     5.0
19964                               Godfather, The (1972)     5.0
45195                                  Matrix, The (1999)     5.0
18793                                Trainspotting (1996)     5.0
16407                    Silence of the Lambs, The (1991)     5.0
8860                     Shawshank Redemption, The (1994)     5.0
8068                                  Pulp Fiction (1994)     5.0
7813    Lé

# Extra Code for Model Tuning (Not used for Overfitting)

In [38]:
import surprise
from surprise import Dataset, Reader, SVD, accuracy
from sklearn.metrics.pairwise import cosine_similarity
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV

In [39]:
train_data, test_data = train_test_split(df_clean[['userId', 'title', 'rating']], test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [40]:
reader = Reader(rating_scale=(1.0, 5.0))
train_set = Dataset.load_from_df(train_data[['userId', 'title', 'rating']], reader)
test_set = Dataset.load_from_df(test_data[['userId', 'title', 'rating']], reader)
val_set = Dataset.load_from_df(val_data[['userId', 'title', 'rating']], reader)

In [41]:
model = SVD()
train_set_full = train_set.build_full_trainset()
model.fit(train_set_full)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff11003f6d0>

In [42]:
test_predictions = model.test(test_set.build_full_trainset().build_testset())
val_predictions = model.test(val_set.build_full_trainset().build_testset())

accuracy.mse(val_predictions)

MSE: 0.3600


0.36004373943737394

In [43]:
results = cross_validate(model, train_set, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6085  0.6079  0.5963  0.6015  0.5977  0.6024  0.0051  
Fit time          2.25    2.24    2.26    2.23    2.27    2.25    0.02    
Test time         0.04    0.11    0.04    0.04    0.04    0.05    0.03    


In [44]:
param_grid = {'n_factors':[50, 100, 200],'n_epochs': [10, 20, 30], 'lr_all': [0.002, 0.005, 0.01],
               'reg_all': [0.02, 0.1, 0.4]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(data)
best_rmse = gs_model.best_score['rmse']
best_params = gs_model.best_params['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  3.3min finished


In [46]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train_set_full)
test_predictions = svd.test(test_set.build_full_trainset().build_testset())
print('Test RMSE:', accuracy.rmse(test_predictions))

RMSE: 0.6020
Test RMSE: 0.6020205743962399


In [47]:
# Define the search space for hyperparameters
param_distributions = {'n_factors': [50, 100, 200],
                       'n_epochs': [10, 20, 30],
                       'lr_all': [0.002, 0.005, 0.01],
                       'reg_all': [0.02, 0.1, 0.4]}
# Create the randomized search object
rs = RandomizedSearchCV(SVD, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=5)

# Run the randomized search
rs.fit(data)

# Get the best RMSE score and the corresponding hyperparameters
best_rmse = rs.best_score['rmse']
best_params = rs.best_params['rmse']


In [48]:
print('Best RMSE: ' + str(best_rmse))

Best RMSE: 0.5917228969658445


In [49]:
print('Best Params: ' + str(best_params))

Best Params: {'n_factors': 200, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}
