In [63]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from surprise import KNNWithMeans, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split

In [64]:
df_movies = pd.read_csv("Data/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [65]:
df_ratings = pd.read_csv("Data/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [66]:
df_ratings.drop(columns='timestamp', inplace=True)

In [67]:
df_cleaned = df_movies.merge(df_ratings, on='movieId')

In [68]:
df_cleaned.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


In [69]:
df_clean = df_cleaned[['movieId','userId','title','rating']]

In [70]:
df_clean.head()

Unnamed: 0,movieId,userId,title,rating
0,1,1,Toy Story (1995),4.0
1,1,5,Toy Story (1995),4.0
2,1,7,Toy Story (1995),4.5
3,1,15,Toy Story (1995),2.5
4,1,17,Toy Story (1995),4.5


In [71]:
num_users = len(df_clean['userId'].value_counts())
num_items = len(df_clean['title'].value_counts())
print('Unique number of users in the dataset: {}'.format(num_users))
print('Unique number of movies in the dataset: {}'.format(num_items))


Unique number of users in the dataset: 610
Unique number of movies in the dataset: 9719


In [72]:
rating_count_df = pd.DataFrame(df_clean.groupby(['rating']).size(), columns=['Num Ratings'])
rating_count_df

Unnamed: 0_level_0,Num Ratings
rating,Unnamed: 1_level_1
0.5,1370
1.0,2811
1.5,1791
2.0,7551
2.5,5550
3.0,20047
3.5,13136
4.0,26818
4.5,8551
5.0,13211


# Baseline Model

In [73]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy
from sklearn.preprocessing import MultiLabelBinarizer


In [75]:
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_clean[['userId', 'movieId', 'rating']], reader)

# split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=.25)

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

# use the fitted model to predict ratings on the testing set
predictions = algo.test(testset)

# evaluate the performance of the model using different metrics
accuracy.mae(predictions)
accuracy.mse(predictions)

MAE:  0.6718
MSE: 0.7614


0.7613844205790361

In [76]:
# create a dictionary to map movie IDs to their indices in the dataset
movie_to_idx = {movie_id: i for i, movie_id in enumerate(df_clean['movieId'].unique())}

# create an instance of the SVD algorithm and fit it on the training set
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe2fb16bca0>

In [82]:
def recommend_movies(user_id):
    # get all movies that the user has not rated yet
    user_movies = df_clean[df_clean['userId'] == user_id]['movieId']
    unrated_movies = [movie_id for movie_id in df_clean['movieId'].unique() if movie_id not in user_movies]

    # create a list of tuples containing the movie ID and the predicted rating
    movie_ratings = [(movie_id, algo.predict(user_id, movie_to_idx[movie_id]).est) for movie_id in unrated_movies]

    # sort the list of tuples by the predicted rating in descending order and get the top 5 movies
    top_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:5]

    # map the recommended movie indices back to their titles
    recommended_movies = [(df_clean[df_clean['movieId'] == movie_id]['title'].iloc[0], rating) for movie_id, rating in top_movies]

    # create a dataframe with the recommended movie titles and predicted ratings
    recommended_movies_df = pd.DataFrame(recommended_movies, columns=['title', 'predicted_rating'])

    return recommended_movies_df

In [91]:
recommend_movies(400)

Unnamed: 0,title,predicted_rating
0,Cemetery Man (Dellamorte Dellamore) (1994),4.919749
1,I Love Trouble (1994),4.90867
2,"Pompatus of Love, The (1996)",4.880522
3,"Visitor, The (2007)",4.879777
4,Tommy (1975),4.876912


In [92]:
user_id = 400 # replace with the ID of the user you want to look up

movies_rated_by_user = df_clean[df_clean['userId'] == user_id][['title', 'rating']].sort_values(by='rating', ascending=False)

print(movies_rated_by_user)


                                                    title  rating
501                                           Heat (1995)     5.0
17077                                        Fargo (1996)     5.0
2243                          Seven (a.k.a. Se7en) (1995)     5.0
58457                          Requiem for a Dream (2000)     5.0
25998   Star Wars: Episode VI - Return of the Jedi (1983)     5.0
82052                                   Inside Man (2006)     5.0
24790   Star Wars: Episode V - The Empire Strikes Back...     5.0
91446                                    Inception (2010)     5.0
19964                               Godfather, The (1972)     5.0
45195                                  Matrix, The (1999)     5.0
18793                                Trainspotting (1996)     5.0
16407                    Silence of the Lambs, The (1991)     5.0
8860                     Shawshank Redemption, The (1994)     5.0
8068                                  Pulp Fiction (1994)     5.0
7813    Lé

In [51]:
#########################

PLOTS BELOW

In [52]:
#rtg_countdf = pd.DataFrame(df_clean.groupby(['rating']).size(), columns=['count'])
#
#ax = rtg_countdf.reset_index().rename(columns={'index': 'rating score'}).plot('rating','count', 'bar',
#    figsize=(12, 8),
#    title='Count for Each Rating Score',
#    fontsize=12)
#
#ax.set_xlabel("Movie Rating Score")
#ax.set_ylabel("Number of Ratings")

PLOTS ABOVE

# Extra Code for Model Tuning (Not used for Overfitting)

In [53]:
import surprise
from surprise import Dataset, Reader, SVD
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df_clean[['userId', 'title', 'rating']], test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)


In [55]:
# Load the dataset using Surprise
from surprise import Dataset, Reader

reader = Reader(rating_scale=(1.0, 5.0))
train_set = Dataset.load_from_df(train_data[['userId', 'title', 'rating']], reader)
test_set = Dataset.load_from_df(test_data[['userId', 'title', 'rating']], reader)
val_set = Dataset.load_from_df(val_data[['userId', 'title', 'rating']], reader)


In [56]:
from surprise import SVD

model = SVD()
train_set_full = train_set.build_full_trainset()
model.fit(train_set_full)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe2fd7ef160>

In [57]:
from surprise import accuracy

test_predictions = model.test(test_set.build_full_trainset().build_testset())
val_predictions = model.test(val_set.build_full_trainset().build_testset())

accuracy.mae(test_predictions)
accuracy.mse(val_predictions)

MAE:  0.6808
MSE: 0.7754


0.7754434060962259

In [58]:
from surprise.model_selection import cross_validate

results = cross_validate(model, train_set, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8840  0.9020  0.8785  0.8847  0.8940  0.8886  0.0084  
Fit time          2.38    2.44    2.39    2.33    2.39    2.39    0.03    
Test time         0.04    0.05    0.04    0.04    0.13    0.06    0.03    


In [41]:
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors':[50, 100, 200],'n_epochs': [10, 20, 30], 'lr_all': [0.002, 0.005, 0.01],
               'reg_all': [0.02, 0.1, 0.4]}
gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
gs_model.fit(data)
best_rmse = gs_model.best_score['rmse']
best_params = gs_model.best_params['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  4.1min finished


In [42]:
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train)
predictions = svd.test(test)
print(accuracy.rmse(predictions))

RMSE: 0.8867
0.8866678034770038


In [43]:
from surprise.model_selection import RandomizedSearchCV

# Define the search space for hyperparameters
param_distributions = {'n_factors': [50, 100, 200],
                       'n_epochs': [10, 20, 30],
                       'lr_all': [0.002, 0.005, 0.01],
                       'reg_all': [0.02, 0.1, 0.4]}
# Create the randomized search object
rs = RandomizedSearchCV(SVD, param_distributions, n_iter=10, measures=['rmse', 'mae'], cv=5)

# Run the randomized search
rs.fit(data)

# Get the best RMSE score and the corresponding hyperparameters
best_rmse = rs.best_score['rmse']
best_params = rs.best_params['rmse']


In [44]:
print('Best RMSE: ' + str(best_rmse))

Best RMSE: 0.8636408658440136


In [45]:
print('Best Params: ' + str(best_params))

Best Params: {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}
