In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ratings_data = pd.read_csv("ratings.csv")
movies_data = pd.read_csv("movies.csv")

In [3]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163703 sha256=8cdd42132bc069764a186dc8d4fa53b19b440efa59f2cb6c4c61eae0c6bc0cf6
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [4]:
from surprise import Dataset
from surprise import Reader

# Get minimum and maximum rating from the dataset
min_rating = ratings_data.rating.min()
max_rating = ratings_data.rating.max()

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

In [5]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(n_epochs=10)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.8712  0.8830  0.8692  0.8722  0.8858  0.8784  0.8796  0.8771  0.8795  0.8775  0.8774  0.0049  
MAE (testset)     0.6725  0.6848  0.6717  0.6713  0.6774  0.6787  0.6787  0.6772  0.6742  0.6731  0.6760  0.0040  
Fit time          3.20    0.95    0.95    0.94    0.97    0.95    0.96    0.94    1.11    1.40    1.24    0.67    
Test time         0.10    0.06    0.06    0.19    0.08    0.06    0.06    0.10    0.11    0.11    0.09    0.04    


In [6]:
print("Average MAE: ", np.average(results["test_mae"]))
print("Average RMSE: ", np.average(results["test_rmse"]))

Average MAE:  0.6759587262378012
Average RMSE:  0.8773508516601002


In [7]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

param_grid = {
  'n_factors': [20, 50, 100],
  'n_epochs': [5, 10, 20]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

0.8665079959663734
{'n_factors': 20, 'n_epochs': 20}


In [8]:
from surprise.model_selection import train_test_split

# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']

# sample random trainset and testset
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.20)

# We'll use the famous SVD algorithm.
svd = SVD(n_factors=best_factor, n_epochs=best_epoch)

# Train the algorithm on the trainset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7bcd4ac9bd00>

In [9]:
def generate_recommendation(model, user_id, ratings_df, movies_df, n_items):
   # Get a list of all movie IDs from dataset
   movie_ids = ratings_df["movieId"].unique()

   # Get a list of all movie IDs that have been watched by user
   movie_ids_user = ratings_df.loc[ratings_df["userId"] == user_id, "movieId"]
    # Get a list off all movie IDS that that have not been watched by user
   movie_ids_to_pred = np.setdiff1d(movie_ids, movie_ids_user)

   # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
   test_set = [[user_id, movie_id, 4] for movie_id in movie_ids_to_pred]

   # Predict the ratings and generate recommendations
   predictions = model.test(test_set)
   pred_ratings = np.array([pred.est for pred in predictions])
   print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
   # Rank top-n movies based on the predicted ratings
   index_max = (-pred_ratings).argsort()[:n_items]
   for i in index_max:
       movie_id = movie_ids_to_pred[i]
       print(movies_df[movies_df["movieId"]==movie_id]["title"].values[0], pred_ratings[i])


# define which user ID that we want to give recommendation
userID = 23
# define how many top-n movies that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd,userID,ratings_data,movies_data,n_items)

Top 10 item recommendations for user 23:
Lawrence of Arabia (1962) 4.1957101974476885
Shawshank Redemption, The (1994) 4.125409772253242
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) 4.032184196915992
Wallace & Gromit: The Wrong Trousers (1993) 3.9822488916204395
Star Wars: Episode IV - A New Hope (1977) 3.9810481011184424
Eternal Sunshine of the Spotless Mind (2004) 3.9791785202785035
Philadelphia Story, The (1940) 3.979141499655915
Departed, The (2006) 3.9679962809853637
Cool Hand Luke (1967) 3.9649790163399414
Little Big Man (1970) 3.9642193436476716
