# Surprise SVD

## Preprocess

### Install Surprise package

In [None]:
!pip install surprise

### Imports

In [5]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from typing import Tuple

### Declare global parameters of data

In [6]:
total_num_users = 10000
total_num_movies = 1000

### Data parsing helper function declarations

In [7]:
def parse_csv(csv_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """
  Extract the arrays of user indices, item indices and ratings listed in a .csv file

  :param csv_path: path to .csv file to read from
  :return: 3 arrays containing the users indices, the item indices and the observed ratings in order  
  """
  df = pd.read_csv(csv_path)
  # extract user and item indices from the Id label in the dataframe
  df = df.join(df.Id.str.extract(r"r(?P<User>\d+)_c(?P<Item>\d+)").astype(int) - 1)
  # extract user, item and prediction triplets from dataframe
  users = df.User.values
  items = df.Item.values
  preds = df.Prediction.values
  return users, items, preds

## Run Surprise SVD model

In [8]:
# construct data in correct format
users, items, preds = parse_csv("../data/data_train.csv")
ratings_dict = {'itemID': items, 'userID': users, 'rating': preds}
df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

In [9]:
param_grid = {
    'n_factors': list(range(2, 22, 2)),
    'n_epochs': [20],
    'lr_all': list(np.arange(0.001, 0.010, 0.001)),
    'reg_all': list(np.arange(0.0, 1.0, 0.1))
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=10, n_jobs=-1, joblib_verbose=2)

gs.fit(data)

# report and save the best parameters
best_params = gs.best_params['rmse']
print(f"The minimum RMSE score is {gs.best_score['rmse']}")
print(f"The parameters which give the best RMSE score are: {best_params}")
n_factors = best_params['n_factors']
n_epochs = best_params['n_epochs']
lr_all = best_params['lr_all']
reg_all = best_params['reg_all']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


1.0020673859959812
{'n_factors': 2, 'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.0}


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.6min finished


In [None]:
# for the submission the full set is set as the trainset
trainset = data.build_full_trainset()
# init SVD with the best params found in the param tuning
algo = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=1234)
# Train the algorithm on the trainset
algo.fit(trainset)

In [20]:
# extract the needed users and items for submission
pred_users, pred_items, _ = parse_csv('../data/sampleSubmission.csv')
pred_ratings = list()
df_ids = list()
# use the trained model to extract the predictions for submission
for user, item in zip(pred_users, pred_items):
  df_ids.append(f"r{user + 1}_c{item + 1}")
  pred_ratings.append(algo.predict(user, item, verbose=False).est)
# save the prediction into a file in the agreed format
df = pd.DataFrame({"Id": df_ids, "Prediction": pred_ratings})
df.to_csv(f"../results/surprise_svd_n_factors-{n_factors}_n_epochs-{n_epochs}_lr_all-{lr_all:.3f}_reg_all-{reg_all:.1f}_submission.csv", index=False)