# Baseline method with SVD replaced with Improved SVD

In this notebook the baseline method (SVD -> ALS) is extended by replacing the initialization and SVD step with Improved SVD.

The sections "Preprocess", "Run Improved SVD + ALS model" and "Create submission file" can be run entirely and in sequence to produce the results. Run the "Download data from Kaggle" code block only if you are running on Colab.

## Download data from Kaggle

In [None]:
!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

## Preprocess

### Install Surprise package

In [None]:
!pip install surprise

### Imports

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from typing import Tuple
from IPython.display import display

### Declare global parameters of data

In [None]:
total_num_users = 10000
total_num_movies = 1000

### Data parsing helper function declarations

In [None]:
def parse_csv(csv_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """
  Extract the arrays of user indices, item indices and ratings listed in a .csv file

  :param csv_path: path to .csv file to read from
  :return: 3 arrays containing the users indices, the item indices and the observed ratings in order  
  """
  df = pd.read_csv(csv_path)
  # extract user and item indices from the Id label in the dataframe
  df = df.join(df.Id.str.extract(r"r(?P<User>\d+)_c(?P<Item>\d+)").astype(int) - 1)
  # extract user, item and prediction triplets from dataframe
  users = df.User.values
  items = df.Item.values
  preds = df.Prediction.values
  return users, items, preds


def construct_ratings_matrix(
    users: np.ndarray,
    items: np.ndarray,
    ratings: np.ndarray,
    n_users: int,
    n_items: int
) -> Tuple[np.ndarray, np.ndarray]:
  """
  Constructs the ratings matrix with NaN values where no rating was observed

  :param users: array of user indices
  :param items: array of item indices
  :param predictions: array of ratings per user-item pair
  :param n_users: total number of users
  :param n_items: total number of items
  :return: the ratings matrix and the observed ratings mask
  """
  ratings_matrix = np.zeros((n_users, n_items))
  observed_mask = np.full((n_users, n_items), fill_value=False)
  for r, c, v in zip(users, items, ratings):
    observed_mask[r, c] = True
    ratings_matrix[r][c] = v

  ratings_matrix[~observed_mask] = np.nan

  return ratings_matrix, observed_mask

### Model function declarations

In [None]:
def RMSE(x: np.ndarray, y: np.ndarray, mask: np.ndarray) -> float:
  return np.sqrt(np.nansum(mask * (x - y) ** 2) / np.sum(mask))


def ALS(
    data: np.ndarray,
    mask: np.ndarray,
    U: np.ndarray,
    VT: np.ndarray,
    n_latent_factors: int,
    regularization_param: float,
    n_iterations: int
) -> np.ndarray:
  regularizer = regularization_param * np.eye(n_latent_factors)

  with tqdm(total=n_iterations * (np.sum(mask.shape))) as pbar:
    rmse_score = RMSE(data, U @ VT, mask)
    pbar.set_description(f"Initial RMSE score is {rmse_score:.4f}")
    for iter in range(n_iterations):
      for i, Ri in enumerate(mask):
        U[i] = np.linalg.solve(
            np.dot(VT, np.dot(np.diag(Ri), VT.T)) + regularizer,
            (np.dot(VT, np.dot(np.diag(Ri), data[i].T))).T
        )
        pbar.update(1)

      for j, Rj in enumerate(mask.T):
        VT[:,j] = np.linalg.solve(
            np.dot(U.T, np.dot(np.diag(Rj), U)) + regularizer,
            np.dot(U.T, np.dot(np.diag(Rj), data[:, j]))
        )
        pbar.update(1)

      rmse_score = RMSE(data, U @ VT, mask)
      pbar.set_description(f"At iteration #{iter + 1} the RMSE score is {rmse_score:.4f}")

  return U @ VT

## Run Improved SVD + ALS model

Declare parameters for Improved SVD and ALS. These parameters were found to be optimal on the standalone Improved SVD method and the standalone baseline method:

In [None]:
n_factors = 3
svd_lr_all = 0.005
svd_reg_all = 0.1
svd_n_epochs = 100
als_reg = 0.1
als_n_iterations = 20

Parse the data from the data_train.csv file and construct the ratings matrix and mask:

In [None]:
# construct data in correct format
users, items, preds = parse_csv("data_train.csv")
ratings, mask = construct_ratings_matrix(users, items, preds, total_num_users, total_num_movies)

Fit the Improved SVD model with the optimal parameters and the entire data set:

In [None]:
ratings_dict = {'itemID': items, 'userID': users, 'rating': preds}
df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(0.5, 5.5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
# for the submission the full set is set as the trainset
trainset = data.build_full_trainset()
# init SVD with the best params found in the param tuning and with the n_factors optimal for regular SVD + ALS
algo = SVD(n_factors=n_factors, n_epochs=svd_n_epochs, lr_all=svd_lr_all, reg_all=svd_reg_all, random_state=1234)
# Train the algorithm on the trainset
algo.fit(trainset)

Extract $U$ and $V^T$ from the fitted Improved SVD model and input them to the ALS method:

In [None]:
U = np.copy(algo.pu)
VT = np.copy(algo.qi.T)

In [None]:
pred_ratings = ALS(ratings, mask, U, VT, n_factors, als_reg, als_n_iterations)
print(f"Validation RMSE: {RMSE(pred_ratings, ratings, mask)}")

## Create submission file

Extract the users and item ids needed for the submission file from sampleSubmission.csv, and use the fitted model to give predictions for them. Save the user ids, item ids and predictions to a .csv file for submission.

In [None]:
# extract the needed users and items for submission
pred_users, pred_items, _ = parse_csv('sampleSubmission.csv')
# save the prediction into a file in the agreed format
df = pd.DataFrame({
    "Id": [f"r{r + 1}_c{c + 1}" for r, c in zip(pred_users, pred_items)],
    "Prediction": [pred_ratings[r, c] for r, c in zip(pred_users, pred_items)]
})
df.to_csv(f"baseline_improved_svd_submission.csv", index=False)