# Description

This notebook includes the necessary code to run the baseline solution for the collaborative filtering Kaggle competition for the CIL 2022 course.


# Preprocess

## Imports

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from typing import Tuple

## Download data

In [2]:
!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
Collecting requests
  Downloading requests-2.27.1-py2.py3-none-any.whl (63 kB)
Collecting python-slugify
  Downloading python_slugify-6.1.2-py2.py3-none-any.whl (9.4 kB)
Collecting urllib3
  Using cached urllib3-1.26.10-py2.py3-none-any.whl (139 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.3-py3-none-any.whl (61 kB)
Collecting charset-normalizer~=2.0.0
  Downloading charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Collecting importlib-resources
  Using cached importlib_resources-5.4.0-py3-none-any.whl (28 kB)
Collecting zipp>=3.1.0
  Using cached zipp-3.6.0-py3-none-any.whl (5.3 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=

The syntax of the command is incorrect.
'mv' is not recognized as an internal or external command,
operable program or batch file.
'chmod' is not recognized as an internal or external command,
operable program or batch file.
Traceback (most recent call last):
  File "C:\Users\yuval\anaconda3\envs\cil-runtime-terror\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\yuval\anaconda3\envs\cil-runtime-terror\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\yuval\anaconda3\envs\cil-runtime-terror\Scripts\kaggle.exe\__main__.py", line 4, in <module>
  File "C:\Users\yuval\anaconda3\envs\cil-runtime-terror\lib\site-packages\kaggle\__init__.py", line 23, in <module>
    api.authenticate()
  File "C:\Users\yuval\anaconda3\envs\cil-runtime-terror\lib\site-packages\kaggle\api\kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located 

## Declare global parameters of data

In [3]:
total_num_users = 10000
total_num_movies = 1000

## Data parsing helper function declarations

In [4]:
def parse_csv(csv_path: str) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """
  Extract the arrays of user indices, item indices and ratings listed in a .csv file

  :param csv_path: path to .csv file to read from
  :return: 3 arrays containing the users indices, the item indices and the observed ratings in order  
  """
  df = pd.read_csv(csv_path)
  # extract user and item indices from the Id label in the dataframe
  df = df.join(df.Id.str.extract(r"r(?P<User>\d+)_c(?P<Item>\d+)").astype(int) - 1)
  # extract user, item and prediction triplets from dataframe
  users = df.User.values
  items = df.Item.values
  preds = df.Prediction.values
  return users, items, preds

def construct_ratings_matrix(
    users: np.ndarray,
    items: np.ndarray,
    ratings: np.ndarray,
    n_users: int,
    n_items: int
) -> Tuple[np.ndarray, np.ndarray]:
  """
  Constructs the ratings matrix with NaN values where no rating was observed

  :param users: array of user indices
  :param items: array of item indices
  :param predictions: array of ratings per user-item pair
  :param n_users: total number of users
  :param n_items: total number of items
  :return: the ratings matrix and the observed ratings mask
  """
  ratings_matrix = np.zeros((n_users, n_items))
  observed_mask = np.full((n_users, n_items), fill_value=False)
  for r, c, v in zip(users, items, ratings):
    observed_mask[r, c] = True
    ratings_matrix[r][c] = v

  ratings_matrix[~observed_mask] = np.nan

  return ratings_matrix, observed_mask

## Model function declarations

In [5]:
def normalize(data: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
  """
  Normalizes the input data matrix per item (column). NaN entries are disregarded and are set to zero.  

  :param data: matrix to normalize
  :return: the normalized input matrix, the column-wise mean and the column-wise standard deviation
  """
  # compute mean and std and normalized the matrix with NaN values
  mean = np.nanmean(data, axis=0, keepdims=True)
  std = np.nanstd(data, axis=0, keepdims=True)
  normed_data = (data - mean) / std
  # set the non-observed entries to 0
  normed_data[np.isnan(normed_data)] = 0
  return normed_data, mean, std


def denormalize(data: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
  """
  Denormalizes the input data matrix per item (column) by performing the inverse operation of the normalize function  

  :param data: matrix to denormalize
  :param mean: the column-wise mean of the unnormalized matrix
  :param std: the column-wise std of the unnormalized matrix
  :return: the denormalized matrix
  """
  return (data * std) + mean


def SVD(A: np.ndarray, k: int) -> np.ndarray:
  """
  Computes the singular value decomposition of the input matrix, projected on
  the subspace defined by the k largest singular values, as a 3-tuple

  :param A: matrix to decompose
  :param k: number of the largest singular values
  :return: the projected singular value decomposition of A
  """
  assert(k <= min(A.shape)), "k should be no greater than the number of users or items."
  U, S, VT = np.linalg.svd(A, full_matrices=False)
  return U[:, :k], np.diag(S[:k]), VT[:k, :]


def MSE(x: np.ndarray, y: np.ndarray, mask: np.ndarray) -> float:
  return np.nansum(mask * (x - y) ** 2) / np.sum(mask)


def RMSE(x: np.ndarray, y: np.ndarray, mask: np.ndarray) -> float:
  return np.sqrt(MSE(x, y, mask))


def ALS(
    data: np.ndarray,
    mask: np.ndarray,
    U: np.ndarray,
    VT: np.ndarray,
    n_latent_factors: int,
    regularization_param: float,
    n_iterations: int
) -> np.ndarray:
  MSEs_list = list()
  regularizer = regularization_param * np.eye(n_latent_factors)

  with tqdm(total=n_iterations * (np.sum(mask.shape))) as pbar:
    mse_loss = MSE(data, U @ VT, mask)
    MSEs_list.append(mse_loss)
    pbar.set_description(f"Initial MSE loss is {mse_loss:.4f}")
    for iter in range(n_iterations):
      for i, Ri in enumerate(mask):
        U[i] = np.linalg.solve(
            np.dot(VT, np.dot(np.diag(Ri), VT.T)) + regularizer,
            (np.dot(VT, np.dot(np.diag(Ri), data[i].T))).T
        )
        pbar.update(1)

      for j, Rj in enumerate(mask.T):
        VT[:,j] = np.linalg.solve(
            np.dot(U.T, np.dot(np.diag(Rj), U)) + regularizer,
            np.dot(U.T, np.dot(np.diag(Rj), data[:, j]))
        )
        pbar.update(1)

      mse_loss = MSE(data, U @ VT, mask)
      MSEs_list.append(mse_loss)
      pbar.set_description(f"At iteration #{iter + 1} the MSE loss is {mse_loss:.4f}")

  print(MSEs_list)
  return U @ VT


def baseline(
    data: np.ndarray,
    mask: np.ndarray,
    n_latent_factors: int,
    regularization_param: float,
    n_iterations: int
) -> np.ndarray:
  """
  Runs the baseline (normalization, SVD, ALS and then denormalization) method

  :param data: the ratings matrix
  :param mask: the matrix of observed ratings
  :param n_latent_factors: the number of latent factor to use
  :param regularization_param: the regularizer (lambda) to use in the ALS steps
  :param n_iterations: the number of ALS iterations to do
  :return: the predicted ratings matrix
  """
  norm_data, data_mean, data_std = normalize(data)
  U, _, VT = SVD(norm_data, n_latent_factors)
  pred_ratings = ALS(norm_data, mask, U, VT, n_latent_factors, regularization_param, n_iterations)
  return denormalize(pred_ratings, data_mean, data_std)

# Run baseline model

## With original parameters

Declare baseline parameters used tp set the Kaggle competition baseline score.

In [None]:
n_latent_factors = 3
regularization_param = 0.1
n_iterations = 20

In [None]:
users, items, preds = parse_csv('data_train.csv')
ratings, mask = construct_ratings_matrix(users, items, preds, total_num_users, total_num_movies)
pred_ratings = baseline(ratings, mask, n_latent_factors, regularization_param, n_iterations)

  0%|          | 0/220000 [00:00<?, ?it/s]

[0.999423166170458, 0.8762300439507353, 0.8689951764637153, 0.8671804213898717, 0.8664447326783178, 0.8660571052735219, 0.8658203474337604, 0.86566233753299, 0.865550555698655, 0.8654681178605266, 0.8654053686214906, 0.8653563920780475, 0.8653173713858464, 0.8652857424697243, 0.8652597255506597, 0.8652380511540986, 0.8652197926163971, 0.8652042598842655, 0.8651909300665487, 0.8651794007979741, 0.8651693581882476]


## Create submission file

In [None]:
pred_users, pred_items, _ = parse_csv('sampleSubmission.csv')
df = pd.DataFrame({
    "Id": [f"r{r + 1}_c{c + 1}" for r, c in zip(pred_users, pred_items)],
    "Prediction": [pred_ratings[r, c] for r, c in zip(pred_users, pred_items)]
})
df.to_csv(f"../results/baseline_reg-{regularization_param:.1f}_k-{n_latent_factors}_iters-{n_iterations}_submission.csv", index=False)

In [None]:
np.savetxt(f"../results/baseline_reg-{regularization_param:.1f}_k-{n_latent_factors}_iters-{n_iterations}_preds.dat", pred_ratings)

# Baseline model parameter tuning

We tune the parameters of the baseline model using the grid search method over pairs of values for the number of latent factors used in SVD and ALS, and the regularization parameter used in ALS. 

In [6]:
users, items, preds = parse_csv('../data/data_train.csv')
# determine a random test set for the parameter tuning
train_users, test_users, train_items, test_items, train_preds, test_preds = \
  train_test_split(users, items, preds, test_size=0.1, random_state=42)
# construct ratings and masks for train and test sets
train_ratings, train_mask = construct_ratings_matrix(
    train_users, train_items, train_preds, total_num_users, total_num_movies
)
test_ratings, test_mask = construct_ratings_matrix(
    test_users, test_items, test_preds, total_num_users, total_num_movies
)

In [None]:
# declare possible parameter values to test
reg_param_vals = np.arange(0.0, 1.0, 0.1)
latent_dim_vals = range(2, 22, 2)
n_reg_vals = len(reg_param_vals)
n_latent_dim_vals = len(latent_dim_vals)
n_iterations = 20
# iterate over all tunable parameter pairs and find their score on the test set
train_rmse_scores = np.empty((n_reg_vals, n_latent_dim_vals))
test_rmse_scores = np.empty((n_reg_vals, n_latent_dim_vals))
with tqdm(total=n_reg_vals * n_latent_dim_vals) as pbar:
  for i, reg in enumerate(reg_param_vals):
    for j, latent_dim in enumerate(latent_dim_vals):
      # compute predicted ratings with parameter pair
      pred_ratings = baseline(train_ratings, train_mask, latent_dim, reg, n_iterations)
      # compute and report RMSE train and test scores
      train_rmse = RMSE(pred_ratings, train_ratings, train_mask)
      test_rmse = RMSE(pred_ratings, test_ratings, test_mask)
      print(f"Regularizing parameter: {reg:.1f}, " \
            f"number of latent factors: {latent_dim}, " \
            f"train RMSE score: {train_rmse:.4f}, " \
            f"test RMSE score: {test_rmse:.4f}")
      # save RMSE scores in tables
      train_rmse_scores[i, j] = train_rmse
      test_rmse_scores[i, j] = test_rmse
      np.savetxt(f"baseline_reg-{regularization_param:.1f}_k-{n_latent_factors}_iters-{n_iterations}_validation.dat", pred_ratings)
      pbar.update(1)
      pbar.set_description(f"Minimal RMSE score on validation set is {np.nanmin(test_rmse_scores):.4f}")

min_rmse_idxs = np.unravel_index(np.argmin(test_rmse_scores, axis=None), test_rmse_scores.shape)
print(f"The optimal regularization parameter is {reg_param_vals[0]:.1f}")
print(f"The optimal number of latent factors is {latent_dim_vals[1]}")

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/220000 [00:00<?, ?it/s]