# Setting up Colab environment

In [None]:
import os
username = 'recspert'
repo = 'ITP-RecSys-2024'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

Installing polara

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [None]:
import numpy as np
from scipy.sparse import diags, csr_matrix
from scipy.sparse.linalg import norm as spnorm
from scipy.sparse.linalg import svds

from polara import get_movielens_data

# navigating to cloned repo directory in Colab
%cd {repo}
from dataprep import transform_indices, leave_last_out, reindex_data
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items
# restoring original location
%cd -

# Experiment description

You'll split data into 3 parts - training, validation and test.
- You will firstly use training and validation to tune your models and finding optimal configuration.
- Once a set of optimal hyper-parameters is found, you'll need to recompute your models with it on the joint training+validation dataset and report final quality on the test data.

For the test data you simply split one last item from each user. The remaining part goes into training+validation. Likewise, you split it one more time the same way as before to get our dataset for tuning.

So the scheme is as follows:
1. Tune on the training and evaluate on the validation data. Find optimal config.
2. Retrain once on the trainin+validation with the optimal config. Report final quality using the test (holdout) data.

# Prepraring data

In [None]:
data = get_movielens_data(include_time=True)

## data splits

In [None]:
# final test data
training_validation_, holdout_ = leave_last_out(data)
# validation data
training_, validation_ = leave_last_out(training_validation_)

## reindexing

In [None]:
training, data_index = transform_indices(training_, 'userid', 'movieid')
# split validation data
validation = reindex_data(validation_, data_index, filter_invalid=True)
validation = validation.sort_values('userid')
# split final test data
holdout = reindex_data(holdout_, data_index, filter_invalid=True)
holdout = holdout.sort_values('userid')

In [None]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    test_users = validation[data_index['users'].name].drop_duplicates().values
)
data_description

# PureSVD

In [None]:
def matrix_from_observations(data, data_description):
    useridx = data[data_description['users']]
    itemidx = data[data_description['items']]
    values = data[data_description['feedback']]
    return csr_matrix((values, (useridx, itemidx)), dtype='f8')

def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_observations(data, data_description)
    ... # <- your code here, mind that singular values must be sorted in decreasing order
    return item_factors, singular_values

def svd_model_scoring(params, data, data_description):
    test_matrix = matrix_from_observations(data, data_description)
    test_users = data_description['test_users']
    item_factors, sigma = params
    scores = test_matrix[test_users].dot(item_factors) @ item_factors.T
    return scores

In [None]:
svd_config = {'rank': 40}
userid = data_description['users']
seen_data = training.loc[lambda x: x[userid].isin(data_description["test_users"])]

svd_params = build_svd_model(svd_config, training, data_description)
svd_scores = svd_model_scoring(svd_params, seen_data, data_description)

In [None]:
downvote_seen_items(svd_scores, seen_data, data_description)

In [None]:
svd_recs = topn_recommendations(svd_scores, topn=10)
model_evaluate(svd_recs, validation, data_description)