# Setting up Colab environment

In [None]:
import os
username = 'SkoltechAI'
repo = 'Recommender-Systems-Intro-Sber-2022'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [2]:
from heapq import nlargest

import numpy as np
from scipy.sparse import diags, csr_matrix
from scipy.sparse.linalg import norm as spnorm
from scipy.sparse.linalg import svds

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

%cd {repo} # navigating to cloned repo directory in Colab
from dataprep import transform_indices
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items
%cd - # restoring original location

# Prepraring data

In [4]:
data = get_movielens_data(include_time=True)

In [5]:
training_, holdout_ = leave_one_out(data, target='timestamp', sample_top=True, random_state=0)

In [6]:
training, data_index = transform_indices(training_, 'userid', 'movieid')
holdout = reindex(holdout_, data_index.values(), filter_invalid=True)
holdout = holdout.sort_values('userid')

Filtered 2 invalid observations.


In [7]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    feedback = 'rating',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
    test_users = holdout[data_index['users'].name].drop_duplicates().values
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'feedback': 'rating',
 'n_users': 6040,
 'n_items': 3704,
 'test_users': array([   0,    1,    2, ..., 6037, 6038, 6039], dtype=int64)}

In [8]:
userid = data_description['users']
seen_data = training.query(f'{userid} in @data_description["test_users"]')

# PureSVD

In [10]:
def matrix_from_observations(data, data_description):
    useridx = data[data_description['users']]
    itemidx = data[data_description['items']]
    values = data[data_description['feedback']]
    return csr_matrix((values, (useridx, itemidx)), dtype='f8')

def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_observations(data, data_description)
    ... # <- your code here
    return item_factors, singular_values


def svd_model_scoring(params, data, data_description):
    item_factors, sigma = params
    train_matrix = data_description['train_matrix']
    test_users = data_description['test_users']
    test_matrix = train_matrix[test_users, :]
    scores = test_matrix.dot(item_factors) @ item_factors.T
    return scores    

In [11]:
svd_config = {'rank': 40}
data_description['train_matrix'] = matrix_from_observations(training, data_description)


svd_params = build_svd_model(svd_config, training, data_description)
svd_scores = svd_model_scoring(svd_params, None, data_description)

In [12]:
downvote_seen_items(svd_scores, seen_data, data_description)

In [13]:
svd_recs = topn_recommendations(svd_scores, topn=10)
model_evaluate(svd_recs, holdout, data_description)

(0.08810864524677045, 0.03138364432667182, 0.25323974082073436)

# Scaled SVD


Implement data normalization with scaling factor that reduces the effects of item popularity.