# Setting up Colab environment

In [None]:
import os
username = 'SkoltechAI'
repo = 'Recommender-Systems-Intro-Sber-2022'

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

!git clone https://github.com/{username}/{repo}.git

In [None]:
!pip install --no-cache-dir --upgrade git+https://github.com/evfro/polara.git@develop#egg=polara

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from polara.preprocessing.dataframes import reindex, leave_one_out

# navigating to cloned repo directory in Colab
%cd {repo}
from dataprep import transform_indices, matrix_from_data
from evaluation import topn_recommendations, model_evaluate, downvote_seen_items
%cd -

# Setup Kaggle

You will need to provide `kaggle.json` file after running the cell below! This file contains the necessary information to use Kaggle API under your account.

If you don't have this file, navigate to https://www.kaggle.com. Then go to the `Account` tab of your user profile and select `Create API Token`. This will trigger the download of `kaggle.json` to your local machine.

Then run the cell below to upload your local `kaggle.json` to the Colab runtime.

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
competition_name = 'recommender-systems-course-competition-sber2022'
competition_data = f'{competition_name}.zip'
!kaggle competitions download -c {competition_name}
!unzip {competition_data}

# Preparing Data

By now you should have two files available in the colab: 
- `train` - a sample of user ratings from the `Movilens 10M` dataset to train models,
- `test` - a sample of warm-start users ratings from the same dataset.

**Important**: You must NOT use `test` data for training. Only use it for generating recommendations!

In [5]:
training_ = pd.read_csv('train')
testset_ = pd.read_csv('test')

# Simple example with Kaggle submission

In [8]:
from scipy.sparse.linalg import svds

In [13]:
training, data_index = transform_indices(training_.copy(), 'userid', 'movieid')

# we normalize warm-start users index independently of train
warm_users_index = pd.Index(testset_['userid'].drop_duplicates(), name='userid') # index for warm-start users
testset = reindex(testset_, [warm_users_index, data_index['items']])

In [15]:
data_description = dict(
    users = data_index['users'].name,
    items = data_index['items'].name,
    order = 'timestamp',
    n_users = len(data_index['users']),
    n_items = len(data_index['items']),
)
data_description

{'users': 'userid',
 'items': 'movieid',
 'order': 'timestamp',
 'n_users': 64680,
 'n_items': 9857}

## Generate recommendations

Using simple PureSVD model for demonstration.

In [16]:
def build_svd_model(config, data, data_description):
    source_matrix = matrix_from_data(data, data_description)
    _, s, vt = svds(source_matrix, k=config['rank'], return_singular_vectors='vh')
    singular_values = s[::-1]
    item_factors = np.ascontiguousarray(vt[::-1, :].T)
    return item_factors, singular_values

def svd_model_scoring(params, data, data_description):
    # data description must contain the correct number of test users
    # this will ensure that our scoring matrix will have correct shape
    test_data_description = {
        **data_description,
        'n_users': data[data_description['users']].nunique()
    }
    test_matrix = matrix_from_data(data, test_data_description)
    # generating prediction scores
    item_factors, sigma = params
    scores = test_matrix.dot(item_factors) @ item_factors.T
    return scores

def get_svd_recommendations(config, train_data, test_data, data_description, topn=10):
    params = build_svd_model(config, train_data, data_description)
    scores = svd_model_scoring(params, test_data, data_description)
    downvote_seen_items(scores, test_data, data_description)
    return topn_recommendations(scores, topn=topn)
    

In [17]:
svd_recs = get_svd_recommendations({'rank': 256}, training, testset, data_description)

## Submitting solution

In [20]:
def generate_solution(recs_array, test_users, itemidx):
    '''
    Function to prepare Kaggle submission based on the obtained recommendations.
    It converts internal index representation back to original index.
    '''
    rec_items = itemidx.values.take(recs_array)
    useridx = np.broadcast_to(
        test_users.values[:, np.newaxis],
        (len(test_users), recs_array.shape[1])
    )

    submission = pd.DataFrame({
        test_users.name: useridx.ravel(),
        itemidx.name: rec_items.ravel()
    })
    return submission

In [22]:
submission = generate_solution(svd_recs, warm_users_index, data_index['items'])

In [None]:
submission_file = 'my_first_submission.csv'
submission.to_csv(submission_file, index=False)

!kaggle competitions submit -c {competition_name} -f {submission_file} -m 'My First Submission'