# RePlay recommender models comparison

### Dataset
We will compare RePlay models on __MovieLens 1m__. 

### Dataset preprocessing: 
Ratings greater than or equal to 3 are considered as positive interactions.

### Data split
Dataset is spitted by date so that 20% of the last interactions as are placed in the test part. Cold items and users are dropped.

### Predict:
We will predict top-10 most relevant films for each user.

### Metrics
Quality metrics used:__ndcg@k, hitrate@k, map@k, mrr@k__ for k = 1, 5, 10
Additional metrics userd: __coverage@k__ and __surprisal@k__.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%config Completer.use_jedi = False

In [55]:
import logging
import pandas as pd
import time


from pyspark.sql import functions as sf, types as st

from replay.data_preparator import DataPreparator
from replay.experiment import Experiment
from replay.metrics import Coverage, HitRate, MRR, MAP, NDCG, Surprisal
from replay.models import (
    ALSWrap, 
    ADMMSLIM, 
    ClassifierRec, 
    KNN, 
    LightFMWrap, 
    MultVAE, 
    NeuroMF, 
    SLIM, 
    Stack,
    PopRec, 
    RandomRec, 
    Wilson, 
    Word2VecRec
)
from replay.session_handler import State
from replay.splitters import DateSplitter
from replay.utils import get_log_info

In [4]:
logger = logging.getLogger("replay")

In [5]:
spark = State().session
spark

In [6]:
from logging import ERROR
State().logger.setLevel(ERROR)

In [7]:
K = 10
K_list_metrics = [1, 5, 10]
BUDGET = 20
SEED = 12345

## 0. Preprocessing <a name='data-preparator'></a>

### 0.1 Data loading

In [8]:
from rs_datasets import MovieLens

data = MovieLens("1m")
data.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance





In [9]:
log = DataPreparator().transform(
    data=data.ratings,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "rating",
        "timestamp": "timestamp"
    }
)
print(get_log_info(log))

total lines: 1000209, total users: 6040, total items: 3706


In [10]:
users = DataPreparator().transform(
    data=data.users,
    columns_names={
        "user_id": "user_id",
    }
)

In [11]:
# рассматриваем как положительный фидбэк только оценки >= 3
only_positives_log = log.filter(sf.col('relevance') >= 3)
only_negatives_log = log.filter(sf.col('relevance') < 3).withColumn('relevance', sf.lit(0.))
only_positives_log.count(), only_negatives_log.count()

(836478, 163731)

### 0.2. Data split

In [12]:
# train и test
train_spl = DateSplitter(
    test_start=0.2,
    drop_cold_items=True,
    drop_cold_users=True,
)
train, test = train_spl.split(only_positives_log)
print('train info:\n', get_log_info(train))
print('test info:\n', get_log_info(test))

train info:
 total lines: 669181, total users: 5397, total items: 3569
test info:
 total lines: 86542, total users: 1139, total items: 3279


In [13]:
# train and validation for hyperparams search
opt_train, opt_val = train_spl.split(train)
opt_train.count(), opt_val.count()

(535343, 24241)

In [14]:
test_start = test.select(sf.min(sf.col('timestamp'))).collect()[0][0]
test_start

datetime.datetime(2000, 12, 2, 6, 8, 19)

In [15]:
wilson_train=train.withColumn('relevance', sf.lit(1)).union(only_negatives_log.filter(sf.col('timestamp') < test_start))
wilson_train.count()

798993

# 1. Metrics definition

In [16]:
e = Experiment(test, {MAP(): K, NDCG(): K, HitRate(): K_list_metrics, Coverage(train): K, Surprisal(train): K, MRR(): K})

# 2. Models training

## 2.1. non-personalized models

In [17]:
non_personalized_models = {'pop_rec': [PopRec(), 'no_opt'], 
          'random_rec_un': [RandomRec(seed=SEED, distribution='uniform'), 'no_opt'], 
          'random_rec_pop_based': [RandomRec(seed=SEED, distribution='popular_based'), {"alpha": [-0.5, 100]}],
          'wilson': [Wilson(), 'no_opt']}

In [56]:
def fit_predict_add_res(name, model, experiment, train, suffix=''):
    start_time=time.time()
    train = train
    if isinstance(model, Wilson) or isinstance(model, ClassifierRec):
        train = wilson_train
    pred=model.fit_predict(log=train, 
                           k=K,
                           users=test.select('user_id').distinct())
    pred.count()
    fit_predict_time = time.time() - start_time
    experiment.add_result(name + suffix, pred)
    experiment.results.loc[name + suffix, 'fit_pred_time'] = fit_predict_time
    print(experiment.results[['NDCG@{}'.format(K), 'MRR@{}'.format(K), 'Coverage@{}'.format(K), 'fit_pred_time']].sort_values('NDCG@{}'.format(K), ascending=False))

In [57]:
def full_pipeline(models, experiment, train, suffix='', budget=BUDGET):
    for name, [model, params] in models.items():
        model.logger.error(msg='{} started'.format(name))
        if params != 'no_opt':
            model.logger.error(msg='{} optimization started'.format(name))
            best_params = model.optimize(opt_train, 
                                         opt_val, 
                                         param_grid=params, 
                                         k=K, 
                                         budget=budget)
            model.set_params(**best_params)
            logger.error(msg='best params for {} are: {}'.format(name, best_params))
            experiment.results.loc[name + suffix, 'params'] = best_params.__repr__()
        
        logger.error(msg='{} fit_predict started'.format(name))
        fit_predict_add_res(name, model, experiment, train, suffix)        

In [20]:
%%time
full_pipeline(non_personalized_models, e, train)

30-Jul-21 10:19:33, replay, ERROR: pop_rec started
ERROR:replay:pop_rec started
30-Jul-21 10:19:33, replay, ERROR: pop_rec fit_predict started
ERROR:replay:pop_rec fit_predict started
30-Jul-21 10:20:03, replay, ERROR: random_rec_un started
ERROR:replay:random_rec_un started
30-Jul-21 10:20:03, replay, ERROR: random_rec_un fit_predict started
ERROR:replay:random_rec_un fit_predict started


          NDCG@10    MRR@10  Coverage@10  fit_pred_time
pop_rec  0.243614  0.390414     0.033903      13.986708


30-Jul-21 10:20:27, replay, ERROR: random_rec_pop_based started
ERROR:replay:random_rec_pop_based started
30-Jul-21 10:20:27, replay, ERROR: random_rec_pop_based optimization started
ERROR:replay:random_rec_pop_based optimization started
[32m[I 2021-07-30 10:20:27,288][0m A new study created in memory with name: no-name-bcbbfd99-3701-4d6b-995b-da28aa1a098a[0m


                NDCG@10    MRR@10  Coverage@10  fit_pred_time
pop_rec        0.243614  0.390414     0.033903      13.986708
random_rec_un  0.025557  0.067583     0.960773       7.788454


[32m[I 2021-07-30 10:20:35,922][0m Trial 0 finished with value: 0.054467680796298316 and parameters: {'alpha': 60.9506086911053}. Best is trial 0 with value: 0.054467680796298316.[0m
[32m[I 2021-07-30 10:20:41,948][0m Trial 1 finished with value: 0.05313342535160094 and parameters: {'alpha': 68.9650244079596}. Best is trial 0 with value: 0.054467680796298316.[0m
[32m[I 2021-07-30 10:20:47,707][0m Trial 2 finished with value: 0.06515071899968328 and parameters: {'alpha': 37.37639362974846}. Best is trial 2 with value: 0.06515071899968328.[0m
[32m[I 2021-07-30 10:20:53,269][0m Trial 3 finished with value: 0.05341964853448675 and parameters: {'alpha': 72.80662217389533}. Best is trial 2 with value: 0.06515071899968328.[0m
[32m[I 2021-07-30 10:20:59,081][0m Trial 4 finished with value: 0.058108857085130895 and parameters: {'alpha': 87.16259754915849}. Best is trial 2 with value: 0.06515071899968328.[0m
[32m[I 2021-07-30 10:21:09,904][0m Trial 5 finished with value: 0.06033

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
pop_rec               0.243614  0.390414     0.033903      13.986708
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
pop_rec               0.243614  0.390414     0.033903      13.986708
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
CPU times: user 18.4 s, sys: 5.3 s, total: 23.7 s
Wall time: 3min 37s


In [21]:
e.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,Coverage@10,HitRate@1,HitRate@5,HitRate@10,MAP@10,MRR@10,NDCG@10,Surprisal@10,fit_pred_time,params
pop_rec,0.033903,0.28446,0.53029,0.645303,0.157194,0.390414,0.243614,0.118354,13.986708,
wilson,0.017092,0.083406,0.34504,0.414399,0.045002,0.180976,0.092121,0.26219,10.372497,
random_rec_pop_based,0.662931,0.060579,0.258999,0.382792,0.028434,0.141737,0.069572,0.317897,6.863843,{'alpha': 1.1534933025992824}
random_rec_un,0.960773,0.032485,0.107112,0.183494,0.009075,0.067583,0.025557,0.53693,7.788454,


## 2.2 Classical models without features

In [22]:
common_models = {
          'admm_slim': [ADMMSLIM(seed=SEED), None],
          'als_imp': [ALSWrap(seed=SEED), None], 
          'als_exp': [ALSWrap(seed=SEED, implicit_prefs=False), None], 
          'knn': [KNN(), None], 
          'lightfm_no_feat': [LightFMWrap(random_state=SEED), {"no_components": [8, 512]}], 
          'slim': [SLIM(seed=SEED), None]}

In [23]:
%%time
full_pipeline(common_models, e, train)

30-Jul-21 10:23:10, replay, ERROR: admm_slim started
ERROR:replay:admm_slim started
30-Jul-21 10:23:10, replay, ERROR: admm_slim optimization started
ERROR:replay:admm_slim optimization started
[32m[I 2021-07-30 10:23:10,838][0m A new study created in memory with name: no-name-eaa8c6ed-ef42-4642-8f18-ca99fc567279[0m
[32m[I 2021-07-30 10:23:43,692][0m Trial 0 finished with value: 0.18695702593887867 and parameters: {'lambda_1': 2.3564346666695692e-09, 'lambda_2': 397.2324903394749}. Best is trial 0 with value: 0.18695702593887867.[0m
[32m[I 2021-07-30 10:23:59,275][0m Trial 1 finished with value: 0.14619139534335537 and parameters: {'lambda_1': 2.9133975985021195e-08, 'lambda_2': 0.02413588907447104}. Best is trial 0 with value: 0.18695702593887867.[0m
[32m[I 2021-07-30 10:24:16,040][0m Trial 2 finished with value: 0.13826118091145884 and parameters: {'lambda_1': 0.004481699685593488, 'lambda_2': 0.09104490483080274}. Best is trial 0 with value: 0.18695702593887867.[0m
[32m

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
pop_rec               0.243614  0.390414     0.033903      13.986708
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454


[32m[I 2021-07-30 10:32:51,316][0m Trial 0 finished with value: 0.17430430262834581 and parameters: {'rank': 61}. Best is trial 0 with value: 0.17430430262834581.[0m
[32m[I 2021-07-30 10:33:05,250][0m Trial 1 finished with value: 0.16825945011698357 and parameters: {'rank': 91}. Best is trial 0 with value: 0.17430430262834581.[0m
[32m[I 2021-07-30 10:33:19,962][0m Trial 2 finished with value: 0.15881423896520533 and parameters: {'rank': 101}. Best is trial 0 with value: 0.17430430262834581.[0m
[32m[I 2021-07-30 10:33:59,355][0m Trial 3 finished with value: 0.16477217484398515 and parameters: {'rank': 193}. Best is trial 0 with value: 0.17430430262834581.[0m
[32m[I 2021-07-30 10:35:02,012][0m Trial 4 finished with value: 0.16360322866629046 and parameters: {'rank': 242}. Best is trial 0 with value: 0.17430430262834581.[0m
[32m[I 2021-07-30 10:35:08,937][0m Trial 5 finished with value: 0.20317502566456266 and parameters: {'rank': 9}. Best is trial 5 with value: 0.2031750

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
pop_rec               0.243614  0.390414     0.033903      13.986708
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454


[32m[I 2021-07-30 10:38:06,638][0m Trial 0 finished with value: 0.05056985935163427 and parameters: {'rank': 52}. Best is trial 0 with value: 0.05056985935163427.[0m
[32m[I 2021-07-30 10:38:13,515][0m Trial 1 finished with value: 0.04518882770168397 and parameters: {'rank': 13}. Best is trial 0 with value: 0.05056985935163427.[0m
[32m[I 2021-07-30 10:38:20,265][0m Trial 2 finished with value: 0.04643311561795654 and parameters: {'rank': 20}. Best is trial 0 with value: 0.05056985935163427.[0m
[32m[I 2021-07-30 10:38:27,366][0m Trial 3 finished with value: 0.04930724445560965 and parameters: {'rank': 29}. Best is trial 0 with value: 0.05056985935163427.[0m
[32m[I 2021-07-30 10:38:55,762][0m Trial 4 finished with value: 0.050940966297855744 and parameters: {'rank': 163}. Best is trial 4 with value: 0.050940966297855744.[0m
[32m[I 2021-07-30 10:39:02,051][0m Trial 5 finished with value: 0.044375034322340634 and parameters: {'rank': 9}. Best is trial 4 with value: 0.050940

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
pop_rec               0.243614  0.390414     0.033903      13.986708
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206


[32m[I 2021-07-30 10:45:34,838][0m Trial 0 finished with value: 0.22547371875018976 and parameters: {'num_neighbours': 68, 'shrink': 15}. Best is trial 0 with value: 0.22547371875018976.[0m
[32m[I 2021-07-30 10:45:48,765][0m Trial 1 finished with value: 0.23346095138330575 and parameters: {'num_neighbours': 91, 'shrink': 75}. Best is trial 1 with value: 0.23346095138330575.[0m
[32m[I 2021-07-30 10:46:01,864][0m Trial 2 finished with value: 0.23003101999956602 and parameters: {'num_neighbours': 53, 'shrink': 65}. Best is trial 1 with value: 0.23346095138330575.[0m
[32m[I 2021-07-30 10:46:15,166][0m Trial 3 finished with value: 0.2297988866764764 and parameters: {'num_neighbours': 78, 'shrink': 26}. Best is trial 1 with value: 0.23346095138330575.[0m
[32m[I 2021-07-30 10:46:27,464][0m Trial 4 finished with value: 0.2339548942281404 and parameters: {'num_neighbours': 69, 'shrink': 76}. Best is trial 4 with value: 0.2339548942281404.[0m
[32m[I 2021-07-30 10:46:40,435][0m T

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206


[32m[I 2021-07-30 10:50:22,125][0m Trial 0 finished with value: 0.21351248501517908 and parameters: {'no_components': 12}. Best is trial 0 with value: 0.21351248501517908.[0m
[32m[I 2021-07-30 10:50:29,926][0m Trial 1 finished with value: 0.21573566571261063 and parameters: {'no_components': 9}. Best is trial 1 with value: 0.21573566571261063.[0m
[32m[I 2021-07-30 10:50:36,842][0m Trial 2 finished with value: 0.21242526275124726 and parameters: {'no_components': 9}. Best is trial 1 with value: 0.21573566571261063.[0m
[32m[I 2021-07-30 10:50:44,873][0m Trial 3 finished with value: 0.2034575567127854 and parameters: {'no_components': 44}. Best is trial 1 with value: 0.21573566571261063.[0m
[32m[I 2021-07-30 10:50:54,159][0m Trial 4 finished with value: 0.16075995456436062 and parameters: {'no_components': 422}. Best is trial 1 with value: 0.21573566571261063.[0m
[32m[I 2021-07-30 10:51:01,416][0m Trial 5 finished with value: 0.2052102317652445 and parameters: {'no_compon

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206


[32m[I 2021-07-30 10:53:38,640][0m Trial 0 finished with value: 0.005801384953535827 and parameters: {'beta': 3.3766843017802727e-09, 'lambda_': 2.2752614727578165e-05}. Best is trial 0 with value: 0.005801384953535827.[0m
[32m[I 2021-07-30 10:53:52,034][0m Trial 1 finished with value: 0.05960430490038714 and parameters: {'beta': 0.11889548086245232, 'lambda_': 3.946460838475964e-07}. Best is trial 1 with value: 0.05960430490038714.[0m
[32m[I 2021-07-30 10:54:02,630][0m Trial 2 finished with value: 0.031167338395529143 and parameters: {'beta': 0.027291218171903664, 'lambda_': 0.0011656746386213893}. Best is trial 1 with value: 0.05960430490038714.[0m
[32m[I 2021-07-30 10:55:59,673][0m Trial 3 finished with value: 0.005801384953535827 and parameters: {'beta': 2.9003899398208524e-05, 'lambda_': 5.5669008004359715e-08}. Best is trial 1 with value: 0.05960430490038714.[0m
[32m[I 2021-07-30 10:56:08,898][0m Trial 4 finished with value: 0.167778819368494 and parameters: {'beta'

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206
CPU times: user 1h 52min 32s, sys: 47min 25s, total: 2h 39min 57s
Wall time: 38min 17s


In [24]:
e.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,Coverage@10,HitRate@1,HitRate@5,HitRate@10,MAP@10,MRR@10,NDCG@10,Surprisal@10,fit_pred_time,params
als_imp,0.165873,0.305531,0.586479,0.702371,0.172592,0.424092,0.268565,0.173147,9.969577,{'rank': 8}
slim,0.041748,0.326602,0.5777,0.693591,0.168213,0.438553,0.265226,0.136549,9.931337,"{'beta': 2.8672570949847566e-09, 'lambda_': 0...."
lightfm_no_feat,0.148221,0.312555,0.592625,0.694469,0.16859,0.429017,0.263783,0.166773,9.913496,{'no_components': 8}
knn,0.055758,0.300263,0.557507,0.646181,0.167006,0.409878,0.256649,0.140657,16.019662,"{'num_neighbours': 99, 'shrink': 56}"
pop_rec,0.033903,0.28446,0.53029,0.645303,0.157194,0.390414,0.243614,0.118354,13.986708,
admm_slim,0.267862,0.236172,0.53468,0.652327,0.119329,0.356396,0.20669,0.208583,28.137156,"{'lambda_1': 2.3564346666695692e-09, 'lambda_2..."
wilson,0.017092,0.083406,0.34504,0.414399,0.045002,0.180976,0.092121,0.26219,10.372497,
random_rec_pop_based,0.662931,0.060579,0.258999,0.382792,0.028434,0.141737,0.069572,0.317897,6.863843,{'alpha': 1.1534933025992824}
random_rec_un,0.960773,0.032485,0.107112,0.183494,0.009075,0.067583,0.025557,0.53693,7.788454,
als_exp,0.019053,0.0,0.023705,0.031607,0.001269,0.011561,0.003875,0.965855,24.228206,{'rank': 117}


## 2.3 Neural models

In [27]:
nets = {'multvae': [MultVAE(), 'no_opt'],
        'neuromf': [NeuroMF(), 'no_opt'], 
        'word2vec': [Word2VecRec(seed=SEED), 'no_opt'],
        'multvae_opt': [MultVAE(), {"learning_rate": [0.0001, 0.5],
                                   "dropout": [0, 0.5],
                                    "l2_reg": [1e-9, 5]
                                   }],
        'neuromf_opt': [NeuroMF(), {
                                    "learning_rate": [0.0001, 0.5],
                                    "l2_reg": [1e-4, 5],
                                    "count_negative_sample": [1, 20]
                                    }],
        'word2vec_opt': [Word2VecRec(seed=SEED), None]}

In [28]:
%%time
full_pipeline(nets, e, train, budget=10)

30-Jul-21 11:01:28, replay, ERROR: multvae started
ERROR:replay:multvae started
30-Jul-21 11:01:28, replay, ERROR: multvae fit_predict started
ERROR:replay:multvae fit_predict started
INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
30-Jul-21 11:03:21, replay, ERROR: neuromf started
ERROR:replay:neuromf started
30-Jul-21 11:03:21, replay, ERROR: neuromf fit_predict started
ERROR:replay:neuromf fit_predict started


                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206


INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
30-Jul-21 11:13:43, replay, ERROR: word2vec started
ERROR:replay:word2vec started
30-Jul-21 11:13:43, replay, ERROR: word2vec fit_predict started
ERROR:replay:word2vec fit_predict started


                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206


30-Jul-21 11:15:04, replay, ERROR: multvae_opt started
ERROR:replay:multvae_opt started
30-Jul-21 11:15:04, replay, ERROR: multvae_opt optimization started
ERROR:replay:multvae_opt optimization started
[32m[I 2021-07-30 11:15:04,781][0m A new study created in memory with name: no-name-a5ce8675-eeee-4d49-8d78-2acd26d9bf97[0m


                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.011561     0.019053      24.228206


INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
[32m[I 2021-07-30 11:15:40,150][0m Trial 0 finished with value: 0.19607659904557673 and parameters: {'learning_rate': 0.0001377346510315515, 'dropout': 0.09380599853745508, 'l2_reg': 3.431206685996096e-07}. Best is trial 0 with value: 0.19607659904557673.[0m
INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
[32m[I 2021-07-30 11:15:56,616][0m Trial 1 finished with value: 0.12371824416336138 and parameters: {'learning_rate': 0.004210661058801066, 'dropout': 0.39521406314334967, 'l2_reg': 2.3364081375627152e-05}. Best is trial 0 with value: 0.19607659904557673.[0m
INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
[32m[I 2021-07-30 11:16:14,851][0m Trial 2 finished with value: 0.1665606063918027 and parameters: {'learning_rate': 0.0012529448953705045, 'dropout': 0.30771564390958017, 'l2_reg': 1.476418746754923e-07}. Best is trial 0 with value: 0.1

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae_opt           0.242758  0.388495     0.035584      69.804866
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.067583     0.960773       7.788454
als_exp               0.003875  0.

[32m[I 2021-07-30 11:29:12,455][0m Trial 0 finished with value: 0.11345660156188117 and parameters: {'learning_rate': 0.00016926349010943598, 'l2_reg': 1.5857683559774738, 'count_negative_sample': 2}. Best is trial 0 with value: 0.11345660156188117.[0m
INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
[32m[I 2021-07-30 11:38:44,990][0m Trial 1 finished with value: 0.1482761900961475 and parameters: {'learning_rate': 0.2077946615547668, 'l2_reg': 0.0036535978553482538, 'count_negative_sample': 4}. Best is trial 1 with value: 0.1482761900961475.[0m
INFO:ignite.handlers.early_stopping.EarlyStopping:EarlyStopping: Stop training
[32m[I 2021-07-30 11:43:19,535][0m Trial 2 finished with value: 0.16517310924300407 and parameters: {'learning_rate': 0.0511395171024003, 'l2_reg': 0.0016673443035774976, 'count_negative_sample': 3}. Best is trial 2 with value: 0.16517310924300407.[0m
[32m[I 2021-07-30 11:54:57,742][0m Trial 3 finished with value: 0.1248819176

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
neuromf_opt           0.244322  0.393361     0.032222    1714.583148
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae_opt           0.242758  0.388495     0.035584      69.804866
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
random_rec_un         0.025557  0.

[32m[I 2021-07-30 14:15:04,993][0m Trial 0 finished with value: 0.049095968198577394 and parameters: {'rank': 248, 'window_size': 39, 'use_idf': False}. Best is trial 0 with value: 0.049095968198577394.[0m
[32m[I 2021-07-30 14:17:48,081][0m Trial 1 finished with value: 0.03755227689744237 and parameters: {'rank': 221, 'window_size': 42, 'use_idf': True}. Best is trial 0 with value: 0.049095968198577394.[0m
[32m[I 2021-07-30 14:22:05,851][0m Trial 2 finished with value: 0.034482762452198674 and parameters: {'rank': 231, 'window_size': 68, 'use_idf': True}. Best is trial 0 with value: 0.049095968198577394.[0m
[32m[I 2021-07-30 14:23:36,649][0m Trial 3 finished with value: 0.03488950427132232 and parameters: {'rank': 91, 'window_size': 48, 'use_idf': True}. Best is trial 0 with value: 0.049095968198577394.[0m
[32m[I 2021-07-30 14:27:49,430][0m Trial 4 finished with value: 0.04899209035072324 and parameters: {'rank': 264, 'window_size': 56, 'use_idf': False}. Best is trial 0 

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
knn                   0.256649  0.409878     0.055758      16.019662
neuromf_opt           0.244322  0.393361     0.032222    1714.583148
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae_opt           0.242758  0.388495     0.035584      69.804866
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.141737     0.662931       6.863843
word2vec_opt          0.047363  0.

In [29]:
e.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,Coverage@10,HitRate@1,HitRate@5,HitRate@10,MAP@10,MRR@10,NDCG@10,Surprisal@10,fit_pred_time,params
als_imp,0.165873,0.305531,0.586479,0.702371,0.172592,0.424092,0.268565,0.173147,9.969577,{'rank': 8}
slim,0.041748,0.326602,0.5777,0.693591,0.168213,0.438553,0.265226,0.136549,9.931337,"{'beta': 2.8672570949847566e-09, 'lambda_': 0...."
lightfm_no_feat,0.148221,0.312555,0.592625,0.694469,0.16859,0.429017,0.263783,0.166773,9.913496,{'no_components': 8}
knn,0.055758,0.300263,0.557507,0.646181,0.167006,0.409878,0.256649,0.140657,16.019662,"{'num_neighbours': 99, 'shrink': 56}"
neuromf_opt,0.032222,0.291484,0.526778,0.645303,0.157181,0.393361,0.244322,0.123693,1714.583148,"{'learning_rate': 0.0006072104239042867, 'l2_r..."
pop_rec,0.033903,0.28446,0.53029,0.645303,0.157194,0.390414,0.243614,0.118354,13.986708,
multvae_opt,0.035584,0.274802,0.535558,0.657594,0.155253,0.388495,0.242758,0.119976,69.804866,"{'learning_rate': 0.07231756087837626, 'dropou..."
multvae,0.031662,0.263389,0.523266,0.640035,0.152172,0.379538,0.236122,0.122042,29.724166,
admm_slim,0.267862,0.236172,0.53468,0.652327,0.119329,0.356396,0.20669,0.208583,28.137156,"{'lambda_1': 2.3564346666695692e-09, 'lambda_2..."
neuromf,0.257495,0.187006,0.501317,0.626866,0.110592,0.317911,0.193122,0.235454,345.876106,


## 2.4 Ansambles

In [58]:
ensembles = {'stack': [Stack(
    models=[LightFMWrap(random_state=SEED, no_components=common_models['lightfm_no_feat'][0].no_components), 
            KNN(**{'num_neighbours': common_models['knn'][0].num_neighbours, 'shrink': common_models['knn'][0].shrink}), 
            ALSWrap(seed=SEED, rank=common_models['als_imp'][0].rank)],
    n_folds=3,
    budget=BUDGET,
    seed=SEED), 'no_opt']}

In [59]:
State().logger.setLevel(logging.DEBUG)

In [60]:
%%time
full_pipeline(ensembles, e, train)

30-Jul-21 15:28:57, replay, ERROR: stack started
ERROR:replay:stack started
30-Jul-21 15:28:57, replay, ERROR: stack fit_predict started
ERROR:replay:stack fit_predict started
30-Jul-21 15:28:57, replay, DEBUG: Начало обучения Stack
DEBUG:replay:Начало обучения Stack
30-Jul-21 15:28:57, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
30-Jul-21 15:28:57, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
30-Jul-21 15:28:58, replay, INFO: Processing fold #0
INFO:replay:Processing fold #0
30-Jul-21 15:29:00, replay, DEBUG: Начало обучения LightFMWrap
DEBUG:replay:Начало обучения LightFMWrap
30-Jul-21 15:29:00, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
30-Jul-21 15:29:01, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
30-Jul-21 15:29:05, replay, DEBUG: Начало предикта LightFMWr

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
stack                 0.261318  0.417731     0.059120    1063.854774
lightfm_item_feat     0.260717  0.432302     0.170916      50.776555
knn                   0.256649  0.409878     0.055758      16.019662
neuromf_opt           0.244322  0.393361     0.032222    1714.583148
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae_opt           0.242758  0.388495     0.035584      69.804866
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.

In [61]:
e.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,Coverage@10,HitRate@1,HitRate@5,HitRate@10,MAP@10,MRR@10,NDCG@10,Surprisal@10,fit_pred_time,params
als_imp,0.165873,0.305531,0.586479,0.702371,0.172592,0.424092,0.268565,0.173147,9.969577,{'rank': 8}
slim,0.041748,0.326602,0.5777,0.693591,0.168213,0.438553,0.265226,0.136549,9.931337,"{'beta': 2.8672570949847566e-09, 'lambda_': 0...."
lightfm_no_feat,0.148221,0.312555,0.592625,0.694469,0.16859,0.429017,0.263783,0.166773,9.913496,{'no_components': 8}
stack,0.05912,0.311677,0.56892,0.65935,0.170716,0.417731,0.261318,0.139633,1063.854774,
lightfm_item_feat,0.170916,0.308165,0.605795,0.698859,0.162175,0.432302,0.260717,0.183765,50.776555,{'no_components': 16}
knn,0.055758,0.300263,0.557507,0.646181,0.167006,0.409878,0.256649,0.140657,16.019662,"{'num_neighbours': 99, 'shrink': 56}"
neuromf_opt,0.032222,0.291484,0.526778,0.645303,0.157181,0.393361,0.244322,0.123693,1714.583148,"{'learning_rate': 0.0006072104239042867, 'l2_r..."
pop_rec,0.033903,0.28446,0.53029,0.645303,0.157194,0.390414,0.243614,0.118354,13.986708,
multvae_opt,0.035584,0.274802,0.535558,0.657594,0.155253,0.388495,0.242758,0.119976,69.804866,"{'learning_rate': 0.07231756087837626, 'dropou..."
multvae,0.031662,0.263389,0.523266,0.640035,0.152172,0.379538,0.236122,0.122042,29.724166,


In [64]:
ensembles['stack'][0].params

{'LightFMWrap': 0.9461744498894694,
 'KNN': 0.4653759137084064,
 'ALSWrap': 0.4443127394321243}

## 2.5 Models considering features

### 2.5.1 item features preprocessing

In [39]:
%%time
item_features = DataPreparator().transform(
    data=data.items,
    columns_names={
        "item_id": "item_id"
    }
)

CPU times: user 28.9 ms, sys: 13.1 ms, total: 42 ms
Wall time: 121 ms


In [40]:
item_features.show(2)

+-------+--------------------+----------------+
|item_id|              genres|           title|
+-------+--------------------+----------------+
|      1|Animation|Childre...|Toy Story (1995)|
|      2|Adventure|Childre...|  Jumanji (1995)|
+-------+--------------------+----------------+
only showing top 2 rows



In [41]:
year = item_features.withColumn('year', sf.substring(sf.col('title'), -5, 4).astype(st.IntegerType())).select('item_id', 'year')
year.show(2)

+-------+----+
|item_id|year|
+-------+----+
|      1|1995|
|      2|1995|
+-------+----+
only showing top 2 rows



In [42]:
genres = (
    State().session.createDataFrame(data.items[["item_id", "genres"]])
    .select(
        "item_id",
        sf.split("genres", "\|").alias("genres")
    )
)

In [43]:
genres_list = (
    genres.select(sf.explode("genres").alias("genre"))
    .distinct().filter('genre <> "(no genres listed)"')
    .toPandas()["genre"].tolist()
)

In [44]:
genres_list

['Documentary',
 'Fantasy',
 'Adventure',
 'War',
 'Animation',
 'Comedy',
 'Thriller',
 'Film-Noir',
 'Crime',
 'Sci-Fi',
 'Musical',
 'Mystery',
 "Children's",
 'Drama',
 'Horror',
 'Western',
 'Romance',
 'Action']

In [45]:
from pyspark.sql.functions import col, lit, array_contains
from pyspark.sql.types import IntegerType

item_features = genres
for genre in genres_list:
    item_features = item_features.withColumn(
        genre,
        array_contains(col("genres"), genre).astype(IntegerType())
    )
item_features = item_features.drop("genres").cache()
item_features.count()

3883

In [46]:
item_features = item_features.join(year, on='item_id', how='inner')
item_features.count()

3883

In [47]:
item_features.cache()

DataFrame[item_id: int, Documentary: int, Fantasy: int, Adventure: int, War: int, Animation: int, Comedy: int, Thriller: int, Film-Noir: int, Crime: int, Sci-Fi: int, Musical: int, Mystery: int, Children's: int, Drama: int, Horror: int, Western: int, Romance: int, Action: int, year: int]

In [65]:
item_features.show(3)

+-------+-----------+-------+---------+---+---------+------+--------+---------+-----+------+-------+-------+----------+-----+------+-------+-------+------+----+
|item_id|Documentary|Fantasy|Adventure|War|Animation|Comedy|Thriller|Film-Noir|Crime|Sci-Fi|Musical|Mystery|Children's|Drama|Horror|Western|Romance|Action|year|
+-------+-----------+-------+---------+---+---------+------+--------+---------+-----+------+-------+-------+----------+-----+------+-------+-------+------+----+
|      1|          0|      0|        0|  0|        1|     1|       0|        0|    0|     0|      0|      0|         1|    0|     0|      0|      0|     0|1995|
|      2|          0|      1|        1|  0|        0|     0|       0|        0|    0|     0|      0|      0|         1|    0|     0|      0|      0|     0|1995|
|      3|          0|      0|        0|  0|        0|     1|       0|        0|    0|     0|      0|      0|         0|    0|     0|      0|      1|     0|1995|
+-------+-----------+-------+-----

### 2.5.2 Models training

In [48]:
def fit_predict_add_res(name, model, experiment, train, user_features=None, item_features=None, suffix=''):
    start_time=time.time()
    train = train
    if isinstance(model, Wilson) or isinstance(model, ClassifierRec):
        train = wilson_train
    pred=model.fit_predict(log=train, 
                           k=K,
                           user_features=user_features,
                           item_features=item_features,
                           users=test.select('user_id').distinct())
    pred.count()
    fit_predict_time = time.time() - start_time
    experiment.add_result(name + suffix, pred)
    experiment.results.loc[name + suffix, 'fit_pred_time'] = fit_predict_time
    print(experiment.results[['NDCG@{}'.format(K), 'MRR@{}'.format(K), 'Coverage@{}'.format(K), 'fit_pred_time']].sort_values('NDCG@{}'.format(K), ascending=False))

In [49]:
def full_pipeline(models, experiment, train, user_features=None, item_features=None, suffix=''):
    for name, [model, params] in models.items():
        model.logger.error(msg='{} started'.format(name))
        if params != 'no_opt':
            model.logger.error(msg='{} optimization started'.format(name))
            best_params = model.optimize(opt_train, 
                                         opt_val, 
                                         user_features=user_features,
                                         item_features=item_features,
                                         param_grid=params, 
                                         k=K, 
                                         budget=BUDGET)
            model.set_params(**best_params)
            logger.error(msg='best params for {} are: {}'.format(name, best_params))
            experiment.results.loc[name + suffix, 'params'] = best_params.__repr__()
        
        logger.error(msg='{} fit_predict started'.format(name))
        fit_predict_add_res(name, model, experiment, train, user_features, item_features, suffix)

In [50]:
models_with_features = {'class': [ClassifierRec(), 'no_opt'],
        'lightfm_item_feat': [LightFMWrap(random_state=SEED), {"no_components": [8, 512]}]}

In [51]:
%%time
full_pipeline(models_with_features, e, train, user_features=None, item_features=item_features)

30-Jul-21 15:04:38, replay, ERROR: class started
ERROR:replay:class started
30-Jul-21 15:04:38, replay, ERROR: class fit_predict started
ERROR:replay:class fit_predict started
30-Jul-21 15:04:38, replay, DEBUG: Начало обучения ClassifierRec
DEBUG:replay:Начало обучения ClassifierRec
30-Jul-21 15:04:38, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
30-Jul-21 15:04:39, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
30-Jul-21 15:04:45, replay, DEBUG: Начало предикта ClassifierRec
DEBUG:replay:Начало предикта ClassifierRec
30-Jul-21 15:05:08, replay, ERROR: lightfm_item_feat started
ERROR:replay:lightfm_item_feat started
30-Jul-21 15:05:08, replay, ERROR: lightfm_item_feat optimization started
ERROR:replay:lightfm_item_feat optimization started
[32m[I 2021-07-30 15:05:08,582][0m A new study created in memory with name: no-name-0e0b3673-4814-4ecb-8116-8adc8deb4a86[0m
30-Jul-21 15:

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
stack                 0.260395  0.424057     0.084057    1120.912277
knn                   0.256649  0.409878     0.055758      16.019662
neuromf_opt           0.244322  0.393361     0.032222    1714.583148
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae_opt           0.242758  0.388495     0.035584      69.804866
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.180976     0.017092      10.372497
random_rec_pop_based  0.069572  0.

30-Jul-21 15:05:09, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
30-Jul-21 15:05:36, replay, DEBUG: Предикт модели в оптимизации
DEBUG:replay:Предикт модели в оптимизации
30-Jul-21 15:05:36, replay, DEBUG: Начало предикта LightFMWrap
DEBUG:replay:Начало предикта LightFMWrap
  1 / concat_features.sum(axis=1).A.ravel(),
30-Jul-21 15:05:42, replay, DEBUG: Подсчет метрики в оптимизации
DEBUG:replay:Подсчет метрики в оптимизации
30-Jul-21 15:05:44, replay, DEBUG: NDCG=0.195708
DEBUG:replay:NDCG=0.195708
[32m[I 2021-07-30 15:05:44,412][0m Trial 0 finished with value: 0.19570844909660975 and parameters: {'no_components': 16}. Best is trial 0 with value: 0.19570844909660975.[0m
30-Jul-21 15:05:44, replay, DEBUG: Фит модели в оптимизации
DEBUG:replay:Фит модели в оптимизации
30-Jul-21 15:05:44, replay, DEBUG: Начало обучения LightFMWrap
DEBUG:replay:Начало обучения LightFMWrap
30-Jul-21 15:05:44, replay, DEBUG: Основная стадия обучения (fit)
DEBUG

30-Jul-21 15:15:53, replay, DEBUG: Фит модели в оптимизации
DEBUG:replay:Фит модели в оптимизации
30-Jul-21 15:15:53, replay, DEBUG: Начало обучения LightFMWrap
DEBUG:replay:Начало обучения LightFMWrap
30-Jul-21 15:15:53, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
30-Jul-21 15:17:34, replay, DEBUG: Предикт модели в оптимизации
DEBUG:replay:Предикт модели в оптимизации
30-Jul-21 15:17:34, replay, DEBUG: Начало предикта LightFMWrap
DEBUG:replay:Начало предикта LightFMWrap
  1 / concat_features.sum(axis=1).A.ravel(),
30-Jul-21 15:17:39, replay, DEBUG: Подсчет метрики в оптимизации
DEBUG:replay:Подсчет метрики в оптимизации
30-Jul-21 15:17:43, replay, DEBUG: NDCG=0.188851
DEBUG:replay:NDCG=0.188851
[32m[I 2021-07-30 15:17:43,792][0m Trial 8 finished with value: 0.18885117578551885 and parameters: {'no_components': 311}. Best is trial 0 with value: 0.19570844909660975.[0m
30-Jul-21 15:17:43, replay, DEBUG: Фит модели в оптимизации
DEBUG:repl

30-Jul-21 15:24:53, replay, DEBUG: NDCG=0.194853
DEBUG:replay:NDCG=0.194853
[32m[I 2021-07-30 15:24:53,843][0m Trial 15 finished with value: 0.19485289904997347 and parameters: {'no_components': 127}. Best is trial 0 with value: 0.19570844909660975.[0m
30-Jul-21 15:24:53, replay, DEBUG: Фит модели в оптимизации
DEBUG:replay:Фит модели в оптимизации
30-Jul-21 15:24:53, replay, DEBUG: Начало обучения LightFMWrap
DEBUG:replay:Начало обучения LightFMWrap
30-Jul-21 15:24:53, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
30-Jul-21 15:25:35, replay, DEBUG: Предикт модели в оптимизации
DEBUG:replay:Предикт модели в оптимизации
30-Jul-21 15:25:35, replay, DEBUG: Начало предикта LightFMWrap
DEBUG:replay:Начало предикта LightFMWrap
  1 / concat_features.sum(axis=1).A.ravel(),
30-Jul-21 15:25:41, replay, DEBUG: Подсчет метрики в оптимизации
DEBUG:replay:Подсчет метрики в оптимизации
30-Jul-21 15:25:44, replay, DEBUG: NDCG=0.194144
DEBUG:replay:NDCG=0.

                       NDCG@10    MRR@10  Coverage@10  fit_pred_time
als_imp               0.268565  0.424092     0.165873       9.969577
slim                  0.265226  0.438553     0.041748       9.931337
lightfm_no_feat       0.263783  0.429017     0.148221       9.913496
lightfm_item_feat     0.260717  0.432302     0.170916      50.776555
stack                 0.260395  0.424057     0.084057    1120.912277
knn                   0.256649  0.409878     0.055758      16.019662
neuromf_opt           0.244322  0.393361     0.032222    1714.583148
pop_rec               0.243614  0.390414     0.033903      13.986708
multvae_opt           0.242758  0.388495     0.035584      69.804866
multvae               0.236122  0.379538     0.031662      29.724166
admm_slim             0.206690  0.356396     0.267862      28.137156
neuromf               0.193122  0.317911     0.257495     345.876106
word2vec              0.137578  0.245373     0.145979      32.822267
wilson                0.092121  0.

In [52]:
e.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,Coverage@10,HitRate@1,HitRate@5,HitRate@10,MAP@10,MRR@10,NDCG@10,Surprisal@10,fit_pred_time,params
als_imp,0.165873,0.305531,0.586479,0.702371,0.172592,0.424092,0.268565,0.173147,9.969577,{'rank': 8}
slim,0.041748,0.326602,0.5777,0.693591,0.168213,0.438553,0.265226,0.136549,9.931337,"{'beta': 2.8672570949847566e-09, 'lambda_': 0...."
lightfm_no_feat,0.148221,0.312555,0.592625,0.694469,0.16859,0.429017,0.263783,0.166773,9.913496,{'no_components': 8}
lightfm_item_feat,0.170916,0.308165,0.605795,0.698859,0.162175,0.432302,0.260717,0.183765,50.776555,{'no_components': 16}
stack,0.084057,0.321335,0.56892,0.654083,0.169138,0.424057,0.260395,0.144577,1120.912277,
knn,0.055758,0.300263,0.557507,0.646181,0.167006,0.409878,0.256649,0.140657,16.019662,"{'num_neighbours': 99, 'shrink': 56}"
neuromf_opt,0.032222,0.291484,0.526778,0.645303,0.157181,0.393361,0.244322,0.123693,1714.583148,"{'learning_rate': 0.0006072104239042867, 'l2_r..."
pop_rec,0.033903,0.28446,0.53029,0.645303,0.157194,0.390414,0.243614,0.118354,13.986708,
multvae_opt,0.035584,0.274802,0.535558,0.657594,0.155253,0.388495,0.242758,0.119976,69.804866,"{'learning_rate': 0.07231756087837626, 'dropou..."
multvae,0.031662,0.263389,0.523266,0.640035,0.152172,0.379538,0.236122,0.122042,29.724166,


# 3. Results

The best results by quality and time were shown by the commonly-used models such as ALS, SLIM and LightFM. 