# Сравнение моделей RePlay

### Датасеты
Сравним результаты моделей RePlay на популярных датасетах __MovieLens 1m__, __MovieLens 20m__ (explicit feedback) и __last_fm__ и __steam__ (implicit feedback). 

### Предобработка данных: 
Для датасетов MovieLens будем считать оценку >= 3 положительным взаимодействием для моделей, использующих факт взаимодействия/информацию о положительном/отрицательном взаимодействии.

### Разбиение данных:
Выберем в test последний (или случайный, если время оценки не определено) оцененный объект для 40% пользователей с удалением холодных объектов и пользователей из теста. 

### Predict:
Предскажем по 10 объектов для пользователей test.

### Метрики
Оценим качество моделелей в метриках __ndcg, hit rage, map__ для k = 1, 5, 10 и разнообразие рекомендаций в метрике __coverage__.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%config Completer.use_jedi = False

In [3]:
import logging
import time


from collections import namedtuple

from pyspark.sql import functions as sf, types as st

from replay.session_handler import State, logger_with_settings

In [4]:
spark = State().session
spark

In [44]:
from logging import ERROR
State().logger.setLevel(ERROR)

In [50]:
K = 10
K_list_metrics = [1, 5, 10]
BUDGET=2

## 0. Подготовка данных <a name='data-preparator'></a>

### 0.1 Загрузка данных

In [7]:
from rs_datasets import MovieLens

data = MovieLens("1m")
data.info()

ratings


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance





In [8]:
from replay.data_preparator import DataPreparator

log_ml_1 = DataPreparator().transform(
    data=data.ratings,
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "rating",
        "timestamp": "timestamp"
    }
)
log_ml_1.count()

1000209

In [9]:
users = DataPreparator().transform(
    data=data.users,
    columns_names={
        "user_id": "user_id",
    }
)

In [11]:
# рассматриваем как положительный фидбэк только оценки >= 3
only_positives_log = log_ml_1.filter(sf.col('relevance') >= 3)
only_negatives_log = log_ml_1.filter(sf.col('relevance') < 3).withColumn('relevance', sf.lit(0.))
only_positives_log.count(), only_negatives_log.count()

(836478, 163731)

### 0.2. Split

In [13]:
from replay.splitters import UserSplitter

train_spl = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=1,
    user_test_size=0.4,
    seed=1234,
    shuffle=False
)
full_train, test = train_spl.split(only_positives_log)
(full_train.count(), test.count())

(834063, 2411)

In [14]:
val_spl = UserSplitter(
    drop_cold_items=True,
    drop_cold_users=True,
    item_test_size=1,
    user_test_size=0.4,
    seed=1234,
    shuffle=False
)
train, val = val_spl.split(full_train)
(val.count(), test.count())

(2413, 2411)

In [15]:
wilson_train=train.withColumn('relevance', sf.lit(1)).union(only_negatives_log)
wilson_train.count()

995379

# 1. Оценка качества моделей

In [16]:
from replay.experiment import Experiment
from replay.metrics import MAP, NDCG, HitRate, Coverage

In [34]:
e = Experiment(test, {MAP(): K, NDCG(): K, HitRate(): K_list_metrics, Coverage(log_ml_1): K})

# 2. Сравнение моделей

## 2.1. Неперсонализированные рекомендации

In [35]:
from replay.models import PopRec, RandomRec, Wilson

In [36]:
ModelParams = namedtuple('ModelParams', ['name', 'model', 'search_space'])

In [37]:
non_personalized_models = [
    ModelParams('poprec', PopRec(), None),
    ModelParams('wilson', Wilson(), None),
    ModelParams('random_rec', RandomRec(), 'default'),
]

In [49]:
def fit_predict_add_res(name, model, experiment):
    start_time=time.time()
    if name=='wilson':
        pred=model.fit_predict(log=wilson_train, 
                          k=K, 
                          users=test.select('user_id').distinct())
    else:
        pred=model.fit_predict(log=full_train, 
                          k=K, 
                          users=test.select('user_id').distinct())
    pred.count()
    experiment.results.loc[name, 'time'] = time.time() - start_time    
    experiment.add_result(name, pred)

In [45]:
def full_pipeline(models, experiment):
    for model_params in models:
#         start_time
        model_params.model.logger.error(msg='{} started'.format(model_params.name))
        if model_params.search_space is not None:
            model_params.model.logger.error(msg='{} optimization started'.format(model_params.name))
            if model_params.search_space == 'default':
                best_params = model_params.model.optimize(train, val, k=K, budget=BUDGET)
            else:
                best_params = model_params.model.optimize(train, val, param_grid=model_params.search_space, k=K, budget=BUDGET)
            model_params.model.set_params(**best_params)
            experiment.results.loc[model_params.name, 'params'] = best_params.__repr__()
            
        logger.debug(msg='{} fit_predict started'.format(model_params.name))
        fit_predict_add_res(model_params.name, model_params.model, experiment)

In [51]:
%%time
full_pipeline(non_personalized_models, e)

10-Feb-21 14:33:09, replay, ERROR: poprec started
ERROR:replay:poprec started
10-Feb-21 14:33:42, replay, ERROR: wilson started
ERROR:replay:wilson started
10-Feb-21 14:34:27, replay, ERROR: random_rec started
ERROR:replay:random_rec started
10-Feb-21 14:34:27, replay, ERROR: random_rec optimization started
ERROR:replay:random_rec optimization started
[32m[I 2021-02-10 14:34:27,693][0m A new study created in memory with name: no-name-9b7b2125-9907-452e-8ed7-0263a9cc496d[0m
[32m[I 2021-02-10 14:34:41,749][0m Trial 0 finished with value: 0.0044797038668894255 and parameters: {'distribution': 'popular_based', 'alpha': 84.34489319355359}. Best is trial 0 with value: 0.0044797038668894255.[0m
[32m[I 2021-02-10 14:34:53,196][0m Trial 1 finished with value: 0.005647592070192276 and parameters: {'distribution': 'popular_based', 'alpha': 20.819693294265026}. Best is trial 1 with value: 0.005647592070192276.[0m


In [52]:
e.results.sort_values('NDCG@10', ascending=False)

Unnamed: 0,Coverage@10,HitRate@1,HitRate@5,HitRate@10,MAP@10,NDCG@10,time,params
poprec,0.037777,0.007466,0.02696,0.048528,0.016832,0.024126,9.68646,
wilson,0.018079,0.001659,0.009125,0.015761,0.004658,0.00721,15.368459,
random_rec,0.825418,0.0,0.003733,0.00954,0.001904,0.00365,13.011448,"{'distribution': 'popular_based', 'alpha': 20...."


## 2.2 Классические коллаборативные модели

In [53]:
from replay.models import KNN, ALSWrap, SLIM, ADMMSLIM

In [55]:
collaborative_models = [
    ModelParams('knn', KNN(), 'default'),
    ModelParams('als', ALSWrap(), 'default'),
    ModelParams('slim', SLIM(), 'default'),
    ModelParams('admm_slim', ADMMSLIM(), 'default'),
]

In [57]:
full_pipeline(collaborative_models, e)

10-Feb-21 14:38:21, replay, ERROR: knn started
ERROR:replay:knn started
10-Feb-21 14:38:21, replay, ERROR: knn optimization started
ERROR:replay:knn optimization started
[32m[I 2021-02-10 14:38:21,704][0m A new study created in memory with name: no-name-36f5be2e-b3c8-43be-855a-5000eb9eb976[0m
[32m[I 2021-02-10 14:40:32,782][0m Trial 0 finished with value: 0.03179201817682271 and parameters: {'num_neighbours': 97, 'shrink': 51}. Best is trial 0 with value: 0.03179201817682271.[0m
[32m[I 2021-02-10 14:42:47,469][0m Trial 1 finished with value: 0.031876034354646324 and parameters: {'num_neighbours': 43, 'shrink': 80}. Best is trial 1 with value: 0.031876034354646324.[0m
10-Feb-21 14:45:43, replay, ERROR: als started
ERROR:replay:als started
10-Feb-21 14:45:43, replay, ERROR: als optimization started
ERROR:replay:als optimization started
[32m[I 2021-02-10 14:45:43,900][0m A new study created in memory with name: no-name-fdf12257-2a20-44de-b15e-b5c475cc6ff6[0m
[32m[I 2021-02-10

Py4JJavaError: An error occurred while calling o53597.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 13972.0 failed 1 times, most recent failure: Lost task 2.0 in stage 13972.0 (TID 26505, 192.168.1.70, executor driver): java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3448)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3445)
	at jdk.internal.reflect.GeneratedMethodAccessor170.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.OutOfMemoryError: Java heap space


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 50612)
Traceback (most recent call last):
  File "/Users/a18785698/.pyenv/versions/3.7.6/lib/python3.7/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/a18785698/.pyenv/versions/3.7.6/lib/python3.7/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/a18785698/.pyenv/versions/3.7.6/lib/python3.7/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/a18785698/.pyenv/versions/3.7.6/lib/python3.7/socketserver.py", line 720, in __init__
    self.handle()
  File "/Users/a18785698/Documents/code_dir/base_replay_376/lib/python3.7/site-packages/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/Users/a18785698/Documents/code_dir/base_replay_376/lib/python3.7/site-pack

## 1. Обучение модели 

#### SLIM
Один из простых, но эффективных алгоритмов 

In [126]:
from replay.models import SLIM

slim = SLIM(lambda_=0.01, beta=0.01)

In [None]:
slim.optimize()

In [10]:
%%time

slim.fit(log=train)

04-Feb-21 01:04:18, replay, DEBUG: Начало обучения SLIM
DEBUG:replay:Начало обучения SLIM
04-Feb-21 01:04:18, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
04-Feb-21 01:04:19, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)


CPU times: user 2.15 s, sys: 120 ms, total: 2.27 s
Wall time: 7.45 s


In [11]:
%%time

recs = slim.predict(
    k=K,
    users=test.select('user_id').distinct(),
    items=test.select('item_id').distinct(),
    log=train,
    filter_seen_items=True
)

04-Feb-21 01:04:25, replay, DEBUG: Начало предикта SLIM
DEBUG:replay:Начало предикта SLIM


CPU times: user 831 ms, sys: 133 ms, total: 964 ms
Wall time: 5.92 s


## 2. Оценка качества и сравнение результатов моделей

В библиотеке реализованы различные метрики качества рекомендательных систем, встречающихся в литературе.
Их можно использовать напрямую, либо запоминать результаты с помощью класса `Experiment`.

In [12]:
from replay.metrics import HitRate, NDCG, MAP
from replay.experiment import Experiment

metrics = Experiment(test, {NDCG(): K,
                            MAP() : K,
                            HitRate(): [1, int(K/2), K]})


In [13]:
%%time
metrics.add_result("SLIM", recs)
metrics.results

CPU times: user 107 ms, sys: 49.2 ms, total: 156 ms
Wall time: 55.1 s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,MAP@10,NDCG@10
SLIM,0.072,0.218,0.328,0.135137,0.18002


## 3. Конвертация в pandas

In [14]:
recs_pd = recs.toPandas()
recs_pd.head(3)

Unnamed: 0,user_id,item_id,relevance
0,1018,1200,0.842803
1,1018,1221,0.790892
2,1018,10,0.714433


## 4. Примеры использования других моделей RePlay

#### ALS
Библиотека также содержит классические алгоритмы рекомендаций, например, матричную факторизацию

In [15]:
from replay.models import ALSWrap

als = ALSWrap(rank=100)

In [16]:
%%time
als.fit(log=train)

04-Feb-21 01:05:28, replay, DEBUG: Начало обучения ALSWrap
DEBUG:replay:Начало обучения ALSWrap
04-Feb-21 01:05:28, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
04-Feb-21 01:05:30, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)


CPU times: user 326 ms, sys: 49.2 ms, total: 375 ms
Wall time: 49.4 s


In [17]:
%%time
recs = als.predict(
    k=K,
    users=test.select('user_id').distinct(),
    items=test.select('item_id').distinct(),
    log=train,
    filter_seen_items=True
)

04-Feb-21 01:06:18, replay, DEBUG: Начало предикта ALSWrap
DEBUG:replay:Начало предикта ALSWrap


CPU times: user 846 ms, sys: 145 ms, total: 991 ms
Wall time: 5.06 s


In [18]:
%%time
metrics.add_result("ALS", recs)
metrics.results

CPU times: user 89.5 ms, sys: 25.4 ms, total: 115 ms
Wall time: 11.1 s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,MAP@10,NDCG@10
SLIM,0.072,0.218,0.328,0.135137,0.18002
ALS,0.064,0.258,0.376,0.14985,0.203


#### MultVAE 
Пример использования DL в рекомендациях

In [19]:
from replay.models import MultVAE

multvae = MultVAE(epochs=100)

In [20]:
%%time
multvae.fit(log=train)

04-Feb-21 01:06:34, replay, DEBUG: Начало обучения MultVAE
DEBUG:replay:Начало обучения MultVAE
04-Feb-21 01:06:34, replay, DEBUG: Предварительная стадия обучения (pre-fit)
DEBUG:replay:Предварительная стадия обучения (pre-fit)
04-Feb-21 01:06:35, replay, DEBUG: Основная стадия обучения (fit)
DEBUG:replay:Основная стадия обучения (fit)
04-Feb-21 01:06:36, replay, DEBUG: Составление батча:
DEBUG:replay:Составление батча:
04-Feb-21 01:06:38, replay, DEBUG: Обучение модели
DEBUG:replay:Обучение модели
04-Feb-21 01:06:39, replay, DEBUG: Epoch[1] current loss: 1352.15152
DEBUG:replay:Epoch[1] current loss: 1352.15152
04-Feb-21 01:06:39, replay, DEBUG: Epoch[1] validation average loss: 1480.69092
DEBUG:replay:Epoch[1] validation average loss: 1480.69092
04-Feb-21 01:06:41, replay, DEBUG: Epoch[2] current loss: 1251.65600
DEBUG:replay:Epoch[2] current loss: 1251.65600
04-Feb-21 01:06:41, replay, DEBUG: Epoch[2] validation average loss: 1472.20325
DEBUG:replay:Epoch[2] validation average loss:

CPU times: user 1min 15s, sys: 5.52 s, total: 1min 21s
Wall time: 23.6 s


In [21]:
%%time

recs = multvae.predict(
    k=10,
    users=test.select('user_id').distinct(),
    items=test.select('item_id').distinct(),
    log=train,
    filter_seen_items=True
)

04-Feb-21 01:06:58, replay, DEBUG: Начало предикта MultVAE
DEBUG:replay:Начало предикта MultVAE
04-Feb-21 01:07:02, replay, DEBUG: Предсказание модели
DEBUG:replay:Предсказание модели


CPU times: user 1.22 s, sys: 178 ms, total: 1.4 s
Wall time: 11.6 s


In [22]:
%%time
metrics.add_result("MultVAE", recs)
metrics.results

CPU times: user 95.7 ms, sys: 85 ms, total: 181 ms
Wall time: 29.7 s


Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,MAP@10,NDCG@10
SLIM,0.072,0.218,0.328,0.135137,0.18002
ALS,0.064,0.258,0.376,0.14985,0.203
MultVAE,0.012,0.03,0.05,0.020785,0.027518


## 5 Сравнение результатов различных моделей
С помощью experiment можно сравнить качество моделей, построенных с использованием различных инструментов.
Предположим, мы хотим сравнить модели RePlay с некой "внешней" моделью. Для этого нужно:
* 5.1. Экспортировать обучающую выборку (в pandas/numpy/csv)
* 5.2. Обучить модель и получить рекомендации для всех пользователей в виде csv/pandas-датафрейма
* 5.3. Считать рекомендации с помощью DataPreparator
* 5.4. Посчитать метрики в experiment

#### 5.1 Экспортируем train

In [23]:
train.toPandas().to_csv("train.csv", index=False)

In [24]:
!head -n 5 train.csv

user_id,item_id,relevance,timestamp
1,1029,5.0,2001-01-01 01:36:45
1,2294,4.0,2001-01-07 02:38:11
1,3114,4.0,2001-01-01 01:36:14
1,783,4.0,2001-01-07 02:38:11


#### 5.2 Обучаем модель и получаем рекомендации в формате `id пользователя - id объекта - relevance`

Предположим, что это произошло, и у нас есть рекомендации в виде csv-файла. Ниже в качестве пример используем рекомендации, полученные одной из моделей, с рандомными релевантностями.

In [25]:
from pyspark.sql.functions import rand

In [26]:
recs.withColumn('relevance', rand(seed=123)).toPandas().to_csv("recs.csv", index=False)

#### 5.3 Теперь нужно прочитать рекомендации в формате, поддерживаемом библиотекой

In [27]:
recs = DataPreparator().transform(
    path="recs.csv",
    columns_names={
        "user_id": "user_id",
        "item_id": "item_id",
        "relevance": "relevance"
    },
    header=True,
    format_type="csv"
)

#### 5.4 Сравним качество внешней модели с предыдущими результатами

In [28]:
metrics.add_result("my_model", recs)
metrics.results.sort_values("NDCG@10", ascending=False)

Unnamed: 0,HitRate@1,HitRate@5,HitRate@10,MAP@10,NDCG@10
ALS,0.064,0.258,0.376,0.14985,0.203
SLIM,0.072,0.218,0.328,0.135137,0.18002
MultVAE,0.012,0.03,0.05,0.020785,0.027518
my_model,0.002,0.022,0.05,0.011313,0.020113
