In [1]:
%load_ext autoreload
%autoreload 2

import gc
import matplotlib.pyplot as plt
import pickle
import tqdm

import numpy as np
import pandas as pd
import torch
from pyspark.sql import functions as sf

from replay.data_preparator import DataPreparator, Indexer
from replay.experiment import Experiment
from replay.metrics import Coverage, HitRate, MRR, MAP, NDCG, Surprisal
from replay.models import DDPG, ALSWrap, ItemKNN, PopRec, NeuroMF, MultVAE
from replay.session_handler import State
from replay.splitters import UserSplitter, DateSplitter
from replay.utils import convert2spark

spark = State().session
spark.sparkContext.setLogLevel('ERROR')

  "LightFM was compiled without OpenMP support. "
23/02/01 12:43:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/01 12:43:46 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/02/01 12:43:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/02/01 12:43:47 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
K = 10
K_list_metrics = [1, 3, 10]
SEED = 42

## DDPG

In [3]:
from rs_datasets import MovieLens

ratings = MovieLens("1m").ratings
ratings.loc[:, 'rating'] = ratings.loc[:, 'rating'].apply(lambda x: 1 if x > 3 else -1)

preparator = DataPreparator()
log = preparator.transform(
    columns_mapping={
        'user_id': 'user_id',
        'item_id': 'item_id',
        'relevance': 'rating',
        'timestamp': 'timestamp'
    },
    data=ratings
)

indexer = Indexer(user_col='user_id', item_col='item_id')
indexer.fit(users=log.select('user_id'), items=log.select('item_id'))
log_replay = indexer.transform(log)

train_spl = DateSplitter(
    test_start=0.2,
    drop_cold_items=True,
    drop_cold_users=True,

)
train, test = train_spl.split(log_replay)

opt_train, opt_val = train_spl.split(train)
opt_train.count(), opt_val.count()

01-Feb-23 12:44:42, replay, INFO: Columns with ids of users or items are present in mapping. The dataframe will be treated as an interactions log.
                                                                                

(640128, 18172)

In [4]:
ddpg = DDPG(user_num=6040, item_num=5000, log_dir='data/models/ml1m_ddpg')
ddpg.fit(opt_train)

  0%|          | 0/4463 [00:00<?, ?it/s]

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
  weights = (current_buffer_len * probs[indices]) ** (-beta)


In [34]:
def eval_single_step(step, experiment, pred_log=opt_val):
    ddpg._load_model(f'data/models/ml1m_ddpg/model_{step}0000.pt')

    recs = ddpg.predict(
        k=K,
        users=pred_log.select('user_idx').distinct(),
        log=opt_train,
        filter_seen_items=True
    )

    experiment.add_result(step, recs)
    del recs; gc.collect()

### Evaluation

In [35]:
ml1m_val = Experiment(opt_val, {Coverage(opt_train): K, NDCG(): K, HitRate(): K})
for i in tqdm.auto.tqdm(range(1, 100)):
    try:
        eval_single_step(i, ml1m_val)
    except Exception as e:
        print(e)
        break

                                                                                

  0%|          | 0/99 [00:00<?, ?it/s]

  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
                                                                                

[Errno 2] No such file or directory: 'data/models/ml1m_ddpg/model_380000.pt'


Best metrics on validation

In [36]:
ml1m_val.results[ml1m_val.results['HitRate@10'] == ml1m_val.results['HitRate@10'].max()]

Unnamed: 0,Coverage@10,HitRate@10,NDCG@10
23,0.023094,0.540616,0.184073


Metrics on test

In [38]:
model_comparison = Experiment(test, {Coverage(opt_train): K, NDCG(): K, HitRate(): K})
eval_single_step(23, model_comparison, pred_log=test)
model_comparison.results

01-Feb-23 19:03:07, replay, INFO: DDPG model can't predict cold users, they will be ignored
                                                                                

Unnamed: 0,Coverage@10,HitRate@10,NDCG@10
23,0.026989,0.447415,0.122556


## Other models

In [39]:
%%time
knn = ItemKNN()
knn.optimize(opt_train, opt_val)
knn_recs = knn.predict(
    k=K,
    users=test.select('user_idx').distinct(),
    log=opt_train,
    filter_seen_items=True,
)

[32m[I 2023-02-01 19:06:17,089][0m A new study created in memory with name: no-name-76530dd9-4b31-4775-bcf9-684b00de6b56[0m
  self.study.enqueue_trial(self._init_args)
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
[32m[I 2023-02-01 19:09:27,216][0m Trial 0 finished with value: 0.177479175227549 and parameters: {'num_neighbours': 10, 'shrink': 0, 'weighting': None}. Best is trial 0 with value: 0.177479175227549.[0m
[32m[I 2023-02-01 19:10:02,712][0m Trial 1 finished with value: 0.20550660715525404 and parameters: {'num_neighbours': 45, 'shrink': 94, 'weighting': None}. Best is trial 1 with value: 0.20550660715525404.[0m
[32m[I 2023-02-01 19:10:24,399][0m Trial 2 finished with value: 0.2118225120826309 and parameters: {'num_neighbours': 89, 'shrink': 79, 'weighting': 'tf_idf'}. Best is trial 2 with value: 0.2118225120826309.[0m
[32m[I 2023-02-01 19:10:52,851][0m

CPU times: user 1.86 s, sys: 1.94 s, total: 3.81 s
Wall time: 6min 34s


In [40]:
%%time
als = ALSWrap(rank=200)
als.fit(opt_train)
als_recs = als.predict(
    k=K,
    users=test.select('user_idx').distinct(),
    log=opt_train,
    filter_seen_items=True,
)

01-Feb-23 19:15:47, replay, INFO: ALSWrap model can't predict cold users, they will be ignored

CPU times: user 309 ms, sys: 332 ms, total: 642 ms
Wall time: 3min 10s


                                                                                

In [41]:
%%time
vae = MultVAE()
vae.fit(opt_train)
vae_recs = vae.predict(
    k=K,
    users=test.select('user_idx').distinct(),
    log=opt_train,
    filter_seen_items=True,
)

01-Feb-23 19:16:05, replay, INFO: The model is neural network with non-distributed training
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
                                                                                

CPU times: user 2min 53s, sys: 35.4 s, total: 3min 28s
Wall time: 3min 28s


In [42]:
%%time
neuro_mf = NeuroMF()
neuro_mf.fit(opt_train)
neuro_mf_recs = neuro_mf.predict(
    k=K,
    users=test.select('user_idx').distinct(),
    log=opt_train,
    filter_seen_items=True,
)

01-Feb-23 19:19:35, replay, INFO: The model is neural network with non-distributed training
01-Feb-23 19:26:47, replay, INFO: NeuroMF model can't predict cold users, they will be ignored
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
  "LightFM was compiled without OpenMP support. "
                                                                                

CPU times: user 10min 38s, sys: 2min 5s, total: 12min 43s
Wall time: 7min 34s


In [43]:
%%time
poprec = PopRec()
poprec.fit(opt_train)
poprec_recs = poprec.predict(
    k=K,
    users=test.select('user_idx').distinct(),
    log=opt_train,
    filter_seen_items=True,
)



CPU times: user 87.4 ms, sys: 121 ms, total: 208 ms
Wall time: 16.1 s


                                                                                

In [44]:
model_comparison.add_result("als", als_recs)
model_comparison.add_result("knn", knn_recs)
model_comparison.add_result("vae", vae_recs)
model_comparison.add_result("neuro_mf", neuro_mf_recs)
model_comparison.add_result("pop_rec", poprec_recs)
model_comparison.results

                                                                                

Unnamed: 0,Coverage@10,HitRate@10,NDCG@10
23,0.026989,0.447415,0.122556
als,0.239566,0.491979,0.158077
knn,0.091263,0.431373,0.133824
vae,0.012243,0.242424,0.039003
neuro_mf,0.300779,0.433155,0.111933
pop_rec,0.035615,0.572193,0.173862
