In [4]:
import logging
import pickle
import re
import time
import warnings
from logging import getLogger

import numpy as np
import pandas as pd
import torch
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.quick_start import load_data_and_model
from recbole.quick_start import run_recbole
from recbole.utils import init_seed, init_logger
from tqdm import tqdm

In [5]:
warnings.filterwarnings('ignore')

In [6]:
interactions_df = pd.read_csv('../data/interactions_processed.csv')
users_df = pd.read_csv('../data/users_processed.csv')
items_df = pd.read_csv('../data/items_processed.csv')

In [7]:
interactions_df['t_dat'] = pd.to_datetime(interactions_df['last_watch_dt'], format="%Y-%m-%d")
interactions_df['timestamp'] = interactions_df.t_dat.values.astype(np.int64) // 10 ** 9

In [8]:
df = interactions_df[['user_id', 'item_id', 'timestamp']].rename(
    columns={'user_id': 'user_id:token', 'item_id': 'item_id:token',
             'timestamp': 'timestamp:float'})

In [9]:
!mkdir recbox_data

mkdir: cannot create directory 'recbox_data': File exists


In [10]:
df.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

In [11]:
parameter_dict = {
    'data_path': '',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'device': 'GPU',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 10,
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}
config = Config(model='MultiVAE', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
# logger.info(config)

In [12]:
dataset = create_dataset(config)
logger.info(dataset)

13 Dec 16:19    INFO  recbox_data
The number of users: 13355
Average actions of users: 63.815710648494836
The number of items: 3294
Average actions of items: 258.78985727300335
The number of inters: 852195
The sparsity of the dataset: 98.06281322904924%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 13355
Average actions of users: 63.815710648494836
The number of items: 3294
Average actions of items: 258.78985727300335
The number of inters: 852195
The sparsity of the dataset: 98.06281322904924%
Remain Fields: ['user_id', 'item_id', 'timestamp']


In [13]:
# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

13 Dec 16:19    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
13 Dec 16:19    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


In [15]:
%%time
model_list = ['MultiVAE', 'MultiDAE', 'MacridVAE',
              "NeuMF", "RecVAE", 'RepeatNet']
results = []
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run_recbole(model=model_name, dataset='recbox_data', config_dict=parameter_dict)
    t = time.time() - start
    print(f"It took {t / 60:.2f} mins")
    print(result)

    result.update({'model': model_name})
    results.append(result)

running MultiVAE...


13 Dec 11:56    INFO  ['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
13 Dec 11:56    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_de

It took 2.83 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0837), ('mrr@10', 0.1671), ('ndcg@10', 0.0817), ('hit@10', 0.3486), ('precision@10', 0.0464)])}
running MultiDAE...


13 Dec 11:59    INFO  ['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
13 Dec 11:59    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_de

It took 3.25 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0839), ('mrr@10', 0.1658), ('ndcg@10', 0.0815), ('hit@10', 0.3473), ('precision@10', 0.0465)])}
running MacridVAE...


13 Dec 12:02    INFO  ['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
13 Dec 12:02    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_de

It took 4.87 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0842), ('mrr@10', 0.1612), ('ndcg@10', 0.08), ('hit@10', 0.3515), ('precision@10', 0.0463)])}
running NeuMF...


13 Dec 12:07    INFO  ['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
13 Dec 12:07    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_de

It took 6.51 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.0689), ('mrr@10', 0.1174), ('ndcg@10', 0.0606), ('hit@10', 0.3014), ('precision@10', 0.038)])}
running RecVAE...


13 Dec 12:14    INFO  ['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
13 Dec 12:14    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_de

It took 4.46 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.085), ('mrr@10', 0.1681), ('ndcg@10', 0.0825), ('hit@10', 0.354), ('precision@10', 0.0471)])}
running RepeatNet...


13 Dec 12:18    INFO  ['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
['/home/slfdstrctd/.local/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/slfdstrctd/.local/share/jupyter/runtime/kernel-2383744d-33c5-45a5-a000-6abb170b4173.json']
13 Dec 12:18    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_de

It took 90.56 mins
{'best_valid_score': -inf, 'valid_score_bigger': True, 'best_valid_result': None, 'test_result': OrderedDict([('recall@10', 0.1362), ('mrr@10', 0.0586), ('ndcg@10', 0.0767), ('hit@10', 0.1362), ('precision@10', 0.0136)])}
CPU times: user 2h 47min 14s, sys: 2h 5min 34s, total: 4h 52min 48s
Wall time: 1h 52min 28s


In [12]:
df = pd.DataFrame(results)

In [13]:
df

Unnamed: 0,model,best_valid_score,valid_score_bigger,best_valid_result,test_result
0,MultiVAE,-inf,True,,"{'recall@10': 0.0837, 'mrr@10': 0.1671, 'ndcg@..."
1,MultiDAE,-inf,True,,"{'recall@10': 0.0839, 'mrr@10': 0.1658, 'ndcg@..."
2,MacridVAE,-inf,True,,"{'recall@10': 0.0842, 'mrr@10': 0.1612, 'ndcg@..."
3,NeuMF,-inf,True,,"{'recall@10': 0.0689, 'mrr@10': 0.1174, 'ndcg@..."
4,RecVAE,-inf,True,,"{'recall@10': 0.085, 'mrr@10': 0.1681, 'ndcg@1..."
5,RepeatNet,-inf,True,,"{'recall@10': 0.1362, 'mrr@10': 0.0586, 'ndcg@..."


In [14]:
df_test_results = pd.json_normalize(df['test_result'])

In [15]:
df_test_results

Unnamed: 0,recall@10,mrr@10,ndcg@10,hit@10,precision@10
0,0.0837,0.1671,0.0817,0.3486,0.0464
1,0.0839,0.1658,0.0815,0.3473,0.0465
2,0.0842,0.1612,0.08,0.3515,0.0463
3,0.0689,0.1174,0.0606,0.3014,0.038
4,0.085,0.1681,0.0825,0.354,0.0471
5,0.1362,0.0586,0.0767,0.1362,0.0136


In [16]:
df = pd.concat([df['model'], df_test_results], axis=1)

In [17]:
df

Unnamed: 0,model,recall@10,mrr@10,ndcg@10,hit@10,precision@10
0,MultiVAE,0.0837,0.1671,0.0817,0.3486,0.0464
1,MultiDAE,0.0839,0.1658,0.0815,0.3473,0.0465
2,MacridVAE,0.0842,0.1612,0.08,0.3515,0.0463
3,NeuMF,0.0689,0.1174,0.0606,0.3014,0.038
4,RecVAE,0.085,0.1681,0.0825,0.354,0.0471
5,RepeatNet,0.1362,0.0586,0.0767,0.1362,0.0136


In [18]:
def show_pivot(results, group=False):
    pivot_results = results.groupby(["model"]).mean()

    if group:
        new_columns = sorted([
            (re.split("@", col)[0], int(re.split("@", col)[1])) if "@" in col else (col, "")
            for col in pivot_results.columns])
        pivot_results.columns = pd.MultiIndex.from_tuples(new_columns, names=["Metric", "Value"])

    display(
        pivot_results.style
        .highlight_min(color='lightcoral', axis=0)
        .highlight_max(color='lightgreen', axis=0)
    )

In [19]:
show_pivot(df)

Unnamed: 0_level_0,recall@10,mrr@10,ndcg@10,hit@10,precision@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MacridVAE,0.0842,0.1612,0.08,0.3515,0.0463
MultiDAE,0.0839,0.1658,0.0815,0.3473,0.0465
MultiVAE,0.0837,0.1671,0.0817,0.3486,0.0464
NeuMF,0.0689,0.1174,0.0606,0.3014,0.038
RecVAE,0.085,0.1681,0.0825,0.354,0.0471
RepeatNet,0.1362,0.0586,0.0767,0.1362,0.0136


По большинству метрик лучшая модель - RecVAE

In [16]:
config, model, dataset, train_data, valid_data, test_data = load_data_and_model( 
         model_file='./saved/RecVAE-Dec-13-2023_12-14-31.pth', 
     )

13 Dec 16:19    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 0, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separa

In [17]:
def recommend(user_id, topn=10):
    if user_id in dataset.field2token_id[dataset.uid_field] and user_id != "[PAD]":
        model.eval()
        with torch.no_grad():
            uid_series = dataset.token2id(dataset.uid_field, [user_id])
            index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
            relevant_data  = dataset[index].to(config["device"])
            predicted_scores = model.full_sort_predict(relevant_data)
            predicted_scores = predicted_scores.view(-1, test_data.dataset.item_num)
            predicted_scores[:, 0] = -np.inf
            top_recommendations_indices  = torch.topk(predicted_scores, topn).indices[0].tolist()
            recos = dataset.id2token(dataset.iid_field, [top_recommendations_indices]).tolist()
        return recos

In [44]:
recs = {}
users = dataset.field2token_id[dataset.uid_field]
for user_id in tqdm(users):
    reco = recommend(user_id, dataset, model)
    if reco:
        recs.update({user_id: reco})
    else:
        print("ERROR", user_id)

  0%|          | 0/13355 [00:00<?, ?it/s]

ERROR [PAD]


100%|██████████| 13355/13355 [52:26<00:00,  4.24it/s]


In [18]:
print(recommend(str(899105), 10))

[['10440', '3734', '15297', '13865', '4880', '9728', '4151', '142', '2657', '7571']]


Convert recs to integers

In [None]:
int_recs = {}

for key, value in recs.items():
    converted_key = int(key)
    converted_values = [[int(item) for item in inner_list] for inner_list in value]
    int_recs[converted_key] = converted_values[0]

In [85]:
with open('../saved_models/RecVAE_offline.pkl', 'wb') as f:
    pickle.dump(int_recs, f)