In [1]:
pip install recbole

Collecting recbole
  Downloading recbole-1.0.1-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 3.3 MB/s eta 0:00:01
[?25hCollecting tensorboard>=2.5.0
  Using cached tensorboard-2.8.0-py3-none-any.whl (5.8 MB)
Collecting scipy==1.6.0
  Downloading scipy-1.6.0-cp39-cp39-macosx_10_9_x86_64.whl (30.9 MB)
[K     |████████████████████████████████| 30.9 MB 4.6 MB/s eta 0:00:011
Collecting colorama==0.4.4
  Using cached colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting scikit-learn>=0.23.2
  Using cached scikit_learn-1.0.2-cp39-cp39-macosx_10_13_x86_64.whl (8.0 MB)
Collecting pyyaml>=5.1.0
  Downloading PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl (197 kB)
[K     |████████████████████████████████| 197 kB 2.2 MB/s eta 0:00:01
Collecting colorlog==4.7.2
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting markdown>=2.6.8
  Using cached Markdown-3.3.6-py3

In [2]:
import os
from pathlib import Path

os.chdir('/content/drive/MyDrive/hnm')
DATA_PATH = Path.cwd() / 'data'
RAW = DATA_PATH / 'raw'
PROCESSED = DATA_PATH / 'processed'
SUBMISSION = DATA_PATH / 'submission'

import pandas as pd
import numpy as np

import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

In [6]:
os.chdir('/content/drive/MyDrive/hnm/gru4rec')

In [7]:
parameter_dict = {
    'data_path': PROCESSED,
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 50,
    'metrics': ['MAP'],
    'valid_metric': 'MAP@12',
    'topk': [12],
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}

config = Config(model='GRU4Rec', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)

29 Mar 12:23    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /content/drive/MyDrive/hnm/data/processed/recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 50
train_batch_size = 2048
learner = adam
learning_rate = 0.001
neg_sampling = None
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}
repeatable = True
metrics = ['MAP']
topk = [12]
valid_metric = MAP@12
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIELD = timestamp
seq_le

In [8]:
dataset = create_dataset(config)
logger.info(dataset)

29 Mar 12:28    INFO  recbox_data
The number of users: 223128
Average actions of users: 85.38935673405729
The number of items: 51558
Average actions of items: 369.5457648815874
The number of inters: 19052671
The sparsity of the dataset: 99.8343826873777%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 223128
Average actions of users: 85.38935673405729
The number of items: 51558
Average actions of items: 369.5457648815874
The number of inters: 19052671
The sparsity of the dataset: 99.8343826873777%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 223128
Average actions of users: 85.38935673405729
The number of items: 51558
Average actions of items: 369.5457648815874
The number of inters: 19052671
The sparsity of the dataset: 99.8343826873777%
Remain Fields: ['user_id', 'item_id', 'timestamp']


In [9]:
train_data, valid_data, test_data = data_preparation(config, dataset)


29 Mar 12:33    INFO  [Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
29 Mar 12:33    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]


In [10]:
model = GRU4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

29 Mar 12:34    INFO  GRU4Rec(
  (item_embedding): Embedding(51558, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 3381696
GRU4Rec(
  (item_embedding): Embedding(51558, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 3381696
GRU4Rec(
  (item_embedding): Embedding(51558, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 3381696


In [11]:
trainer = Trainer(config, model)

In [12]:
best_valid_score, best_valid_result = trainer.fit(train_data, valid_data=valid_data, show_progress=True)

Train     0: 100%|█████████████████████| 8323/8323 [05:39<00:00, 24.53it/s, GPU RAM: 2.20 G/15.78 G]
29 Mar 12:40    INFO  epoch 0 training [time: 339.31s, train loss: 71936.9081]
epoch 0 training [time: 339.31s, train loss: 71936.9081]
epoch 0 training [time: 339.31s, train loss: 71936.9081]
29 Mar 12:40    INFO  Saving current: saved/GRU4Rec-Mar-29-2022_12-34-37.pth
Saving current: saved/GRU4Rec-Mar-29-2022_12-34-37.pth
Saving current: saved/GRU4Rec-Mar-29-2022_12-34-37.pth
Train     1: 100%|█████████████████████| 8323/8323 [05:37<00:00, 24.68it/s, GPU RAM: 2.20 G/15.78 G]
29 Mar 12:46    INFO  epoch 1 training [time: 337.19s, train loss: 66461.1367]
epoch 1 training [time: 337.19s, train loss: 66461.1367]
epoch 1 training [time: 337.19s, train loss: 66461.1367]
29 Mar 12:46    INFO  Saving current: saved/GRU4Rec-Mar-29-2022_12-34-37.pth
Saving current: saved/GRU4Rec-Mar-29-2022_12-34-37.pth
Saving current: saved/GRU4Rec-Mar-29-2022_12-34-37.pth
Train     2: 100%|████████████████████