In [None]:
# !pip install recbole

In [18]:
import os
from pathlib import Path

os.chdir('/content/drive/MyDrive/hnm')
DATA_PATH = Path.cwd() / 'data'
RAW = DATA_PATH / 'raw'
PROCESSED = DATA_PATH / 'processed'
SUBMISSION = DATA_PATH / 'submission'

import numpy as np
import pandas as pd

import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import SASRec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

from tqdm import tqdm

In [4]:
os.chdir('/content/drive/MyDrive/hnm/sasrec')

In [8]:
parameter_dict = {
    'data_path': PROCESSED,
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[30,inf)",
    'item_inter_num_interval': "[30,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 50,
    'metrics': ['MAP'],
    'valid_metric': 'MAP@12',
    'topk': [12],
    'eval_args': {
        'split': {'RS': [9, 0, 1]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'},
    'learning_rate': 0.01,
    'train_batch_size': 4096,
    'eval_batch_size': 4096
}

config = Config(model='SASRec', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

# write config info into log
logger.info(config)

30 Mar 13:29    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = /content/drive/MyDrive/hnm/data/processed/recbox_data
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 50
train_batch_size = 4096
learner = adam
learning_rate = 0.01
neg_sampling = None
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}
repeatable = True
metrics = ['MAP']
topk = [12]
valid_metric = MAP@12
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIELD = timestamp
seq_len

In [9]:
dataset = create_dataset(config)
logger.info(dataset)

30 Mar 13:34    INFO  recbox_data
The number of users: 306733
Average actions of users: 72.26853735508521
The number of items: 59157
Average actions of items: 374.72231050104807
The number of inters: 22167073
The sparsity of the dataset: 99.8778364322941%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 306733
Average actions of users: 72.26853735508521
The number of items: 59157
Average actions of items: 374.72231050104807
The number of inters: 22167073
The sparsity of the dataset: 99.8778364322941%
Remain Fields: ['user_id', 'item_id', 'timestamp']
recbox_data
The number of users: 306733
Average actions of users: 72.26853735508521
The number of items: 59157
Average actions of items: 374.72231050104807
The number of inters: 22167073
The sparsity of the dataset: 99.8778364322941%
Remain Fields: ['user_id', 'item_id', 'timestamp']


In [10]:
train_data, valid_data, test_data = data_preparation(config, dataset)


30 Mar 13:39    INFO  [Training]: train_batch_size = [4096] negative sampling: [None]
[Training]: train_batch_size = [4096] negative sampling: [None]
[Training]: train_batch_size = [4096] negative sampling: [None]
30 Mar 13:39    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 0, 1]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]


In [11]:
model = SASRec(config, train_data.dataset).to(config['device'])
logger.info(model)

30 Mar 13:39    INFO  SASRec(
  (item_embedding): Embedding(59157, 64, padding_idx=0)
  (position_embedding): Embedding(50, 64)
  (trm_encoder): TransformerEncoder(
    (layer): ModuleList(
      (0): TransformerLayer(
        (multi_head_attention): MultiHeadAttention(
          (query): Linear(in_features=64, out_features=64, bias=True)
          (key): Linear(in_features=64, out_features=64, bias=True)
          (value): Linear(in_features=64, out_features=64, bias=True)
          (softmax): Softmax(dim=-1)
          (attn_dropout): Dropout(p=0.5, inplace=False)
          (dense): Linear(in_features=64, out_features=64, bias=True)
          (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
          (out_dropout): Dropout(p=0.5, inplace=False)
        )
        (feed_forward): FeedForward(
          (dense_1): Linear(in_features=64, out_features=256, bias=True)
          (dense_2): Linear(in_features=256, out_features=64, bias=True)
          (LayerNorm): LayerNorm((

In [12]:
trainer = Trainer(config, model)

best_valid_score, best_valid_result = trainer.fit(train_data, valid_data=valid_data, show_progress=True)

Train     0: 100%|█████████████████████| 4837/4837 [15:59<00:00,  5.04it/s, GPU RAM: 7.33 G/15.78 G]
30 Mar 13:55    INFO  epoch 0 training [time: 959.80s, train loss: 41748.3684]
epoch 0 training [time: 959.80s, train loss: 41748.3684]
epoch 0 training [time: 959.80s, train loss: 41748.3684]
30 Mar 13:55    INFO  Saving current: saved/SASRec-Mar-30-2022_13-39-54.pth
Saving current: saved/SASRec-Mar-30-2022_13-39-54.pth
Saving current: saved/SASRec-Mar-30-2022_13-39-54.pth
Train     1: 100%|█████████████████████| 4837/4837 [16:07<00:00,  5.00it/s, GPU RAM: 7.33 G/15.78 G]
30 Mar 14:12    INFO  epoch 1 training [time: 967.07s, train loss: 40313.5844]
epoch 1 training [time: 967.07s, train loss: 40313.5844]
epoch 1 training [time: 967.07s, train loss: 40313.5844]
30 Mar 14:12    INFO  Saving current: saved/SASRec-Mar-30-2022_13-39-54.pth
Saving current: saved/SASRec-Mar-30-2022_13-39-54.pth
Saving current: saved/SASRec-Mar-30-2022_13-39-54.pth
Train     2: 100%|█████████████████████| 483

In [19]:
from recbole.utils.case_study import full_sort_topk

external_user_ids = dataset.id2token(dataset.uid_field, list(range(dataset.user_num)))[1:]

topk_items = []
for internal_user_id in tqdm(list(range(dataset.user_num))[1:]):
    _, topk_iid_list = full_sort_topk([internal_user_id], model, test_data, k=12, device=config['device'])
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)



100%|██████████| 306732/306732 [27:54<00:00, 183.16it/s]


In [20]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str
result.head()

Unnamed: 0,customer_id,prediction
0,00007d2de826758b65a93dd24ce629ed66842531df6699...,554757003 816914001 629551003 509210020 509210...
1,0008968c0d451dbc5a9968da03196fe20051965edde741...,857163001 751664001 868134001 863595006 768912...
2,000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...,851996001 861036001 851996002 871957001 861036...
3,001127bffdda108579e6cb16080440e89bf1250a776c6e...,879965001 879965002 803772002 863937003 817472...
4,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,739590032 860949002 838055002 846279001 351484...


In [25]:
submission = pd.read_csv(PROCESSED / 'lstm_ensemble.csv')
submission = pd.merge(submission, result, on='customer_id', how='outer')
submission = submission.fillna(-1)
submission['prediction'] = submission.apply(
    lambda x: x['prediction_y'] if x['prediction_y'] != -1 else x['prediction_x'], axis=1)

In [None]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str
result.head()

Unnamed: 0,customer_id,prediction
0,00007d2de826758b65a93dd24ce629ed66842531df6699...,554757003 484398001 768912001 179123001 720125...
1,0008968c0d451dbc5a9968da03196fe20051965edde741...,863595006 904545002 883307004 921906003 868134...
2,000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...,851996001 827968004 714790020 706016001 803468...
3,001127bffdda108579e6cb16080440e89bf1250a776c6e...,706016001 749699002 762846001 762846008 706016...
4,001ea4e9c54f7e9c88811260d954edc059d596147e1cf8...,739590032 860949002 860949003 846279001 811835...


In [26]:
submission = submission.drop(columns=['prediction_y', 'prediction_x'])
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0656719005 0745232001 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,351484002 838055003 723529001 699080001 599580...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0852643003 0858883002 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0924243001 0573085028 0751471001 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0924243001 0791587015 0896152002 09...


In [28]:
submission.to_csv(SUBMISSION / 'sasrec.csv', index=False)