# 1. Install & Import

In [None]:
!pip install recbole

In [None]:
import os
import numpy as np
import pandas as pd
import gc

In [None]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4RecF, FDSA, BERT4Rec, GRU4Rec#, SASRecF
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

import random

import torch
from torch import nn

from recbole.model.abstract_recommender import SequentialRecommender
from recbole.model.layers import FeedForward
# from recbole.model.layers import FeatureSeqEmbLayer

from recbole.utils import FeatureType
import copy
import math

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as fn
from torch.nn.init import normal_

from recbole.utils import FeatureType, FeatureSource
import torch.nn.functional as F
from recbole.data.interaction import Interaction

# 2. Create atomic files for Recbole training

These datasets are all publicly available on kaggle. 

In [None]:
!mkdir /kaggle/working/hm_atomic_interation_with_item_feature
# inter = pd.read_csv('../input/hm-atomic-interation-with-item-feature/hm_atomic_interation_with_item_feature.inter', sep='\t')

inter = pd.read_csv('../input/reduced-inter/recbox_data_post2020.inter', sep='\t')
# inter = inter[inter['timestamp:float'] > 1589620000 ]# 1595620000
inter.to_csv('/kaggle/working/hm_atomic_interation_with_item_feature/hm_atomic_interation_with_item_feature.inter', index=False, sep='\t')
del inter
gc.collect()

In [None]:
# item = pd.read_csv('../input/bertembedding/out_bert_embed.csv')
# item = pd.read_csv('../input/tfidf-embedding/out_2.csv')
item = pd.read_csv('../input/feature-bert-embed/bert_embed_feature.csv')
item = item.rename(columns={'article_id':'item_id:token', 'embed': 'item_emb:float_seq'})
print(item.head())
print(item.shape)
item.to_csv('/kaggle/working/hm_atomic_interation_with_item_feature/hm_atomic_interation_with_item_feature.item', index=False, sep='\t')
del item
gc.collect()

# Create and train Recbole model

## Model training

This part trains the model

In [None]:
parameter_dict = {
    'data_path': '/kaggle/working',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[40,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp'],
                  'item': ['item_id', 'item_emb']
             },
    'selected_features': ['item_emb'],
    'neg_sampling': None,
    'epochs': 1,
#     'train_batch_size': 256,
    'n_layers': 2,
    'n_heads': 2,
    'hidden_size': 64,
    'inner_size': 256,
    'hidden_dropout_prob': 0.5,
    'attn_dropout_prob': 0.5,
    'hidden_act': 'gelu',
    'layer_norm_eps': 1e-12,
    'initializer_range': 0.02,
    'mask_ratio': 0.2,
    'loss_type': 'CE',
    'learning_rate': 0.002,
    'pooling_mode': 'sum',
    'eval_args': {
        'split': {'RS': [10, 0, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}

config = Config(model="BERT4Rec", dataset='hm_atomic_interation_with_item_feature', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])

# logger initialization
init_logger(config)
logger = getLogger()
# Create handlers
c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)



In [None]:
dataset = create_dataset(config)
logger.info(dataset)

In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)

In [None]:
# # model loading and initialization
model = BERT4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data)

The following commented code chunk was used for loading trained models.

In [None]:
# model_file = "../input/onehot-bert-m/BERT4RecF-Apr-18-2022_02-18-35.pth"
# checkpoint = torch.load(model_file)
# config = checkpoint['config']
# init_seed(config['seed'], config['reproducibility'])
# init_logger(config)
# logger = getLogger()
# logger.info(config)
# model = BERT4RecF(config, train_data.dataset).to(config['device'])
# model.load_state_dict(checkpoint['state_dict'])
# model.load_other_parameter(checkpoint.get('other_parameter'))


## Combine models

This part makes predictions and fills out the "cold-start" ones with 12 most frequence items.

In [None]:
from recbole.utils.case_study import full_sort_topk
from recbole.quick_start.quick_start import load_data_and_model
# config, model, dataset, train_data, valid_data, test_data = load_data_and_model(
#     model_file='/kaggle/working/saved/SASRecF-Apr-05-2022_20-56-46.pth',
# )
external_user_ids = dataset.id2token(
    dataset.uid_field, list(range(dataset.user_num)))[1:]#fist element in array is 'PAD'(default of Recbole) ->remove it 

In [None]:
import torch
from recbole.data.interaction import Interaction

def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset.inter_feat[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores, attention = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 12)[1], attention

In [None]:
topk_items = []
for external_user_id in external_user_ids[112:]:
    topk_iid_list, attention = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)
print(len(topk_items))

In [None]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str
result.head()

In [None]:
del external_item_str
del topk_items
del external_user_ids
del train_data
del valid_data
del test_data
del model
del Trainer
del logger
del dataset
gc.collect()

In [None]:
result.head()

In [None]:
reference = pd.read_csv('../input/uid-reference/reference.csv')
reference.head()

In [None]:
result.customer_id = result.customer_id.astype('int64')
result.dtypes

In [None]:
new_result = pd.merge(result, reference, how='left', left_on='customer_id', right_on='new_id', indicator=False, suffixes=("_x", "")).drop(columns=['customer_id_x', 'new_id'])

In [None]:
new_result = new_result[['customer_id', 'prediction']]
new_result.head()

In [None]:
submit_df = pd.read_csv('../input/cold-start/submission.csv')
submit_df = pd.merge(submit_df, new_result, on='customer_id', how='outer')
submit_df.head()

In [None]:
submit_df = submit_df.fillna(-1)
submit_df['prediction'] = submit_df.apply(
    lambda x: x['prediction_y'] if x['prediction_y'] != -1 else x['prediction_x'], axis=1)
submit_df.head()

In [None]:
submit_df = submit_df.drop(columns=['prediction_y', 'prediction_x'])
submit_df.head()

In [None]:
submit_df.to_csv('submission.csv', index=False)