In [None]:
!pip install recbole

In [None]:
import pandas as pd
from google.colab import drive
import numpy as np
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed
import torch
from recbole.data.interaction import Interaction

In [None]:
# drive.mount("/content/gdrive")
# pd.set_option('display.max_columns', None)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
### reading the data 
transactions = pd.read_csv("transactions_train.csv", dtype={'article_id': 'str'})
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [None]:
### Converting the t_dat from object datatype to timestamp 
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'], format="%Y-%m-%d")
transactions['timestamp'] = transactions.t_dat.values.astype(np.int64) // 10 ** 9

### filtring the data for 2020 
df_2020 = transactions[transactions['timestamp'] > 1585620000][['customer_id', 'article_id', 'timestamp']].rename(columns={'customer_id': 'user_id:token', 'article_id': 'item_id:token', 'timestamp': 'timestamp:float'})
df_2020.head()

Unnamed: 0,user_id:token,item_id:token,timestamp:float
23934157,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808001,1585699200
23934158,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,727808007,1585699200
23934159,000563485cbb7850b0a93c6606f89c5b961c6647d1bd48...,567532015,1585699200
23934160,000563485cbb7850b0a93c6606f89c5b961c6647d1bd48...,706104009,1585699200
23934161,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,783504004,1585699200


In [None]:
### Saving the data in the format to be used by Recbole 
df_2020.to_csv('recbox_data/recbox_data.inter', index=False, sep='\t')

249

### Create dataset and train model with Recbole

In [None]:
### Parameter to configure data for Recbole 
parameter_dict = {
    'data_path': './', ## path of the dataset 
    'USER_ID_FIELD': 'user_id', ## user ID 
    'ITEM_ID_FIELD': 'item_id', ## item ID filed 
    'TIME_FIELD': 'timestamp', ## Timestamp 
    'user_inter_num_interval': "[40,inf)", ##  users who buy more than 40 items 
    'item_inter_num_interval': "[40,inf)", ## items which is bought by more than 40 people 
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'neg_sampling': None,
    'epochs': 50,  ## number of epochs to be used to train the model 
    'eval_args': {
        'split': {'RS': [10, 0, 0]}, ### split the data into only train, no test, no validation 
        'group_by': 'user', ### groupby using user 
        'order': 'TO',
        'mode': 'full'}
}

### creating the data configuration 
config = Config(model='GRU4Rec', dataset='recbox_data', config_dict=parameter_dict)

# random seed
init_seed(config['seed'], config['reproducibility'])

In [None]:
### creating dataset based on above configuration 
dataset = create_dataset(config)

In [None]:
### splitting the data into test, train and validation 
train_data, valid_data, test_data = data_preparation(config, dataset)

11 May 19:33    INFO  [Training]: train_batch_size = [2048] negative sampling: [None]
[Training]: train_batch_size = [2048] negative sampling: [None]
11 May 19:33    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]
[Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [10, 0, 0]}, 'group_by': 'user', 'order': 'TO', 'mode': 'full'}]


In [None]:
model = GRU4Rec(config, train_data.dataset).to(config['device'])
# logger.info(model)

# trainer loading and initialization
trainer = Trainer(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data)

11 May 19:34    INFO  epoch 0 training [time: 15.51s, train loss: 3720.7215]
epoch 0 training [time: 15.51s, train loss: 3720.7215]
11 May 19:34    INFO  Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth
Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth
11 May 19:34    INFO  epoch 1 training [time: 15.78s, train loss: 3532.7930]
epoch 1 training [time: 15.78s, train loss: 3532.7930]
11 May 19:34    INFO  Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth
Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth
11 May 19:34    INFO  epoch 2 training [time: 15.82s, train loss: 3386.4065]
epoch 2 training [time: 15.82s, train loss: 3386.4065]
11 May 19:34    INFO  Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth
Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth
11 May 19:34    INFO  epoch 3 training [time: 15.92s, train loss: 3294.3650]
epoch 3 training [time: 15.92s, train loss: 3294.3650]
11 May 19:34    INFO  Saving current: saved/GRU4Rec-May-11-2022_19-33-48.pth

Create recommendation result from trained model

In [None]:
### list of all the customer ID
external_user_ids = dataset.id2token(dataset.uid_field, list(range(dataset.user_num)))[1:]

In [None]:
###
# Input:
# old_interaction: list of all previous transaction 
# last_item_id: last trainstion article ID
# max_len: Max number of transaction 

def add_last_item(old_interaction, last_item_id, max_len=50):
    new_seq_items = old_interaction['item_id_list'][-1]
    if old_interaction['item_length'][-1].item() < max_len:
        new_seq_items[old_interaction['item_length'][-1].item()] = last_item_id
    else:
        new_seq_items = torch.roll(new_seq_items, -1)
        new_seq_items[-1] = last_item_id
    return new_seq_items.view(1, len(new_seq_items))

### 
#input:
# external_user_id : List of all customer ID
# dataset : Atomic dataset 
# Model : recbole model object 

def predict_for_all_item(external_user_id, dataset, model):
    model.eval()
    with torch.no_grad():
        uid_series = dataset.token2id(dataset.uid_field, [external_user_id])
        index = np.isin(dataset[dataset.uid_field].numpy(), uid_series)
        input_interaction = dataset[index]
        test = {
            'item_id_list': add_last_item(input_interaction, 
                                          input_interaction['item_id'][-1].item(), model.max_seq_length),
            'item_length': torch.tensor(
                [input_interaction['item_length'][-1].item() + 1
                 if input_interaction['item_length'][-1].item() < model.max_seq_length else model.max_seq_length])
        }
        new_inter = Interaction(test)
        new_inter = new_inter.to(config['device'])
        new_scores = model.full_sort_predict(new_inter)
        new_scores = new_scores.view(-1, test_data.dataset.item_num)
        new_scores[:, 0] = -np.inf  # set scores of [pad] to -inf
    return torch.topk(new_scores, 10)


In [None]:

topk_items = []
### Looping thorugh all the customer ID 
for external_user_id in external_user_ids:
    _, topk_iid_list = predict_for_all_item(external_user_id, dataset, model)
    last_topk_iid_list = topk_iid_list[-1]
    external_item_list = dataset.id2token(dataset.iid_field, last_topk_iid_list.cpu()).tolist()
    topk_items.append(external_item_list)

15458


In [None]:
external_item_str = [' '.join(x) for x in topk_items]
result = pd.DataFrame(external_user_ids, columns=['customer_id'])
result['prediction'] = external_item_str
result.head()

Unnamed: 0,customer_id,prediction
0,00d7ebd46f6a6d53630d41386b6ef6a505cdc4c80011ff...,0918522001 0850917001 0893133001 0915526001 09...
1,0109ad0b5a76924a1b58be677409bb601cc8bead9a87b8...,0901955001 0861477001 0600886001 0833530002 07...
2,013f00f9e218549246a3aa82b3f3a0c22a693bc25fa735...,0839402002 0839402001 0770336001 0865172002 08...
3,01bada2a453b09c70ea57bdda5a9ef0fb04062718d3a3d...,0724906006 0253448003 0554598001 0700701002 08...
4,01dd96059a11759518f10969d0a528f03c8501dc4c628b...,0817353003 0895002002 0842004003 0859118001 08...


Combine result from most bought items and GRU model

In [None]:
### using the submission file from ensamble model 
submit_df = pd.read_csv('submission.csv')

## Merging the ensamble model prediction with prediction
submit_df = pd.merge(submit_df, result, on='customer_id', how='outer')
submit_df.head()

(1371980, 2)

In [None]:
submit_df = submit_df.fillna(-1)
submit_df['prediction'] = submit_df.apply(
    lambda x: x['prediction_y'] if x['prediction_y'] != -1 else x['prediction_x'], axis=1)
submit_df.head()

Unnamed: 0,customer_id,prediction_x,prediction_y,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0751471001 0924243001 04...,-1,0568601043 0568601006 0751471001 0924243001 04...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0924243001 0924243002 0739590027 08...,-1,0826211002 0924243001 0924243002 0739590027 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0924243001 0924243002 07...,-1,0794321007 0852643001 0924243001 0924243002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0924243002 0924243001 0751471001 05...,-1,0448509014 0924243002 0924243001 0751471001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0927530004 0896152002 09...,-1,0730683050 0791587015 0927530004 0896152002 09...


In [None]:
submit_df = submit_df.drop(columns=['prediction_y', 'prediction_x'])
submit_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0568601006 0751471001 0924243001 04...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0826211002 0924243001 0924243002 0739590027 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0852643001 0924243001 0924243002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0448509014 0924243002 0924243001 0751471001 05...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0730683050 0791587015 0927530004 0896152002 09...


In [None]:
submit_df.to_csv('submission_regbole.csv', index=False)