### This notebook provides general idea of how we can train models to predict next transactions based on the previous ones.

It is just my experiemental model and probably one more idea how we can use data. According to my experiements this model can't get signal from the data and has a poor perfomance, but, maybe, someone could improve it and get better score. There are plenty variants to improve: hyperparameters, size of the dataset, transformer-decoder model, train model as masked language model and extract embeddings.

In [None]:
!pip install -q python-box

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import cv2
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from pytorch_lightning import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
import pytorch_lightning as pl
import seaborn as sns
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict
import torch
import random
import torch.optim as optim
from box import Box

In [None]:
data_path = Path('../input/h-and-m-personalized-fashion-recommendations')
images_path = data_path/'images'

Provide general constants for the dataset. Columns just needed to extract only needed information.

In [None]:
COLUMNS = [
    't_dat',
    'customer_id',
    'article_id',
    'product_code',
    'product_type_no',
    'section_no',
    'index_group_no'
]
MAX_LEN = 20
PAD_IDX = 0
N_SAMPLES = 5_000_000 #number of transactions

In [None]:
sample_sub = pd.read_csv(data_path/'sample_submission.csv')
cutomers = pd.read_csv(data_path/'customers.csv')
articles = pd.read_csv(data_path/'articles.csv')
transactions_train = pd.read_csv(data_path/'transactions_train.csv', parse_dates=['t_dat'])
transactions = transactions_train.iloc[:N_SAMPLES].merge(articles, on = ('article_id'))[COLUMNS]

del transactions_train

## Mappings
Provide useful mappings to convert articles to index, product code to the index of the embedding layer and etc...

In [None]:
articles_uniq = articles['article_id'].unique()
product_type_uniq = articles['product_type_no'].unique()
product_code_uniq = articles['product_code'].unique()
section_name_uniq = articles['section_no'].unique()
index_group_name_uniq = articles['index_group_no'].unique()

n_articles = len(articles_uniq)
articles_map = {0: PAD_IDX} #add padding to dictionary
articles_map.update(dict(zip(range(1, n_articles+1), articles_uniq))) # indx - article_id
inv_articles_map = dict(map(lambda x: (x[1], x[0]), articles_map.items())) # article_id - indx

product_code_map = {0: PAD_IDX} #add padding to dictionary
product_code_map.update(dict(zip(range(1, len(product_code_uniq)+1), product_code_uniq))) # indx - product_code_id
inv_product_code_map = dict(map(lambda x: (x[1], x[0]), product_code_map.items())) # product_code_id - indx 

product_type_map = {0: PAD_IDX} #add padding to dictionary
product_type_map.update(dict(zip(range(1, len(product_type_uniq)+1), product_type_uniq))) # indx - product_type
inv_product_type_map = dict(map(lambda x: (x[1], x[0]), product_type_map.items())) # product_type - indx

section_name_map = {0: PAD_IDX} #add padding to dictionary
section_name_map.update(dict(zip(range(1, len(section_name_uniq)+1), section_name_uniq))) # indx - section_name
inv_section_name_map = dict(map(lambda x: (x[1], x[0]), section_name_map.items())) # section_name - indx

index_group_map = {0: PAD_IDX} #add padding to dictionary
index_group_map.update(dict(zip(range(1, len(index_group_name_uniq)+1), index_group_name_uniq))) #indx - index_group
inv_index_group_map = dict(map(lambda x: (x[1], x[0]), index_group_map.items())) # index_group - indx

# another mapping
# article_id - [product_type_no, product_code, section_no, index_group_no]
articles_info = (
    articles[[
        'article_id', 
        'product_type_no', 
        'product_code', 
        'section_no', 
        'index_group_no'
    ]]
    .set_index('article_id')
    .to_dict('index')
)
#add values for padding
articles_info[0] = {
    'index_group_no': PAD_IDX,
    'product_code': PAD_IDX,
    'product_type_no': PAD_IDX,
    'section_no': PAD_IDX
}

## Model
This is a baseline model. I used only 4 types of embeddings: product_code, product_type, section_name, index_group_name.
#### Training
In the training phase on each timestep model receives article_id and converts it to (product_code, product_type, section_name, index_group_name), then each value is translated into embedding and then concatenated. LSTM makes a forward step and outputs hidden states with output values. For the next step we can either take output value or true value (teacher forcing). And this step continues until model reaches the end of the batch.
#### Inference
In the inference phase model receives train articles and then generate next article step by step.

In [None]:
class RnnModel(nn.Module):
    def __init__(self, num_layers=2, hidden_size=80, device = torch.device('cuda:0')):
        super().__init__()
        self.device = device
        self.n_layers = num_layers
        self.hidden_size = hidden_size
        # add +1 for padding
        self.product_code = nn.Embedding(len(product_code_uniq)+1, 30, padding_idx = PAD_IDX)
        self.product_type = nn.Embedding(len(product_type_uniq)+1, 10, padding_idx = PAD_IDX)
        self.section_name = nn.Embedding(len(product_type_uniq)+1, 10, padding_idx = PAD_IDX)
        self.index_group_name = nn.Embedding(len(product_type_uniq)+1, 5, padding_idx = PAD_IDX)
        self.rnn = nn.LSTM(input_size=55, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.LazyLinear(n_articles)
        
    def forward(
        self, 
        product_code = None,
        product_type_no = None,
        section_no = None,
        index_group_no = None,
        hidden = None, 
        cell = None,
    ):
        # basic forward pass
        pr_code_emb = self.product_code(product_code)
        pr_type_emb = self.product_type(product_type_no)
        sec_name_emb = self.section_name(section_no)
        indx_group_emb = self.index_group_name(index_group_no)

        emb = torch.cat([pr_code_emb, pr_type_emb, sec_name_emb, indx_group_emb], dim = -1)
        out, (h, c) = self.rnn(emb, (hidden, cell))
        h_ = h.permute(1, 0, 2).reshape(out.shape[0], -1)
        next_ = self.fc(h_).squeeze()
        return next_, h, c

    def _forward(
        self, 
        product_code = None,
        product_type_no = None,
        section_no = None,
        index_group_no = None,
        t = 1
        ):
        # forward pass for training
        MAX_LEN = product_code.shape[1]
        
        outputs = torch.zeros(product_code.shape[0], MAX_LEN, n_articles, requires_grad=True).to(self.device)
        in_product_code = product_code[:, 0] #(batch_size,)
        in_product_type_no = product_type_no[:, 0] #(batch_size,)
        in_section_no = section_no[:, 0] #(batch_size,)
        in_index_group_no = index_group_no[:, 0] #(batch_size,)

        hidden, cell = self.init_state(product_code.shape[0])

        for t in range(1, MAX_LEN-1):
            in_product_code = in_product_code.unsqueeze(1)
            in_product_type_no = in_product_type_no.unsqueeze(1)
            in_section_no = in_section_no.unsqueeze(1)
            in_index_group_no = in_index_group_no.unsqueeze(1)
          
            output, hidden, cell = self.forward(
                                        in_product_code, 
                                        in_product_type_no,
                                        in_section_no,
                                        in_index_group_no,
                                        hidden, 
                                        cell
                                        )
            outputs[:, t] = output
            # decide whether to take the next value or the output
            teacher_force = random.random() < t
            top1 = output.max(1)[1].cpu().numpy()
            articles = [articles_map[art] for art in top1]
            
            if teacher_force:
              in_product_code = product_code[:, t]
              in_product_type_no = product_type_no[:, t]
              in_section_no = section_no[:, t]
              in_index_group_no = index_group_no[:, t]
            else:
              info = CustomDataset._retrieve_info(articles)
              in_product_code = torch.tensor(info['product_codes'], dtype = torch.long).to(self.device)
              in_product_type_no = torch.tensor(info['product_types_no'], dtype = torch.long).to(self.device)
              in_section_no = torch.tensor(info['sections_no'], dtype = torch.long).to(self.device)
              in_index_group_no = torch.tensor(info['index_groups_no'], dtype = torch.long).to(self.device)
        return outputs

    def generate(
        self, 
        product_code = None,
        product_type_no = None,
        section_no = None,
        index_group_no = None,
        ):
      
        MAX_LEN = product_code.shape[1]
      
        outputs = torch.zeros(product_code.shape[0], 12, n_articles).to(self.device)
        hidden, cell = self.init_state(product_code.shape[0])
        output, hidden, cell = self.forward(
                                        product_code, 
                                        product_type_no,
                                        section_no,
                                        index_group_no,
                                        hidden, 
                                        cell
                                        )
        
        in_product_code = product_code[:, -1] #(batch_size,)
        in_product_type_no = product_type_no[:, -1] #(batch_size,)
        in_section_no = section_no[:, -1] #(batch_size,)
        in_index_group_no = index_group_no[:, -1] #(batch_size,)
        
        for t in range(12):
            in_product_code = in_product_code.unsqueeze(1)
            in_product_type_no = in_product_type_no.unsqueeze(1)
            in_section_no = in_section_no.unsqueeze(1)
            in_index_group_no = in_index_group_no.unsqueeze(1)

            output, hidden, cell = self.forward(
                                        in_product_code, 
                                        in_product_type_no,
                                        in_section_no,
                                        in_index_group_no,
                                        hidden, 
                                        cell
                                        )
            outputs[:, t] = output
            top1 = output.max(1)[1].cpu().numpy()
            articles = [articles_map[art] for art in top1]

            info = CustomDataset._retrieve_info(articles)
            in_product_code = torch.tensor(info['product_codes'], dtype = torch.long).to(self.device)
            in_product_type_no = torch.tensor(info['product_types_no'], dtype = torch.long).to(self.device)
            in_section_no = torch.tensor(info['sections_no'], dtype = torch.long).to(self.device)
            in_index_group_no = torch.tensor(info['index_groups_no'], dtype = torch.long).to(self.device)
        return outputs

    def init_state(self, batch_size):
        hidden = torch.zeros((self.n_layers, batch_size, self.hidden_size)).to(self.device)
        cell = torch.zeros((self.n_layers, batch_size, self.hidden_size)).to(self.device)
        return hidden, cell

I used pytorch lightning as a wrapper. Basically, in LightningModule you need to define training_step, validation_step, loaders and optimizers.

In [None]:
class RnnLightningModel(LightningModule):
  def __init__(self, cfg, model, train_df = None, val_df = None, test_df = None):
    super().__init__()
    self.cfg = cfg
    self.train_df = train_df
    self.val_df = val_df
    self.model = model
    self.criterion = nn.CrossEntropyLoss(ignore_index = 0)
    self.save_hyperparameters(cfg, ignore = ['train_df', 'val_df', 'test_df', 'model', 'criterion'])

  def forward(
      self, 
      product_codes = None,
      product_types_no = None,
      sections_no = None,
      index_groups_no = None,
      t = 1
      ):
    
    out = self.model._forward(
        product_code = product_codes,
        product_type_no = product_types_no,
        section_no = sections_no,
        index_group_no = index_groups_no,
        t = t
        )
    return out

  def generate(
      self, 
      product_codes = None,
      product_types_no = None,
      sections_no = None,
      index_groups_no = None,
      ):
        
    out = self.model.generate(
        product_code = product_codes,
        product_type_no = product_types_no,
        section_no = sections_no,
        index_group_no = index_groups_no
    )
    top1 = out.max(-1)[1].cpu().numpy()
    articles = [[articles_map[art] for art in client] for client in top1]
    return articles

  def training_step(self, batch, batch_idx):
    y = batch.pop('labels')
    output = self(**batch)

    loss = self.criterion(output[:, 1:].reshape(-1, n_articles), y[:, 1:].reshape(-1))
    self.log('train_loss', loss)
    return {'loss': loss}

  def validation_step(self, batch, batch_idx):
    y = batch.pop('labels')
    output = self(**batch, t = 0)

    loss = self.criterion(output[:, 1:].reshape(-1, n_articles), y[:, 1:].reshape(-1))
    self.log('val_loss', loss)
    return {'loss': loss}

  def train_dataloader(self):
    dataset = CustomDataset(self.train_df)
    loader = DataLoader(dataset, batch_size = self.cfg.batch_size, collate_fn=collate_fn, shuffle = True)
    return loader

  def val_dataloader(self):
    dataset = CustomDataset(self.val_df)
    loader = DataLoader(dataset, batch_size = self.cfg.batch_size, collate_fn=collate_fn, shuffle = False)
    return loader

  def __apply_weight_decay(self):
    no_decay = []
    decay = []
    for n, p in self.named_parameters():
        if 'bias' in n and 'LayerNorm' in n:
            no_decay.append(p)
        else:
            decay.append(p)
    return [{'params': no_decay, 'weight_decay': 0}, {'params': decay}]

  def configure_optimizers(self):
    optimizer = eval(self.cfg.optimizer.name)(
        self.__apply_weight_decay(), **self.cfg.optimizer.params
        )
    scheduler = eval(self.cfg.scheduler.name)(
        optimizer,
        **self.cfg.scheduler.params
        )
    return [optimizer], [scheduler]

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
    self.log('val_loss', avg_loss)

  def train_epoch_end(self, outputs):
    avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
    self.log('train_loss', avg_loss)

## Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
      
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, indx):
        item = self.df.iloc[indx]
        article_id = item['article_id']
        y = [inv_articles_map[x] for x in article_id]
        info = self._retrieve_info(article_id)
        return info, y

    @staticmethod
    def _retrieve_info(articles_id):
        #for one customer
        info = [articles_info[x] for x in articles_id]
        product_codes = [inv_product_code_map[x['product_code']] for x in info]
        product_types_no = [inv_product_type_map[x['product_type_no']] for x in info]
        sections_no = [inv_section_name_map[x['section_no']] for x in info]
        index_groups_no = [inv_index_group_map[x['index_group_no']] for x in info]
        return {
            'product_codes': product_codes,
            'product_types_no': product_types_no,
            'sections_no': sections_no,
            'index_groups_no': index_groups_no
        }

def collate_fn(batch: List[Dict[int, List]]):
    max_len = min(max([len(b[1]) for b in batch]), MAX_LEN)
    output_dict = {k: torch.zeros((len(batch), max_len), dtype = torch.long) for k in batch[0][0].keys()}
    y = torch.zeros((len(batch), max_len), dtype = torch.long)

    for i, b in enumerate(batch):
        for k, v in b[0].items():
            len_ = min(len(v), max_len)
            output_dict[k][i][:len_] += torch.tensor(v[:len_])
        y[i][:len_] = torch.tensor(b[1][:len_])

    output_dict['labels'] = y
    return output_dict

In [None]:
def split_data(df, validation_days=7):
    validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def prepare_data(df):
    # customer_id - articles, products_code, products_type_no ...
    grouped = (
        df
        .sort_values(['t_dat'], ascending=True)
        .groupby('customer_id')
        .aggregate(lambda x: list(x))
        .reset_index()
    )
    return grouped

In [None]:
df_train, df_val = split_data(transactions)
df_train, df_val = prepare_data(df_train), prepare_data(df_val)
df_train = df_train[df_train['article_id'].apply(lambda x: len(x) > 10)] #train on the number of transactions > 10

In [None]:
cfg = {
    'batch_size': 64,
    'optimizer': {
        'name': 'optim.AdamW',
        'params': {
            'lr': 6e-3,
        },
    },
    'scheduler': {
        'name': 'optim.lr_scheduler.StepLR',
        'params': {
            'step_size': 1,
            'gamma': 0.5
        },
    },
}

cfg = Box(cfg)

In [None]:
model = RnnModel()
model = RnnLightningModel(cfg, model, df_train, df_val)

In [None]:
checkpoint_callback = ModelCheckpoint(
        dirpath = './',
        filename = 'rnn_model',
        monitor = 'train_loss',
        mode = 'min'
    )

trainer = pl.Trainer(
    gpus = 1,
    max_epochs=3,
    callbacks = [checkpoint_callback]
)
trainer.fit(model)

## Check outputs
Let's take one batch and see what model outputs.

In [None]:
model.cuda()
model.eval()
batch = next(iter(model.train_dataloader()))

true = batch['labels'][:, 4:].numpy().tolist()
true = [[articles_map[art] for art in client] for client in true]
pred = model.generate(
    product_codes = batch['product_codes'][:, :4].cuda(),
    product_types_no = batch['product_types_no'][:, :4].cuda(),
    sections_no = batch['sections_no'][:, :4].cuda(),
    index_groups_no = batch['index_groups_no'][:, :4].cuda(),
)

In [None]:
for i, (truth, predictions) in enumerate(zip(true, pred)):
    print('*'*130)
    print(f'Customer {i}')
    
    truth = list(filter(lambda x: x != PAD_IDX, truth))
    max_len = max(len(truth), len(predictions))
    display_data = pd.DataFrame({'true articles': ['']*max_len, 'pred articles': ['']*max_len})
    
    truth_articles = articles.loc[articles.article_id.isin(truth), 'detail_desc'].values
    predictions_articles = articles.loc[articles.article_id.isin(predictions), 'detail_desc'].values
    
    display_data.loc[range(len(truth_articles)),'true articles'] = truth_articles
    display_data.loc[range(len(predictions_articles)),'pred articles'] = predictions_articles
    display(HTML(display_data.to_html()))
    print('\n')