In [None]:
import os
import pickle
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torch.nn as nn


from sklearn.preprocessing import LabelEncoder

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def load_obj(filepath):
    obj=None
    with open(filepath, 'rb') as file:
        obj = pickle.load(file)
    return obj

def save_obj(obj, filepath):
    with open(filepath, 'wb') as file:
        pickle.dump(obj, file)

In [None]:
article2id=load_obj('../input/hm-recformer-dataset/article2id.pkl')
id2article=load_obj('../input/hm-recformer-dataset/id2article.pkl')
article_info = load_obj('../input/hm-recformer-dataset/article_info.pkl')
train_articles = load_obj('../input/hm-recformer-dataset/train_articles.pkl')

transaction_df = pd.read_pickle("../input/hm-recformer-dataset/train_transaction_df.pkl")
article_df = pd.read_pickle("../input/hm-recformer-dataset/train_article_df.pkl")

In [None]:
%%time
transaction_df['has_val'] = transaction_df.train_val_split.apply( lambda data: data['val_data']['week_relative'][0] > 46 )
transaction_df['num_weeks'] = transaction_df.train_val_split.apply( lambda data: len(np.unique(data['train_data']['week_relative'] )) )

transaction_df.has_val.value_counts()

In [None]:
transaction_df['num_weeks'].value_counts()

In [None]:
print("number of articles:", len(article2id))
print("number of customers:", len(transaction_df))
print("number of train articles:", len(train_articles))

In [None]:
article_df.head()

In [None]:
def label_encoder(lst):
    label2id={}
    id2label={}
    
    for i,label in enumerate(lst):
        label2id[label] = i
        id2label[i] = label
    return (label2id, id2label)

In [None]:
(section2id, id2section) = label_encoder(article_df.section_name.unique())
(prodgroup2id, id2prodgroup) = label_encoder(article_df.product_group_name.unique())

save_obj(section2id, 'section2id.pkl')
save_obj(id2section, 'id2section.pkl')

save_obj(prodgroup2id, 'prodgroup2id.pkl')
save_obj(id2prodgroup, 'id2prodgroup.pkl')

print(len(section2id), len(prodgroup2id))

In [None]:
class config:
    NUM_EPOCHS=25
    NUM_NEGATIVE_SAMPLES = 100
    MAX_SEQ_LEN = 64
    BATCH_SIZE=1024
    
    MAX_TARGETS = 6
    MAX_TIME_ID = 51
    PAD_TIME_ID = 52
    
    PAD_ARTICLE_ID = len(article2id)
    PAD_SECTION_ID = article_df.section_name.nunique()
    PAD_PRODUCT_GROUP_ID = article_df.product_group_name.nunique()

# Dataset

In [None]:
class HMDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    def get_targetids(self, article_ids, week_relative):
        max_week = np.max(week_relative)
        targets = set()
        
        for i in range(len(article_ids)):
            if week_relative[i] == max_week:
                targets.add(article_ids[i])
        
        targets = list(targets)
        np.random.shuffle(targets)
        return targets[:config.MAX_TARGETS]
    
    def trim_target_time_ranges(self, article_ids, week_relative):
        max_week = np.max(week_relative)
        x_articles=[]
        x_week = []
        
        for i in range(len(article_ids)):
            if week_relative[i] != max_week:
                x_articles.append(article_ids[i])
                x_week.append(week_relative[i])
        return (x_articles, x_week)
        
    
    def pad_sequence(self, x, max_seqlen, padid):
        x = x[-max_seqlen:]
        seqdiff = max_seqlen - len(x)
        if seqdiff == 0:
            return x
        x = ([padid]*seqdiff) + x
        return x
    
    def encode_sequence(self, lst, article2id):
        lst = [ article2id[x] for x in lst]
        return lst
    
    def augment_weeks(self, num_weeks, week_relative, article_ids):
        prv_week=-1
        cnt=0
        seqlen = len(week_relative)
        num_train_weeks = np.random.choice( np.arange(4, num_weeks) )
        
        cur_seqlen = 0
        for i in range( seqlen ):
            if week_relative[i] != prv_week:
                prv_week = week_relative[i]
                cnt+=1
            if cnt > num_train_weeks:
                cur_seqlen = i
                break
        
        week_relative = week_relative[:cur_seqlen]
        article_ids = article_ids[:cur_seqlen]
        
        return (week_relative, article_ids)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        has_val = row.has_val
        
        train_data = row.train_val_split['train_data']
        week_relative = train_data['week_relative']
        article_ids = train_data['article_ids']
        
        if not has_val:
            val_data = row.train_val_split['val_data']
            week_relative += val_data['week_relative']
            article_ids += val_data['article_ids']
        
        #Target ids
        targets = self.get_targetids(article_ids, week_relative)
        target_prodgroup_ids = [ article_info[articleid]['product_group_name'] for articleid in targets]
        
        
        (article_ids, week_relative) = self.trim_target_time_ranges(article_ids, week_relative)
        product_group_ids = [ article_info[articleid]['product_group_name'] for articleid in article_ids]
        week_relative = list(np.clip(week_relative, 0, config.MAX_TIME_ID))
        
        #Encode Sequence
        article_ids = self.encode_sequence(article_ids, article2id)
        product_group_ids = self.encode_sequence(product_group_ids, prodgroup2id)
        
        targets  = self.encode_sequence(targets, article2id)
        target_prodgroup_ids = self.encode_sequence(target_prodgroup_ids, prodgroup2id)
        
        seqlen = len(article_ids)
        article_ids = self.pad_sequence(article_ids, config.MAX_SEQ_LEN, config.PAD_ARTICLE_ID)
        product_group_ids = self.pad_sequence(product_group_ids, config.MAX_SEQ_LEN, config.PAD_PRODUCT_GROUP_ID)
        week_relative = self.pad_sequence(week_relative, config.MAX_SEQ_LEN, config.PAD_TIME_ID)
        
        targets = self.pad_sequence(targets, config.MAX_TARGETS, config.PAD_ARTICLE_ID)
        target_prodgroup_ids = self.pad_sequence(target_prodgroup_ids, config.MAX_TARGETS, config.PAD_PRODUCT_GROUP_ID)
        
        
        #attn_mask - 1==> should ignore that position; 0 ==> donot ignore
        attn_mask = torch.zeros(config.MAX_SEQ_LEN, dtype=torch.long).type(torch.bool)
        attn_mask[:-seqlen]=True
        
        inputs = {
            'article_ids': torch.tensor(article_ids, dtype=torch.long),
            'product_group_ids': torch.tensor(product_group_ids, dtype=torch.long),
            'week_relative': torch.tensor(week_relative, dtype=torch.long),
            'seqlen': torch.tensor(seqlen, dtype=torch.long),
            'attn_mask': attn_mask
        }
        
        targets = torch.tensor(targets, dtype=torch.long)
        target_prodgroup_ids = torch.tensor(target_prodgroup_ids, dtype=torch.long)
        return inputs, targets
    
    def __len__(self):
        return len(self.df)

In [None]:
np.random.seed(22)

In [None]:
class ValDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
        self.negsamples = np.random.choice( np.arange(len(article2id)), (len(self.df), config.NUM_NEGATIVE_SAMPLES) )
        print("Numeb negsamples:", self.negsamples.shape)
        
    def get_targetids(self, article_ids, week_relative):
        max_week = np.max(week_relative)
        targets = set()
        
        for i in range(len(article_ids)):
            if week_relative[i] == max_week:
                targets.add(article_ids[i])
        
        targets = list(targets)
        np.random.shuffle(targets)
        return targets[:config.MAX_TARGETS]
    
    def pad_sequence(self, x, max_seqlen, padid):
        x = x[-max_seqlen:]
        seqdiff = max_seqlen - len(x)
        if seqdiff == 0:
            return x
        x = ([padid]*seqdiff) + x
        return x
    
    def encode_sequence(self, lst, article2id):
        lst = [ article2id[x] for x in lst]
        return lst
    
    def get_train_sequence(self, train_data):
        week_relative = train_data['week_relative']
        article_ids = train_data['article_ids']
        product_group_ids = [ article_info[articleid]['product_group_name'] for articleid in article_ids]
        week_relative = list(np.clip(week_relative, 0, config.MAX_TIME_ID))
        
        #Encode Sequence
        article_ids = self.encode_sequence(article_ids, article2id)
        product_group_ids = self.encode_sequence(product_group_ids, prodgroup2id)
        seqlen = len(article_ids)
        
        #Pad Sequences
        article_ids = self.pad_sequence(article_ids, config.MAX_SEQ_LEN, config.PAD_ARTICLE_ID)
        product_group_ids = self.pad_sequence(product_group_ids, config.MAX_SEQ_LEN, config.PAD_PRODUCT_GROUP_ID)
        week_relative = self.pad_sequence(week_relative, config.MAX_SEQ_LEN, config.PAD_TIME_ID)
        
        #attn_mask - 1==> should ignore that position; 0 ==> donot ignore
        attn_mask = torch.zeros(config.MAX_SEQ_LEN, dtype=torch.long).type(torch.bool)
        attn_mask[:-seqlen]=True
        
        return {
            'article_ids': torch.tensor(article_ids, dtype=torch.long),
            'product_group_ids': torch.tensor(product_group_ids, dtype=torch.long),
            'week_relative': torch.tensor(week_relative, dtype=torch.long),
            'seqlen': torch.tensor(seqlen, dtype=torch.long),
            'attn_mask': attn_mask
        }
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        train_data = row.train_val_split['train_data']
        val_data = row.train_val_split['val_data']
        negsample = self.negsamples[idx]
        
        inputs = self.get_train_sequence(train_data)
        targets = list(set(val_data['article_ids']))
        targets  = self.encode_sequence(targets, article2id)
        targets = self.pad_sequence(targets, config.MAX_TARGETS, config.PAD_ARTICLE_ID)
        targets = torch.tensor(targets, dtype=torch.long)
        
        negsample = torch.tensor(negsample, dtype=torch.long)
        
        return inputs, targets, negsample
    
    def __len__(self):
        return len(self.df)

# Model

In [None]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        
        self.article_embeddings = nn.Embedding(1+len(article2id), 128 ,padding_idx=config.PAD_ARTICLE_ID)
        self.prodgroup_embeddings = nn.Embedding(1+len(prodgroup2id), 32 ,padding_idx=config.PAD_PRODUCT_GROUP_ID)
        self.week_embeddings = nn.Embedding( 1 + config.PAD_TIME_ID, 160,padding_idx=config.PAD_TIME_ID)
    
    def get_article_embeddings(self, x_article):
        x_article = self.article_embeddings(x_article)
        return x_article
    
    def forward(self, x_week, x_article, x_prodgroup):
        x_article = self.article_embeddings(x_article)
        x_prodgroup = self.prodgroup_embeddings(x_prodgroup)
        x_week = self.week_embeddings(x_week)
        x = torch.cat([x_article, x_prodgroup], dim=-1)
        x = x+x_week
        return x
    
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.transformer_layer1 = nn.TransformerEncoderLayer(160, 4, dim_feedforward=512,
                                                            dropout=0.15,
                                                            batch_first=True,
                                                            activation='gelu')
        self.transformer_layer2 = nn.TransformerEncoderLayer(160, 4, dim_feedforward=512,
                                                            dropout=0.15,
                                                            batch_first=True,
                                                            activation='gelu')
        
    def forward(self, x, attn_mask):
        x = self.transformer_layer1(x, src_key_padding_mask=attn_mask)
        x = self.transformer_layer2(x, src_key_padding_mask=attn_mask)
        x = x.mean(dim=1)
        return x
    
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(160, 128),
            nn.LeakyReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.1)
        )
        self.out = nn.Linear(128, 128)
    def forward(self, x):
        x = self.fc(x)
        x = self.out(x)
        return x

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.embedding_layer = Embedding()
        self.encoder = Encoder()
        self.mlp = MLP()
    
    def forward(self, x_week, x_article, x_prodgroup, attn_mask):
        x = self.embedding_layer(x_week, x_article, x_prodgroup)
        x = self.encoder(x, attn_mask)
        x = self.mlp(x)
        return x

# Negative Sampling

In [None]:
def get_train_negativesamples(articles):
    bsize = articles.shape[0]
    articles = set(articles.view(-1).tolist())
    articles = list(articles.difference({config.PAD_ARTICLE_ID}))
    
    negsamples = np.random.choice(articles, (bsize, config.NUM_NEGATIVE_SAMPLES))
    negsamples = torch.tensor(negsamples, dtype=torch.long, device=device)
    return negsamples

# loss

In [None]:
def compute_loss(ylabels, hist_embedds, hpos, hneg):
    loss = torch.tensor(0.0, device=device)
    bsize=hist_embedds.shape[0]
    dim=hist_embedds.shape[-1]
    
    pos_scores = (hist_embedds.view(bsize, -1, dim) * hpos).sum(dim=-1)
    neg_scores = (hist_embedds.view(bsize, -1, dim) * hneg).sum(dim=-1)
    
    for i in range(config.MAX_TARGETS):
        p = pos_scores[:, i].view(-1, 1)
        score_diff = (p - neg_scores)
        
        cur_loss = -torch.log( score_diff.sigmoid() + 1e-9 )
        cur_labels = ylabels[:, i]
        mask = (cur_labels != config.PAD_ARTICLE_ID)
        cur_loss = cur_loss[mask, :]
        loss += cur_loss.mean()
    loss = loss/4
    return loss

In [None]:
def rank_metrics(ylabels, hist_embedds, hpos, hneg):
    num_pos = 0
    rank_5 = 0
    rank_10 = 0
    
    bsize=hist_embedds.shape[0]
    dim=hist_embedds.shape[-1]
    
    pos_scores = (hist_embedds.view(bsize, -1, dim) * hpos).sum(dim=-1)
    neg_scores = (hist_embedds.view(bsize, -1, dim) * hneg).sum(dim=-1)
    
    for i in range(config.MAX_TARGETS):
        p = pos_scores[:, i].view(bsize, -1)
        score_diff = (p - neg_scores)
        
        cur_labels = ylabels[:, i]
        mask = (cur_labels != config.PAD_ARTICLE_ID)
        score_diff = (score_diff[mask, :] < 0).sum(dim=-1)
        
        num_pos += mask.sum().item()
        rank_5 += (score_diff <= 5).sum().item()
        rank_10 += (score_diff <= 10).sum().item()
        
    return rank_5, rank_10, num_pos

# train epoch

In [None]:
def train_epoch(train_dataloader, model, optimizer, schedular):
    epoch_losses = []
    model.train()
    for it, (inputs, targets) in enumerate(train_dataloader):
        batch_max_seqlen = inputs['seqlen'].max()

        x_week  = inputs['week_relative'][:, -batch_max_seqlen:].to(device)
        x_article = inputs['article_ids'][:, -batch_max_seqlen:].to(device)
        x_prodgroup = inputs['product_group_ids'][:, -batch_max_seqlen:].to(device)
        attn_mask = inputs['attn_mask'][:, -batch_max_seqlen:].to(device)
        targets = targets.to(device)

        negsamples = get_train_negativesamples(inputs['article_ids'])
        hist_embedds = model(x_week, x_article, x_prodgroup, attn_mask)
        pos_embedds = model.embedding_layer.get_article_embeddings(targets)
        neg_embedds = model.embedding_layer.get_article_embeddings(negsamples)

        model.zero_grad(set_to_none=True)
        loss = compute_loss(targets, hist_embedds, pos_embedds, neg_embedds)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 10)
        loss.backward()
        optimizer.step()
        schedular.step()
        
        epoch_losses.append(loss.item())
        if it%100 == 0 :
            print("iteration:{} | loss:{:.4f}".format(it, np.mean(epoch_losses)))
    return np.mean(epoch_losses)

In [None]:
def evaluate(val_dataloader, model):
    model.eval()
    losses = []
    total_pos = 0.0
    rank_5 = 0.0
    rank_10 = 0.0
    
    for it, (inputs, targets, negsamples) in enumerate(val_dataloader):
        batch_max_seqlen = inputs['seqlen'].max()

        x_week  = inputs['week_relative'][:, -batch_max_seqlen:].to(device)
        x_article = inputs['article_ids'][:, -batch_max_seqlen:].to(device)
        x_prodgroup = inputs['product_group_ids'][:, -batch_max_seqlen:].to(device)
        attn_mask = inputs['attn_mask'][:, -batch_max_seqlen:].to(device)
        
        targets = targets.to(device)
        negsamples = negsamples.to(device)
        with torch.no_grad():
            hist_embedds = model(x_week, x_article, x_prodgroup, attn_mask)
            pos_embedds = model.embedding_layer.get_article_embeddings(targets)
            neg_embedds = model.embedding_layer.get_article_embeddings(negsamples)
            
            loss = compute_loss(targets, hist_embedds, pos_embedds, neg_embedds)
            cur_rank_5, cur_rank_10, cur_total_pos = rank_metrics(targets, hist_embedds, pos_embedds, neg_embedds)
            
            total_pos += cur_total_pos
            rank_5  += cur_rank_5
            rank_10 += cur_rank_10
            losses.append(loss.item())
    rank_5  = rank_5/total_pos
    rank_10 = rank_10/total_pos
    return np.mean(losses), rank_5, rank_10

In [None]:
train_dataset = HMDataset(transaction_df)
val_dataset = ValDataset( transaction_df[transaction_df.has_val == True] )

train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, pin_memory=True, batch_size=config.BATCH_SIZE, drop_last=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, shuffle=False, pin_memory=True, batch_size=config.BATCH_SIZE, drop_last=False)

print("Number of training iterations:", len(train_dataloader))
print("Number of validation iterations:", len(val_dataloader))

In [None]:
model = Model().to(device)
optimizer=torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.001)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    1e-3,
    epochs=config.NUM_EPOCHS,
    steps_per_epoch=len(train_dataloader),
    pct_start=0.05,
    div_factor=100,
    final_div_factor=10
)

In [None]:
best_loss = None
best_rank5 = None
best_rank10  =None

for e in range(config.NUM_EPOCHS):
    epoch_loss = train_epoch(train_dataloader, model, optimizer, scheduler)
    print("epoch: {} | train loss:{:.4f}".format(e+1, epoch_loss))
    
    val_loss, rank_5, rank_10 = evaluate(val_dataloader, model)
    if best_loss is None or val_loss < best_loss:
        best_loss = val_loss
        torch.save(model, "model_best_loss.pt")
    if best_rank5 is None or rank_5 > best_rank5:
        best_rank5 = rank_5
        torch.save(model, "model_best_rank_5.pt")
    
    if best_rank10 is None or rank_10 > best_rank10:
        best_rank10 = rank_10
        torch.save(model, "model_best_rank_10.pt")
    print("eval loss:{:.4f} | precision_5:{:.4f} | precision_10:{:.4f}".format(val_loss, rank_5, rank_10))
    print("best loss:{:.4f} | best rank5:{:.4f} | best rank10:{:.4f}".format(best_loss, best_rank5, best_rank10))
    
    torch.save(model, "model.pt")
    print("==="*10)