In [470]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [471]:
df = pd.read_csv("transactions_train.csv", dtype={"article_id": str})
df_article = pd.read_csv("articles.csv", dtype={"article_id": str})

df = df.tail(2000000).reset_index(drop=True)
df["t_dat"] = pd.to_datetime(df["t_dat"])
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7

article_count = df["article_id"].value_counts()
df = df[df["article_id"].map(article_count) >= 50]
top_articles_df = df["article_id"].value_counts().head(100).reset_index()
top_articles_df.columns = ["article_id", "counts"]
df_article = df_article.merge(top_articles_df, on="article_id", how="right")
df_article = df_article.set_index("article_id")

top_articles = top_articles_df["article_id"]
df_article = df_article.loc[top_articles]
df_article[["prod_name", "product_code", "product_type_name", "counts", "perceived_colour_value_name"]]
df_article["product_type_name"].value_counts()


product_type_name
Trousers            20
T-shirt             18
Sweater             10
Leggings/Tights      6
Cardigan             6
Vest top             6
Hoodie               5
Blouse               4
Underwear bottom     4
Blazer               4
Shirt                4
Socks                3
Unknown              2
Top                  2
Bra                  2
Bikini top           2
Dress                1
Sarong               1
Name: count, dtype: int64

In [472]:
WEEK_HIST_MAX = 5
df["article_id"] = pd.factorize(df['article_id'])[0]
article_ids = np.unique(df["article_id"].values)

def create_dataset(df, week):
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    
    target_df = df[df["week"] == week]           
    target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    target_df.rename(columns={"article_id": "target"}, inplace=True)
    target_df["week"] = week
    target_df = target_df.merge(hist_df, on="customer_id", how="left")
    target_df = target_df[target_df["article_id"].notna()].reset_index(drop=True)
    target_df = target_df[target_df["target"].notna()].reset_index(drop=True)

    return target_df

test_weeks = [0]
train_weeks = [1, 2]


In [473]:
a = create_dataset(df, 2)
a

Unnamed: 0,customer_id,target,week,article_id
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,"[6535, 6724, 1887, 7046, 2593]",2,"[2057, 215, 2592, 1400, 2593, 200, 1510, 108, ..."
1,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,[3633],2,"[2505, 637, 1108]"
2,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"[7076, 6605, 6734]",2,"[2082, 248, 248, 1581, 6217, 752]"
3,0006d3ff0caf0cb4d4e0615ee5cb7d268622364d483335...,"[1576, 4795]",2,"[553, 5605, 1790, 1739, 3216, 1576, 5355, 4436..."
4,00075ef36696a7b4ed8c83e22a4bf7ea7c90ee110991ec...,"[2788, 7018]",2,"[939, 2707, 1674, 1421, 3681, 1131, 3119, 1752..."
...,...,...,...,...
32441,fff5506ea8a342e778e4f2fbc2c9575e20b71cf24b75e6...,[3794],2,[3527]
32442,fff5a8e958488dc8a6f24f65bd1b40fb733068eb2cb54f...,[1270],2,"[2574, 3113]"
32443,fff60c2d6ade407465a875a5640a1e6e2b5ed5f6ec21f5...,"[16, 15, 6313]",2,"[244, 245, 248]"
32444,fff7e7674509592818bf453391af43a85eaaac9a52d858...,[4607],2,"[2078, 116, 3836, 2293, 3313, 5808, 4607]"


In [474]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from tqdm import tqdm

In [475]:
class Dataset(Dataset):
    def __init__(self, df, seq_len):
        self.df = df
        self.seq_len = seq_len
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        article_hist = torch.zeros(self.seq_len).long()
        target = torch.zeros(len(article_ids)).float()

        for t in row.target:
            target[t] = 1.0
                    
        if len(row.article_id) >= self.seq_len:
            article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
        else:
            article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                
        return article_hist, target

In [476]:
class NeuralAttentiveRecMachine(nn.Module):
    def __init__(self, article_dim):
        super(NeuralAttentiveRecMachine, self).__init__()
        
        self.article_emb = nn.Embedding(len(article_ids), article_dim)
        self.encoder = nn.GRU(article_dim, article_dim, batch_first=True)
        self.attention_layer = nn.Linear(article_dim, article_dim)
        self.softmax = nn.Softmax(dim=-1)
        self.context_layer = nn.Linear(2*article_dim, article_dim)
        
    def forward(self, article_hist):
        tensor = self.article_emb(article_hist)
        encoder_output, hn = self.encoder(tensor)
        hn = hn.squeeze(0)
        attention_logits = self.attention_layer(hn).unsqueeze(1) @ encoder_output.transpose(1, 2)
        attention_weights = self.softmax(attention_logits)
        context_vector = torch.concat([hn, (attention_weights @ encoder_output).squeeze(1)], -1)
        context_vector = self.context_layer(context_vector)
        article_embs = self.article_emb(torch.tensor(article_ids))
        score = context_vector @ article_embs.transpose(0, 1)
        
        return score
        
        

In [477]:
class GruforRec(nn.Module):
    def __init__(self, article_dim):
        super(GruforRec, self).__init__()
        self.article_emb = nn.Embedding(len(article_ids), article_dim)
        self.encoder = nn.GRU(article_dim, article_dim, batch_first=True)
        
    def forward(self, article_hist):
        tensor = self.article_emb(article_hist)
        _, hn = self.encoder(tensor)
        hn = hn.squeeze(0)
        article_embs = self.article_emb(torch.tensor(article_ids))
        score = hn @ article_embs.transpose(0, 1)
        
        return score

In [478]:
def eval_recall(output, target):
    sigmoid = nn.Sigmoid()
    output = sigmoid(output)
    recall = output > 0.4
    recall = ((recall == target.bool()) & recall).float().sum(dim=1) / (target.sum(dim=1) + 1e-10)
    recall = recall.sum(dim=0) / recall.shape[0]
    
    return recall

In [479]:
def eval_precision(output, target):
    sigmoid = nn.Sigmoid()
    output = sigmoid(output)
    precision = output > 0.4
    precision = ((precision == target.bool()) & precision).float().sum(dim=1) / (precision.float().sum(dim=1) + 1e-10)
    precision = precision.sum(dim=0) / precision.shape[0]
    
    return precision

In [480]:
SEQ_LEN = 16
BATCH_SIZE = 256
LEARNING_RATE = 0.001
EPOCHS = 30

test_df = pd.concat([create_dataset(df, w) for w in test_weeks]).reset_index(drop=True)
train_df = pd.concat([create_dataset(df, w) for w in train_weeks]).reset_index(drop=True)
train_dataset = Dataset(train_df, SEQ_LEN)
test_dataset = Dataset(test_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model1 = NeuralAttentiveRecMachine(50)
model2 = GruforRec(50)


optimizer1 = torch.optim.SGD(model1.parameters(), lr=LEARNING_RATE)
optimizer2 = torch.optim.SGD(model2.parameters(), lr=LEARNING_RATE)

criterion = nn.BCEWithLogitsLoss()


In [481]:

for i in range(EPOCHS):
    print("="*30)
    print(f"Current Epoch {i+1}")
    print("-"*30)
    print("Training..")
    
    _loss = .0
    train_recall = []
    train_precision = []
    model1.train()
    for article_hist, target in tqdm(train_loader):
        loss = criterion(model1(article_hist), target)
        _loss += loss.item()
        train_recall.append(eval_recall(model1(article_hist), target))
        train_precision.append(eval_precision(model1(article_hist), target))

        optimizer1.zero_grad()
        loss.backward()
        optimizer1.step()
    
    print("Evaluating..")
    _eval_loss = .0
    test_recall = []
    test_precision = []
    model1.eval()
    for article_hist, target in tqdm(test_loader):
        eval_loss = criterion(model1(article_hist), target)
        test_recall.append(eval_recall(model1(article_hist), target))
        test_precision.append(eval_precision(model1(article_hist), target))

        _eval_loss += eval_loss.item()
    
    train_loss = _loss/len(train_loader.dataset) * 100
    eval_loss = _eval_loss/len(test_loader.dataset) * 100
    train_recall = torch.mean(torch.tensor(train_recall))
    test_recall = torch.mean(torch.tensor(test_recall))
    train_precision = torch.mean(torch.tensor(train_precision))
    test_precision = torch.mean(torch.tensor(test_precision))
    print(f"Epoch {i+1}\nTrain Loss: {train_loss}\nTest Loss: {eval_loss}\nTrain Recall: {train_recall}\nTest Recall: {test_recall}\nTrain Precision: {train_precision}\nTest Precision: {test_precision}")

Current Epoch 1
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:36<00:00,  7.02it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:14<00:00,  8.53it/s]


Epoch 1
Train Loss: 0.33987345914627
Test Loss: 0.32601783988645855
Train Recall: 0.6316839456558228
Test Recall: 0.638315737247467
Train Precision: 0.0003657522611320019
Test Precision: 0.0003516659198794514
Current Epoch 2
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:37<00:00,  6.87it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:14<00:00,  8.51it/s]


Epoch 2
Train Loss: 0.31914361438686023
Test Loss: 0.3134282427628182
Train Recall: 0.6500526666641235
Test Recall: 0.6564481258392334
Train Precision: 0.00036366633139550686
Test Precision: 0.00035167610622011125
Current Epoch 3
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:38<00:00,  6.74it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:14<00:00,  8.60it/s]


Epoch 3
Train Loss: 0.31048621408757354
Test Loss: 0.30778643117790555
Train Recall: 0.6626895070075989
Test Recall: 0.6657165288925171
Train Precision: 0.00036280008498579264
Test Precision: 0.0003509551752358675
Current Epoch 4
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:37<00:00,  6.95it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:14<00:00,  8.27it/s]


Epoch 4
Train Loss: 0.30635916249542877
Test Loss: 0.30485942778098424
Train Recall: 0.6695542335510254
Test Recall: 0.6733287572860718
Train Precision: 0.00036190691753290594
Test Precision: 0.0003511634422466159
Current Epoch 5
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:38<00:00,  6.72it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:15<00:00,  8.19it/s]


Epoch 5
Train Loss: 0.3040814986484294
Test Loss: 0.30308920541545564
Train Recall: 0.6733088493347168
Test Recall: 0.677336573600769
Train Precision: 0.0003616934409365058
Test Precision: 0.0003508933004923165
Current Epoch 6
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:38<00:00,  6.62it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:15<00:00,  7.86it/s]


Epoch 6
Train Loss: 0.30258610091913896
Test Loss: 0.30185355865403324
Train Recall: 0.6757940053939819
Test Recall: 0.6797837615013123
Train Precision: 0.00036127035855315626
Test Precision: 0.00035071143065579236
Current Epoch 7
------------------------------
Training..


100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:42<00:00,  6.14it/s]


Evaluating..


100%|████████████████████████████████████████████████████████████████████████████████| 124/124 [00:16<00:00,  7.36it/s]


Epoch 7
Train Loss: 0.3015028140049582
Test Loss: 0.30088414967538313
Train Recall: 0.6783797144889832
Test Recall: 0.6819550395011902
Train Precision: 0.0003616877074819058
Test Precision: 0.00035054254112765193
Current Epoch 8
------------------------------
Training..


 86%|████████████████████████████████████████████████████████████████████▌           | 221/258 [00:38<00:06,  5.68it/s]


KeyboardInterrupt: 

In [None]:
for i in range(EPOCHS):
    print("="*30)
    print(f"Current Epoch {i+1}")
    print("-"*30)
    print("Training..")
    
    _loss = .0
    train_recall = []
    train_precision = []
    model2.train()
    for article_hist, target in tqdm(train_loader):
        loss = criterion(model2(article_hist), target)
        _loss += loss.item()
        train_recall.append(eval_recall(model2(article_hist), target))
        train_precision.append(eval_precision(model2(article_hist), target))

        optimizer2.zero_grad()
        loss.backward()
        optimizer2.step()
    
    print("Evaluating..")
    _eval_loss = .0
    test_recall = []
    test_precision = []
    model2.eval()
    for article_hist, target in tqdm(test_loader):
        eval_loss = criterion(model2(article_hist), target)
        test_recall.append(eval_recall(model2(article_hist), target))
        test_precision.append(eval_precision(model2(article_hist), target))

        _eval_loss += eval_loss.item()
    
    train_loss = _loss/len(train_loader.dataset) * 100
    eval_loss = _eval_loss/len(test_loader.dataset) * 100
    train_recall = torch.mean(torch.tensor(train_recall))
    test_recall = torch.mean(torch.tensor(test_recall))
    train_precision = torch.mean(torch.tensor(train_precision))
    test_precision = torch.mean(torch.tensor(test_precision))
    print(f"Epoch {i+1}\nTrain Loss: {train_loss}\nTest Loss: {eval_loss}\nTrain Recall: {train_recall}\nTest Recall: {test_recall}\nTrain Precision: {train_precision}\nTest Precision: {test_precision}")