In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

![title](https://raw.githubusercontent.com/google/eng-edu/main/ml/recommendation-systems/images/softmax-model.png)

In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm

In [2]:
os.chdir('..')

In [3]:
from fashion_recommendations.metrics.average_precision import mapk

### Datasets

In [4]:
from itertools import cycle, islice

import torch
import torch.nn.functional as F
from torch.utils.data import IterableDataset, DataLoader

In [5]:
class FashionDatasetSingleLabel(IterableDataset):

    def __init__(self, dataset_filepath, max_length, padding_value):
        
        self.dataset_itr = open(dataset_filepath, 'r')
        next(self.dataset_itr)  # skip header
        
        self.max_length = max_length
        
        self.padding_value = padding_value
    
    def process_label(self, label: str):

        return torch.tensor(int(label))
    
    def process_input(self, input_str: str, max_length, padding_value):
        
        input_tensor = torch.tensor([int(v) for v in input_str.split(',')])
        
        len_orig = len(input_tensor)
        
        if len_orig >= max_length:
            
            input_tensor = input_tensor[-max_length:]  # Take latest items
            
        else:
            
            num_pad = max_length - len_orig
            
            input_tensor = F.pad(input_tensor, (0, num_pad), value=padding_value)
            
        return input_tensor
    
    def parse_itr(self, dataset_itr):
        
        for line in dataset_itr:
        
            line_items = line.rstrip('\n').split('\t')  # [customer_id, label, input]
            
            label = self.process_label(line_items[1])
            
            input_seq = self.process_input(line_items[2], self.max_length, self.padding_value)

            yield input_seq, label
        
    def get_stream(self, dataset_itr):
        
        return self.parse_itr(dataset_itr)

    def __iter__(self):
        
        return self.get_stream(self.dataset_itr)

### fashrec-v1

- Use order history only
- Embedding for each item
- If item ordered X times index its embedding X times
- BoW of all item history embeddings
- For each customer take random transaction and roll-back data

### Train model

In [6]:
total_num_articles = pd.read_csv('data/articles.csv').shape[0]
total_num_articles

105542

In [7]:
class FashionRecV1(nn.Module):

    def __init__(self, mask_value, embedding_dim):
        super(FashionRecV1, self).__init__()
        
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(num_embeddings=total_num_articles, embedding_dim=self.embedding_dim)
        
        self.fc_1 = nn.Linear(in_features=self.embedding_dim, out_features=11)
        
        self.fc_2 = nn.Linear(in_features=11, out_features=total_num_articles)
        
        self.relu = nn.ReLU()
        
        self.tanh = nn.Tanh()
        
        self.mask_value = mask_value

    def forward(self, padded_sequences):
        
        x = self.embedding(padded_sequences)

        mask = padded_sequences != self.mask_value
        
        # Repeat mask so that shape matches output of embedding
        mask = torch.unsqueeze(mask, dim=2)
        mask = mask.repeat(1, 1, self.embedding_dim)

        x = mask * x
        
        # Compute average over non-padding embeddings:
        x = x.sum(dim=1)
        num_embeddings = mask[:, :, 0].sum(dim=1).reshape(-1, 1)  # Divide by number of (non-padding) embeddings to get mean embedding

        x = torch.div(
            x, 
            num_embeddings
        )

        x = self.fc_1(x)
        x = self.relu(x)
        
        x = self.fc_2(x)
        x = self.tanh(x)
        
        return x

In [8]:
BATCH_SIZE = 4

In [9]:
total_training_examples = pd.read_csv('data/splits/train_single_purchase_label_sample.tsv', sep='\t', low_memory=False).shape[0]
total_batches = np.ceil(total_training_examples/BATCH_SIZE)
total_batches

3.0

In [52]:
total_dev_examples = pd.read_csv('data/splits/dev_single_purchase_label.tsv', sep='\t', low_memory=False).shape[0]
total_dev_examples

72019

In [76]:
dev_actuals = pd.read_csv('data/splits/dev_all_purchase_label.tsv', sep='\t')['article_id_idx_historical'].str.split(',').apply(lambda x: [int(i) for i in x]).tolist()

In [38]:
PADDING_VALUE = 0

In [53]:
fashion_rec_v1 = FashionRecV1(mask_value=PADDING_VALUE, embedding_dim=10)

In [54]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=fashion_rec_v1.parameters(), lr=0.01)

In [None]:
BATCH_SIZE_INFERENCE = 512

In [None]:
MAX_EPOCHS = 20

In [None]:
training_losses = []
dev_losses = []
dev_maps = []

for epoch in range(MAX_EPOCHS):
    
    # Since we use an IterableDataset we need to reinstaniate the dataset since file end will have been reached:
    train_dataset = FashionDatasetSingleLabel(dataset_filepath='data/splits/train_single_purchase_label_sample.tsv', max_length=50, padding_value=0)    
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)  
    
    for idx, data in enumerate(tqdm(train_loader, total=total_batches)):
        
        X, y = data
        
        X = X.long()

        optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate

        y_pred = fashion_rec_v1(X)
        
        loss = criterion(y_pred, y)  # Need index for loss in PyTorch

        loss.backward()    
        
        optimizer.step()
    
    if idx % 100 == 0:

        # Compute train loss
        train_dataset_for_loss = FashionDatasetSingleLabel(dataset_filepath='data/splits/train_single_purchase_label_sample.tsv', max_length=50, padding_value=0)     
        total_train_loss = 0

        with torch.no_grad():
            for data in DataLoader(train_dataset_for_loss, batch_size=BATCH_SIZE_INFERENCE):
                X, y = data
                optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate
                y_pred = fashion_rec_v1(X)
                loss = nn.CrossEntropyLoss(reduction='sum')(y_pred, y).item()
                total_train_loss += loss

            mean_train_loss = total_train_loss / total_training_examples

            print(f"Training loss: {mean_train_loss}")
            training_losses.append(mean_train_loss)

        # Compute dev loss
        dev_dataset_for_loss = FashionDatasetSingleLabel(dataset_filepath='data/splits/dev_single_purchase_label.tsv', max_length=50, padding_value=0)     
        total_dev_loss = 0
        top_12_predictions = []

        with torch.no_grad():
            for data in DataLoader(dev_dataset_for_loss, batch_size=BATCH_SIZE_INFERENCE):
                X, y = data
                optimizer.zero_grad()  # Set gradients to 0 otherwise will accumulate
                y_pred = fashion_rec_v1(X)
                loss = nn.CrossEntropyLoss(reduction='sum')(y_pred, y).item()
                total_dev_loss += loss

                top_12 = y_pred.argsort(dim=1, descending=True)[:, :12].tolist()
                top_12_predictions += top_12

            mean_dev_loss = total_dev_loss / total_dev_examples

            print(f"Dev loss: {mean_dev_loss}")
            dev_losses.append(mean_dev_loss)

        # Compute dev MAP@12
        dev_mapk12 = mapk(dev_actuals, top_12_predictions, k=12)
        print(f"Dev MAP@12: {dev_mapk12}")
        dev_maps.append(dev_mapk12)

TODO
- Add shuffle to datasets (or pre-shuffle)
- Add to_device and check that it's GPU
- Add code for generating predictions for submission