In [None]:
import os
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from sklearn import preprocessing
from tqdm import tqdm
                       
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
import torch.nn as nn
from torch.nn import Linear
from torch.nn import functional as F


import gensim
from gensim.models import Word2Vec

In [None]:
def read_and_concatenate_parquet_files(pattern):
    """Read and concatenate parquet files matching given pattern."""
    parquet_files = {}
    for file in glob.glob(pattern):
        parquet_files[file] = pd.read_parquet(file)
    return pd.concat(parquet_files.values())

# Read and concatenate training data from first week
train_w1 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/train_w0_part*.parquet")

# Read and concatenate label data from first week
label_w1 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/label_w0_part*.parquet")

# Read and concatenate training data from fourth week for validation
train_w4 = read_and_concatenate_parquet_files("/kaggle/input/otto-training-wo-split/train_w3_part*.parquet")

# Read and concatenate label data from fourth week for validation
label_w4 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/label_w3_part*.parquet")


In [None]:
# # Read and concatenate training data from 2nd week
# train_w2 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/train_w1_part*.parquet")

# # Read and concatenate label data from 2nd week
# label_w2 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/label_w1_part*.parquet")


# # Read and concatenate training data from 3rd week
# train_w3 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/train_w2_part*.parquet")

# # Read and concatenate label data from 3rd week
# label_w3 = read_and_concatenate_parquet_files("/kaggle/input/otto-prep-4-weeks/label_w2_part*.parquet")


In [None]:
def create_column_mapping(df, col_name):
    # Read in the data from the parquet file
    all_aid = pd.read_parquet('/kaggle/input/otto-full-optimized-memory-footprint/train.parquet')
    
    # Get a sorted list of the unique values in the specified column
    aid_sorted = sorted(list(all_aid[col_name].unique()))
    
    # Delete the dataframe to save memory
    del all_aid
    
    # Create a mapping from the unique values to integers, starting from 2
    mapping = {k: i + 2 for i, k in enumerate(aid_sorted)}
    
    # Create an inverse mapping from the integers back to the unique values
    inverse_mapping = {v: k for k, v in mapping.items()}
    
    return mapping, inverse_mapping

def map_column(df, col_name, mapping):
    # Replace the values in the specified column with their corresponding integer values
    df[col_name] = df[col_name].map(mapping)
    return df

# Create the mapping and inverse mapping for the 'aid' column
mapping, inverse_mapping = create_column_mapping(train_w1, 'aid')

# Replace the values in the 'aid' column of the train_w1 dataframe with their corresponding integer values
train_w1 = map_column(train_w1, 'aid', mapping)

# Replace the values in the 'aid' column of the train_w4 dataframe with their corresponding integer values
train_w4 = map_column(train_w4, 'aid', mapping)


#discard sessions with NaN aid (in 4th week but not in first)
train_w4 = train_w4[~train_w4.session.isin(train_w4[train_w4['aid'].isna()]['session'].unique())]


In [None]:
def get_merged_sessions(session_df, label_session_df):
    """
    Merge the session data and label data by session.

    """
    input_df = pd.DataFrame(session_df.groupby('session')['aid'].unique().agg(list))
    label_df = pd.DataFrame(label_session_df.groupby('session')['aid'].unique().agg(list))
    
    # Rename the 'aid' column to 'input'
    input_df = input_df.rename(columns={'aid': 'input'})
    
    # Rename the 'aid' column to 'label'
    label_df = label_df.rename(columns={'aid': 'label'})

    # Merge the input and label dataframes on the 'session' column, using 'session' as the index
    merged_df = input_df.merge(label_df, left_index=True, right_index=True)
    
    return merged_df

train_w1 = get_merged_sessions(train_w1, label_w1)
train_w4 = get_merged_sessions(train_w4, label_w4)

# Delete the label dataframes to save memory
del label_w1, label_w4


In [None]:
# Load pre-trained Word2Vec model.
w2v = gensim.models.Word2Vec.load("/kaggle/input/otto-w2vec/word2vec.model")

In [None]:
EMBEDDING_LENGTH = 32
TARGET_LENGTH = 20
SEQUENCE_LENGTH = 100

class Dataset(torch.utils.data.Dataset):
    def __init__(self, sessions, input_length = SEQUENCE_LENGTH, target_length = TARGET_LENGTH):
        self.sessions = sessions
        self.input_length = input_length
        self.target_length = target_length

    def __len__(self):
        return len(self.sessions)
    
    def pad_items(self, session, length, input_item = True):
        if len(session)< length:
            session = session + list((length - len(session)) * [0])
        else: 
            if input_item:
                session = session[-length:]
            else: session = session[:length]
        return session



    def __getitem__(self, idx):
        input_tokens = self.sessions.iloc[idx, self.sessions.columns.get_loc("input")]
        
        length_input = len(input_tokens)
        input_tokens = self.pad_items(input_tokens, self.input_length)
            
        target = self.sessions.iloc[idx, self.sessions.columns.get_loc("label")]
        target = self.pad_items(target, self.target_length, input_item = False)
        
        target_mask = [1 if item != 0 else item for item in target ]
        
        
        target = input_tokens +target
        input_tokens = input_tokens + target_mask 
        
 
        
        input_tokens_orig = torch.tensor(input_tokens)
        input_tokens = w2v.wv[input_tokens]
        
        input_tokens = torch.tensor(input_tokens, dtype = torch.float )
        target = torch.tensor(target, dtype = torch.long )

        
        
        mask = torch.eq(input_tokens_orig, 0).type(torch.bool).unsqueeze(0).permute(1,0)
        
       
        return input_tokens, target, mask, input_tokens_orig
                                     

In [None]:
def masked_ce(y_pred, y_true, mask):

    loss = F.cross_entropy(y_pred, y_true, reduction="none")

    loss = loss * mask

    return loss.sum() / (mask.sum() + 1e-8)


def masked_accuracy(y_pred: torch.Tensor, y_true: torch.Tensor, mask: torch.Tensor):

    _, predicted = torch.max(y_pred, 1)

    y_true = torch.masked_select(y_true, mask)
    predicted = torch.masked_select(predicted, mask)

    acc = (y_true == predicted).double().mean()

    return acc



class Recommender(pl.LightningModule):
    def __init__(
        self,
        out = len(mapping)+2,
        channels=EMBEDDING_LENGTH,
        dropout=0.2,
        lr=1e-4,
        word2vec = w2v
    ):
        super().__init__()
        
        self.lr = lr
        self.dropout = dropout
        self.out = out
        
        self.item_embeddings = w2v

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=channels, nhead=4, dropout=self.dropout, batch_first = True
        )

        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=10)

        self.linear_out = Linear(channels, self.out)

        self.do = nn.Dropout(p=self.dropout)


    def forward(self, input_items, mask):

        mask = mask.squeeze(-1)
        
        if torch.cuda.is_available():
            input_items = input_items.to("cuda:0")
            mask = mask.to("cuda:0")
        
        
        input_items = self.encoder(input_items, src_key_padding_mask =mask)
        
        out = self.linear_out(input_items)

        return out
    


    def training_step(self, batch, batch_idx):
        
        
        input_items, y_true, mask, input_tokens_orig = batch

        y_pred = self(input_items, mask)
        

        if torch.cuda.is_available():
            y_pred = y_pred.to("cuda:0")
            y_true = y_true.to("cuda:0")
        
        
        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        
        
        input_tokens_orig = input_tokens_orig.view(-1)
        loss_mask = input_tokens_orig == 1
        
        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=loss_mask)
       
        #accuracy = self.masked_accuracy(y_pred=y_pred, y_true=y_true, mask=loss_mask)


        self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        #self.log("train_accuracy", accuracy)

        return loss   
    


    def validation_step(self, batch, batch_idx):
        input_items, y_true, mask, input_tokens_orig = batch

        y_pred = self(input_items, mask)

        y_pred = y_pred.view(-1, y_pred.size(2))
        y_true = y_true.view(-1)

        input_tokens_orig = input_tokens_orig.view(-1)
        loss_mask = input_tokens_orig == 1
        
        loss = masked_ce(y_pred=y_pred, y_true=y_true, mask=loss_mask)   
        
        #accuracy = self.masked_accuracy(y_pred=y_pred, y_true=y_true, mask=loss_mask)
        
        self.log('valid_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        #self.log("valid_accuracy", accuracy)

        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "valid_loss",
        }


In [None]:
batch_size = 3


train_data = Dataset(
    sessions=train_w1
)

val_data = Dataset(
sessions = train_w4
)

print("len(train_data)", len(train_data))
print("len(val_data)", len(val_data))


train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    num_workers=2,
    shuffle=True,
)
val_loader = DataLoader(
    val_data,
    batch_size=batch_size,
    num_workers=2,
    shuffle=False,
)


In [None]:
#train

epochs = 10

model = Recommender()
trainer = pl.Trainer(
    max_epochs=epochs,
    gpus=1
)

logger = TensorBoardLogger(
    save_dir='/kaggle/working/',
    )

checkpoint_callback = ModelCheckpoint(
    monitor="valid_loss",
    mode="min",
    dirpath='/kaggle/working/',
    filename="recommender",
    )

trainer.fit(model, train_loader, val_loader)


