In [1]:
import os
import ast
import json
import glob
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
#import cudf
import joblib

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from sklearn import preprocessing
from tqdm import tqdm

                        
from torch.utils.data import DataLoader
import pytorch_lightning as pl
import torch
import torch.nn as nn
from torch.nn import Linear
from torch.nn import functional as F


import gensim
#from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [2]:
train_w1 = dict() #train data, other weeks will be added later
label_w1 = dict()

#train_w2 = dict()
#label_w2 = dict()

# train_w3 = dict() 
# label_w3 = dict()

train_w4 = dict() #val_dataset
label_w4= dict()  #val_dataset

for file in glob.glob("/kaggle/input/otto-prep-4-weeks/train_w0_part*.parquet"):
   train_w1[file] = pd.read_parquet(file)
train_w1 = pd.concat(train_w1.values())

for file in glob.glob("/kaggle/input/otto-prep-4-weeks/label_w0_part*.parquet"):
   label_w1[file] = pd.read_parquet(file)
label_w1 = pd.concat(label_w1.values())

# for file in glob.glob("/kaggle/input/otto-training-wo-split/train_w1_part*.parquet"):
#    train_w2[file] = pd.read_parquet(file)
# train_w2 = pd.concat(train_w2.values())

# for file in glob.glob("/kaggle/input/otto-prep-4-weeks/label_w1_part*.parquet"):
#    label_w2[file] = pd.read_parquet(file)
# label_w2 = pd.concat(label_w2.values())

# for file in glob.glob("/kaggle/input/otto-training-wo-split/train_w2_part*.parquet"):
#    train_w3[file] = pd.read_parquet(file)
# train_w3 = pd.concat(train_w3.values())

# for file in glob.glob("/kaggle/input/otto-prep-4-weeks/label_w2_part*.parquet"):
#    label_w3[file] = pd.read_parquet(file)
# label_w3 = pd.concat(label_w3.values())

for file in glob.glob("/kaggle/input/otto-training-wo-split/train_w3_part*.parquet"):
   train_w4[file] = pd.read_parquet(file)
train_w4 = pd.concat(train_w4.values())

for file in glob.glob("/kaggle/input/otto-prep-4-weeks/label_w3_part*.parquet"):
   label_w4[file] = pd.read_parquet(file)
label_w4 = pd.concat(label_w4.values())


#max aid unique 486

In [3]:
def label_encoding(df):

    #set start = 2 to reserve 0 for mask and 1 for pad
    df['aid+2'] = df['aid']+2
    df['aid'] = df['aid+2']

    df = df.drop(['aid+2'], axis = 1)
    return df

In [4]:
train_w1 = label_encoding(train_w1)
train_w4 = label_encoding(train_w4)

In [5]:
def get_merged(df, label_df):

    df = pd.DataFrame(df.groupby('session')['aid'].unique().agg(list))
    label_df = pd.DataFrame(label_df.groupby('session')['aid'].unique().agg(list))
    
    df.rename(columns = {'aid': 'input'}, inplace = True)
    label_df.rename(columns = {'aid': 'label'}, inplace = True)

    df = df.reset_index()
    label_df = label_df.reset_index()

    df = pd.merge(df, label_df, on = "session", how = "inner")  
    
    return df

In [6]:
train_w1 = get_merged(train_w1, label_w1)
train_w4 = get_merged(train_w4, label_w4)
del label_w1, label_w4

In [7]:
# Load pre-trained Word2Vec model.
w2v = gensim.models.Word2Vec.load("/kaggle/input/otto-w2vec/word2vec.model")



In [8]:
INPUT_LENGTH = 100
OUTPUT_LENGTH = 20

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sessions, input_length = INPUT_LENGTH, output_length = OUTPUT_LENGTH):
        self.sessions = sessions
        self.input_length = input_length
        self.output_length = output_length

    def __len__(self):
        return len(self.sessions)
    
    def pad_items(self, session, length):
        if len(session)< length:
            session = session + list((length - len(session)) * [0])
        else: session = session[-length:]
        return session
      
    def __getitem__(self, idx):
        input_tokens = self.sessions.iloc[idx, self.sessions.columns.get_loc("input")]
        
        length_input = len(input_tokens)
        input_tokens = self.pad_items(input_tokens, self.input_length) 
        input_tokens = w2v.wv[input_tokens]
        
        target = self.sessions.iloc[idx, self.sessions.columns.get_loc("label")]
        target = self.pad_items(target, self.output_length) 
        
        input_tokens = torch.tensor(input_tokens, dtype=torch.long)
        target = torch.tensor(target, dtype=torch.long)
        
        mask = torch.tensor(list([1]*length_input + list(self.input_length * [0])))
        
        return input_tokens, target, mask
                                     

In [18]:
class Recommender(pl.LightningModule):
    def __init__(
        self,
        out = OUTPUT_LENGTH,
        channels=INPUT_LENGTH,
        dropout=0.2,
        lr=1e-4,
        word2vec = w2v
    ):
        super().__init__()
        
        self.lr = lr
        self.dropout = dropout
        self.out = out
        
        self.item_embeddings = w2v

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=channels, nhead=4, dropout=self.dropout
        )

        self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=10)

        self.linear_out = Linear(channels, self.out)

        self.do = nn.Dropout(p=self.dropout)


    def forward(self, input_items, mask):

        input_items = self.encoder(input_items, mask)

        out = self.linear_out(input_items)

        return out

    def training_step(self, batch, batch_idx):
        input_items, y_true, mask = batch

        y_pred = self(input_items, mask)

        loss = F.cross_entropy(y_pred, y_true, reduction="none")
        accuracy = (y_true == y_pred).double().mean()

        self.log("train_loss", loss)
        self.log("train_accuracy", accuracy)

        return loss

    def validation_step(self, batch, batch_idx):
        input_items, y_true, mask = batch

        y_pred = self(input_items, mask)

        loss = F.cross_entropy(y_pred, y_true, reduction="none")
        accuracy = (y_true == y_pred).double().mean()

        self.log("valid_loss", loss)
        self.log("valid_accuracy", accuracy)

        return loss


    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=10, factor=0.1
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "valid_loss",
        }

In [16]:
batch_size = 1


train_data = Dataset(
    sessions=train_w1
)

val_data = Dataset(
sessions = train_w4
)

print("len(train_data)", len(train_data))
print("len(val_data)", len(val_data))


train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    num_workers=2,
    shuffle=True,
)
val_loader = DataLoader(
    val_data,
    batch_size=batch_size,
    num_workers=2,
    shuffle=False,
)


len(train_data) 4441142
len(val_data) 4356191


In [19]:
epochs = 10

model = Recommender()
trainer = pl.Trainer(
    max_epochs=epochs,
    gpus=1
)

trainer.fit(model, train_loader, val_loader)

Sanity Checking: 0it [00:00, ?it/s]

AssertionError: was expecting embedding dimension of 100, but got 32