In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import ast
import random
import matplotlib.pyplot as plt
from functools import partial

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
DATASET = 'helpdesk'

train_data = pd.read_csv(f'data_flattened/{DATASET}/train.csv')
test_data = pd.read_csv(f'data_flattened/{DATASET}/test.csv')
val_data = pd.read_csv(f'data_flattened/{DATASET}/val.csv')

## Process timestamps

In [3]:
from utils import date2ts, ParseDatelist, GenTimeFeatures

train_data['cts_next'] = train_data['cts_next'].apply(date2ts)
test_data['cts_next'] = test_data['cts_next'].apply(date2ts)
val_data['cts_next'] = val_data['cts_next'].apply(date2ts)

mu = np.mean(train_data['cts_next'])
sigma = np.std(train_data['cts_next'])
scaler = lambda x: (x - mu) / sigma

parser = lambda x: ParseDatelist(x, applyer=partial(GenTimeFeatures, ts_applyer=lambda x: scaler(x // 1000000000)))
train_data['cts_prefix'] = train_data['cts_prefix'].apply(parser)
test_data['cts_prefix'] = test_data['cts_prefix'].apply(parser)
val_data['cts_prefix'] = val_data['cts_prefix'].apply(parser)

train_data['cts_next'] = train_data['cts_next'].apply(scaler)
test_data['cts_next'] = test_data['cts_next'].apply(scaler)
val_data['cts_next'] = val_data['cts_next'].apply(scaler)

N_TIMEFEATURES = len(train_data.loc[0, 'cts_prefix'][0])

train_data.head()

Unnamed: 0,act_prefix,cts_prefix,act_next,cts_next
0,[1],"[[1.0858516260300441, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.085852
1,[1],"[[0.47310774649970466, 1, 0, 0, 0, 0, 0, 0, 0,...",8,0.473109
2,[1],"[[1.0831950534807708, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.093193
3,[1],"[[1.0377570572662635, 0, 0, 1, 0, 0, 0, 0, 0, ...",8,1.041314
4,[1],"[[0.11367686982801574, 0, 0, 0, 1, 0, 0, 0, 0,...",8,0.113677


## Process activities

In [4]:
from utils import ParseActivitylist, OneHotEncode

train_data['act_prefix'] = train_data['act_prefix'].apply(ParseActivitylist)
test_data['act_prefix'] = test_data['act_prefix'].apply(ParseActivitylist)
val_data['act_prefix'] = val_data['act_prefix'].apply(ParseActivitylist)

train_data['act_next'] = train_data['act_next'].apply(int)
test_data['act_next'] = test_data['act_next'].apply(int)
val_data['act_next'] = val_data['act_next'].apply(int)

uniq_tokens = np.union1d(
    train_data['act_prefix'].apply(lambda x: x[0]).unique(),
    train_data['act_next'].unique()
)

N_TOKENS = len(uniq_tokens)
assert np.min(uniq_tokens) == 1, "Minimum label is not 1!"
assert np.max(uniq_tokens) == N_TOKENS, "Maximum label is not N_TOKENS!"

encoder = partial(OneHotEncode, num_classes=N_TOKENS)

train_data['act_prefix'] = train_data['act_prefix'].apply(encoder)
test_data['act_prefix'] = test_data['act_prefix'].apply(encoder)
val_data['act_prefix'] = val_data['act_prefix'].apply(encoder)

train_data.head()

Unnamed: 0,act_prefix,cts_prefix,act_next,cts_next
0,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[1.0858516260300441, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.085852
1,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[0.47310774649970466, 1, 0, 0, 0, 0, 0, 0, 0,...",8,0.473109
2,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[1.0831950534807708, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.093193
3,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[1.0377570572662635, 0, 0, 1, 0, 0, 0, 0, 0, ...",8,1.041314
4,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[0.11367686982801574, 0, 0, 0, 1, 0, 0, 0, 0,...",8,0.113677


In [5]:
from utils import LogLoader

BATCH_SIZE = 128

train_loader = LogLoader(
    data=train_data,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_loader = LogLoader(
    data=test_data,
    batch_size=BATCH_SIZE,
    shuffle=False
)

val_loader = LogLoader(
    data=val_data,
    batch_size=BATCH_SIZE,
    shuffle=False
)


# Model

In [14]:
class LstmModel(nn.Module):
    def __init__(self, vocab_size, n_features, emb_size=128, hid_size=64, embed_features=True):
        super(LstmModel, self).__init__()
        self.vocab_size = vocab_size
        self.n_features = n_features
        self.emb_size = emb_size
        self.hid_size = hid_size
        self.embed_features = embed_features
        self.emb_layer = None

        if self.embed_features:
            self.emb_layer = nn.Linear(
                in_features=n_features,
                out_features=emb_size
            )

        self.vector_size = emb_size if embed_features else n_features

        self.lstm = nn.LSTM(
            input_size=self.vector_size,
            hidden_size=hid_size,
            batch_first=True
        )

        self.head_layers = nn.Sequential(
            nn.Linear(in_features=hid_size, out_features=hid_size),
            nn.Dropout(p=0.2),
            nn.ELU(),
            nn.Linear(in_features=hid_size, out_features=hid_size),
            nn.Dropout(p=0.2),
            nn.ELU(),
            nn.Linear(in_features=hid_size, out_features=vocab_size),
        )


    def forward(self, prefix_batch):
        model_device = next(self.parameters()).device
        batch_size = prefix_batch.shape[0]

        input_batch = None
        if self.embed_features:
            input_batch = self.emb_layer(prefix_batch)
        else:
            input_batch = prefix_batch

        h0 = torch.randn((1, batch_size, self.hid_size), device=model_device)
        c0 = torch.randn((1, batch_size, self.hid_size), device=model_device)

        _, (hn, _) = self.lstm(input_batch, (h0, c0))

        logits = self.head_layers(torch.squeeze(hn))
        return logits


In [16]:
#!g1.1
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from utils import FocalLoss

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
N_EPOCHS = 50

model = LstmModel(
    vocab_size=10,
    n_features=N_TOKENS + N_TIMEFEATURES,
    embed_features=True
).float().to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)
# criterion = nn.CrossEntropyLoss()
criterion = partial(FocalLoss, gamma=1.5)


for epoch in tqdm(range(N_EPOCHS)):
    model.train()
    for batch in train_loader:
        logits = model.forward(batch['prefix'].float().to(DEVICE))
        loss = criterion(logits, batch['act_next'].to(DEVICE))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
    
    if epoch % 5 == 0:
        model.eval()
        total_predicts = []
        for batch in val_loader:
            with torch.no_grad():
                logits = model.forward(batch['prefix'].float().to(DEVICE)).cpu()
            
            predict = torch.argmax(logits, dim=1).tolist()
            total_predicts += predict
        print(f"Epoch {epoch} has {accuracy_score(val_data['act_next'].tolist(), total_predicts)} accuracy.")


model.eval()
total_predicts = []
for batch in test_loader:            
    with torch.no_grad():
        logits = model.forward(batch['prefix'].float().to(DEVICE)).cpu()
            
    predict = torch.argmax(logits, dim=1).tolist()
    total_predicts += predict

print(f"Result test accuracy is {accuracy_score(test_data['act_next'].tolist(), total_predicts)}.")


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 0 has 0.4240583232077764 accuracy.
Epoch 5 has 0.7667071688942891 accuracy.
Epoch 10 has 0.7897934386391251 accuracy.
Epoch 15 has 0.7970838396111786 accuracy.
Epoch 20 has 0.8007290400972054 accuracy.
Epoch 25 has 0.8019441069258809 accuracy.
Epoch 30 has 0.7982989064398542 accuracy.
Epoch 35 has 0.7995139732685298 accuracy.
Epoch 40 has 0.7995139732685298 accuracy.
Epoch 45 has 0.7995139732685298 accuracy.
Result test accuracy is 0.8027541514783313.
