In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import ast
import random
import matplotlib.pyplot as plt
from functools import partial

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [27]:
DATASET = 'helpdesk'

train_data = pd.read_csv(f'data_flattened/{DATASET}/train.csv')
test_data = pd.read_csv(f'data_flattened/{DATASET}/test.csv')
val_data = pd.read_csv(f'data_flattened/{DATASET}/val.csv')
train_data.head()

Unnamed: 0,act_prefix,cts_prefix,act_next,cts_next
0,[1],['2012-04-03 16:55:38'],8,2012-04-03 16:55:53
1,[1],['2011-10-24 17:55:40'],8,2011-10-24 17:56:20
2,[1],['2012-04-03 00:04:30'],8,2012-04-05 15:30:00
3,[1],['2012-03-21 23:50:05'],8,2012-03-22 22:24:03
4,[1],['2011-07-21 17:50:37'],8,2011-07-21 17:50:49


## Process timestamps

In [29]:
from utils import date2ts, ParseDatelist, GenTimeFeatures

train_data['cts_next'] = train_data['cts_next'].apply(date2ts)
test_data['cts_next'] = test_data['cts_next'].apply(date2ts)
val_data['cts_next'] = val_data['cts_next'].apply(date2ts)

mu = np.mean(train_data['cts_next'])
sigma = np.std(train_data['cts_next'])
scaler = lambda x: (x - mu) / sigma

parser = lambda x: ParseDatelist(x, applyer=partial(GenTimeFeatures, ts_applyer=lambda x: scaler(x // 1000000000)))
train_data['cts_prefix'] = train_data['cts_prefix'].apply(parser)
test_data['cts_prefix'] = test_data['cts_prefix'].apply(parser)
val_data['cts_prefix'] = val_data['cts_prefix'].apply(parser)

train_data['cts_next'] = train_data['cts_next'].apply(scaler)
test_data['cts_next'] = test_data['cts_next'].apply(scaler)
val_data['cts_next'] = val_data['cts_next'].apply(scaler)

N_TIMEFEATURES = len(train_data.loc[0, 'cts_prefix'][0])

train_data.head()

Unnamed: 0,act_prefix,cts_prefix,act_next,cts_next
0,[1],"[[1.0858516260300441, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.085852
1,[1],"[[0.47310774649970466, 1, 0, 0, 0, 0, 0, 0, 0,...",8,0.473109
2,[1],"[[1.0831950534807708, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.093193
3,[1],"[[1.0377570572662635, 0, 0, 1, 0, 0, 0, 0, 0, ...",8,1.041314
4,[1],"[[0.11367686982801574, 0, 0, 0, 1, 0, 0, 0, 0,...",8,0.113677


## Process activities

In [30]:
from utils import ParseActivitylist, OneHotEncode

train_data['act_prefix'] = train_data['act_prefix'].apply(ParseActivitylist)
test_data['act_prefix'] = test_data['act_prefix'].apply(ParseActivitylist)
val_data['act_prefix'] = val_data['act_prefix'].apply(ParseActivitylist)

train_data['act_next'] = train_data['act_next'].apply(int)
test_data['act_next'] = test_data['act_next'].apply(int)
val_data['act_next'] = val_data['act_next'].apply(int)

uniq_tokens = np.union1d(
    train_data['act_prefix'].apply(lambda x: x[0]).unique(),
    train_data['act_next'].unique()
)

N_TOKENS = len(uniq_tokens)
assert np.min(uniq_tokens) == 1, "Minimum label is not 1!"
assert np.max(uniq_tokens) == N_TOKENS, "Maximum label is not N_TOKENS!"

encoder = partial(OneHotEncode, num_classes=N_TOKENS)

train_data['act_prefix'] = train_data['act_prefix'].apply(encoder)
test_data['act_prefix'] = test_data['act_prefix'].apply(encoder)
val_data['act_prefix'] = val_data['act_prefix'].apply(encoder)

train_data.head()

Unnamed: 0,act_prefix,cts_prefix,act_next,cts_next
0,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[1.0858516260300441, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.085852
1,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[0.47310774649970466, 1, 0, 0, 0, 0, 0, 0, 0,...",8,0.473109
2,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[1.0831950534807708, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,1.093193
3,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[1.0377570572662635, 0, 0, 1, 0, 0, 0, 0, 0, ...",8,1.041314
4,"[[1, 0, 0, 0, 0, 0, 0, 0, 0]]","[[0.11367686982801574, 0, 0, 0, 1, 0, 0, 0, 0,...",8,0.113677


In [5]:
from utils import LogLoader

BATCH_SIZE = 128

train_loader = LogLoader(
    data=train_data,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_loader = LogLoader(
    data=test_data,
    batch_size=BATCH_SIZE,
    shuffle=False
)

val_loader = LogLoader(
    data=val_data,
    batch_size=BATCH_SIZE,
    shuffle=False
)


# Model

## Activity prediction

In [12]:
#!g1.1
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from utils import FocalLoss
from models import GruLstmModel, LstmAttentionModel, LstmModel

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
N_EPOCHS = 50

model = GruLstmModel(
    vocab_size=N_TOKENS + 1,
    output_size=N_TOKENS + 1,
    n_features=N_TOKENS + N_TIMEFEATURES,
    hid_size=128,
    num_layers=1,
    bidirectional=False,
    embed_features=True
).float().to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)
criterion = nn.CrossEntropyLoss()
# criterion = partial(FocalLoss, gamma=1.5)


for epoch in tqdm(range(N_EPOCHS)):
    model.train()
    for batch in train_loader:
        logits = model.forward(batch['prefix'].float().to(DEVICE))
        loss = criterion(logits, batch['act_next'].to(DEVICE))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
    
    if epoch % 5 == 0:
        model.eval()
        total_predicts = []
        for batch in val_loader:
            with torch.no_grad():
                logits = model.forward(batch['prefix'].float().to(DEVICE)).cpu()
            
            predict = torch.argmax(logits, dim=1).tolist()
            total_predicts += predict
        print(f"Epoch {epoch} has {accuracy_score(val_data['act_next'].tolist(), total_predicts)} accuracy.")


model.eval()
total_predicts = []
for batch in test_loader:            
    with torch.no_grad():
        logits = model.forward(batch['prefix'].float().to(DEVICE)).cpu()
            
    predict = torch.argmax(logits, dim=1).tolist()
    total_predicts += predict

print(f"Result test accuracy is {accuracy_score(test_data['act_next'].tolist(), total_predicts)}.")


  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 0 has 0.5528554070473876 accuracy.
Epoch 5 has 0.6537059538274606 accuracy.
Epoch 10 has 0.7825030376670717 accuracy.
Epoch 15 has 0.7897934386391251 accuracy.
Epoch 20 has 0.7897934386391251 accuracy.
Epoch 25 has 0.7910085054678008 accuracy.
Epoch 30 has 0.7946537059538274 accuracy.
Epoch 35 has 0.7995139732685298 accuracy.
Epoch 40 has 0.8019441069258809 accuracy.
Epoch 45 has 0.7970838396111786 accuracy.
Result test accuracy is 0.8019441069258809.


## Timestamp prediction

In [33]:
#!g1.1
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from utils import FocalLoss
from models import GruLstmModel, LstmAttentionModel, LstmModel

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
N_EPOCHS = 100

model = GruLstmModel(
    vocab_size=N_TOKENS + 1,
    output_size=1,
    n_features=N_TOKENS + N_TIMEFEATURES,
    hid_size=64,
    num_layers=1,
    bidirectional=False,
    embed_features=True
).float().to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8)
criterion = nn.MSELoss()


for epoch in tqdm(range(N_EPOCHS)):
    model.train()
    for batch in train_loader:
        logits = model.forward(batch['prefix'].float().to(DEVICE)).squeeze()
        loss = criterion(logits, batch['cts_next'].to(DEVICE))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
    
    if epoch % 10 == 0:
        model.eval()
        total_predicts = []
        for batch in val_loader:
            with torch.no_grad():
                predict = model.forward(batch['prefix'].float().to(DEVICE)).cpu().squeeze().tolist()

            total_predicts += predict
        print(f"Epoch {epoch} has {mean_absolute_error(val_data['cts_next'].tolist(), total_predicts)} MAE.")


model.eval()
total_predicts = []
for batch in test_loader:            
    with torch.no_grad():
        predict = model.forward(batch['prefix'].float().to(DEVICE)).cpu().squeeze().tolist()

    total_predicts += predict

test_mae = mean_absolute_error(test_data['cts_next'].tolist(), total_predicts)
test_mae_days = (test_mae * sigma + mu) / 86400
print(f"Result test MAE is {test_mae}.")
print(f"Result test MAE (days) is {test_mae_days}.")


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 0 has 0.5470096798768007 MAE.
Epoch 5 has 0.15603680014513963 MAE.
Epoch 10 has 0.12658346840405973 MAE.
Epoch 15 has 0.06629205255624498 MAE.
Epoch 20 has 0.08880101159084537 MAE.
Epoch 25 has 0.09212989713216199 MAE.
Epoch 30 has 0.07815449817970535 MAE.
Epoch 35 has 0.09711598416243783 MAE.
Epoch 40 has 0.08906492205383439 MAE.
Epoch 45 has 0.0949381165064536 MAE.
Epoch 50 has 0.09839507054577733 MAE.
Epoch 55 has 0.1006583793255819 MAE.
Epoch 60 has 0.11482738384562959 MAE.
Epoch 65 has 0.09939385332793499 MAE.
Epoch 70 has 0.09159804387824391 MAE.
Epoch 75 has 0.10458166315780076 MAE.
Epoch 80 has 0.10375312763128113 MAE.
Epoch 85 has 0.09559130100211223 MAE.
Epoch 90 has 0.10128935399732142 MAE.
Epoch 95 has 0.11490961373550357 MAE.
Result test MAE is 0.08072829478373895.
Result test MAE (days) is 15168.034632161138.
