In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#!g1.1
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pm4py
import tqdm

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

random.seed(3407)
torch.manual_seed(3407)
torch.cuda.manual_seed(3407)
np.random.seed(3407)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

import wandb
wandb.login()

# Baseline data processing

In [None]:
#!g1.1
LOG_TYPE = 'bpi12'
event_log = pm4py.objects.log.importer.xes.importer.apply(f'./data/{LOG_TYPE}.xes')

In [None]:
#!g1.1
from logmentations.datasets import filter_log

event_log_filtered = filter_log(event_log, LOG_TYPE)

In [None]:
#!g1.1
act2id = {'<PAD>': 0, '<BOS>': 1, '<EOS>': 2}
id2act = {0: '<PAD>', 1: '<BOS>', 2: '<EOS>'}

freqs = {}

current_id = 3
for t in event_log_filtered:
    for e in t:
        if e['concept:name'] not in act2id:
            act2id[e['concept:name']] = current_id
            id2act[current_id] = e['concept:name']
            current_id += 1

        freqs[act2id[e['concept:name']]] = freqs.get(act2id[e['concept:name']], 0) + 1

events_cnt = sum(cnt for act, cnt in freqs.items())
weights = {act: events_cnt / (2 * cnt) for act, cnt in freqs.items()}
print(weights)

In [None]:
#!g1.1
from logmentations.utils import time_aware_data_split

train_log, val_log, test_log = time_aware_data_split(event_log_filtered, (0.7, 0.1, 0.2))

In [None]:
#!g1.1
from logmentations.datasets import LogsDataset

normalizer_value = np.percentile(
    [np.max(np.diff([t[i]['time:timestamp'].timestamp() for i in range(len(t))]))
         for t in event_log_filtered], q=90
)

def time_scaling(time: float) -> float:
    return time / normalizer_value

def invert_scaling(scaled_time: float) -> float:
    return scaled_time * normalizer_value

train_ds = LogsDataset(train_log, act2id, time_applyer=time_scaling)
val_ds = LogsDataset(val_log, act2id, time_applyer=time_scaling)
test_ds = LogsDataset(test_log, act2id, time_applyer=time_scaling)

print(f'Median duration: {normalizer_value}')

In [None]:
#!g1.1
from logmentations.utils import generation_collate_fn
from logmentations.datasets import LengthAwareSampler

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 64
NUM_WORKERS = 4

train_loader = torch.utils.data.DataLoader(
    dataset=train_ds,
    batch_sampler=LengthAwareSampler(
        data_len=len(train_ds),
        batch_size=BATCH_SIZE,
        group_size=BATCH_SIZE * 16
    ),
    collate_fn=generation_collate_fn,
    pin_memory=True,
    num_workers=NUM_WORKERS
)

val_loader = torch.utils.data.DataLoader(
    dataset=val_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=generation_collate_fn,
    pin_memory=True,
    num_workers=NUM_WORKERS
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=generation_collate_fn,
    pin_memory=True,
    num_workers=NUM_WORKERS
)

# Model training

In [None]:
#!g1.1
import typing as tp
from logmentations.models import LogVAE, LogAE
from logmentations.training import BaseConfig, train_generative_epoch, eval_generative_model
from logmentations.utils import kld_weight_annealing

avg_weight = np.mean([w for act, w in weights.items()])
loss_weights = torch.tensor([avg_weight] * 3 + [w for act, w in sorted(weights.items())], device=DEVICE).float()

model = LogVAE(
    n_features=27, latent_dim=256, num_classes=26,
    emb_dim=64, hid_dim=128, num_layers=3, bidirectional=True
).to(DEVICE)

N_EPOCHS = 50
SAVE_PERIOD = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EPOCHS)

CONFIG = BaseConfig({
    "n_epochs": N_EPOCHS,
    "save_period": SAVE_PERIOD,
    "optimizer": optimizer,
    "scheduler": scheduler,
    "grad_clip_value": 5.,
    "act_weight": 1.,
    "time_weight": 0.8,
    "kld_weight": 0.,
    "device": DEVICE,
    "loss_weights": loss_weights
})

In [None]:
#!g1.1
run = wandb.init(
    project="GenModels4PBPM-Generation",
    entity="serp404",
    tags=["generation", "VAE", LOG_TYPE]
)

save_path = os.path.join("./checkpoints", run.name)
if not os.path.exists(save_path):
    os.mkdir(save_path)

best_cer = None
for epoch in tqdm.notebook.tqdm(range(N_EPOCHS), "Training"):
    # Train step
    train_loss, train_act, train_times, train_kld, grad_norm = train_generative_epoch(
        model, train_loader, CONFIG
    )

    scheduler.step()
    CONFIG.kld_weight = kld_weight_annealing(epoch, N_EPOCHS, max_value=0.0001)

    # Validation step
    val_min_accuracy, val_cer, val_mae, val_length_mae = eval_generative_model(
        model, val_loader, CONFIG
    )

    metrics = {
        "epoch": epoch,
        "train_loss": train_loss,
        "train_act_loss": train_act,
        "train_time_loss": train_times,
        "grad_norm": grad_norm,
        "val_min_accuracy": val_min_accuracy,
        "val_cer": val_cer,
        "val_mae": val_mae,
        "val_length_mae": val_length_mae,
        "lr": optimizer.param_groups[0]['lr']
    }

    if type(model) == LogVAE:
        metrics["train_kld_loss"] = train_kld
    wandb.log(metrics)

    if best_cer is None or val_cer < best_cer:
        torch.save(model.state_dict(), os.path.join(save_path, "model_best.pth"))
        best_cer = val_cer

    if epoch % SAVE_PERIOD == 0:
        torch.save(model.state_dict(), os.path.join(save_path, f"model_e{epoch}.pth"))

run.finish()

## Model evaluation

In [None]:
#!g1.1
model_best = LogVAE(
    n_features=28, latent_dim=256, num_classes=27,
    emb_dim=64, hid_dim=128, num_layers=3, bidirectional=True
).to(DEVICE)

model_best.load_state_dict(torch.load(os.path.join(save_path, 'model_best.pth'), map_location=DEVICE))

# Test step
accuracies = []
cers = []
maes = []
length_maes = []

N_RUNS = 20
for _ in tqdm.tqdm(list(range(N_RUNS))):
    min_accuracy, cer, mae, length_mae = eval_generative_model(
        model_best, test_loader, CONFIG
    )
    
    accuracies.append(min_accuracy)
    cers.append(cer)
    maes.append(mae)
    length_maes.append(length_mae)

print(f'Min-accuracy: {np.mean(accuracies)}')
print(f'CER: {np.mean(cers)}')
print(f'MAE: {np.mean(maes)}')
print(f'Inverted MAE (days): {invert_scaling(np.mean(maes)) / 3600 / 24}')
print(f'Length MAE: {np.mean(length_maes)}')

In [None]:
#!g1.1