## A use case of conditioned generative modeling for business process data

In [31]:
import os
import pm4py
import torch 
import pandas as pd
import torch.nn.functional as F
from torch import nn
from argparse import Namespace
from cosmo.models import MTCondDG
from cosmo.utils import ensure_dir, get_runs, load_checkpoint, get_vocabs, read_data
from cosmo.meld import prepare_log, vectorize_log
from simulation import best_to_dict

params = Namespace(**{
    "dataset": "bpi17",
    "model": "MTCondDG",
    "condition": "resource_usage",
    "batch_size": 1,
})

In [32]:
def _prepare_model_input(sim_trace, resource_is_cat):
    na_prefix = torch.tensor(sim_trace["activity"][-5:], dtype=torch.long).unsqueeze(0)
    res_type = torch.long if resource_is_cat else torch.float
    res_prefix = torch.tensor(sim_trace["resource"][-5:], dtype=res_type).unsqueeze(0)
    rt_prefix = torch.tensor(
        sim_trace["remaining_time"][-5:], dtype=torch.float
    ).unsqueeze(0)

    x = torch.cat((na_prefix, res_prefix, rt_prefix))
    x = x.transpose(0, 1).unsqueeze(0)
    return x


def _simulate_trace(model, sim_trace, cond, max_len, device="cuda", states=None):
    g = torch.Generator().manual_seed(13)  # for reproducibility
    for _ in range(max_len):  # max trace len
        x = _prepare_model_input(sim_trace, "resource" in model.vocabs)
        x = [x.to(device), cond.to(device)]
        ac, res, rt, states = model(x, states)

        # appending next activity
        probs = F.softmax(ac, dim=1)
        ix = torch.multinomial(probs.cpu(), num_samples=1, generator=g).item()
        # ix = torch.argmax(torch.softmax(ac.cpu(), dim=1), dim=1).item()
        sim_trace["activity"].append(ix)

        # appending next resource; this is needed to distinguish cat/num resource
        if "resource" in model.vocabs.keys():
            res_ix = torch.argmax(torch.softmax(res.cpu(), dim=1), dim=1).item()
            # res_ix = torch.multinomial(F.softmax(res, dim=1).cpu(), num_samples=1, generator=g).item()
            sim_trace["resource"].append(res_ix)
        else:
            sim_trace["resource"].append(res.item())

        # appending next remaining time
        sim_trace["remaining_time"].append(rt.item())
        if ix == model.vocabs["activity"]["stoi"]["<eos>"] or rt.item() <= 0:
            break
    return sim_trace


def simulate_from_scratch(
    model, n_traces=100, max_len=100, conditions=[0, 1], device="cuda"
):
    simulations = pd.DataFrame()
    # simulations = dict(activity=[], resource=[], remaining_time=[], condition=[])
    case_id = 0
    for c in conditions:
        cond = torch.tensor([c])
        for _ in range(n_traces):
            sim_trace = dict(
                activity=[0, 0, 0, 0, 0],
                resource=[0, 0, 0, 0, 0],
                remaining_time=[0, 0, 0, 0, 0],
            )
            sim_trace = _simulate_trace(model, sim_trace, cond, max_len)
            _sim = pd.DataFrame(sim_trace)
            _sim["condition"] = c
            _sim["case_id"] = case_id
            simulations = pd.concat((simulations, _sim))
            case_id += 1

    # removing <pad> tokens
    simulations = simulations[
        simulations.activity != model.vocabs["activity"]["stoi"]["<pad>"]
    ]
    return simulations

### Event log
- A dataset containing events
- An event can be represented as a tuple $e=(a, t, z)$, where: 
    - $a_j \in A$ is an activity label 
    - $t_j \in T$ is a timestamp denoting that the event $e_j$ was executed at time $t_j$
    - and $z_j \in Z^n$ might be any attribute (e.g. resource to execute the activity, cost of executing the activity, etc.).
- A **sequence of events** composes a case $c_i \in C$, a.k.a. process instance
- Thus, an event log $E$ is composed of a set of cases $C$.

In [49]:
log = read_data(os.path.join("data", params.dataset, "log.csv"))
pm = pm4py.discover_petri_net_inductive(log, case_id_key="case_id", activity_key="activity", timestamp_key="time")

log_columns = ["case_id", "activity", "resource", "time"]
initial_activities = log.groupby("case_id").activity.nth(0).unique()
final_activities = log.groupby("case_id").activity.nth(-1).unique()
most_freq_res = log["resource"].value_counts().nlargest(2).idxmin()

log[log_columns].head(10)

Unnamed: 0,case_id,activity,resource,time
0,Application_652823628,A_Create Application,User_1,2016-01-01 09:51:15.304
1,Application_652823628,A_Submitted,User_1,2016-01-01 09:51:15.352
2,Application_652823628,W_Handle leads,User_1,2016-01-01 09:51:15.774
3,Application_652823628,W_Handle leads,User_1,2016-01-01 09:52:36.392
4,Application_652823628,W_Complete application,User_1,2016-01-01 09:52:36.403
5,Application_652823628,A_Concept,User_1,2016-01-01 09:52:36.413
6,Application_652823628,W_Complete application,User_17,2016-01-02 10:45:22.429
7,Application_652823628,W_Complete application,User_17,2016-01-02 10:49:28.816
8,Application_652823628,A_Accepted,User_52,2016-01-02 11:23:04.299
9,Application_652823628,O_Create Offer,User_52,2016-01-02 11:29:03.994


In [None]:
from pm4py import view_petri_net
view_petri_net(pm[0], pm[1], pm[2])

### Generative models for (business) event data
- Unlike text, events are multidimensional
    - Text: "Artificial intelligence can generate value for companies because..."
    - Events: $[(e_1, e_2, e_3), (e_4, e_5, e_6, e_7, e_8), (e_9, e_{10})]$, where each $e$ has might have 

### Preparing data and loading model checkpoint

In [43]:
bpm_results = pd.read_csv(os.path.join("results", "best_runs.csv"))
params = best_to_dict(bpm_results, params.dataset, params.condition)
params = Namespace(**params)

# log = read_data(os.path.join("data", params.dataset, "log.csv"))
log["target"] = log[params.condition]
log.drop(["trace_time", "resource_usage", "variant"], axis=1, inplace=True)
log = prepare_log(log)
vocabs = get_vocabs(log=log)
itos = {value: key for key, value in vocabs["activity"]["stoi"].items()}
ritos = {value: key for key, value in vocabs["resource"]["stoi"].items()}
for f in vocabs:
    log.loc[:, f] = log.loc[:, f].transform(lambda x: vocabs[f]["stoi"][x])

model = MTCondDG(vocabs=vocabs, batch_size=params.batch_size)
checkpoint = load_checkpoint(
    ckpt_dir_or_file=f"models/{params.dataset}/{params.condition}/{params.run_name}/best_model.ckpt"
)
model.load_state_dict(checkpoint["net"])
model.cuda()
model.eval()

 [*] Loading checkpoint from models/bpi17/resource_usage/usual-dust-45/best_model.ckpt succeed!


MTCondDG(
  (embeddings): ModuleDict(
    (activity): Embedding(28, 5)
    (resource): Embedding(151, 12)
  )
  (lstm): LSTM(18, 256, batch_first=True)
  (activity_block): SpecializedBlock(
    (lstm): LSTM(256, 256, batch_first=True)
    (linear): Linear(in_features=1281, out_features=28, bias=True)
  )
  (res_block): SpecializedBlock(
    (lstm): LSTM(256, 256, batch_first=True)
    (linear): Linear(in_features=1281, out_features=151, bias=True)
  )
  (time_block): SpecializedBlock(
    (lstm): LSTM(256, 256, batch_first=True)
    (linear): Linear(in_features=1281, out_features=1, bias=True)
  )
)

### Simulation from scratch based on constriants
- resource usage: 
    - Not using *User_3* resource
    - Using *User_3* resource


simulation constrained on not using the resource:

In [None]:
simulated = simulate_from_scratch(model, n_traces=1)
simulated.activity = simulated.activity.apply(lambda x: itos[x])
simulated.resource = simulated.resource.apply(lambda x: ritos[x])
simulated = simulated[simulated != "<eos>"].dropna()
simulated.condition = simulated.condition.apply(lambda x: f"Using {most_freq_res}" if x == 1 else "Not using")
simulated[simulated.condition == "Not using"]

In [55]:
simulated = simulate_from_scratch(model, n_traces=1)
simulated[simulated.condition == "Not using"]

Unnamed: 0,activity,resource,remaining_time,condition,case_id
5,A_Create Application,User_1,14.686987,Not using,0
6,A_Create Application,User_1,13.855551,Not using,0
7,W_Handle leads,User_1,10.835886,Not using,0
8,W_Complete application,User_1,10.178379,Not using,0
9,A_Concept,User_1,11.03873,Not using,0
10,A_Accepted,User_1,10.270922,Not using,0
11,O_Refused,User_1,10.043033,Not using,0
12,O_Refused,User_1,9.530025,Not using,0
13,W_Validate application,User_1,9.451303,Not using,0
14,W_Validate application,User_1,7.920919,Not using,0


using desired resource

In [56]:
simulated = simulate_from_scratch(model, n_traces=1)
simulated[simulated.condition == f"Using User_3"]

Unnamed: 0,activity,resource,remaining_time,condition,case_id
5,A_Create Application,User_1,14.741652,Using User_3,1
6,A_Create Application,User_1,13.924622,Using User_3,1
7,W_Handle leads,User_1,10.915035,Using User_3,1
8,W_Complete application,User_1,10.267927,Using User_3,1
9,A_Concept,User_1,11.16165,Using User_3,1
10,W_Complete application,User_1,10.428379,Using User_3,1
11,W_Complete application,User_1,10.103631,Using User_3,1
12,W_Complete application,User_3,9.141265,Using User_3,1
13,O_Cancelled,User_113,8.895084,Using User_3,1
14,A_Cancelled,User_113,8.196096,Using User_3,1
