In [2]:
import pandas as pd
import os
from os.path import dirname

root_path = dirname(os.getcwd())

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

/home/sebdis/HGNN/HGNN_NA
/home/sebdis/HGNN/HGNN_NA/data/datasets/original/
/home/sebdis/HGNN/HGNN_NA/data/datasets/processed/
/home/sebdis/HGNN/HGNN_NA/data/datasets/graphs/


In [3]:
dataset = "BPI_Challenge_2012_A"
filename = data_dir + dataset + ".csv"
raw_data = pd.read_csv(filename, index_col=False)
raw_data = raw_data.drop(columns="Unnamed: 0")

In [4]:
raw_data.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
1,112,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
2,112,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
3,10862,COMPLETE,A_ACCEPTED,2011-10-01 09:42:43.308000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000
4,10862,COMPLETE,A_FINALIZED,2011-10-01 09:45:09.243000+00:00,2011-10-01 00:38:44.546000+02:00,173688,20000


In [5]:
tab_all = raw_data.rename(
    columns={"case:concept:name": "CaseID", "concept:name": "Activity"}
)
# tab_all.head()

In [6]:
tab_all["time:timestamp"] = [x.split(".")[0] for x in tab_all["time:timestamp"]]
tab_all["case:REG_DATE"] = [x.split(".")[0] for x in tab_all["case:REG_DATE"]]
tab_all["time:timestamp"] = tab_all["time:timestamp"].str.replace("-", "/")
tab_all["time:timestamp"] = tab_all["time:timestamp"].str.split("+", expand=True)[0]
tab_all["case:REG_DATE"] = tab_all["case:REG_DATE"].str.replace("-", "/")
tab_all["case:REG_DATE"] = tab_all["case:REG_DATE"].str.split("+", expand=True)[0]
# tab_all.head()

In [6]:
# tab_all.to_csv(data_dir_processed + f"{dataset}_processed_all.csv", index=False)

In [7]:
split_ratio = 2 / 3

first_act_tab = (
    tab_all.groupby("CaseID").first().sort_values("time:timestamp").reset_index()
)
first_act_tab = first_act_tab[
    ~first_act_tab.duplicated(subset=["CaseID", "Activity"], keep="first")
]
first_act_tab = first_act_tab.reset_index(drop=True)

list_train_valid_cases = list(
    first_act_tab[: int(split_ratio * len(first_act_tab))]["CaseID"].unique()
)

list_train_cases = list_train_valid_cases[: int(len(list_train_valid_cases) * 0.8)]
tab_train = tab_all[tab_all["CaseID"].isin(list_train_cases)].reset_index(drop=True)
# tab_train.to_csv(data_dir_processed+ f"{dataset}_processed_train.csv", index = False)

list_valid_cases = list_train_valid_cases[int(len(list_train_valid_cases) * 0.8) :]
tab_valid = tab_all[tab_all["CaseID"].isin(list_valid_cases)].reset_index(drop=True)
# tab_valid.to_csv(data_dir_processed+f"{dataset}_processed_valid.csv", index = False)

list_test_cases = list(
    first_act_tab[int(split_ratio * len(first_act_tab)) :]["CaseID"].unique()
)
tab_test = tab_all[tab_all["CaseID"].isin(list_test_cases)].reset_index(drop=True)
# tab_test.to_csv(data_dir_processed+ f"{dataset}_processed_test.csv", index = False)

### Prepare the graphs

In [8]:
from utils import get_case_ids
import utils
from torch import tensor, max, int64, float32
from torch_geometric.data import HeteroData

In [39]:
from pprint import pprint as print

from torch import stack


def build_prefixes_graph_from_trace(dataset, trace):
    X = []  # graphs
    Y = []  # NA, timestamp, resource labels

    node_features = utils.get_node_features(dataset, trace)
    prefix_lenghts = range(2, len(trace))
    # print(prefix_lenghts)
    for prefix in prefix_lenghts:
        # print(prefix)

        # init node types and features
        G = HeteroData()
        for k in node_features:
            G[k].x = node_features[k][:prefix]

        edges_indexes = utils.compute_edges_indexs(node_features, prefix)

        edge_features = utils.compute_edges_features(node_features, edges_indexes)

        for k in edge_features:
            if k[0] in ["Activity", "org:resource", "time:timestamp"]:
                G[k].edge_attr = stack(edge_features[k])
            else:
                G[k].edge_attr = tensor(edge_features[k], dtype=float32)

        for k in edges_indexes:
            ce = [[], []]
            for i in range(len(edges_indexes[k])):
                ce[0].append(edges_indexes[k][i][0])
                ce[1].append(edges_indexes[k][i][1])
            edges_indexes[k] = ce

        for k in edges_indexes:
            G[k].edge_index = tensor(edges_indexes[k], dtype=int64)

        X.append(G)

        # TODO add get label function to make things more modular
        Y.append(
            [
                node_features["Activity"][prefix],
                node_features["time:timestamp"][prefix],
                (
                    node_features["org:resource"][0]
                    if len(node_features["org:resource"]) == 1
                    else node_features["org:resource"][prefix]
                ),
            ]
        )
    return (X, Y)

## Create the datasets

In [10]:
case_train_ids = get_case_ids(tab_train)
case_valid_ids = get_case_ids(tab_valid)
case_test_ids = get_case_ids(tab_test)

In [11]:
print(len(case_train_ids))
print(len(case_valid_ids))
print(len(case_test_ids))

6979
1745
4363


In [40]:
from tqdm.notebook import tqdm


print("Preparing training dataset...")

X_train = []
Y_train = []

for i in tqdm(range(len(case_train_ids))):
    trace = (
        tab_train.query(f"CaseID == {case_train_ids[i]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs, labels = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace)
    
    # print(trace)
    # print([x.x_dict for x in graphs[:2]])
    # print(labels[:2])
    # break

    for i in range(len(labels)):
        X_train.append(graphs[i])
        Y_train.append(labels[i])

# Y_train = tensor(Y_train, dtype=float32)

print("Done!\n\n")

'Preparing training dataset...'


  0%|          | 0/6979 [00:00<?, ?it/s]

'Done!\n\n'


In [41]:
X_train[0]

HeteroData(
  org:resource={ x=[2, 1] },
  lifecycle:transition={ x=[1, 1] },
  Activity={ x=[2, 10] },
  time:timestamp={ x=[2, 1] },
  case:REG_DATE={ x=[1, 1] },
  case:AMOUNT_REQ={ x=[1, 1] },
  (org:resource, related_to, org:resource)={
    edge_attr=[1, 2],
    edge_index=[2, 1],
  },
  (Activity, followed_by, Activity)={
    edge_attr=[1, 2],
    edge_index=[2, 1],
  },
  (time:timestamp, related_to, time:timestamp)={
    edge_attr=[1, 2],
    edge_index=[2, 1],
  },
  (Activity, related_to, org:resource)={ edge_index=[2, 2] },
  (Activity, related_to, lifecycle:transition)={ edge_index=[2, 2] },
  (Activity, related_to, time:timestamp)={ edge_index=[2, 2] },
  (Activity, related_to, case:REG_DATE)={ edge_index=[2, 2] },
  (Activity, related_to, case:AMOUNT_REQ)={ edge_index=[2, 2] }
)

In [21]:
X_train[1].x_dict['time:timestamp']

tensor([[1.3174e+09],
        [1.3174e+09],
        [1.3174e+09]])

In [42]:
X_train[1].edge_items()

[(('org:resource', 'related_to', 'org:resource'),
  {'edge_attr': tensor([[0., 1.],
          [0., 2.],
          [0., 1.]]), 'edge_index': tensor([[0, 0, 1],
          [1, 2, 2]])}),
 (('Activity', 'followed_by', 'Activity'),
  {'edge_attr': tensor([[0.2000, 1.0000],
          [0.2000, 2.0000],
          [0.2000, 1.0000]]), 'edge_index': tensor([[0, 0, 1],
          [1, 2, 2]])}),
 (('time:timestamp', 'related_to', 'time:timestamp'),
  {'edge_attr': tensor([[0., 1.],
          [0., 2.],
          [0., 1.]]), 'edge_index': tensor([[0, 0, 1],
          [1, 2, 2]])}),
 (('Activity', 'related_to', 'org:resource'),
  {'edge_index': tensor([[0, 1, 2],
          [0, 1, 2]])}),
 (('Activity', 'related_to', 'lifecycle:transition'),
  {'edge_index': tensor([[0, 1, 2],
          [0, 0, 0]])}),
 (('Activity', 'related_to', 'time:timestamp'),
  {'edge_index': tensor([[0, 1, 2],
          [0, 1, 2]])}),
 (('Activity', 'related_to', 'case:REG_DATE'),
  {'edge_index': tensor([[0, 1, 2],
          [0,

In [43]:
print("Preparing validation dataset...")

X_valid = []
Y_valid = []

for i in tqdm(range(len(case_valid_ids))):
    trace = (
        tab_valid.query(f"CaseID == {case_valid_ids[i]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs, labels = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace)
    for i in range(len(labels)):
        X_valid.append(graphs[i])
        Y_valid.append(labels[i])

# Y_valid = tensor(Y_valid)

print("Done!\n\n")

'Preparing validation dataset...'


  0%|          | 0/1745 [00:00<?, ?it/s]

'Done!\n\n'


In [None]:
print("Preparing test dataset...")

X_test = []
Y_test = []

for i in tqdm(range(len(case_test_ids))):
    trace = (
        tab_test.query(f"CaseID == {case_test_ids[i]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs, labels = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace)
    for i in range(len(labels)):
        X_test.append(graphs[i])
        Y_test.append(labels[i])

# Y_test = tensor(Y_test)

print("Done!\n\n")

'Preparing test dataset...'


  0%|          | 0/4363 [00:00<?, ?it/s]

### Let's Save the graph datasets

In [14]:
import pickle

with open(data_dir_graphs + dataset + "_TRAIN_event_prediction.pkl", "wb") as f:
    pickle.dump([X_train, Y_train], f)
with open(data_dir_graphs + dataset + "_VALID_event_prediction.pkl", "wb") as f:
    pickle.dump([X_valid, Y_valid], f)
with open(data_dir_graphs + dataset + "_TEST_event_prediction.pkl", "wb") as f:
    pickle.dump([X_test, Y_test], f)