In [None]:
import pandas as pd
import os
import json
import numpy as np
from os.path import dirname

root_path = dirname(os.getcwd())

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs_repair/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

In [None]:
with open("dataset_features.json", 'r') as file:
    datasets_info = json.load(file)


In [None]:
list(datasets_info.keys())

In [None]:
dataset = "BPI12_DECLINED_COMPLETE"

In [None]:
tab_all = pd.read_csv(f"datasets/processed/{dataset}_processed_all.csv")
tab_all.head()

In [None]:
tab_train = pd.read_csv(f"datasets/processed/{dataset}_processed_train.csv")
tab_valid = pd.read_csv(f"datasets/processed/{dataset}_processed_valid.csv")
tab_test = pd.read_csv(f"datasets/processed/{dataset}_processed_test.csv")

In [None]:

with open("dataset_features.json", 'r') as file:
    dataset_info = json.load(file)[dataset]


In [None]:
dataset_info

In [None]:
categorical_columns = dataset_info["categorical"]
real_value_columns = dataset_info["numerical"]

In [None]:
for k in categorical_columns:
    tab_all[k] = tab_all[k].astype("object")
    tab_train[k] = tab_train[k].astype("object")
    tab_valid[k] = tab_valid[k].astype("object")
    tab_test[k] = tab_test[k].astype("object")

### Prepare the graphs

In [None]:
import sklearn.preprocessing

from typing import List

In [None]:
def get_case_ids(tab):
    return list(tab["CaseID"].unique())

In [None]:
from torch import tensor, max, int64, float32
from torch_geometric.data import HeteroData

In [None]:
def get_one_hot_encoder(dataset: pd.DataFrame, key: str):
    datas = dataset[key].unique()
    datas = datas.reshape([len(datas), 1])
    onehot = sklearn.preprocessing.OneHotEncoder()
    onehot.fit(datas)
    return onehot

In [None]:
def get_one_hot_encodings(
    onehot, datas: pd.Series
):
    return onehot.transform(datas.reshape(-1, 1)).toarray()

In [None]:
def get_node_features(dataset: pd.DataFrame, trace: pd.DataFrame, cat_features, real_features) -> dict:
 

    res = {}

    for key in trace:
        values = trace[key].values
        if key in cat_features:
            onehot_encoder = get_one_hot_encoder(dataset, key)
            try:
                res[key] = tensor(
                    get_one_hot_encodings(onehot_encoder, values),
                    dtype=float32,
                    requires_grad=True
                )
            except ValueError:
                print(key)
                print(values)
        if key in real_features:
            res[key] = tensor(values,  dtype=float32,requires_grad=True)
            res[key] = res[key].reshape(res[key].shape[0], 1)
        
    

    return res


In [None]:


def compute_edges_indexs(node_features: dict, prefix_len):
    res = {}
    keys = node_features.keys()
    
    indexes = [[i, i + 1] for i in range(prefix_len-1)]
   
    for k in keys:
        if len(node_features[k]) != 1:
            if k == "Activity":
                res[(k, "followed_by", k)] = indexes
                for k2 in keys:
                    if k2 != k:
                        if len(node_features[k2]) == 1:
                            res[(k, "related_to", k2)] = [
                                [i, 0] for i in range(prefix_len)
                            ]
                        else:
                            res[(k, "related_to", k2)] = [
                                [i, i] for i in range(prefix_len)
                            ]
            else:
                res[(k, "related_to", k)] = indexes

    return res

In [None]:



def build_prefixes_graph_from_trace(dataset, trace, cat_features, real_features, prefix_length):
    X = []  # graphs
   
    
    
    node_features = get_node_features(dataset, trace, cat_features, real_features)
    
    
    
    
    G = HeteroData()
        
        
        
    for k in node_features:
        if k != "case:label":
            G[k].x = node_features[k][:prefix_length]


    edges_indexes = compute_edges_indexs(node_features, prefix_length)

    


    for k in edges_indexes:
        ce = [[], []]
        for i in range(len(edges_indexes[k])):
            ce[0].append(edges_indexes[k][i][0])
            ce[1].append(edges_indexes[k][i][1])
        edges_indexes[k] = ce

    for k in edges_indexes:
        G[k].edge_index = tensor(edges_indexes[k], dtype=int64)


    ## Get the label of the trace

    G.y = {}
        
    ##
        
        
        
    X.append(G)
    
    

        
    return X

## Create the datasets

In [None]:
case_train_ids = get_case_ids(tab_train)
case_valid_ids = get_case_ids(tab_valid)
case_test_ids = get_case_ids(tab_test)

In [None]:
print(len(case_train_ids))
print(len(case_valid_ids))
print(len(case_test_ids))

In [None]:
tab_train["CaseID"] = tab_train["CaseID"].astype(np.str_)
tab_valid["CaseID"] = tab_valid["CaseID"].astype(np.str_)
tab_test["CaseID"] = tab_test["CaseID"].astype(np.str_)

In [None]:
trace = (
        tab_train.query(f"CaseID == '{case_train_ids[0]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
trace 

In [None]:
import pickle
from tqdm.notebook import tqdm

In [None]:
PREFIX_LENGTH = 4

In [None]:
print("Preparing training dataset...")

X_train = []


for i in tqdm(range(len(case_train_ids))):
    trace = (
        tab_train.query(f"CaseID == '{case_train_ids[i]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )

    if len(trace) > 2:
        graphs = build_prefixes_graph_from_trace(
            dataset=tab_all,
            trace=trace,
            cat_features=categorical_columns,
            real_features=real_value_columns,
            prefix_length=PREFIX_LENGTH,
        )
        for i in range(len(graphs)):
            X_train.append(graphs[i])

In [None]:
with open(data_dir_graphs + dataset + "_TRAIN_repair.pkl", "wb") as f:
    pickle.dump(X_train, f)

In [None]:
print("Preparing validation dataset...")

X_valid = []


for i in tqdm(range(len(case_valid_ids))):
    trace = (
        tab_valid.query(f"CaseID == '{case_valid_ids[i]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    if len(trace) > 2:
        graphs = build_prefixes_graph_from_trace(
            dataset=tab_all,
            trace=trace,
            cat_features=categorical_columns,
            real_features=real_value_columns,
            prefix_length=PREFIX_LENGTH
        )
        for i in range(len(graphs)):
            X_valid.append(graphs[i])

In [None]:
with open(data_dir_graphs + dataset + "_VALID_repair.pkl", "wb") as f:
    pickle.dump(X_valid, f)

In [None]:
print("Preparing test dataset...")

X_test = []


for i in tqdm(range(len(case_test_ids))):
    trace = (
        tab_test.query(f"CaseID == '{case_test_ids[i]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )

    if len(trace) > 2:
        graphs = build_prefixes_graph_from_trace(
            dataset=tab_all,
            trace=trace,
            cat_features=categorical_columns,
            real_features=real_value_columns,
            prefix_length=PREFIX_LENGTH
        )
        for i in range(len(graphs)):
            X_test.append(graphs[i])

In [None]:
with open(data_dir_graphs + dataset + "_TEST_repair.pkl", "wb") as f:
    pickle.dump(X_test, f)