In [6]:
import pandas as pd
import os
from os.path import dirname

root_path = dirname(os.getcwd())

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

/home/sebdis/ProcessMining/HGNN/HGNN_NA
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/original/
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/processed/
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/graphs/


In [7]:
dataset = "Helpdesk"

In [8]:
filename = data_dir + dataset + ".csv"
raw_data = pd.read_csv(filename, index_col=False)
raw_data.head()

Unnamed: 0,CaseID,Activity,time:timestamp
0,2,1,2012-04-03 16:55:38
1,2,8,2012-04-03 16:55:53
2,2,6,2012-04-05 17:15:52
3,3,1,2010-10-29 18:14:06
4,3,8,2010-11-04 01:16:11


In [9]:
tab_all = raw_data
tab_all.head()

Unnamed: 0,CaseID,Activity,time:timestamp
0,2,1,2012-04-03 16:55:38
1,2,8,2012-04-03 16:55:53
2,2,6,2012-04-05 17:15:52
3,3,1,2010-10-29 18:14:06
4,3,8,2010-11-04 01:16:11


In [10]:
tab_all["Activity"].unique()

array([1, 8, 6, 3, 9, 2, 4, 5, 7])

In [11]:
tab_all.value_counts("Activity")

Activity
8    4278
6    4150
1    4144
9     962
3     108
2      45
4      14
5       5
7       4
Name: count, dtype: int64

In [12]:
#tab_all["time:timestamp"] = [x.split(".")[0] for x in tab_all["time:timestamp"]]

tab_all["time:timestamp"] = tab_all["time:timestamp"].str.replace("-", "/")
tab_all["time:timestamp"] = tab_all["time:timestamp"].str.split("+", expand=True)[0]

tab_all.head()

Unnamed: 0,CaseID,Activity,time:timestamp
0,2,1,2012/04/03 16:55:38
1,2,8,2012/04/03 16:55:53
2,2,6,2012/04/05 17:15:52
3,3,1,2010/10/29 18:14:06
4,3,8,2010/11/04 01:16:11


In [13]:
from utils import translate_time

tab_all['time:timestamp'] = tab_all['time:timestamp'].apply(translate_time)
tab_all.head()

Unnamed: 0,CaseID,Activity,time:timestamp
0,2,1,1333465000.0
1,2,8,1333465000.0
2,2,6,1333639000.0
3,3,1,1288369000.0
4,3,8,1288830000.0


In [14]:
categorical_columns = ['Activity']
real_value_columns = ["time:timestamp"]

In [15]:
tab_all.isnull().values.any()

False

In [None]:
tab_all = tab_all.fillna({key: "NAN" for key in categorical_columns})
tab_all = tab_all.fillna({key: -1 for key in real_value_columns})

In [16]:
from math import log


min_time = tab_all['time:timestamp'].min() 

tab_all['time:timestamp'] -= min_time
tab_all['time:timestamp'] = [ log(x)  if x > 0 else 0. for x in tab_all['time:timestamp'].values]


tab_all.head()

Unnamed: 0,CaseID,Activity,time:timestamp
0,2,1,18.064921
1,2,8,18.064921
2,2,6,18.067402
3,3,1,17.033106
4,3,8,17.051399


In [17]:
tab_all.to_csv(data_dir_processed + f"{dataset}_processed_all.csv", index=False)

# Split the dataset

In [18]:
split_ratio = 2 / 3

first_act_tab = (
    tab_all.groupby("CaseID").first().sort_values("time:timestamp").reset_index()
)
first_act_tab = first_act_tab[
    ~first_act_tab.duplicated(subset=["CaseID", "Activity"], keep="first")
]
first_act_tab = first_act_tab.reset_index(drop=True)

list_train_valid_cases = list(
    first_act_tab[: int(split_ratio * len(first_act_tab))]["CaseID"].unique()
)

list_train_cases = list_train_valid_cases[: int(len(list_train_valid_cases) * 0.8)]
tab_train = tab_all[tab_all["CaseID"].isin(list_train_cases)].reset_index(drop=True)
tab_train.to_csv(data_dir_processed+ f"{dataset}_processed_train.csv", index = False)

list_valid_cases = list_train_valid_cases[int(len(list_train_valid_cases) * 0.8) :]
tab_valid = tab_all[tab_all["CaseID"].isin(list_valid_cases)].reset_index(drop=True)
tab_valid.to_csv(data_dir_processed+f"{dataset}_processed_valid.csv", index = False)

list_test_cases = list(
    first_act_tab[int(split_ratio * len(first_act_tab)) :]["CaseID"].unique()
)
tab_test = tab_all[tab_all["CaseID"].isin(list_test_cases)].reset_index(drop=True)
tab_test.to_csv(data_dir_processed+ f"{dataset}_processed_test.csv", index = False)

In [19]:
tab_all.columns

Index(['CaseID', 'Activity', 'time:timestamp'], dtype='object')

In [None]:
tab_train

### Prepare the graphs

In [20]:
from utils import get_case_ids, is_static, get_one_hot_encoder, get_one_hot_encodings

from torch import tensor, max, int64, float32
from torch_geometric.data import HeteroData
import numpy as np

In [21]:
def get_node_features(dataset: pd.DataFrame, trace: pd.DataFrame, cat_features, real_features) -> dict:
    columns_static = [c for c in trace if is_static(trace[c])]

    res = {}

    for key in trace:
        values = trace[key].values
        
        
        if key in cat_features:
            onehot_encoder = get_one_hot_encoder(dataset, key)
            if key not in columns_static:
                res[key] = tensor(
                    get_one_hot_encodings(onehot_encoder, values),
                    dtype=float32,
                    requires_grad=True
                )
            else:
                res[key] = tensor(
                    get_one_hot_encodings(onehot_encoder, np.array([values[0]])),
                    dtype=float32,
                    requires_grad=True
                )
        if key in real_features:
            if key not in columns_static:
                res[key] = tensor(values,  dtype=float32,requires_grad=True)
            else:
                res[key] = tensor([values[0]], dtype=float32,requires_grad=True)
            res[key] = res[key].reshape(res[key].shape[0], 1)
        
    

    return res


In [22]:


def compute_edges_indexs(node_features: dict, prefix_len):
    res = {}
    keys = node_features.keys()
    # indexes = [[i, j] for i in range(prefix_len) for j in range(i + 1, prefix_len)]
    indexes = [[i, i + 1] for i in range(prefix_len-1)]
    # activities indexes
    for k in keys:
        if len(node_features[k]) != 1:
            if k == "Activity":
                res[(k, "followed_by", k)] = indexes
                for k2 in keys:
                    if k2 != k:
                        if len(node_features[k2]) == 1:
                            res[(k, "related_to", k2)] = [
                                [i, 0] for i in range(prefix_len)
                            ]
                        else:
                            res[(k, "related_to", k2)] = [
                                [i, i] for i in range(prefix_len)
                            ]
            else:
                res[(k, "related_to", k)] = indexes

    return res

In [23]:
import torch

from torch import cat

def compute_edges_features(node_features, edges_indexes, cat_features, real_features):
    res = {}

    for k in edges_indexes:
        if k[0] == k[2]:
            indexes = edges_indexes[k]
            res[k] = []
            
            if k[0] in cat_features:
                for i in indexes:
                        res[k].append(      
                            cat(
                                (
                                    tensor([torch.equal(node_features[k[0]][i[0]],node_features[k[0]][i[1]])], dtype=torch.float32),
                                    tensor([i[1] - i[0]], dtype=torch.float32),
                                )
                            )
                        )
            elif k[0] in real_features:
                for i in indexes:
                        res[k].append(
                            tensor(
                                [
                                    node_features[k[0]][i[1]]
                                    - node_features[k[0]][i[0]],
                                    i[1] - i[0],
                                ]
                            )
                        )
            

    return res

In [24]:
# from pprint import pprint as print

from torch import stack

def build_prefixes_graph_from_trace(dataset, trace, cat_features, real_features):
    X = []  # graphs
   

    node_features = get_node_features(dataset, trace, cat_features, real_features)
    
    
    prefix_lenghts = range(2, len(trace))
    # print(prefix_lenghts)
    for prefix in prefix_lenghts:
        # print(prefix)

        # init node types and features
        G = HeteroData()
        for k in node_features:
            G[k].x = node_features[k][:prefix]
        

        edges_indexes = compute_edges_indexs(node_features, prefix)

        edge_features = compute_edges_features(node_features, edges_indexes, cat_features, real_features)
        
        for k in edge_features:
            G[k].edge_attr = stack(edge_features[k])
            


        for k in edges_indexes:
            ce = [[], []]
            for i in range(len(edges_indexes[k])):
                ce[0].append(edges_indexes[k][i][0])
                ce[1].append(edges_indexes[k][i][1])
            edges_indexes[k] = ce

        for k in edges_indexes:
            G[k].edge_index = tensor(edges_indexes[k], dtype=int64)

        G.y = {}
        for k in node_features:
            if k in cat_features:
                G.y[k] = torch.max(node_features[k][0],0)[1] if len(node_features[k]) == 1 else torch.max(node_features[k][prefix], 0)[1]
            else:
                G.y[k] = torch.tensor([node_features[k][prefix][0]])
        
        X.append(G)

        
    return X

## Create the datasets

In [25]:
case_train_ids = get_case_ids(tab_train)
case_valid_ids = get_case_ids(tab_valid)
case_test_ids = get_case_ids(tab_test)

In [26]:
print(len(case_train_ids))
print(len(case_valid_ids))
print(len(case_test_ids))

2028
508
1268


In [28]:
trace = (
        tab_train.query(f"CaseID == {case_train_ids[0]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
trace 

Unnamed: 0,Activity,time:timestamp
0,1,17.033106
1,8,17.051399
2,6,17.051411


In [29]:
graphs = build_prefixes_graph_from_trace(tab_all, trace, categorical_columns, real_value_columns)

In [31]:
graphs[0].y

{'Activity': tensor(5), 'time:timestamp': tensor([17.0514])}

In [32]:
graphs[0].x_dict

{'Activity': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 1., 0.]], grad_fn=<SliceBackward0>),
 'time:timestamp': tensor([[17.0331],
         [17.0514]], grad_fn=<SliceBackward0>)}

In [None]:
graphs[1].edge_items()

In [33]:
from tqdm.notebook import tqdm


print("Preparing training dataset...")

X_train = []


for i in tqdm(range(len(case_train_ids))):
    trace = (
        tab_train.query(f"CaseID == {case_train_ids[i]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace, cat_features=categorical_columns, real_features=real_value_columns)
    
    # print(trace)
    # print([x.x_dict for x in graphs[:2]])
    # print(labels[:2])
    # break

    for i in range(len(graphs)):
        X_train.append(graphs[i])
        

# Y_train = tensor(Y_train, dtype=float32)

print("Done!\n\n")

Preparing training dataset...


  0%|          | 0/2028 [00:00<?, ?it/s]

Done!




In [34]:
print("Preparing validation dataset...")

X_valid = []


for i in tqdm(range(len(case_valid_ids))):
    trace = (
        tab_valid.query(f"CaseID == {case_valid_ids[i]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace,cat_features=categorical_columns, real_features=real_value_columns)
    for i in range(len(graphs)):
        X_valid.append(graphs[i])
       

# Y_valid = tensor(Y_valid)

print("Done!\n\n")

Preparing validation dataset...


  0%|          | 0/508 [00:00<?, ?it/s]

Done!




In [35]:
print("Preparing test dataset...")

X_test = []


for i in tqdm(range(len(case_test_ids))):
    trace = (
        tab_test.query(f"CaseID == {case_test_ids[i]}")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace,cat_features=categorical_columns, real_features=real_value_columns)
    for i in range(len(graphs)):
        X_test.append(graphs[i])
     

# Y_test = tensor(Y_test)

print("Done!\n\n")

Preparing test dataset...


  0%|          | 0/1268 [00:00<?, ?it/s]

Done!




### Let's Save the graph datasets

In [36]:
import pickle

with open(data_dir_graphs + dataset + "_TRAIN_event_prediction_FINAL.pkl", "wb") as f:
    pickle.dump(X_train, f)
with open(data_dir_graphs + dataset + "_VALID_event_prediction_FINAL.pkl", "wb") as f:
    pickle.dump(X_valid, f)
with open(data_dir_graphs + dataset + "_TEST_event_prediction_FINAL.pkl", "wb") as f:
    pickle.dump(X_test, f)