In [None]:
import pandas as pd
import os
from os.path import dirname

root_path = dirname(os.getcwd())

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

In [None]:
dataset = "BPI_Challenge_2013_open_problems"

In [None]:
import pm4py

log = pm4py.read_xes(f"datasets/original/{dataset}.xes.gz")
log = pm4py.convert_to_dataframe(log)
log.to_csv(f"datasets/original/{dataset}.csv")

In [None]:
filename = data_dir + dataset + ".csv"
raw_data = pd.read_csv(filename, index_col=False)
raw_data = raw_data.drop(columns="Unnamed: 0")
raw_data.head()

In [None]:
tab_all = raw_data.rename(
    columns={"case:concept:name": "CaseID", "concept:name": "Activity"}
)
tab_all.head()

In [6]:
tab_all["Activity"] = tab_all["Activity"] + "_" + tab_all["lifecycle:transition"]
tab_all.head()

Unnamed: 0,org:group,resource country,org:resource,oranization country,org:role,Activity,impact,product,time:timestamp,lifecycle:transition,CaseID
0,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2006-11-07 10:00:36+00:00,In Progress,1-147898401
1,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2006-11-07 13:05:44+00:00,In Progress,1-147898401
2,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_Wait,Medium,PROD753,2009-12-02 14:24:32+00:00,Wait,1-147898401
3,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2011-09-03 07:09:09+00:00,In Progress,1-147898401
4,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2007-03-20 09:06:25+00:00,In Progress,1-165554831


In [7]:
tab_all = tab_all.drop(columns=["lifecycle:transition"])

In [8]:
tab_all["Activity"].unique()

array(['Accepted_In Progress', 'Accepted_Wait',
       'Queued_Awaiting Assignment', 'Accepted_Assigned',
       'Completed_Closed'], dtype=object)

In [9]:
tab_all["time:timestamp"] = [x.split(".")[0] for x in tab_all["time:timestamp"]]

tab_all["time:timestamp"] = tab_all["time:timestamp"].str.replace("-", "/")
tab_all["time:timestamp"] = tab_all["time:timestamp"].str.split("+", expand=True)[0]

tab_all.head()

Unnamed: 0,org:group,resource country,org:resource,oranization country,org:role,Activity,impact,product,time:timestamp,CaseID
0,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2006/11/07 10:00:36,1-147898401
1,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2006/11/07 13:05:44,1-147898401
2,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_Wait,Medium,PROD753,2009/12/02 14:24:32,1-147898401
3,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2011/09/03 07:09:09,1-147898401
4,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,2007/03/20 09:06:25,1-165554831


In [10]:
from utils import translate_time

tab_all['time:timestamp'] = tab_all['time:timestamp'].apply(translate_time)
tab_all.head()

Unnamed: 0,org:group,resource country,org:resource,oranization country,org:role,Activity,impact,product,time:timestamp,CaseID
0,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,1162890000.0,1-147898401
1,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,1162901000.0,1-147898401
2,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_Wait,Medium,PROD753,1259760000.0,1-147898401
3,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,1315027000.0,1-147898401
4,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,1174378000.0,1-165554831


In [11]:
categorical_columns = ['org:group', 'resource country', 'org:resource', 'oranization country',
       'org:role', 'Activity', 'impact', 'product']
real_value_columns = ["time:timestamp"]

In [12]:
tab_all.isnull().values.any()

True

In [13]:
tab_all = tab_all.fillna({key: "NAN" for key in categorical_columns})
tab_all = tab_all.fillna({key: -1 for key in real_value_columns})

In [14]:
from math import log


min_time = tab_all['time:timestamp'].min() 

tab_all['time:timestamp'] -= min_time
tab_all['time:timestamp'] = [ log(x)  if x > 0 else 0. for x in tab_all['time:timestamp'].values]


tab_all.head()

Unnamed: 0,org:group,resource country,org:resource,oranization country,org:role,Activity,impact,product,time:timestamp,CaseID
0,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,0.0,1-147898401
1,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,9.315421,1-147898401
2,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_Wait,Medium,PROD753,18.388883,1-147898401
3,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,18.840289,1-147898401
4,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,16.256809,1-165554831


In [15]:
from utils import get_resource_role_map
roles_map = get_resource_role_map(tab_all)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data['ac_rl'] = self.data.apply(associations, axis=1)


20.0 Analysing resource pool 
40.0 Analysing resource pool 
60.0 Analysing resource pool 
80.0 Analysing resource pool 
100.0 Analysing resource pool 
('ROLES \n'
 '\n'
 "([{'role': 'Role 1', 'quantity': 239, 'members': ['Tomas', 'Niklas', 'Ewa', "
 "'Pawel', 'Panigrahy', 'Jerker', 'Srinivasan', 'Aneesh V', 'Rijin', 'Celine', "
 "'Anna', 'Sumesh', 'Prasad', 'Peter', 'Craig', 'Stefan', 'David', 'Jonas', "
 "'Joakim', 'Per', 'Stephen', 'Christer', 'Ing-Marie', 'Nicolas', 'Kenneth', "
 "'Fredrik', 'Viktoria', 'Roland', 'Marco', 'Katarina', 'Ian', 'Martin', "
 "'Olivier', 'Britt', 'Michal', 'Timothy', 'Andrew', 'Mikael', 'Erik', 'Arun', "
 "'Els', 'Jo', 'Lars', 'Daniel', 'Rickard', 'Thiago', 'Cyril', 'Mattias', "
 "'Bharath', 'Praveen', 'Steve', 'Murali', 'Rohan', 'Wim', 'Inger', "
 "'Frederic', 'Miroslaw', 'Lena', 'Radoslaw', 'Robert', 'Anup', 'Lars-Ove', "
 "'Jörgen', 'Hineesh', 'Carlos', 'Adam', 'Ann-Charlotte', 'Mats', 'Olle', "
 "'Richard', 'Samira', 'Vikrant', 'Kymaria', 'Agneta', 'R

In [16]:
roles_map = {
    x["resource"] : x["role"]
    for x in roles_map
}
roles_map

{'Tomas': 'Role 1',
 'Niklas': 'Role 1',
 'Ewa': 'Role 1',
 'Pawel': 'Role 1',
 'Panigrahy': 'Role 1',
 'Jerker': 'Role 1',
 'Srinivasan': 'Role 1',
 'Aneesh V': 'Role 1',
 'Rijin': 'Role 1',
 'Celine': 'Role 1',
 'Anna': 'Role 1',
 'Sumesh': 'Role 1',
 'Prasad': 'Role 1',
 'Peter': 'Role 1',
 'Craig': 'Role 1',
 'Stefan': 'Role 1',
 'David': 'Role 1',
 'Jonas': 'Role 1',
 'Joakim': 'Role 1',
 'Per': 'Role 1',
 'Stephen': 'Role 1',
 'Christer': 'Role 1',
 'Ing-Marie': 'Role 1',
 'Nicolas': 'Role 1',
 'Kenneth': 'Role 1',
 'Fredrik': 'Role 1',
 'Viktoria': 'Role 1',
 'Roland': 'Role 1',
 'Marco': 'Role 1',
 'Katarina': 'Role 1',
 'Ian': 'Role 1',
 'Martin': 'Role 1',
 'Olivier': 'Role 1',
 'Britt': 'Role 1',
 'Michal': 'Role 1',
 'Timothy': 'Role 1',
 'Andrew': 'Role 1',
 'Mikael': 'Role 1',
 'Erik': 'Role 1',
 'Arun': 'Role 1',
 'Els': 'Role 1',
 'Jo': 'Role 1',
 'Lars': 'Role 1',
 'Daniel': 'Role 1',
 'Rickard': 'Role 1',
 'Thiago': 'Role 1',
 'Cyril': 'Role 1',
 'Mattias': 'Role 1',


In [17]:
tab_all["org:resource:role"] = [roles_map[role] for role in tab_all["org:resource"].values]

In [18]:
categorical_columns.append("org:resource:role")

In [20]:
tab_all.to_csv(data_dir_processed + f"{dataset}_processed_all.csv", index=False)

# Split the dataset

In [21]:
split_ratio = 2 / 3

first_act_tab = (
    tab_all.groupby("CaseID").first().sort_values("time:timestamp").reset_index()
)
first_act_tab = first_act_tab[
    ~first_act_tab.duplicated(subset=["CaseID", "Activity"], keep="first")
]
first_act_tab = first_act_tab.reset_index(drop=True)

list_train_valid_cases = list(
    first_act_tab[: int(split_ratio * len(first_act_tab))]["CaseID"].unique()
)

list_train_cases = list_train_valid_cases[: int(len(list_train_valid_cases) * 0.8)]
tab_train = tab_all[tab_all["CaseID"].isin(list_train_cases)].reset_index(drop=True)
tab_train.to_csv(data_dir_processed+ f"{dataset}_processed_train.csv", index = False)

list_valid_cases = list_train_valid_cases[int(len(list_train_valid_cases) * 0.8) :]
tab_valid = tab_all[tab_all["CaseID"].isin(list_valid_cases)].reset_index(drop=True)
tab_valid.to_csv(data_dir_processed+f"{dataset}_processed_valid.csv", index = False)

list_test_cases = list(
    first_act_tab[int(split_ratio * len(first_act_tab)) :]["CaseID"].unique()
)
tab_test = tab_all[tab_all["CaseID"].isin(list_test_cases)].reset_index(drop=True)
tab_test.to_csv(data_dir_processed+ f"{dataset}_processed_test.csv", index = False)

In [22]:
tab_all.columns

Index(['org:group', 'resource country', 'org:resource', 'oranization country',
       'org:role', 'Activity', 'impact', 'product', 'time:timestamp', 'CaseID',
       'org:resource:role'],
      dtype='object')

In [23]:
tab_train

Unnamed: 0,org:group,resource country,org:resource,oranization country,org:role,Activity,impact,product,time:timestamp,CaseID,org:resource:role
0,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,0.000000,1-147898401,Role 1
1,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,9.315421,1-147898401,Role 1
2,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_Wait,Medium,PROD753,18.388883,1-147898401,Role 1
3,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,18.840289,1-147898401,Role 1
4,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,16.256809,1-165554831,Role 1
...,...,...,...,...,...,...,...,...,...,...,...
1446,Org line C,Sweden,Vesa,in,C_6,Queued_Awaiting Assignment,Major,PROD805,18.964929,1-730084841,Role 1
1447,Org line C,Sweden,Himanshu,in,E_10,Accepted_In Progress,Major,PROD805,18.964936,1-730084841,Role 1
1448,Org line C,Sweden,Himanshu,in,E_10,Completed_Closed,Major,PROD805,18.964936,1-730084841,Role 1
1449,Org line C,Sweden,Vesa,se,C_6,Accepted_In Progress,Major,PROD805,18.964935,1-730084875,Role 1


### Prepare the graphs

In [24]:
from utils import get_case_ids, is_static, get_one_hot_encoder, get_one_hot_encodings

from torch import tensor, max, int64, float32
from torch_geometric.data import HeteroData
import numpy as np

In [25]:
def get_node_features(dataset: pd.DataFrame, trace: pd.DataFrame, cat_features, real_features) -> dict:
    columns_static = [c for c in trace if is_static(trace[c])]

    res = {}

    for key in trace:
        values = trace[key].values
        
        
        if key in cat_features:
            onehot_encoder = get_one_hot_encoder(dataset, key)
            if key not in columns_static:
                res[key] = tensor(
                    get_one_hot_encodings(onehot_encoder, values),
                    dtype=float32,
                    requires_grad=True
                )
            else:
                res[key] = tensor(
                    get_one_hot_encodings(onehot_encoder, np.array([values[0]])),
                    dtype=float32,
                    requires_grad=True
                )
        if key in real_features:
            if key not in columns_static:
                res[key] = tensor(values,  dtype=float32,requires_grad=True)
            else:
                res[key] = tensor([values[0]], dtype=float32,requires_grad=True)
            res[key] = res[key].reshape(res[key].shape[0], 1)
        
    

    return res


In [26]:


def compute_edges_indexs(node_features: dict, prefix_len):
    res = {}
    keys = node_features.keys()
    # indexes = [[i, j] for i in range(prefix_len) for j in range(i + 1, prefix_len)]
    indexes = [[i, i + 1] for i in range(prefix_len-1)]
    # activities indexes
    for k in keys:
        if len(node_features[k]) != 1:
            if k == "Activity":
                res[(k, "followed_by", k)] = indexes
                for k2 in keys:
                    if k2 != k:
                        if len(node_features[k2]) == 1:
                            res[(k, "related_to", k2)] = [
                                [i, 0] for i in range(prefix_len)
                            ]
                        else:
                            res[(k, "related_to", k2)] = [
                                [i, i] for i in range(prefix_len)
                            ]
            else:
                res[(k, "related_to", k)] = indexes

    return res

In [27]:
import torch

from torch import cat

def compute_edges_features(node_features, edges_indexes, cat_features, real_features):
    res = {}

    for k in edges_indexes:
        if k[0] == k[2]:
            indexes = edges_indexes[k]
            res[k] = []
            
            if k[0] in cat_features:
                for i in indexes:
                        res[k].append(      
                            cat(
                                (
                                    tensor([torch.equal(node_features[k[0]][i[0]],node_features[k[0]][i[1]])], dtype=torch.float32),
                                    tensor([i[1] - i[0]], dtype=torch.float32),
                                )
                            )
                        )
            elif k[0] in real_features:
                for i in indexes:
                        res[k].append(
                            tensor(
                                [
                                    node_features[k[0]][i[1]]
                                    - node_features[k[0]][i[0]],
                                    i[1] - i[0],
                                ]
                            )
                        )
            

    return res

In [28]:
# from pprint import pprint as print

from torch import stack

def build_prefixes_graph_from_trace(dataset, trace, cat_features, real_features):
    X = []  # graphs
   

    node_features = get_node_features(dataset, trace, cat_features, real_features)
    
    
    prefix_lenghts = range(2, len(trace))
    # print(prefix_lenghts)
    for prefix in prefix_lenghts:
        # print(prefix)

        # init node types and features
        G = HeteroData()
        for k in node_features:
            G[k].x = node_features[k][:prefix]
        

        edges_indexes = compute_edges_indexs(node_features, prefix)

        edge_features = compute_edges_features(node_features, edges_indexes, cat_features, real_features)
        
        for k in edge_features:
            G[k].edge_attr = stack(edge_features[k])
            


        for k in edges_indexes:
            ce = [[], []]
            for i in range(len(edges_indexes[k])):
                ce[0].append(edges_indexes[k][i][0])
                ce[1].append(edges_indexes[k][i][1])
            edges_indexes[k] = ce

        for k in edges_indexes:
            G[k].edge_index = tensor(edges_indexes[k], dtype=int64)

        G.y = {}
        for k in node_features:
            if k in cat_features:
                G.y[k] = torch.max(node_features[k][0],0)[1] if len(node_features[k]) == 1 else torch.max(node_features[k][prefix], 0)[1]
            else:
                G.y[k] = torch.tensor([node_features[k][prefix][0]])
        
        X.append(G)

        
    return X

## Create the datasets

In [29]:
case_train_ids = get_case_ids(tab_train)
case_valid_ids = get_case_ids(tab_valid)
case_test_ids = get_case_ids(tab_test)

In [30]:
print(len(case_train_ids))
print(len(case_valid_ids))
print(len(case_test_ids))

436
110
273


In [31]:
trace = (
        tab_train.query(f"CaseID == '{case_train_ids[0]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
trace 

Unnamed: 0,org:group,resource country,org:resource,oranization country,org:role,Activity,impact,product,time:timestamp,org:resource:role
0,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,0.0,Role 1
1,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,9.315421,Role 1
2,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_Wait,Medium,PROD753,18.388883,Role 1
3,Org line A2,Sweden,Tomas,cn,A2_2,Accepted_In Progress,Medium,PROD753,18.840289,Role 1


In [32]:
graphs = build_prefixes_graph_from_trace(tab_all, trace, categorical_columns, real_value_columns)

In [33]:
graphs[1].y

{'org:group': tensor(0),
 'resource country': tensor(11),
 'org:resource': tensor(223),
 'oranization country': tensor(5),
 'org:role': tensor(1),
 'Activity': tensor(1),
 'impact': tensor(3),
 'product': tensor(121),
 'time:timestamp': tensor([18.8403]),
 'org:resource:role': tensor(0)}

In [34]:
graphs[1].x_dict

{'org:group': tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
        grad_fn=<SliceBackward0>),
 'resource country': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]],
        grad_fn=<SliceBackward0>),
 'org:resource': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0

In [None]:
graphs[1].edge_items()

In [35]:
from tqdm.notebook import tqdm


print("Preparing training dataset...")

X_train = []


for i in tqdm(range(len(case_train_ids))):
    trace = (
        tab_train.query(f"CaseID == '{case_train_ids[i]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace, cat_features=categorical_columns, real_features=real_value_columns)
    
    # print(trace)
    # print([x.x_dict for x in graphs[:2]])
    # print(labels[:2])
    # break

    for i in range(len(graphs)):
        X_train.append(graphs[i])
        

# Y_train = tensor(Y_train, dtype=float32)

print("Done!\n\n")

Preparing training dataset...


  0%|          | 0/436 [00:00<?, ?it/s]

Done!




In [36]:
print("Preparing validation dataset...")

X_valid = []


for i in tqdm(range(len(case_valid_ids))):
    trace = (
        tab_valid.query(f"CaseID == '{case_valid_ids[i]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace,cat_features=categorical_columns, real_features=real_value_columns)
    for i in range(len(graphs)):
        X_valid.append(graphs[i])
       

# Y_valid = tensor(Y_valid)

print("Done!\n\n")

Preparing validation dataset...


  0%|          | 0/110 [00:00<?, ?it/s]

Done!




In [37]:
print("Preparing test dataset...")

X_test = []


for i in tqdm(range(len(case_test_ids))):
    trace = (
        tab_test.query(f"CaseID == '{case_test_ids[i]}'")
        .reset_index()
        .drop(columns="index")
        .drop(columns="CaseID")
    )
    graphs = build_prefixes_graph_from_trace(dataset=tab_all, trace=trace,cat_features=categorical_columns, real_features=real_value_columns)
    for i in range(len(graphs)):
        X_test.append(graphs[i])
     

# Y_test = tensor(Y_test)

print("Done!\n\n")

Preparing test dataset...


  0%|          | 0/273 [00:00<?, ?it/s]

Done!




### Let's Save the graph datasets

In [38]:
import pickle

with open(data_dir_graphs + dataset + "_TRAIN_event_prediction_FINAL.pkl", "wb") as f:
    pickle.dump(X_train, f)
with open(data_dir_graphs + dataset + "_VALID_event_prediction_FINAL.pkl", "wb") as f:
    pickle.dump(X_valid, f)
with open(data_dir_graphs + dataset + "_TEST_event_prediction_FINAL.pkl", "wb") as f:
    pickle.dump(X_test, f)