In [1]:
import pandas as pd
import os
from os.path import dirname


root_path = dirname(os.getcwd())

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/comuzzi/"
data_dir_processed = root_path + "/data/datasets/_processed/"
data_dir_graphs = root_path + "/data/datasets/graphs_repair/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

/home/sebdis/ProcessMining/HGNN/HGNN_NA
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/comuzzi/
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/comuzzi/_processed/
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/graphs_repair/


In [2]:
#dataset = "bpi_2013"
#dataset = "bpi_2012"
#dataset = "small_log"
#dataset = "large_log"

#dataset = "sp2020"
dataset = "BPI20_RequestForPayment"

In [3]:
nan_methods = ["odd", "even", "window", "random"]

In [4]:
raw_data = pd.read_csv(f"{data_dir}/{dataset}/complete_df_full_even.csv")
raw_data.head()

Unnamed: 0,CaseID,Activity,CompleteTimestamp,org:resource,org:role,case:Project,case:Task,case:OrganizationalEntity,case:Cost Type,case:RequestedAmount,case:Activity,case:RfpNumber,CumTimeInterval
0,1,Request For Payment SUBMITTED by EMPLOYEE,2017-01-09 09:17:18+00:00,STAFF MEMBER,EMPLOYEE,project 148216,UNKNOWN,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,0.0
1,1,Request For Payment FINAL_APPROVED by SUPERVISOR,2017-01-09 09:18:00+00:00,STAFF MEMBER,SUPERVISOR,project 148216,UNKNOWN,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,42.0
2,1,Request For Payment REJECTED by MISSING,2017-01-10 12:42:32+00:00,STAFF MEMBER,MISSING,project 148216,UNKNOWN,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,98714.0
3,1,Request For Payment SUBMITTED by EMPLOYEE,2017-03-03 09:51:13+00:00,STAFF MEMBER,EMPLOYEE,project 148216,UNKNOWN,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,4581235.0
4,1,Request For Payment APPROVED by PRE_APPROVER,2017-03-03 09:51:42+00:00,STAFF MEMBER,PRE_APPROVER,project 148216,UNKNOWN,organizational unit 65463,0,34.336343,UNKNOWN,request for payment number 148215,4581264.0


In [5]:
if dataset == "sp2020":
    raw_data.fillna({"org:resource": "EMPTY"})

In [9]:
raw_data.columns

Index(['CaseID', 'Activity', 'CompleteTimestamp', 'org:resource', 'org:role',
       'case:Project', 'case:Task', 'case:OrganizationalEntity',
       'case:Cost Type', 'case:RequestedAmount', 'case:Activity',
       'case:RfpNumber', 'CumTimeInterval'],
      dtype='object')

In [5]:
if dataset == "bpi_2013" or dataset == "small_log" or dataset == "large_log" or dataset == "sp2020":
    date_format = '%Y-%m-%d %H:%M:%S'
elif dataset == "bpi_2012":
    date_format = '%Y-%m-%d %H:%M:%S.%f'
elif dataset == "BPI20_RequestForPayment":
    date_format = "%Y-%m-%d %H:%M:%S%z"
else:
    date_format = '%Y/%m/%d %H:%M:%S.%f'
date_format

'%Y-%m-%d %H:%M:%S%z'

In [6]:
from datetime import datetime

def translate_time(time_str):
    return datetime.strptime(time_str, date_format).timestamp()

In [7]:
train_dataset = pd.read_csv(f"{data_dir}/{dataset}/complete_df_train_even.csv")
valid_dataset = pd.read_csv(f"{data_dir}/{dataset}/complete_df_val_even.csv")
test_dataset = pd.read_csv(f"{data_dir}/{dataset}/complete_df_test_even.csv")

In [8]:
masked_datasets = {key : pd.read_csv(f"{data_dir}/{dataset}/missing_df_full_{key}.csv") for key in nan_methods}

In [10]:
if dataset == "bpi_2013" or dataset == "bpi_2012":
    tab_all = raw_data.rename(columns={"CompleteTimestamp": "time:timestamp", "Resource": "org:resource"})
    train_dataset = train_dataset.rename(columns={"CompleteTimestamp": "time:timestamp", "Resource": "org:resource"})
    valid_dataset = valid_dataset.rename(columns={"CompleteTimestamp": "time:timestamp", "Resource": "org:resource"})
    test_dataset = test_dataset.rename(columns={"CompleteTimestamp": "time:timestamp", "Resource": "org:resource"})
    for k in masked_datasets:
        masked_datasets[k] = masked_datasets[k].rename(columns={"CompleteTimestamp": "time:timestamp", "Resource": "org:resource"})
elif dataset == "small_log" or dataset == "large_log" or dataset == "sp2020" or dataset == "BPI20_RequestForPayment":
    tab_all = raw_data.rename(columns={"CompleteTimestamp": "time:timestamp"})
    train_dataset = train_dataset.rename(columns={"CompleteTimestamp": "time:timestamp"})
    valid_dataset = valid_dataset.rename(columns={"CompleteTimestamp": "time:timestamp"})
    test_dataset = test_dataset.rename(columns={"CompleteTimestamp": "time:timestamp"})
    for k in masked_datasets:
        masked_datasets[k] = masked_datasets[k].rename(columns={"CompleteTimestamp": "time:timestamp"})


In [11]:
tab_all["time:timestamp"] = tab_all["time:timestamp"].apply(translate_time)
train_dataset["time:timestamp"] = train_dataset["time:timestamp"].apply(translate_time)
valid_dataset["time:timestamp"] = valid_dataset["time:timestamp"].apply(translate_time)
test_dataset["time:timestamp"] = test_dataset["time:timestamp"].apply(translate_time)

for k in masked_datasets:
    masked_datasets[k]["time:timestamp"] = [translate_time(x) if type(x) == str else x for x in  masked_datasets[k]["time:timestamp"].values]

In [12]:
from math import log
 
from numpy import NaN 

if dataset == "bpi_2012":
    tab_all["(case) AMOUNT_REQ"] = [log(x) if x > 0 else x if x is NaN else 0. for x in  tab_all["(case) AMOUNT_REQ"].values]
    train_dataset["(case) AMOUNT_REQ"] = [log(x) if x > 0 else x if x is NaN else 0. for x in  train_dataset["(case) AMOUNT_REQ"].values]
    valid_dataset["(case) AMOUNT_REQ"] = [log(x) if x > 0 else x if x is NaN else 0. for x in  valid_dataset["(case) AMOUNT_REQ"].values]
    test_dataset["(case) AMOUNT_REQ"] = [log(x) if x > 0 else x if x is NaN else 0. for x in  test_dataset["(case) AMOUNT_REQ"].values]

    for k in masked_datasets:
        masked_datasets[k]["(case) AMOUNT_REQ"] = [log(x) if x > 0 else x if x is NaN else 0. for x in  masked_datasets[k]["(case) AMOUNT_REQ"].values]

In [13]:
tab_all = tab_all.drop(columns=["CumTimeInterval"])
train_dataset = train_dataset.drop(columns=["CumTimeInterval"])
valid_dataset = valid_dataset.drop(columns=["CumTimeInterval"])
test_dataset = test_dataset.drop(columns=["CumTimeInterval"])

for k in masked_datasets:
    masked_datasets[k] = masked_datasets[k].drop(columns=["CumTimeInterval"])

In [14]:
min = tab_all["time:timestamp"].min()
min

1483953438.0

In [15]:
from numpy import NaN


tab_all["time:timestamp"] -= min
train_dataset["time:timestamp"] -= min
valid_dataset["time:timestamp"] -= min 
test_dataset["time:timestamp"] -= min 

for k in masked_datasets:
    masked_datasets[k]["time:timestamp"] = [x-min if x is not NaN else x for x in  masked_datasets[k]["time:timestamp"].values]

In [None]:
masked_datasets["even"]

In [16]:
from math import log

tab_all["time:timestamp"] = [log(x) if x > 0 else 0. for x in tab_all["time:timestamp"].values ]
train_dataset["time:timestamp"] = [log(x) if x > 0 else 0. for x in train_dataset["time:timestamp"].values ]
valid_dataset["time:timestamp"] = [log(x) if x > 0 else 0. for x in valid_dataset["time:timestamp"].values ]
test_dataset["time:timestamp"] = [log(x) if x > 0 else 0. for x in test_dataset["time:timestamp"].values ]

for k in masked_datasets:
    masked_datasets[k]["time:timestamp"] = [log(x) if x is not NaN and x != 0 else 0. for x in  masked_datasets[k]["time:timestamp"].values]


In [None]:
masked_datasets["even"]

In [None]:
tab_all.head()

In [None]:
train_dataset.head()

In [None]:
valid_dataset.head()

In [None]:
test_dataset.head()

In [17]:
dataset = f"{dataset}_CZ"
dataset

'BPI20_RequestForPayment_CZ'

In [None]:
data_dir_processed

In [18]:
if not os.path.isdir(f"{data_dir_processed}/{dataset}"):
    os.mkdir(f"{data_dir_processed}/{dataset}")

data_dir_processed = f"{data_dir_processed}/{dataset}/"

In [19]:
tab_all.to_csv(data_dir_processed + f"{dataset}_processed_all.csv", index=False)

In [20]:
train_dataset.to_csv(data_dir_processed+ f"{dataset}_processed_train.csv", index = False)

In [21]:
valid_dataset.to_csv(data_dir_processed+f"{dataset}_processed_valid.csv", index = False)

In [22]:
test_dataset.to_csv(data_dir_processed+ f"{dataset}_processed_test.csv", index = False)

In [23]:
for k in masked_datasets:
    masked_datasets[k].to_csv(data_dir_processed + f"{dataset}_masked_{k}_all.csv", index=False)