In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
import os
import json 
from os.path import dirname



root_path = dirname(os.getcwd()) + "/HGNN_NA"

pd.set_option("display.max_columns", None)
# data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/_processed/"
data_dir_graphs = root_path + "/data/datasets/graphs_repair/"

print(root_path, data_dir_processed, data_dir_graphs, sep="\n")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = "cpu"

/home/sebdis/ProcessMining/HGNN/HGNN_NA
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/comuzzi/_processed/
/home/sebdis/ProcessMining/HGNN/HGNN_NA/data/datasets/comuzzi/graphs_repair/


In [None]:
ACT_TIME_ONLY = True

In [2]:
ACT_TIME_ONLY = False

In [3]:
with open("data/dataset_features.json", 'r') as file:
    datasets_info = json.load(file)

In [4]:
list(datasets_info.keys())

['BPI_Challenge_2013_open_problems',
 'sp2020',
 'Helpdesk',
 'BPI20_RequestForPayment',
 'BPI Challenge 2017 - Offer log',
 'BPI_Challenge_2012_W_Complete',
 'BPI_Challenge_2012_A',
 'bpi_2012_CZ',
 'bpi_2013_CZ',
 'large_log_CZ',
 'small_log_CZ',
 'sp2020_CZ',
 'BPI20_RequestForPayment_CZ']

In [22]:
dataset = "small_log_CZ"

In [19]:
with open("data/dataset_features.json", 'r') as file:
    dataset_info = json.load(file)[dataset]

In [23]:
tab_all = pd.read_csv(f"{data_dir_processed}/{dataset}/{dataset}_processed_all.csv") 
tab_all.head()

Unnamed: 0,CaseID,Activity,time:timestamp
0,1,Activity A,0.0
1,1,Activity B,8.188689
2,1,Activity C,8.881836
3,1,Activity D,9.287301
4,1,Activity E,9.574983


In [24]:
print(f"number of events {len(tab_all)}")
print(f"number of traces {len(tab_all['CaseID'].unique())}")
print(f"number of attributes {len(tab_all.columns) - 1}") 

number of events 28000
number of traces 2000
number of attributes 2


In [None]:
MISSING_VALUE = "MISSING_VALUE"

In [None]:
if ACT_TIME_ONLY:
    categorical_columns = ["Activity"]
    real_value_columns = ["time:timestamp"]
    dataset = f"{dataset}_AT_only"
else:
    categorical_columns = dataset_info["categorical"]
    real_value_columns = dataset_info["numerical"]

In [None]:
list_unique = {k : list(tab_all[k].unique()) + [MISSING_VALUE] for k in categorical_columns}
#list_unique

In [None]:
import random

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
with open(data_dir_graphs + dataset + "_TRAIN_repair.pkl", "rb") as f:
    X_train = pickle.load(f)
with open(data_dir_graphs + dataset + "_VALID_repair.pkl", "rb") as f:
    X_valid = pickle.load(f)
with open(data_dir_graphs + dataset + "_TEST_repair.pkl", "rb") as f:
    X_test = pickle.load(f)

In [None]:
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import ToUndirected

transform = ToUndirected()

with torch.no_grad():
        for i in range(len(X_train)):
                X_train[i] = transform(X_train[i])
        for i in range(len(X_valid)):
                X_valid[i] = transform(X_valid[i])
        for i in range(len(X_test)):
                X_test[i] = transform(X_test[i])

In [None]:
edge_types = set()
node_types = set()
for i in range(len(X_train)):
    n, edge_type = X_train[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
for i in range(len(X_valid)):
    n, edge_type = X_valid[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
for i in range(len(X_test)):
    n, edge_type = X_test[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
        
node_types = list(node_types)
edge_types = list(edge_types)

## Hyperopt

In [None]:
from ax.service.managed_loop import optimize
from torch_geometric.nn import (
    HeteroConv,
    SAGEConv
)
from torch.nn import (
    ModuleList,
    Module,
    Linear,
    ModuleDict
  )
from typing_extensions import Self

In [None]:
class HGNN(Module):

    def __init__(self, output_cat, output_real,nodes_relations, parameters) -> Self:  # type: ignore
        super().__init__()

        # List of convolutional layers
        
        hid = parameters["hid"]
        layers = parameters["layers"]
        aggregation = parameters["aggregation"]
       
        
        self.output_cat = output_cat
        self.output_real = output_real
        
        self.convs = ModuleList()
        for _ in range(layers):
            conv = HeteroConv(
                {
                    relation: (
                        #TransformerConv((-1,-1), hid)
                        SAGEConv((-1,-1), aggr=aggregation, out_channels=hid, normalize=False)
                        # GATv2Conv((-1,-1), add_self_loops=False, out_channels=hid, concat=False)
                    )
                    for relation in nodes_relations
                },
                aggr=aggregation,
            )

            self.convs.append(conv)

        self.FC = ModuleDict()
        
        for k in output_cat:
            self.FC[k] = Linear(hid, output_cat[k], device=device)
        for k in output_real:
            self.FC[k] = Linear(hid, 1, device=device)
        
    
        
        

    def forward(self, batch):

        for i in range(len(self.convs)):
            batch.x_dict = self.convs[i]( 
                batch.x_dict, batch.edge_index_dict
            )
            batch.x_dict = {key: x.relu() for key, x in batch.x_dict.items()}

        output = {}
        
        for k in self.output_cat:
            output[k] = self.FC[k](batch.x_dict[k][batch.mask])
        for k in self.output_real:
            output[k] = self.FC[k](batch.x_dict[k][batch.mask]).reshape(1,-1)[0]
            

        return output

In [None]:
from torcheval.metrics.functional import multiclass_accuracy, multiclass_f1_score
import torch.nn as nn
from copy import deepcopy
from tqdm.notebook import tqdm


def train_hgnn(config, output_cat, output_real, epochs=20):
    print(config)

    net = HGNN(
        parameters=config,
        output_cat=output_cat,
        output_real=output_real,
        nodes_relations=edge_types,
    )
    net = net.to(device)

    losses = {}

    for k in output_cat:
        losses[k] = nn.CrossEntropyLoss()
    for k in output_real:
        losses[k] = nn.L1Loss()

    train_loader = DataLoader(X_train, batch_size=config["batch_size"], shuffle=True)
    valid_loader = DataLoader(X_valid, batch_size=config["batch_size"], shuffle=True)

    optimizer = torch.optim.Adam(net.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])

    best_model = None
    best_loss = 0
    patience = 5
    pat_count = 0

    torch.cuda.empty_cache()

    for epoch in tqdm(range(0, epochs)):

        net.train()
        for _, x in enumerate(train_loader):
            x = x.to(device)

            # TESTTTTTTTTTTT
            #x.mask = torch.full_like(x.mask, fill_value=True)
            #########################
            
            all_labels = x.y
            labels = {k: all_labels[k][x.mask] for k in all_labels}

            optimizer.zero_grad()
            outputs = net(x)

            losses_step = {k: losses[k](outputs[k], labels[k]) for k in losses}

            total_loss = 0
            for k in losses_step:
                total_loss += losses_step[k]

            total_loss.backward()
            optimizer.step()

        running_total_loss = []

        net.eval()
        with torch.no_grad():
            for i, x in enumerate(valid_loader):
                x = x.to(device)

                all_labels = x.y
                labels = {k: all_labels[k][x.mask] for k in all_labels}

                outputs = net(x)

                losses_step = {k: losses[k](outputs[k], labels[k]) for k in losses}

                running_total_loss.append(sum(list(losses_step.values())))

        val_loss = sum(running_total_loss) / len(running_total_loss)

        if epoch == 0:
            best_model = deepcopy(net)
            best_loss = val_loss
        else:
            if val_loss < best_loss:
                best_loss = val_loss
                best_model = deepcopy(net)
                pat_count = 0
            if pat_count == patience:
                return best_model
        pat_count += 1

    return best_model

In [None]:
from torch.nn.functional import l1_loss
 
def test_hgnn(net, output_cat, output_real, test_graphs=X_test):
    test_loader = DataLoader(test_graphs, batch_size=128, shuffle=False)
    
    losses = {}
    
    for k in output_cat:
        losses[k] = (
            nn.CrossEntropyLoss()
        )
    for k in output_real:
        losses[k] = nn.L1Loss()
    
    
    
    
    
    predictions_categorical = {k: [] for k in output_cat}
    target_categorical = {k: [] for k in output_cat}

    avg_MAE = {k : [] for k in output_real}
    prediction_numerical = {k: [] for k in output_real}
    target_numerical = {k: [] for k in output_real}
    
    total_loss = []
        
    net.eval()
    with torch.no_grad():
        for _, x in enumerate(test_loader):
            x = x.to(device)
            
            all_labels = x.y
            labels = {k : all_labels[k][x.mask] for k in all_labels}
            
            outputs = net(x)
            
     
            losses_step = {k: losses[k](outputs[k], labels[k]).item() for k in losses}
            total_loss.append(sum(list(losses_step.values())))
            
            for k in output_cat:
                predictions_categorical[k].append(
                        torch.argmax(torch.softmax(outputs[k], dim=1), 1)
                )
                target_categorical[k].append(labels[k])
            
            
            for k in output_real:
                prediction_numerical[k].append(
                    outputs[k]
                )
                target_numerical[k].append(
                    labels[k]
                )
                avg_MAE[k].append(losses_step[k])
                    
    for k in predictions_categorical:
            predictions_categorical[k] = torch.cat(predictions_categorical[k])
            target_categorical[k] = torch.cat(target_categorical[k])
               
    for k in prediction_numerical:
        prediction_numerical[k] = torch.cat(prediction_numerical[k])
        target_numerical[k] = torch.cat(target_numerical[k])
   
    
    MAE = {
        k: l1_loss(prediction_numerical[k], target_numerical[k]).item()
        for k in output_real
    }
    
    accuracy = {
            k: multiclass_accuracy(
                predictions_categorical[k],
                target_categorical[k],
                num_classes=output_cat[k],
            )
            for k in output_cat
        }
    
    avg_MAE = {k : sum(avg_MAE[k]) / len(avg_MAE[k]) for k in avg_MAE}
    
    
    Average_total_loss = sum(total_loss) / len(total_loss)
    
    res = {f"{k}_acc" : accuracy[k].item() for k in accuracy} | {f"{k}_mae" : avg_MAE[k] for k in avg_MAE} | {"AVG_total_loss" : Average_total_loss}  | {f"MAE_{k}" : MAE[k] for k in MAE}
    
    print(res)
    
    return res

In [None]:
outputcat = {k : len(list_unique[k]) for k in list_unique}
outputreal = real_value_columns
print(outputcat)
print(outputreal)

In [None]:
def train_evaluate(config):
    trained_net = train_hgnn(config, output_cat=outputcat, output_real=outputreal, epochs=50)
    return test_hgnn(trained_net, output_cat=outputcat, output_real=outputreal)

In [None]:
import logging

logging.getLogger("root").setLevel(logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
#outputreal = ['time:timestamp']
#outputcat = {"Activity" : outputcat["Activity"]}
#print(outputcat)

In [None]:
best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "hid", "type": "choice", "values": [128], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "layers", "type": "choice", "values": [2, 3, 4, 5], "value_type": "int", "is_ordered" : True, "sort_values":False},
        {"name": "layers", "type": "choice", "values": [2], "value_type": "int", "is_ordered" : True, "sort_values":False},
        {"name": "lr", "type": "range", "bounds": [1e-4, 1e-1], "value_type": "float", "log_scale": True},
        {"name": "batch_size", "type": "choice", "values": [128,256,512], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "batch_size", "type": "choice", "values": [16, 64, 128], "value_type": "int", "is_ordered" : True,"sort_values":False}, 
        {"name": "weight_decay", "type": "range", "bounds" : [1e-2, 1e-1], "value_type": "float", "log_scale" : True}, 
        #{"name": "heads", "type": "choice", "values": [1,2], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "heads", "type": "choice", "values": [1], "value_type": "int", "is_ordered" : True,"sort_values":False},
        
        #{"name": "aggregation", "type" : "choice", "values" :["sum", "mean", "max"], "value_type" : "str"}
        {"name": "aggregation", "type" : "choice", "values" :["max"], "value_type" : "str"},
     
    ],
  
    evaluation_function=train_evaluate,
    objective_name='AVG_total_loss',
    arms_per_trial=1,
    minimize = True,
    random_seed = 123,
    total_trials = 20
)

print(best_parameters)
means, covariances = values
print(means)
print(experiment)

In [None]:
from ax.service.utils.report_utils import exp_to_df

results = exp_to_df(experiment)

In [None]:
results.sort_values(by="AVG_total_loss")

In [None]:
results = results.sort_values(by="AVG_total_loss")

In [None]:
results.to_csv(f"repair_results/{dataset}_CONFIGS.csv", sep=",")

## Test on different test sets

In [None]:
test_types = ["even", "odd", "window", "random"]

In [None]:
import pandas


def create_df(results):
    res = {}
    
    for k in results[0]:
        res[k] = [x[k] for x in results]
    
    res = pandas.DataFrame(data=res)
    
    return res, res.mean(), res.std()
        
    

In [None]:
def test_multi(config, outputcat, outputreal, num_runs=10, num_epochs=20):
    
    res = {}
    
    save_path = f"repair_results/{dataset}/"
    
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    
    for i in range(num_runs):
        
        print(f"Run {i}")
        
        net = train_hgnn(
                config, 
                outputcat,
                outputreal,
                num_epochs
            )
        
        
        
        for test_type in test_types:
            
            print(f"Test type {test_type}")
            
            with open(data_dir_graphs + dataset + f"_TEST_repair_{test_type}.pkl", "rb") as f:
                X = pickle.load(f)    
                
            for i in range(len(X)):
                X[i] = transform(X[i])

            if test_type not in res:
                res[test_type] = []
            
           
                
            
                
            res[test_type].append(
                test_hgnn(
                    net,
                    outputcat,
                    outputreal,
                    test_graphs=X
                )
            )
    
    for test_type in test_types:    
        results_table, means, stds = create_df(res[test_type])
                
        results_table.to_csv(f"{save_path}{test_type}_RESULTS.csv", sep=",", index=False)
        
        pd.DataFrame(data={"mean" : means, "std" : stds}).to_csv(f"{save_path}{test_type}_MEAN_STD.csv", sep=",")
        
        print(test_type)
        print(pd.DataFrame(data={"mean" : means, "std" : stds}))
        
        
    
    return res

In [None]:
res = test_multi(best_parameters, outputcat, outputreal, 10, 50)