In [None]:
import torch
import pickle
import numpy as np
import pandas as pd
import os

from os.path import dirname



root_path = dirname(os.getcwd()) + "/HGNN_NA"

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [None]:
dataset = "BPI_Challenge_2012_A"

In [None]:
tab_all = pd.read_csv(data_dir_processed+dataset+"_processed_all.csv")
print(tab_all.head())
list_activities = list(tab_all["Activity"].unique())
list_org_resource = list(tab_all['org:resource'].unique())
list_org_resource_roles = list(tab_all["org:resource:role"].unique())

In [None]:
tab_all['org:resource:role'].value_counts()

In [None]:
from data import utils

one_hot_resource = utils.get_one_hot_encoder(tab_all, "org:resource")
one_hot_resource_role = utils.get_one_hot_encoder(tab_all, "org:resource:role")
resources = tab_all["org:resource"]
resources_role = tab_all["org:resource:role"]
map_resource_to_role = {}
for i in range(len(resources)):
    map_resource_to_role[resources[i]] = resources_role[i]

map_resource_to_role

In [None]:
map_resource_index = {}
for x in tab_all["org:resource"].unique():
    map_resource_index[x] = torch.argmax(torch.tensor(utils.get_one_hot_encodings(one_hot_resource, np.array([x])))).item()
map_resource_index

In [None]:
map_resource_role_index = {}
for x in tab_all["org:resource:role"].unique():
    map_resource_role_index[x] = torch.argmax(torch.tensor(utils.get_one_hot_encodings(one_hot_resource_role, np.array([x])))).item()
map_resource_role_index

In [None]:
map_resource_to_role = { map_resource_index[k] : map_resource_role_index[v] for k,v in map_resource_to_role.items()}
map_resource_to_role

In [None]:
import random

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
with open(data_dir_graphs + dataset + "_TRAIN_event_prediction_newF_onehot_lessEdgesWclass_wRole.pkl", "rb") as f:
    X_train = pickle.load(f)
with open(data_dir_graphs + dataset + "_VALID_event_prediction_newF_onehot_lessEdgesWclass_wRole.pkl", "rb") as f:
    X_valid = pickle.load(f)
with open(data_dir_graphs + dataset + "_TEST_event_prediction_newF_onehot_lessEdgesWclass_wRole.pkl", "rb") as f:
    X_test = pickle.load(f)

In [None]:

from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import ToUndirected, NormalizeFeatures

transform = ToUndirected()

with torch.no_grad():
        for i in range(len(X_train)):
                X_train[i] = transform(X_train[i])
        for i in range(len(X_valid)):
                X_valid[i] = transform(X_valid[i])
        for i in range(len(X_test)):
                X_test[i] = transform(X_test[i])
    






In [None]:
edge_types = set()
node_types = set()
for i in range(len(X_train)):
    n, edge_type = X_train[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
for i in range(len(X_valid)):
    n, edge_type = X_valid[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
for i in range(len(X_test)):
    n, edge_type = X_test[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
node_types = list(node_types)
edge_types = list(edge_types)

In [None]:
#node_types, edge_types = X_train[0].metadata()

In [None]:
def get_weights(load, key):
    weights = []
    
    cl_train = [0 for _ in tab_all[key].unique()]
    
    print(cl_train)
    
    for i,x in enumerate(load):

        
        classes = x.y[key]

        # print(classes)
        
        for c in list(classes):
            try:
                cl_train[c] +=1
            except KeyError:
                cl_train[c] = 1
    s = sum(cl_train)
    
    print(cl_train)
    
    weights = [s/x if x > 0 else 0 for x in cl_train]

    # weights = [0.7,0.7,1,0.7,0.7,0.7,0.7,0.7,0.7,0.7]
    weights = torch.tensor(weights, device=device)
    print(weights)
    return weights
    

In [None]:
edge_features_dims = {}
for i in range(len(X_train)):
    for k in X_train[i].edge_attr_dict.keys():
        edge_features_dims[k] = X_train[i].edge_attr_dict[k].shape[1]
for i in range(len(X_valid)):
    for k in X_train[i].edge_attr_dict.keys():
        edge_features_dims[k] = X_train[i].edge_attr_dict[k].shape[1]
for i in range(len(X_test)):
    for k in X_train[i].edge_attr_dict.keys():
        edge_features_dims[k] = X_train[i].edge_attr_dict[k].shape[1]
edge_features_dims

In [None]:
act_weights = get_weights(DataLoader(X_train, batch_size=1024, shuffle=False), "Activity")
res_roles_weights = get_weights(DataLoader(X_train, batch_size=1024, shuffle=False), "org:resource:role")


## Hyperopt

In [None]:
from ax.service.managed_loop import optimize

In [None]:
from torch_geometric.nn import (
    HeteroConv,
    global_mean_pool,
    GATv2Conv,
)
from torch.nn import (
    ModuleList,
    Module,
    Sequential,
    Linear,
  )
from typing_extensions import Self

In [None]:
class HGNN(Module):

    def __init__(self, out_act, out_org_resource,out_org_resource_role,nodes_relations, relations_with_features, parameters) -> Self:  # type: ignore
        super().__init__()

        # List of convolutional layers
        
        hid = parameters["hid"]
        layers = parameters["layers"]
        aggregation = parameters["aggregation"]
        n_heads = parameters["heads"]
     
        
        self.convs = ModuleList()
        for _ in range(layers):
            conv = HeteroConv(
                {
                    relation: (
                        GATv2Conv((-1,-1), add_self_loops=False, out_channels=hid, heads=n_heads, concat=False)
                        if relation not in relations_with_features
                        else GATv2Conv((-1,-1), add_self_loops=False, out_channels=hid,heads=n_heads, edge_dim=relations_with_features[relation], concat=False)
                    )
                    for relation in nodes_relations
                },
                aggr=aggregation,
            )

            self.convs.append(conv)

        self.fc_NA = Linear(hid, out_act)
            #Linear(hid*n_heads, out_act)
       
        self.fc_timestamp = Linear(hid, 1)
            #Linear(hid*n_heads, 1)
       
        self.fc_org_resource = Linear(hid, out_org_resource)
            #Linear(hid*n_heads, out_org_resource)
        
        self.fc_org_resource_role = Linear(hid, out_org_resource_role)
        
        

    def forward(self, batch):

        for i in range(len(self.convs)):
            batch.x_dict = self.convs[i]( 
                batch.x_dict, batch.edge_index_dict, batch.edge_attr_dict
            )

            batch.x_dict = {key: x.relu() for key, x in batch.x_dict.items()}

        

        act = global_mean_pool(batch.x_dict["Activity"], batch["Activity"].batch)
        tm = global_mean_pool(batch.x_dict["time:timestamp"], batch["time:timestamp"].batch)
        ores = global_mean_pool(batch.x_dict["org:resource"], batch["org:resource"].batch)
        ores_roles = global_mean_pool(batch.x_dict["org:resource:role"], batch["org:resource:role"].batch)

        activities = self.fc_NA(act)
        timestamps = self.fc_timestamp(tm).reshape(1,-1)[0]
        org_resources = self.fc_org_resource(ores)
        org_resources_role = self.fc_org_resource_role(ores_roles)

        return [activities, timestamps, org_resources, org_resources_role]

In [None]:
from torcheval.metrics.functional import multiclass_accuracy, multiclass_f1_score

In [None]:
import torch.nn as nn

In [None]:
import time

In [None]:
from copy import deepcopy


def train_hgnn(config):
    print(config)
    
   
    
    net = HGNN(
        parameters=config,
        out_act=len(list_activities),
        out_org_resource=len(list_org_resource),
        out_org_resource_role=len(list_org_resource_roles),
        nodes_relations=edge_types,
        relations_with_features=edge_features_dims,
    )
    net.to(device)

    act_loss = nn.CrossEntropyLoss(act_weights)
    timestamp_loss = nn.L1Loss()
    org_resource_loss = nn.CrossEntropyLoss()
    org_resource_roles_loss = nn.CrossEntropyLoss()

    
    train_loader = DataLoader(X_train, batch_size=config["batch_size"], shuffle=True)
    valid_loader = DataLoader(X_valid, batch_size=config["batch_size"], shuffle=True)
    
    
    #if config["optim"] == "Adam":
    optimizer = torch.optim.Adam(net.parameters(), lr=config["lr"])
    #else:
    #    with torch.no_grad():
    #        for _,x in enumerate(train_loader):
    #            x.to(device)
    #            _, _, _ = net(x)
    #            break
    #    optimizer = torch.optim.Adagrad(net.parameters(), lr=config["lr"])
        
    
    

    best_model = None
    best_loss = 0
    patience = 5
    pat_count = 0
    
    torch.cuda.empty_cache()
    
    for epoch in range(0, 20):
        start_time = time.time() 

        print(f"Epoch: {epoch}\n")
        
        
        net.train()
        for _, x in enumerate(train_loader):
            x = x.to(device)

            activities_labels = x.y["Activity"]
            timestamp_labels = x.y["time:timestamp"]
            org_resource_labels = x.y["org:resource"]
            org_resource_roles_labels = x.y["org:resource:role"]
            
            optimizer.zero_grad()
            outputs_act, outputs_timestamp, outputs_org_resource, outputs_org_resource_roles = net(x)
            
            act_loss_step = act_loss(outputs_act, activities_labels)
            org_resource_loss_step = org_resource_loss(outputs_org_resource, org_resource_labels)
            org_resource_roles_loss_step = org_resource_roles_loss(outputs_org_resource_roles, org_resource_roles_labels)
            timestamp_loss_step = timestamp_loss(outputs_timestamp, timestamp_labels)
            
            total_loss = (act_loss_step + timestamp_loss_step + org_resource_loss_step + org_resource_roles_loss_step)
            total_loss.backward()
            optimizer.step()
            

        
        
        prediction_activity = []
        target_activity = []
        
        prediction_resource = []
        target_resource = []
        
        prediction_resource_roles = []
        target_resource_roles = []
        
        avg_mse_timestamp = []
        
        
        
        running_total_loss = []
        
        net.eval()
        with torch.no_grad():
            for i, x in enumerate(valid_loader):
                x = x.to(device)

                activities_labels = x.y["Activity"]
                timestamp_labels = x.y["time:timestamp"]
                org_resource_labels = x.y["org:resource"]
                org_resource_roles_labels = x.y["org:resource:role"]
                
                outputs_act, outputs_timestamp, outputs_org_resource, outputs_org_resource_roles = net(x)
                
                act_loss_step = act_loss(outputs_act, activities_labels)
                org_resource_loss_step = org_resource_loss(outputs_org_resource, org_resource_labels)
                org_resource_roles_loss_step = org_resource_roles_loss(outputs_org_resource_roles, org_resource_roles_labels)
                timestamp_loss_step = timestamp_loss(outputs_timestamp, timestamp_labels)
                
                running_total_loss.append(act_loss_step.item() + org_resource_loss_step.item() + timestamp_loss_step.item() + org_resource_roles_loss_step.item())
                

                
                preds_act = torch.argmax(torch.softmax(outputs_act, dim=1), 1)
                prediction_activity.append(preds_act)
                target_activity.append(activities_labels)
                
                
                preds_org_resource = torch.argmax(torch.softmax(outputs_org_resource, dim=1), 1)
                prediction_resource.append(preds_org_resource)
                target_resource.append(org_resource_labels)
                
                preds_org_resource_roles = torch.argmax(torch.softmax(outputs_org_resource_roles, dim=1), 1)
                prediction_resource_roles.append(preds_org_resource_roles)
                target_resource_roles.append(org_resource_roles_labels)
                
                
                
                avg_mse_timestamp.append(timestamp_loss_step.item())
        
        
        
                
                
        prediction_activity = torch.cat(prediction_activity)
        target_activity = torch.cat(target_activity)
        
        prediction_resource = torch.cat(prediction_resource)
        target_resource = torch.cat(target_resource)
        
        prediction_resource_roles = torch.cat(prediction_resource_roles)
        target_resource_roles = torch.cat(target_resource_roles)
        
        
        macro_f1_activity = multiclass_f1_score(prediction_activity, target_activity, num_classes=len(list_activities), average="macro")
        
        act_accuracy = multiclass_accuracy(prediction_activity, target_activity, num_classes=len(list_activities))
        resource_accuracy = multiclass_accuracy(prediction_resource, target_resource, num_classes=len(list_org_resource))
        resource_roles_accuracy = multiclass_accuracy(prediction_resource_roles, target_resource_roles, num_classes=len(list_org_resource_roles))
        
        MAE_timestamp = sum(avg_mse_timestamp) / len(avg_mse_timestamp)
        
        val_loss = sum(running_total_loss) / len(running_total_loss)
        
        
        
        print(f"\nVALIDATION")
        print("ACTIVITY: acc {:.4f} MacroF1 {:.4f}".format(act_accuracy, macro_f1_activity))
        print("TIMESTAMP: MAE {:.4f}".format(MAE_timestamp))
        print("RESOURCE: acc {:.4f}".format(resource_accuracy))
        print("RESOURCE_ROLE: acc {:.4f}".format(resource_roles_accuracy))
        print("TOTAL_LOSS: {:.4f}".format(val_loss))
        print("epoch time {}s\n".format(time.time() - start_time))
        
        
        
        if epoch == 0:
            best_model= deepcopy(net)
            best_loss = val_loss
        else:
            if val_loss < best_loss:
                best_loss = val_loss
                best_model = deepcopy(net)
                pat_count = 0
                print("new best model found")
            if pat_count == patience:
                print("Validation performance didn\'t improve for {} epochs. Training stops.".format(pat_count))
                return best_model
        pat_count += 1    


    return best_model
            


    

In [None]:
def test_hgnn(net):
    test_loader = DataLoader(X_test, batch_size=128, shuffle=False)
    
    act_loss = nn.CrossEntropyLoss(act_weights)
    timestamp_loss = nn.L1Loss()
    resource_loss = nn.CrossEntropyLoss()
    resource_roles_loss = nn.CrossEntropyLoss() 
    
    prediction_activity = []
    target_activity = []
    
    prediction_resource = []
    target_resource = []
    
    prediction_resource_roles = []
    target_resource_roles = []
    
    avg_mse_timestamp = []
    
    total_loss = []
        
    net.eval()
    with torch.no_grad():
        for i, x in enumerate(test_loader):
            x = x.to(device)
            activities_labels = x.y["Activity"]
            timestamp_labels = x.y["time:timestamp"]
            org_resource_labels = x.y["org:resource"]
            org_resource_roles_labels = x.y["org:resource:role"]
            
            outputs_act, outputs_timestamp, outputs_org_resource, outputs_org_resource_roles = net(x)
            
     
            timestamp_loss_step = timestamp_loss(outputs_timestamp, timestamp_labels)
            act_loss_step = act_loss(outputs_act, activities_labels)
            resource_loss_step = resource_loss(outputs_org_resource, org_resource_labels)
            org_resource_roles_loss_step = resource_roles_loss(outputs_org_resource_roles, org_resource_roles_labels)
            
            preds_act = torch.argmax(torch.softmax(outputs_act, dim=1), 1)
            prediction_activity.append(preds_act)
            target_activity.append(activities_labels)
            
            
            preds_org_resource = torch.argmax(torch.softmax(outputs_org_resource, dim=1), 1)
            prediction_resource.append(preds_org_resource)
            target_resource.append(org_resource_labels)
            
            preds_org_resource_roles = torch.argmax(torch.softmax(outputs_org_resource_roles, dim=1), 1)
            prediction_resource_roles.append(preds_org_resource_roles)
            target_resource_roles.append(org_resource_roles_labels)
            
            
            
            avg_mse_timestamp.append(timestamp_loss_step.item())
            
            total_loss.append(timestamp_loss_step.item() + act_loss_step.item() + resource_loss_step.item() + org_resource_roles_loss_step.item())
            
            
    
    prediction_activity = torch.cat(prediction_activity)
    target_activity = torch.cat(target_activity)
    
    prediction_resource = torch.cat(prediction_resource)
    target_resource = torch.cat(target_resource)
    
    prediction_resource_roles = torch.cat(prediction_resource_roles)
    target_resource_roles = torch.cat(target_resource_roles)
    
    resource_to_role_acc = multiclass_accuracy(
        torch.tensor([map_resource_to_role[x.item()] for x in prediction_resource], device=device),
        torch.tensor([map_resource_to_role[x.item()] for x in target_resource_roles], device=device),
        num_classes=3
    )
    
    
    act_accuracy = multiclass_accuracy(prediction_activity, target_activity, num_classes=len(list_activities))
    resource_accuracy = multiclass_accuracy(prediction_resource, target_resource, num_classes=len(list_org_resource))
    resource_roles_accuracy = multiclass_accuracy(prediction_resource_roles, target_resource_roles, num_classes=len(list_org_resource_roles))
    MAE_timestamp = sum(avg_mse_timestamp) / len(avg_mse_timestamp)
    
    Average_total_loss = sum(total_loss) / len(total_loss)
    
    macro_f1_activity = multiclass_f1_score(prediction_activity, target_activity, num_classes=len(list_activities), average="macro")
    
    return { "MacroF1Activity" : macro_f1_activity.item(), "activity_accuracy"  : act_accuracy.item(), "resource_accuracy" : resource_accuracy.item(), "resource_roles_accuracy" : resource_roles_accuracy.item(), "resource_to_role_acc":resource_to_role_acc.item(), "MAE_timestamp" : MAE_timestamp, "AVG_total_loss" : Average_total_loss} 

In [None]:
def train_evaluate(config):
    trained_net = train_hgnn(config)
    return test_hgnn(trained_net)

In [None]:
best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "hid", "type": "choice", "values": [128], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "layers", "type": "choice", "values": [2, 3, 4, 5], "value_type": "int", "is_ordered" : True, "sort_values":False},
        {"name": "layers", "type": "choice", "values": [2], "value_type": "int", "is_ordered" : True, "sort_values":False},
        {"name": "lr", "type": "range", "bounds": [1e-4, 1e-1], "value_type": "float", "log_scale": True},
        {"name": "batch_size", "type": "choice", "values": [256, 1024], "value_type": "int", "is_ordered" : True,"sort_values":False}, 
        {"name": "heads", "type": "choice", "values": [1,4], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "heads", "type": "choice", "values": [1], "value_type": "int", "is_ordered" : True,"sort_values":False},
        
        #{"name": "aggregation", "type" : "choice", "values" :["sum", "mean", "max"], "value_type" : "str"}
        {"name": "aggregation", "type" : "choice", "values" :["max"], "value_type" : "str"},
     
    ],
  
    evaluation_function=train_evaluate,
    objective_name='AVG_total_loss',
    arms_per_trial=1,
    minimize = True,
    random_seed = 123,
    total_trials = 20
)

print(best_parameters)
means, covariances = values
print(means)
print(experiment)

In [None]:
from ax.service.utils.report_utils import exp_to_df

results = exp_to_df(experiment)

In [None]:
results.sort_values(by="activity_accuracy")

In [None]:
results = results.sort_values(by="AVG_total_loss")

In [None]:
results.to_csv("results/BPI12A.csv", sep=",")