In [None]:
import torch
import pickle
import numpy as np
import pandas as pd
import os
import json 
from os.path import dirname



root_path = dirname(os.getcwd()) + "/SEPH_outcome"

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs_repair/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [None]:
with open("data/dataset_features.json", 'r') as file:
    datasets_info = json.load(file)

In [None]:
list(datasets_info.keys())

In [None]:
dataset = "BPI12_DECLINED_COMPLETE"

In [None]:
with open("data/dataset_features.json", 'r') as file:
    dataset_info = json.load(file)[dataset]

In [None]:
categorical_columns = dataset_info["categorical"]
real_value_columns = dataset_info["numerical"]

In [None]:
tab_all = pd.read_csv(data_dir_processed+dataset+"_processed_all.csv")
tab_all.head()

In [None]:
import random

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
with open(data_dir_graphs + dataset + "_TRAIN_repair.pkl", "rb") as f:
    X_train = pickle.load(f)
with open(data_dir_graphs + dataset + "_VALID_repair.pkl", "rb") as f:
    X_valid = pickle.load(f)
with open(data_dir_graphs + dataset + "_TEST_repair.pkl", "rb") as f:
    X_test = pickle.load(f)

In [None]:

from torch_geometric.data import Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.transforms import ToUndirected, NormalizeFeatures

transform = ToUndirected()

with torch.no_grad():
        for i in range(len(X_train)):
                X_train[i] = transform(X_train[i])
        for i in range(len(X_valid)):
                X_valid[i] = transform(X_valid[i])
        for i in range(len(X_test)):
                X_test[i] = transform(X_test[i])
    






In [None]:
edge_types = set()
node_types = set()
for i in range(len(X_train)):
    n, edge_type = X_train[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
for i in range(len(X_valid)):
    n, edge_type = X_valid[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)
for i in range(len(X_test)):
    n, edge_type = X_test[i].metadata()
    for x in n:
        node_types.add(x)
    for x in edge_type:
        edge_types.add(x)



In [None]:
node_types = list(node_types)
edge_types = list(edge_types)

In [None]:
node_types

In [None]:
edge_types

## Hyperopt

In [None]:
from ax.service.managed_loop import optimize

In [None]:
from torch_geometric.nn import (
    HeteroConv,
    global_mean_pool,
    GATv2Conv,
    SAGEConv,
    TransformerConv
)
from torch.nn import (
    ModuleList,
    Module,
    Linear
  )
from typing_extensions import Self

In [None]:
class HGNN(Module):

    def __init__(self, output_cat, output_real,nodes_relations, parameters) -> Self:  # type: ignore
        super().__init__()

       
        
        hid = parameters["hid"]
        layers = parameters["layers"]
        aggregation = parameters["aggregation"]
       
        
        self.output_cat = output_cat
        self.output_real = output_real
        
        
        
        # Convolutional layers
        
        self.convs = ModuleList()
        for _ in range(layers):
            conv = HeteroConv(
                {
                    relation: (
                        SAGEConv((-1,-1), aggr=aggregation, out_channels=hid, normalize=False)
                    )
                    for relation in nodes_relations
                },
                aggr=aggregation,
            )

            self.convs.append(conv)

        
        # Fully connected layers 
        
        self.FC = {}
        
        for k in output_cat:
            self.FC[k] = Linear(hid, output_cat[k], device=device)
        for k in output_real:
            self.FC[k] = Linear(hid, 1, device=device)
        
    
        
        

    def forward(self, batch):

        
        # Forward pass for convolutional layers
        for i in range(len(self.convs)):
            batch.x_dict = self.convs[i]( 
                batch.x_dict, batch.edge_index_dict
            )
            batch.x_dict = {key: x.relu() for key, x in batch.x_dict.items()}


        
        # Output predictions
        output = {}
        
        for k in self.output_cat:
            output[k] = self.FC[k](batch.x_dict[k][batch.mask])
        for k in self.output_real:
            output[k] = self.FC[k](batch.x_dict[k][batch.mask]).reshape(1,-1)[0]
            

        return output

In [None]:
from torcheval.metrics.functional import multiclass_accuracy, multiclass_f1_score

In [None]:
import torch.nn as nn

In [None]:
import time

In [None]:
from copy import deepcopy
from tqdm.notebook import tqdm

def train_hgnn(config, output_cat, output_real, epochs=20):
    print(config)

    net = HGNN(
        parameters=config,
        output_cat=output_cat,
        output_real=output_real,
        nodes_relations=edge_types,
    )
    net = net.to(device)

    losses = {}

    for k in output_cat:
        losses[k] = (
            nn.CrossEntropyLoss()
        )
    for k in output_real:
        losses[k] = nn.L1Loss()

    train_loader = DataLoader(X_train, batch_size=config["batch_size"], shuffle=True)
    valid_loader = DataLoader(X_valid, batch_size=config["batch_size"], shuffle=True)

    optimizer = torch.optim.Adam(net.parameters(), lr=config["lr"])

    best_model = None
    best_loss = 0
    patience = 5
    pat_count = 0

    torch.cuda.empty_cache()

    for epoch in tqdm(range(0, epochs)):
        start_time = time.time()

        #print(f"Epoch: {epoch}\n")

        net.train()
        for _, x in enumerate(train_loader):
            x = x.to(device)

            all_labels = x.y
            labels = {k : all_labels[k][x.mask] for k in all_labels}
                
            optimizer.zero_grad()
            outputs = net(x)

            losses_step = {k: losses[k](outputs[k], labels[k]) for k in losses}

            total_loss = 0
            for k in losses_step:
                total_loss += losses_step[k]

            total_loss.backward()
            optimizer.step()

        predictions_categorical = {k: [] for k in output_cat}
        target_categorical = {k: [] for k in output_cat}

        avg_MAE = {k: [] for k in output_real}

        running_total_loss = []

        net.eval()
        with torch.no_grad():
            for i, x in enumerate(valid_loader):
                x = x.to(device)

                all_labels = x.y
                labels = {k : all_labels[k][x.mask] for k in all_labels}

                outputs = net(x)

                losses_step = {k: losses[k](outputs[k], labels[k]) for k in losses}

                running_total_loss.append(sum(list(losses_step.values())))

                for k in output_cat:
                    predictions_categorical[k].append(
                        torch.argmax(torch.softmax(outputs[k], dim=1), 1)
                    )
                    target_categorical[k].append(labels[k])

                for k in output_real:
                    avg_MAE[k].append(losses_step[k])

        for k in predictions_categorical:
            predictions_categorical[k] = torch.cat(predictions_categorical[k])
            target_categorical[k] = torch.cat(target_categorical[k])

        
        # Accuracies for categorial features
        accuracy = {
            k: multiclass_accuracy(
                predictions_categorical[k],
                target_categorical[k],
                num_classes=output_cat[k],
            )
            for k in output_cat
        }

        # Average Mean Absolute Error for real value features
        avg_MAE = {k: sum(avg_MAE[k]) / len(avg_MAE[k]) for k in avg_MAE}

        val_loss = sum(running_total_loss) / len(running_total_loss)

        #print(f"\nVALIDATION")
        #for k in accuracy:
        #    (
        #        print("{}: acc {:.4f}".format(k, accuracy[k]))
        #        if k != "Activity"
        #        else print(
        #            "{}: acc {:.4f} macroF1 {:.4f}".format(
        #                k, accuracy[k], macro_f1_activity.item()
        #            )
        #        )
        #    )
        #for k in avg_MAE:
        #    print("{}: MAE {:.4f}".format(k, avg_MAE[k]))
        #print("TOTAL_LOSS: {:.4f}".format(val_loss))
        #print("epoch time {}s\n".format(time.time() - start_time))

        
        # Early stopping 
        
        if epoch == 0:
            best_model = deepcopy(net)
            best_loss = val_loss
        else:
            if val_loss < best_loss:
                best_loss = val_loss
                best_model = deepcopy(net)
                pat_count = 0
            if pat_count == patience:
                #print(
                #    "Validation performance didn't improve for {} epochs. Training stops.".format(
                #        pat_count
                #    )
                #)
                return best_model
        pat_count += 1

    return best_model

In [None]:
def test_hgnn(net, output_cat, output_real):
    test_loader = DataLoader(X_test, batch_size=128, shuffle=False)
    
    losses = {}
    
    for k in output_cat:
        losses[k] = (
            nn.CrossEntropyLoss()
        )
    for k in output_real:
        losses[k] = nn.L1Loss()
    
    
    
    
    
    predictions_categorical = {k: [] for k in output_cat}
    target_categorical = {k: [] for k in output_cat}

    avg_MAE = {k : [] for k in output_real}
    
    total_loss = []
        
    net.eval()
    with torch.no_grad():
        for i, x in enumerate(test_loader):
            x = x.to(device)
            
            all_labels = x.y
            labels = {k : all_labels[k][x.mask] for k in all_labels}
            
            outputs = net(x)
            
     
            losses_step = {k: losses[k](outputs[k], labels[k]).item() for k in losses}
            total_loss.append(sum(list(losses_step.values())))
            
            for k in output_cat:
                    predictions_categorical[k].append(
                        torch.argmax(torch.softmax(outputs[k], dim=1), 1)
                    )
                    target_categorical[k].append(labels[k])
            
            
            for k in output_real:
                    avg_MAE[k].append(losses_step[k])
                    
    for k in predictions_categorical:
            predictions_categorical[k] = torch.cat(predictions_categorical[k])
            target_categorical[k] = torch.cat(target_categorical[k])
               
            
   
            
    accuracy = {
            k: multiclass_accuracy(
                predictions_categorical[k],
                target_categorical[k],
                num_classes=output_cat[k],
            )
            for k in output_cat
        }
    
    #resource_to_role_acc = multiclass_accuracy(
    #    torch.tensor([map_resource_to_role[x.item()] for x in predictions_categorical["org:resource"]], device=device),
    #    target_categorical["org:resource:role"],
    #    num_classes=output_cat["org:resource:role"]
    #)
    
    avg_MAE = {k : sum(avg_MAE[k]) / len(avg_MAE[k]) for k in avg_MAE}
    
    
    Average_total_loss = sum(total_loss) / len(total_loss)
    
    res = {f"{k}_acc" : accuracy[k].item() for k in accuracy} | {f"{k}_mae" : avg_MAE[k] for k in avg_MAE} | {"AVG_total_loss" : Average_total_loss} 
    
    print(res)
    
    return res

In [None]:
outputcat = {k : len(list_unique[k]) for k in list_unique}
outputreal = real_value_columns
print(outputcat)
print(outputreal)

In [None]:
def train_evaluate(config):
    trained_net = train_hgnn(config, output_cat=outputcat, output_real=outputreal, epochs=50)
    return test_hgnn(trained_net, output_cat=outputcat, output_real=outputreal)

In [None]:
import logging

logging.getLogger("root").setLevel(logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "hid", "type": "choice", "values": [128], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "layers", "type": "choice", "values": [2, 3, 4, 5], "value_type": "int", "is_ordered" : True, "sort_values":False},
        {"name": "layers", "type": "choice", "values": [2], "value_type": "int", "is_ordered" : True, "sort_values":False},
        {"name": "lr", "type": "range", "bounds": [1e-4, 1e-1], "value_type": "float", "log_scale": True},
        {"name": "batch_size", "type": "choice", "values": [128,256,512], "value_type": "int", "is_ordered" : True,"sort_values":False}, 
        
        #{"name": "heads", "type": "choice", "values": [1,2], "value_type": "int", "is_ordered" : True,"sort_values":False},
        #{"name": "heads", "type": "choice", "values": [1], "value_type": "int", "is_ordered" : True,"sort_values":False},
        
        {"name": "aggregation", "type" : "choice", "values" :["sum", "mean", "max"], "value_type" : "str"}
        #{"name": "aggregation", "type" : "choice", "values" :["max"], "value_type" : "str"},
     
    ],
  
    evaluation_function=train_evaluate,
    objective_name='AVG_total_loss',
    arms_per_trial=1,
    minimize = True,
    random_seed = 123,
    total_trials = 30
)

print(best_parameters)
means, covariances = values
print(means)
print(experiment)

In [None]:
from ax.service.utils.report_utils import exp_to_df

results = exp_to_df(experiment)

In [None]:
results.sort_values(by="AVG_total_loss")

In [None]:
results = results.sort_values(by="AVG_total_loss")

In [None]:
results.to_csv(f"results/{dataset}.csv", sep=",")