In [1]:
import torch
import pickle
import numpy as np
import pandas as pd
import os

from os.path import dirname



root_path = dirname(os.getcwd()) + "/HGNN_NA"

pd.set_option("display.max_columns", None)
data_dir = root_path + "/data/datasets/original/"
data_dir_processed = root_path + "/data/datasets/processed/"
data_dir_graphs = root_path + "/data/datasets/graphs/"

print(root_path, data_dir, data_dir_processed, data_dir_graphs, sep="\n")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

/home/sebdis/HGNN/HGNN_NA
/home/sebdis/HGNN/HGNN_NA/data/datasets/original/
/home/sebdis/HGNN/HGNN_NA/data/datasets/processed/
/home/sebdis/HGNN/HGNN_NA/data/datasets/graphs/


device(type='cuda', index=0)

In [2]:
dataset = "BPI_Challenge_2012_A"

In [3]:
tab_all = pd.read_csv(data_dir_processed+dataset+"_processed_all.csv")
print(tab_all.head())
list_activities = list(tab_all["Activity"].unique())

   org:resource lifecycle:transition           Activity       time:timestamp  \
0           112             COMPLETE        A_SUBMITTED  2011/09/30 22:38:44   
1           112             COMPLETE  A_PARTLYSUBMITTED  2011/09/30 22:38:44   
2           112             COMPLETE      A_PREACCEPTED  2011/09/30 22:39:37   
3         10862             COMPLETE         A_ACCEPTED  2011/10/01 09:42:43   
4         10862             COMPLETE        A_FINALIZED  2011/10/01 09:45:09   

         case:REG_DATE  CaseID  case:AMOUNT_REQ  
0  2011/10/01 00:38:44  173688            20000  
1  2011/10/01 00:38:44  173688            20000  
2  2011/10/01 00:38:44  173688            20000  
3  2011/10/01 00:38:44  173688            20000  
4  2011/10/01 00:38:44  173688            20000  


In [4]:
import random

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [22]:
with open(data_dir_graphs + dataset + "_TRAIN.pkl", "rb") as f:
    X_train, Y_train = pickle.load(f)
with open(data_dir_graphs + dataset + "_VALID.pkl", "rb") as f:
    X_valid, Y_valid = pickle.load(f)
with open(data_dir_graphs + dataset + "_TEST.pkl", "rb") as f:
    X_test, Y_test = pickle.load(f)

In [23]:
from typing_extensions import Self
from torch_geometric.data import Dataset
from torch.utils.data import DataLoader
from torch_geometric.transforms import ToUndirected, NormalizeFeatures

transform = ToUndirected()
t2 = NormalizeFeatures()

class Het_graph_data(Dataset):
    def __init__(self, prefix_graphs, labels) -> Self:
        self.X = prefix_graphs
        self.Y = labels

    # get the number of rows in the dataset
    def __len__(self):
        return len(self.Y)

    # get a row at a particular index in the dataset
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

    @staticmethod
    def collate(batch):
        # print(batch)
        data = [t2(transform(item[0])) for item in batch]
        Y = [item[1] for item in batch]
        return [data, Y]

In [64]:
train_loader = DataLoader(
    Het_graph_data(X_train, Y_train),
    batch_size=256,
    shuffle=True,
    collate_fn=Het_graph_data.collate,
)


valid_loader = DataLoader(
    Het_graph_data(X_valid, Y_valid),
    batch_size=128,
    shuffle=False,
    collate_fn=Het_graph_data.collate,
)

test_loader = DataLoader(
    Het_graph_data(X_test, Y_test),
    batch_size=1,
    shuffle=False,
    collate_fn=Het_graph_data.collate,
)

In [65]:
# A Class to keep track of the metrics of the classification process
class ClassificationMetrics:

  # Constructor takes the number of classes, in our case 20
  def __init__(self, num_classes=20):
    self.num_classes = num_classes
    # Initialize a confusion matrix
    self.C = torch.zeros(num_classes, num_classes)

  # Update the confusion matrix with the new scores
  def add(self, yp, yt):
    # yp: 1D tensor with predictions
    # yt: 1D tensor with ground-truth targets
    yp = yp.to("cpu")
    yt = yt.to("cpu")
    with torch.no_grad(): # We require no computation graph
      self.C+=(yt*self.C.shape[1]+yp).bincount(minlength=self.C.numel()).view(self.C.shape).float()

  def clear(self):
    # We set the confusion matrix to zero
    self.C.zero_()

  # Computes the global accuracy
  def acc(self):
    return self.C.diag().sum().item()/self.C.sum()

  # Computes the class-averaged accuracy
  def mAcc(self):
    return (self.C.diag()/self.C.sum(-1)).mean().item()

  # Computers the class-averaged Intersection over Union
  def mIoU(self):
    return (self.C.diag()/(self.C.sum(0)+self.C.sum(1)-self.C.diag())).mean().item()

  # Returns the confusion matrix
  def confusion_matrix(self):
    return self.C

In [66]:
loaders = {"train": train_loader, "validation" : valid_loader, "test" : test_loader}

In [61]:
node_types, edge_types = X_train[0].metadata()

In [None]:
X_train[0].x_dict

In [11]:
node_types

['org:resource',
 'lifecycle:transition',
 'Activity',
 'time:timestamp',
 'case:REG_DATE',
 'case:AMOUNT_REQ']

In [27]:
edge_types

[('org:resource', 'related_to', 'org:resource'),
 ('Activity', 'followed_by', 'Activity'),
 ('time:timestamp', 'related_to', 'time:timestamp'),
 ('Activity', 'related_to', 'org:resource'),
 ('Activity', 'related_to', 'lifecycle:transition'),
 ('Activity', 'related_to', 'time:timestamp'),
 ('Activity', 'related_to', 'case:REG_DATE'),
 ('Activity', 'related_to', 'case:AMOUNT_REQ')]

In [67]:
# from models.models import HGNN
import datetime

from torch import nn
from tqdm.notebook import tqdm


In [83]:
from copy import deepcopy
from typing_extensions import Self
from torch_geometric.nn import SAGEConv, HeteroConv, GATConv # Linear, GCNConv
from torch.nn import ModuleList, Module, Sequential, Softmax, Dropout, Linear, ReLU
from torch import mean, stack, sum, concat


class HGNN(Module):

    def __init__(self, hid, out, layers, node_types, nodes_relations) -> Self:  # type: ignore
        super().__init__()

        # List of convolutional layers
        self.convs = ModuleList()
        for _ in range(layers):
            conv = HeteroConv(
                    {relation:SAGEConv((-1, -1), hid) for relation in nodes_relations}
                    # {('Activity', 'followed_by', 'Activity') : SAGEConv((-1,-1), hid),
                    #  ("org:resource", "related_to", 'org:resource') : SAGEConv((-1,-1), hid),
                    #  ('time:timestamp', "related_to", "time:timestamp") : SAGEConv((-1,-1), hid)}
                    ,
                    aggr="mean",
                )
            self.convs.append(conv)
            
        #print(nodes_relations)
        # Take each node hid representation and apply a linear layer
        # self.linear_nodes = Sequential(Linear(hid, hid),ReLU(), Dropout(p=0.5), Linear(hid, hid), ReLU(), Dropout(p=0.5), Linear(hid, int(hid / 2)), ReLU())

        # Return the softmax with the class probabilities
        # self.fc = Sequential(Linear(int(hid/2)*(len(node_types)), out), ReLU(), Softmax(dim=0))
        self.fc = Sequential(Linear(hid*(len(node_types)), 64), ReLU(), Dropout(p=0.4), Linear(64, 64), ReLU(), nn.BatchNorm1d(64), Linear(64, out))

    def forward(self, x, edge_index):
        outs = []
        for i in range(len(x)):
            for conv in self.convs:
                
                x[i] = conv(x[i], edge_index[i])
                
                x[i] = {key: x.relu() for key, x in x[i].items()}

        
            # Node features of each node in the graph
            nodes_features = [
                x[i][key] for key in x[i].keys()
            ]
        
            # Global mean of each node type
            for i in range(len(nodes_features)):
                nodes_features[i] = mean(nodes_features[i], dim=0)

        
            # Global mean pooling
            #nodes_features = mean(stack(nodes_features), dim=0)
            nodes_features = concat(nodes_features)
            outs.append(nodes_features)
        
        out = self.fc(stack(outs))
        

        return out  # {key : self.linear(x_dict[key]) for key in x_dict.keys()}, nodes_features


org:resource, related_to, org:resource)={
    edge_attr=[1, 1],
    edge_index=[2, 1],
  },
  (Activity, followed_by, Activity)={
    edge_attr=[1, 1],
    edge_index=[2, 1],
  },
  (time:timestamp, related_to, time:timestamp)

In [47]:
model = HGNN(hid=128, out=len(list_activities), layers=4, node_types=node_types, nodes_relations=edge_types)
model

HGNN(
  (convs): ModuleList(
    (0-1): 2 x HeteroConv(num_relations=8)
  )
  (linear_nodes): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): Dropout(p=0.3, inplace=False)
    (2): Linear(in_features=32, out_features=16, bias=True)
  )
  (fc): Sequential(
    (0): Linear(in_features=96, out_features=10, bias=True)
    (1): Softmax(dim=0)
  )
)

In [None]:
X_train[0].x_dict.items()

In [None]:
X_train[0].edge_items()

In [None]:
from torch_geometric.transforms import ToUndirected

X_train[0] = ToUndirected()(X_train[0])

In [None]:
X_train[0].x_dict

In [None]:
X_train[0].edge_items()

In [None]:
X_train[0].is_directed()

In [None]:
with torch.no_grad():
    print(model(X_train[0].x_dict,X_train[0].edge_index_dict))
    print(Y_train[0])

In [None]:
k_t = {}
for i,(x,y) in enumerate(loaders['test']):
    _,pr = torch.max(y[0], dim=0)
    try:
        k_t[pr.item()] += 1
    except KeyError:
        k_t[pr.item()] = 1
k_t

In [None]:
k_t = []
for i,(x,y) in enumerate(loaders['validation']):
    k_t.append("__".join(list(x[0].x_dict.keys())))
    

In [None]:
k_t[:5]

In [None]:
k_t_s = set(k_t)
k_t_s

In [None]:
X_train[2]

In [55]:
from torch import tensor
m  =  ClassificationMetrics(num_classes=5)
m.add(tensor([1,1,1]),tensor([2,2,2]))
print(m.confusion_matrix())

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 3., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


In [41]:
for i,(x,y) in enumerate(valid_loader):
    print(x[0].edge_index_dict)
    break

{('Activity', 'followed_by', 'Activity'): tensor([[0, 1],
        [1, 0]]), ('time:timestamp', 'related_to', 'time:timestamp'): tensor([[0, 1],
        [1, 0]]), ('Activity', 'related_to', 'org:resource'): tensor([[0, 1],
        [0, 0]]), ('Activity', 'related_to', 'lifecycle:transition'): tensor([[0, 1],
        [0, 0]]), ('Activity', 'related_to', 'time:timestamp'): tensor([[0, 1],
        [0, 1]]), ('Activity', 'related_to', 'case:REG_DATE'): tensor([[0, 1],
        [0, 0]]), ('Activity', 'related_to', 'case:AMOUNT_REQ'): tensor([[0, 1],
        [0, 0]]), ('org:resource', 'rev_related_to', 'Activity'): tensor([[0, 0],
        [0, 1]]), ('lifecycle:transition', 'rev_related_to', 'Activity'): tensor([[0, 0],
        [0, 1]]), ('time:timestamp', 'rev_related_to', 'Activity'): tensor([[0, 1],
        [0, 1]]), ('case:REG_DATE', 'rev_related_to', 'Activity'): tensor([[0, 0],
        [0, 1]]), ('case:AMOUNT_REQ', 'rev_related_to', 'Activity'): tensor([[0, 0],
        [0, 1]])}


In [84]:
from torch import int32

model = HGNN(hid=128, out=len(list_activities), layers=8, node_types=node_types, nodes_relations=edge_types)


num_epochs = 10
best_accuracy = 0
early_stop_patience = 10
lr_value = 0.1

best_model = None

num_runs = 1
running_time = []

metric_tracker = ClassificationMetrics(num_classes=len(list_activities))

for run in range(num_runs):
    
    start = datetime.datetime.now()
    print("Run: {}".format(run + 1))
    
    
    
    model = model.to(device)
    
    #print("compiling...")
    #model = torch.compile(model, dynamic=True)
    #print("Compiled model")
    
    criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr_value) # momentum=0.9, weight_decay=1e-1)

    
    not_improved_count = 0
    
    for epoch in range(num_epochs):
        print(
            "\n-- EPOCH {}/{} -------------------------\n".format(epoch + 1, num_epochs)
        )
        torch.cuda.empty_cache()
        count_train = [0 for _ in range(len(list_activities))]
        count_true_train = [0 for _ in range(len(list_activities))]
        count_val = [0 for _ in range(len(list_activities))]
        count_true_val = [0 for _ in range(len(list_activities))]
        for state in ["train", "validation"]:
            if state == "train":
                model.train()
                metric_tracker.clear()
                # true_label = []
                # predictions = []
                # scores = []

            else:
                
                # print(metric_tracker.confusion_matrix())
                #print(count_train)
                #print(count_true_train)
                
                print("\tTRAIN | acc: {:.4f} | mAcc: {:.4f} | mIoU: {:.4f}".format(metric_tracker.acc(),
                                                                                   #metric_tracker.mAcc(),
                                                                                   #metric_tracker.mIoU()
                                                                                   0,0))
                
                # y_true = np.concatenate(true_label)
                # y_pred = np.concatenate(predictions)
                # scores = np.concatenate(scores)
                
                # print("TRAIN")
                # print(conta(y_true, 0), conta(y_true, 1))
                # print(conta(y_pred, 0), conta(y_pred, 1))
                
                # print_stats(y_pred, y_true, scores)
                metric_tracker.clear()
                model.eval()
                # true_label = []
                # predictions = []
                # scores = []
            
            running_loss = 0.0
            running_corrects = 0

            for i,(x,y) in tqdm(enumerate(loaders[state])):
                
                if i % 1 == 0:
                    print(metric_tracker.acc())
                    # print(running_loss/i)
                    print(metric_tracker.confusion_matrix())
                    #print(count_train)
                    #print(count_true_train)
                    print("//"*14)
                # print("X")
                # print("//"*50)
                # print(x)
                # print("//"*50)
                # print(y)
                x = [xx.to(device) for xx in x]
                x_edge_index_dicts = [xx.edge_index_dict for xx in x]
                x_dicts = [xx.x_dict for xx in x]
                # print("//"*50)
                #print(x)
                #print(x.edge_index_dict)
                #print(x.x_dict)
                # print("//"*50)
                
                y = [yy.to(device) for yy in y]
                y = stack(y)
                
                #x = [[sub_item.to(device=device) for sub_item in item] for item in x]


                #y = torch.tensor([torch.max(yi,0)[1] for yi in y])

                #y = y.to(device)
                
                outputs = model(x_dicts, x_edge_index_dicts)
                
                # print(outputs)
                # print("//"*30)
                
                
                # outputs = outputs.to(device)
                
                #print(outputs)
                #print(y)
                
                loss = criterion(outputs, y)

                if state == "train":
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    
                _, preds = torch.max(outputs, 1)
                
                # print(y)
                _, true_preds = torch.max(y, 1)                  
                # print(true_preds)
                preds = preds.to(device)
                true_preds = true_preds.to(device)
                # if state == "train": # For now batch is set to one
                #     #for i in preds:
                #     count_train[preds] += 1
                #     count_true_train[true_preds] += 1
                #     
                # else:
                #     #for i in preds:
                #     count_val[preds] += 1      
                #     count_true_val[true_preds] += 1      
                      
                      
                running_loss += loss.item()
                
                # print(outputs, y, sep="\n")
                # print(preds, true_preds, sep="\n")
                # print(preds)
                # print(true_preds)
                metric_tracker.add(torch.tensor(preds.to(int32)), torch.tensor(true_preds.to(int32)))
                # print(metric_tracker.confusion_matrix())
                
                
        print(count_val)
        print("\tEVAL  | acc: {:.4f} | mAcc: {:.4f} | mIoU: {:.4f}\n".format(metric_tracker.acc(),
                                                                             # metric_tracker.mAcc(),
                                                                             # metric_tracker.mIoU()
                                                                             # )
                                                                            0,0)   )     
                   

        if epoch == 0:# HERE WE KEEP BEST AUC VALUE
            best_accuracy = metric_tracker.acc()
            
        else:
            if metric_tracker.acc() > best_accuracy:
                print("SAVING MODEL..............\n")
                best_accuracy = metric_tracker.acc()
                not_improved_count = 0
            else:
                not_improved_count += 1

        if not_improved_count == early_stop_patience:
            print(
                "Validation performance didn't improve for {} epochs. "
                "Training stops.".format(early_stop_patience)
            )
            break

    running_time.append((datetime.datetime.now() - start).total_seconds())

Run: 1

-- EPOCH 1/10 -------------------------



0it [00:00, ?it/s]

tensor(nan)
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
////////////////////////////


  metric_tracker.add(torch.tensor(preds.to(int32)), torch.tensor(true_preds.to(int32)))


tensor(0.0977)
tensor([[ 2.,  7.,  5.,  7.,  3.,  2.,  0.,  1.,  7.,  5.],
        [ 1.,  1.,  1.,  3.,  1.,  1.,  1.,  0.,  3.,  1.],
        [ 3.,  0.,  1.,  3.,  3.,  1.,  1.,  1.,  4.,  1.],
        [ 4.,  2.,  2.,  4.,  3.,  3.,  0.,  1.,  1.,  2.],
        [ 3.,  7.,  4.,  5.,  9.,  2.,  2.,  8., 11.,  6.],
        [ 3.,  7.,  3.,  6.,  3.,  3.,  6.,  1.,  7.,  2.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 6.,  9.,  6.,  3.,  5.,  3.,  2.,  4.,  6.,  7.],
        [ 0.,  5.,  1.,  3.,  0.,  1.,  1.,  2.,  1.,  1.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])
////////////////////////////
tensor(0.1426)
tensor([[ 5., 12.,  5.,  7., 18., 11.,  2.,  2.,  8.,  9.],
        [ 3.,  1.,  1.,  3.,  8.,  7.,  1.,  2.,  5.,  3.],
        [ 6.,  1.,  2.,  3.,  8.,  3.,  1.,  1.,  4.,  3.],
        [11.,  3.,  2.,  4.,  7.,  6.,  0.,  4.,  2.,  2.],
        [10.,  8.,  6.,  9., 41.,  8.,  2., 14., 12., 10.],
        [11.,  9.,  7.,  6., 14.,  8.,  

KeyboardInterrupt: 