In [1]:
import os
from os.path import dirname
root_path = dirname(dirname(os.getcwd()))
print(root_path)
import sys
sys.path.append(root_path + '/RemainingCycleTimePrediction/2_Scripts/')
import pandas as pd
import numpy as np
import time, datetime
import pickle as pkl
import copy

import torch 
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GatedGraphConv, global_mean_pool
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render

from Event_log_processing_utils import Extract_trace_and_temporal_features, Extract_prefix
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")

data_dir = root_path + '/RemainingCycleTimePrediction/1_Data/'
project_dir = root_path + '/RemainingCycleTimePrediction/'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

/home/sebdis/ProcessMining/RemainingTime




/home/sebdis/ProcessMining/RemainingTime
cuda:0


In [2]:
import random

torch.manual_seed(0)
torch.cuda.manual_seed(0)
random.seed(0)
np.random.seed(0)

## 1. Load data

In [3]:
torch.cuda.device_count()

1

In [4]:
# data_name = 'Helpdesk'
# data_name = "env_permit"
# data_name = "BPI_Challenge_2012"
# data_name = "BPI_Challenge_2012_A"


#TODO remaining
# troppo grande data_name = "BPI_Challenge_2012_Complete"

# data_name = "BPI_Challenge_2012_O"

# data_name = "BPI_Challenge_2012_W"


# data_name = "BPI_Challenge_2012_W_Complete"

# data_name = "BPI_Challenge_2013_closed_problems"
data_name = "BPI_Challenge_2013_incidents"

In [5]:
tab_all = pd.read_csv(data_dir+data_name+"_processed_all.csv")
tab_train = pd.read_csv(data_dir+data_name+"_processed_train.csv")
tab_valid = pd.read_csv(data_dir+data_name+"_processed_valid.csv")
tab_test = pd.read_csv(data_dir+data_name+"_processed_test.csv")
tab_all.head()

Unnamed: 0,Activity,timestamp,Case_ID
0,Accepted+In Progress,2010/03/31 14:59:42,1-364285768
1,Accepted+In Progress,2010/03/31 15:00:56,1-364285768
2,Queued+Awaiting Assignment,2010/03/31 15:45:48,1-364285768
3,Accepted+In Progress,2010/04/06 14:44:07,1-364285768
4,Queued+Awaiting Assignment,2010/04/06 14:44:38,1-364285768


## 2. Prepare inputs and outputs for model training

In [6]:
list_activities = list(tab_all["Activity"].unique())
#creating instance of one-hot-encoder and fit on the whole dataset
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(np.array(list_activities).reshape((len(list_activities), 1)))

lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_all)
maxlen = max([len(x) for x in lines]) #find maximum line size
lines, lines_t, lines_t2, lines_t3, lines_t4 = Extract_trace_and_temporal_features(tab_train)
divisor = np.mean([item for sublist in lines_t for item in sublist]) #average time between events
print('divisor: {}'.format(divisor))
divisor2 = np.mean([item for sublist in lines_t2 for item in sublist]) #average time between current and first events
print('divisor2: {}'.format(divisor2))
prefixes, outputs = Extract_prefix(lines, lines_t, lines_t2, lines_t3, lines_t4)
divisor_rt = np.mean(outputs[2])
print('divisor_rt: {}'.format(divisor_rt))

divisor: 148617.25690295128
divisor2: 710873.5891046532
divisor_rt: 1879217.2841009742


In [7]:
with open(data_dir+"GGNN_"+data_name+"_NextActivity_train.pkl", "rb") as f:
    X_train, Y_train =  pkl.load(f)
with open(data_dir+"GGNN_"+data_name+"_NextActivity_valid.pkl", "rb") as f:
    X_valid, Y_valid =  pkl.load(f)
with open(data_dir+"GGNN_"+data_name+"_NextActivity_test.pkl", "rb") as f:
    X_test, Y_test =  pkl.load(f)

In [8]:
class EventLogData(Dataset):
    def __init__ (self, input_x, output):
        self.X = input_x[0]
        self.A = input_x[1]
        self.V = input_x[2]
        self.y = output
        self.y = self.y.to(torch.float32)
        # self.y = self.y.reshape((len(self.y),1))

    #get the number of rows in the dataset
    def __len__(self):
        return len(self.X)

    #get a row at a particular index in the dataset
    def __getitem__ (self,idx):
        return [[self.X[idx], self.A[idx], self.V[idx]],self.y[idx]]
    
     # get the indices for the train and test rows
    def get_splits(self, n_valid = 0.2):
        train_idx,valid_idx = train_test_split(list(range(len(self.X))),test_size = n_valid, shuffle = True)
        train = Subset(self, train_idx)
        valid = Subset(self, valid_idx)
        return train, valid
    
def my_collate(batch):
    data = [item[0] for item in batch]
    Y = [item[1] for item in batch]
    return [data, Y]

In [9]:
valid_loader = DataLoader(EventLogData(X_valid, Y_valid),
                                batch_size=len(X_valid[0]),
                                shuffle=False, collate_fn=my_collate)
test_loader = DataLoader(EventLogData(X_test, Y_test),
                                batch_size=1,
                                shuffle=False, collate_fn=my_collate)


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## 4. Re-Train model with tuned hyperparameters

In [11]:
# A Class to keep track of the metrics of the classification process
class ClassificationMetrics:

  # Constructor takes the number of classes, in our case 20
  def __init__(self, num_classes=20):
    self.num_classes = num_classes
    # Initialize a confusion matrix
    self.C = torch.zeros(num_classes, num_classes)

  # Update the confusion matrix with the new scores
  def add(self, yp, yt):
    # yp: 1D tensor with predictions
    # yt: 1D tensor with ground-truth targets
    yp = yp.to("cpu")
    yt = yt.to("cpu")
    with torch.no_grad(): # We require no computation graph
      self.C+=(yt*self.C.shape[1]+yp).bincount(minlength=self.C.numel()).view(self.C.shape).float()

  def clear(self):
    # We set the confusion matrix to zero
    self.C.zero_()

  # Computes the global accuracy
  def acc(self):
    return self.C.diag().sum().item()/self.C.sum()

  # Computes the class-averaged accuracy
  def mAcc(self):
    return (self.C.diag()/self.C.sum(-1)).mean().item()

  # Computers the class-averaged Intersection over Union
  def mIoU(self):
    return (self.C.diag()/(self.C.sum(0)+self.C.sum(1)-self.C.diag())).mean().item()

  # Returns the confusion matrix
  def confusion_matrix(self):
    return self.C

In [12]:
# Creating the model class
class GGNN_model(nn.Module):
    def __init__(self, ggnn_dim, num_layers, droppout_prob, list_activities):
        super(GGNN_model, self).__init__()        
        self.ggnn_dim = ggnn_dim
        self.num_layers = num_layers
        self.droppout_prob = droppout_prob
        
        self.ggnn = GatedGraphConv(self.ggnn_dim, num_layers=self.num_layers)
        self.fc = nn.Sequential(
            nn.Dropout(p = self.droppout_prob),
            nn.Linear(self.ggnn_dim,128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Dropout(p = self.droppout_prob),
            nn.Linear(64,len(list_activities))
        )
        
    
    # Progresses data across layers    
    def forward(self, x):
        x = [self.ggnn(X, A.to(torch.long), V) for i, (X, A, V ) in enumerate(x)]
        x = torch.stack([global_mean_pool(single_x, batch = None) for single_x in x])
        x = x.squeeze(1)
        out = self.fc(x)
        return out

In [13]:
batch_size = 256
ggnn_dim = 256
num_layers = 4
lr_value = 0.01
droppout_prob = 0.3
hidden_channels = 4

train_loader = DataLoader(EventLogData(X_train,Y_train), batch_size=batch_size, 
                          shuffle=True, collate_fn=my_collate)

In [14]:
model_name = "_model_Gated_GNN"

In [40]:
from tqdm import tqdm_notebook as tqdm

save_folder = project_dir + "/5_Output_files/Next_Activity/" + data_name + model_name

if not os.path.exists(save_folder):
    os.mkdir(save_folder)

In [15]:
loaders = {"train": train_loader, "validation" : valid_loader, "test" : test_loader}

In [16]:
# save_folder = project_dir + '5_Output_files/Remaining_time_prediction/'+data_name+'_model_Gated_GNN'


num_epochs = 40
best_accuracy = 0
early_stop_patience = 10

best_model = None

num_runs = 1
running_time = []

metric_tracker = ClassificationMetrics(num_classes=len(list_activities))

for run in range(num_runs):
    
    start = datetime.datetime.now()
    print("Run: {}".format(run + 1))
    
    model = GGNN_model(ggnn_dim, num_layers, droppout_prob,list_activities)
    
    model = model.to(device)
    print("compiling...")
    model = torch.compile(model, dynamic=True)
    print("Compiled model")
    
    criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr_value) # momentum=0.9, weight_decay=1e-1)

    
    not_improved_count = 0
    
    for epoch in range(num_epochs):
        print(
            "\n-- EPOCH {}/{} -------------------------\n".format(epoch + 1, num_epochs)
        )
        torch.cuda.empty_cache()
        count_train = [0 for _ in range(len(list_activities))]
        count_val = [0 for _ in range(len(list_activities))]
        for state in ["train", "validation"]:
            if state == "train":
                model.train()
                metric_tracker.clear()
                # true_label = []
                # predictions = []
                # scores = []

            else:
                
                # print(metric_tracker.confusion_matrix())
                print(count_train)
                
                print("\tTRAIN | acc: {:.4f} | mAcc: {:.4f} | mIoU: {:.4f}".format(metric_tracker.acc(),
                                                                                   #metric_tracker.mAcc(),
                                                                                   #metric_tracker.mIoU()
                                                                                   0,0))
                
                # y_true = np.concatenate(true_label)
                # y_pred = np.concatenate(predictions)
                # scores = np.concatenate(scores)
                
                # print("TRAIN")
                # print(conta(y_true, 0), conta(y_true, 1))
                # print(conta(y_pred, 0), conta(y_pred, 1))
                
                # print_stats(y_pred, y_true, scores)
                metric_tracker.clear()
                model.eval()
                # true_label = []
                # predictions = []
                # scores = []
            
            running_loss = 0.0
            running_corrects = 0

            for i,(x,y) in tqdm(enumerate(loaders[state])):
                x = [[sub_item.to(device=device) for sub_item in item] for item in x]

                
                
    
                # true_label.append(y.numpy())
                y = torch.tensor([torch.max(yi,0)[1] for yi in y])
                # y = torch.stack(y, dim=0)
                # print(y)
                # print(y.shape)
                y = y.to(device)
                
                outputs = model(x)
                
                # print(outputs)
                
                outputs = outputs.to(device)
                
                loss = criterion(outputs, y)

                if state == "train":
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    
                _, preds = torch.max(outputs, 1)                
                preds = preds.to(device)
                if state == "train":
                    for i in preds:
                        count_train[i] += 1
                else:
                    for i in preds:
                        count_val[i] += 1
                # print(preds)
                
                
                metric_tracker.add(preds, y)
                
                # running_loss += loss.item() * x.size(0)
                # running_corrects += torch.sum(preds == y.data)
                            
            # epoch_loss = running_loss / len(dataset[state])
            # epoch_acc = running_corrects.double() / len(dataset[state])
            
        # print(metric_tracker.confusion_matrix())
        print(count_val)
        print("\tEVAL  | acc: {:.4f} | mAcc: {:.4f} | mIoU: {:.4f}\n".format(metric_tracker.acc(),
                                                                             # metric_tracker.mAcc(),
                                                                             # metric_tracker.mIoU()
                                                                             # )
                                                                            0,0)   )     
                   

        if epoch == 0:# HERE WE KEEP BEST AUC VALUE
            best_accuracy = metric_tracker.acc()
            torch.save(
                model.state_dict(),
                "{}/best_model_run_{}.pt".format(save_folder, run + 1),
            )
        else:
            if metric_tracker.acc() > best_accuracy:
                print("SAVING MODEL..............\n")
                torch.save(
                    model.state_dict(),
                    "{}/best_model_run_{}.pt".format(save_folder, run + 1),
                )
                best_accuracy = metric_tracker.acc()
                not_improved_count = 0
            else:
                not_improved_count += 1

        if not_improved_count == early_stop_patience:
            print(
                "Validation performance didn't improve for {} epochs. "
                "Training stops.".format(early_stop_patience)
            )
            break

    running_time.append((datetime.datetime.now() - start).total_seconds())

Run: 1



-- EPOCH 1/40 -------------------------



KeyboardInterrupt: 

## 5. Evaluation

In [43]:
def evaluate_model(model):
    err_dict = {}
    with torch.no_grad():
        model.eval()
        
        metric_tracker = ClassificationMetrics(num_classes=len(list_activities))
        for i,(inputs,targets) in enumerate(test_loader):
            metric_tracker.clear()
            prefix_len = inputs[0][0].size(0)
            inputs = [[sub_item.to(dtype=torch.float32, device=device) for sub_item in item] for item in inputs]
            # targets = torch.tensor(targets).to(device=device)
            targets = torch.tensor([torch.max(yi,0)[1] for yi in targets])
            outputs = model(inputs)
            
            # loss_mape = torch.abs((targets - yhat)/targets)*100
            # criterion = nn.CrossEntropyLoss()
            # loss_mae = criterion(yhat,targets).item()
            
            metric_tracker.add(torch.max(outputs,1)[1], targets)
            
            if prefix_len not in err_dict.keys():
                err_dict[prefix_len] = [metric_tracker.acc()]
            else:
                err_dict[prefix_len].append(metric_tracker.acc())
    return err_dict

In [44]:
err_total_dict = {}
print(save_folder)
for run in range(num_runs):
    print("Run: {}".format(run+1))
    trained_model = GGNN_model(ggnn_dim, num_layers, droppout_prob, list_activities)
    trained_model = trained_model.to(device)
    trained_model.load_state_dict(torch.load('{}/best_model_run_{}.pt'.format(save_folder,run+1),
                                         map_location=torch.device(device)))
    err_dict = evaluate_model(trained_model)
    print(err_dict)
    for key in err_dict.keys():
        err = torch.mean(torch.tensor(err_dict[key]), axis = 0)
        if key in err_total_dict.keys():
            err_total_dict[key].append(torch.tensor(err))
        else:
            err_total_dict[key] = [torch.tensor(err)]

/home/sebdis/ProcessMining/RemainingTime/RemainingCycleTimePrediction//5_Output_files/Next_Activity/BPI_Challenge_2013_incidents_model_Gated_GNN
Run: 1
{2: [tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(1.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(1.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(1.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), tensor(0.), tensor(0.), tensor(0.), tensor(1.), tensor(0.), ten

In [45]:
print(err_total_dict)

{2: [tensor(0.3167)], 3: [tensor(0.8124)], 4: [tensor(0.4129)], 5: [tensor(0.6562)], 6: [tensor(0.5000)], 7: [tensor(0.6373)], 8: [tensor(0.5446)], 9: [tensor(0.5740)], 10: [tensor(0.5368)], 11: [tensor(0.6239)], 12: [tensor(0.5257)], 13: [tensor(0.6042)], 14: [tensor(0.4464)], 15: [tensor(0.5670)], 16: [tensor(0.4878)], 17: [tensor(0.5429)], 18: [tensor(0.5283)], 19: [tensor(0.6087)], 20: [tensor(0.4000)], 21: [tensor(0.4000)], 22: [tensor(0.4242)], 23: [tensor(0.3448)], 24: [tensor(0.2800)], 25: [tensor(0.4706)], 26: [tensor(0.2143)], 27: [tensor(0.2500)], 28: [tensor(0.1111)], 29: [tensor(0.)], 30: [tensor(0.4286)], 31: [tensor(0.)], 32: [tensor(0.5000)], 33: [tensor(0.)], 34: [tensor(0.3333)], 35: [tensor(0.3333)], 36: [tensor(0.3333)], 37: [tensor(0.3333)], 38: [tensor(0.5000)], 39: [tensor(0.)], 40: [tensor(1.)], 41: [tensor(0.)], 42: [tensor(1.)], 43: [tensor(0.)], 44: [tensor(1.)]}


In [46]:
num_samples_dict = {}
for i,(inputs,targets) in enumerate(test_loader):
    key = inputs[0][0].size(0)
    if key in num_samples_dict.keys():
        num_samples_dict[key] += 1
    else:
        num_samples_dict[key] = 1

In [47]:
print(num_samples_dict)

{2: 2510, 3: 1599, 4: 1194, 5: 1050, 6: 780, 7: 601, 8: 437, 9: 338, 10: 272, 11: 218, 12: 175, 13: 144, 14: 112, 15: 97, 16: 82, 17: 70, 18: 53, 19: 46, 20: 40, 21: 40, 22: 33, 23: 29, 24: 25, 25: 17, 26: 14, 27: 12, 28: 9, 29: 9, 30: 7, 31: 5, 32: 4, 33: 3, 34: 3, 35: 3, 36: 3, 37: 3, 38: 2, 39: 2, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1}


In [48]:
list_prefix_len = []
list_num_samples = []
list_accuracy = []

for key, value in err_total_dict.items():
    list_prefix_len.append(key)
    list_num_samples.append(num_samples_dict[key])
    list_accuracy.append(value[0].item())
   
tab_result = pd.DataFrame({"Prefix length":list_prefix_len, "Num samples": list_num_samples, 
                           "Accuracy(%)":list_accuracy})
tab_result

Unnamed: 0,Prefix length,Num samples,Accuracy(%)
0,2,2510,0.316733
1,3,1599,0.812383
2,4,1194,0.412898
3,5,1050,0.65619
4,6,780,0.5
5,7,601,0.637271
6,8,437,0.544622
7,9,338,0.573964
8,10,272,0.536765
9,11,218,0.623853


In [49]:
tab = tab_result[tab_result["Num samples"] >= 20]
general_acc = round(tab["Accuracy(%)"] * tab["Num samples"])
# print(general_acc)
print(sum(general_acc) / sum(tab["Num samples"]))

0.5273001508295626


In [50]:
tab.to_csv(project_dir+"4_Outputs/Evaluation/"+data_name+"_GGNN_eval.csv", index = False)
torch.cuda.empty_cache()