In [None]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import optuna 
import torch
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from DataEncoder import encode_pad_event, encode_pad_sequence, scale_time_differences, label_encode_y
from TwoLevelGCNIm import prepare_data, CustomDataset, train, evaluate, EarlyStopping, EventSequenceGCNModel, objective, load_model, f1_eva
from utils import plot_training_history_im, print_best_hp_gcn_im, best_trial_path_im
import os
import shutil

In [None]:
#event = pd.read_csv("D:/Research in UAE/sequence/output/BPI12af_Combin_Feature.csv")
#sequence = event[['case:concept:name','case:AMOUNT_REQ','result']].groupby(['case:concept:name']).first()
#sequence = sequence.reset_index()
event = pd.read_csv("D:/Research in UAE/sequence/output/Event_Feature_pro.csv")
sequence = pd.read_csv("D:/Research in UAE/sequence/output/Sequence_Feature_pro.csv")

In [None]:
y_col = sequence.result
y_encode = label_encode_y(y_col)

In [None]:
#cat_col_event = ['activity_verb', 'activity_dec', 'StartRes', 'CompleteRes']
#num_col_event = ['Duration']
#case_index = 'case:concept:name'

cat_col_event = ['Activity_verb', 'Activity_Dec', 'Resource', 'outcome', "stopcode"]
num_col_event = ['net_promotor_score', 'creditscore', 'rate_charged', 'duration']
case_index = 'Case ID'

event_encode = encode_pad_event(event, cat_col_event, num_col_event, case_index, cat_mask = True, num_mask = True, eos = False)

In [None]:
#cat_col_seq = []
#num_col_seq = ['case:AMOUNT_REQ']
cat_col_seq = ['plan']
num_col_seq = ['age', 'coverage_numeric', 'length_of_stay']
sequence_encode = encode_pad_sequence(sequence, cat_col_seq, num_col_seq)

In [None]:
#start_time_col = 'StartTime'
start_time_col = 'Start Timestamp'
scaled_time_diffs = scale_time_differences(event, sequence, start_time_col, case_index)

In [None]:
num_sequences = event_encode.shape[0]
max_num_events = event_encode.shape[1]
num_event_features = event_encode.shape[2]
num_sequence_features = sequence_encode.shape[1]

In [None]:
# data preparation is done as described above
event_feature_list = prepare_data(event_encode, scaled_time_diffs, torch.float)
sequence_features = torch.tensor(sequence_encode, dtype=torch.float)
y_encode = torch.tensor(y_encode, dtype=torch.long)

In [None]:
# Split indices for train and test
train_indices, test_indices = train_test_split(range(len(event_feature_list)), test_size=0.2, stratify=y_encode.numpy(), random_state=42)

# Split the data
train_event_features = [event_feature_list[i] for i in train_indices]
test_event_features = [event_feature_list[i] for i in test_indices]

train_sequence_features = sequence_features[train_indices]
test_sequence_features = sequence_features[test_indices]

train_y = y_encode[train_indices]
test_y = y_encode[test_indices]

# Create datasets
train_dataset = CustomDataset(train_event_features, train_sequence_features, train_y)
test_dataset = CustomDataset(test_event_features, test_sequence_features, test_y)

# DataLoader
#train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
#test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Example usage
patience = 30
output_dim = len(np.unique(y_encode))
epochs = 300 # Increase epochs to allow for early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_save_folder = '../output/model_hp/2levelGCNIm/'
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///../output/2levelGCNIm.db',  
                            load_if_exists=True)
#study.optimize(objective, n_trials=80)
study.optimize(lambda trial: objective(trial, model_save_folder, 
                                       train_dataset, test_dataset, 
                                       num_event_features, num_sequence_features, 
                                       output_dim, 
                                       patience, epochs, device), n_trials=200)

# Get the best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best test f1: ", study.best_value)

In [None]:
model_save_path = best_trial_path_im(study, model_save_folder)

best_model_save_path =  f"{model_save_path[:model_save_path.rfind('/')]}/best_model_{model_save_path[model_save_path.rfind('/')+1:]}"
# Copy the file and 
shutil.copy(model_save_path, best_model_save_path)

best_model, optimizer, criterion, batch_size, l1_lambda, best_epoch, best_accuracy, best_loss, best_std_dev, best_f1 = load_model(EventSequenceGCNModel, model_save_path, device)

In [None]:
# Create DataLoader with the loaded batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the number of epochs if you want to continue training
final_epochs = 0  # Set to 0 if you only want to evaluate

print(f"Best epoch from best model:{best_epoch+1}")

if final_epochs !=0:
    history = {
        'train_loss': [],
        'train_f1': [],
        'test_loss': [],
        'test_f1': []
    }
    start_epoch = 0
    
    # Continue training from the last epoch if needed
    for epoch in range(start_epoch, start_epoch + final_epochs):
        train_loss, train_accuracy, train_f1 = train(best_model, train_loader, optimizer, criterion, device, l1_lambda)
        test_loss, test_accuracy, test_f1 = evaluate(best_model, test_loader, criterion, device)    
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}, Test Loss: {test_loss:.4f}, Test F1: {test_f1:.4f}')
        # Store in history
        history['train_loss'].append(train_loss)
        history['train_f1'].append(train_f1)
        history['test_loss'].append(test_loss)
        history['test_f1'].append(test_f1)
        # Optionally save the model if it improves
        if test_f1 > best_f1:
            best_f1 = test_f1
            print(f"New best F1 with continue training: {best_f1:.4f}")
            print(f"New best epoch with continue training: {epoch+1}" )

# Final Evaluation
if final_epochs == 0:
    test_loss, test_accuracy, test_f1 = evaluate(best_model, test_loader, criterion, device)
    print(f"Final Test Loss: {test_loss:.4f}, Final Test F1: {test_f1:.4f}, Final Test Accuracy : {test_accuracy:.4f}" )

In [None]:
print(plot_training_history_im(history))

In [None]:
print_best_hp_gcn_im("EventSequenceGCNModel", model_save_path,device)

In [None]:
# Final evaluation to get predictions and labels
test_loss, test_accuracy, conf_matrix, class_report = f1_eva(best_model, test_loader, criterion, device)

# Print final evaluation metrics
print(f"Final Test Loss: {test_loss:.4f}, Final Test Accuracy: {test_accuracy:.4f}")

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report (with F1 scores for each class):")
print(class_report)