In [None]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import optuna 
import torch
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from DataEncoder import encode_pad_event, encode_pad_sequence, scale_time_differences, label_encode_y, encode_textual_event
from DurationEmbedding import duration_embedding_layer
from TwoLevelEmbeddingGraphConv import prepare_data, CustomDataset, train, evaluate, EarlyStopping, EventSequenceEmbeddingGraphConvModel, objective, load_model
from utils import plot_training_history, print_best_hp_graphconv, best_trial_path
import os 
import shutil

In [None]:
event = pd.read_csv("D:/Research in UAE/sequence/output/Event_Feature_pro.csv")
sequence = pd.read_csv("D:/Research in UAE/sequence/output/Sequence_Feature_pro.csv")
#event = pd.read_csv("D:/Research in UAE/sequence/output/BPI12af_Combin_Feature.csv")
#sequence = event[['case:concept:name','case:AMOUNT_REQ','result']].groupby(['case:concept:name']).first()
#sequence = sequence.reset_index()

In [None]:
y_col = sequence.result
y_encode = label_encode_y(y_col)

In [None]:
cat_col_event = ['Resource', 'outcome', "stopcode"]
num_col_event = ['net_promotor_score', 'creditscore', 'rate_charged', 'duration']
case_index = 'Case ID'

#cat_col_event = ['StartRes', 'CompleteRes']
#num_col_event = ['Duration']
#case_index = 'case:concept:name'

event_encode = encode_pad_event(event, cat_col_event, num_col_event, case_index, cat_mask = True, num_mask = True, eos = False)

In [None]:
cat_col_seq = ['plan']
num_col_seq = ['age', 'coverage_numeric', 'length_of_stay']
#cat_col_seq = []
#num_col_seq = ['case:AMOUNT_REQ']
sequence_encode = encode_pad_sequence(sequence, cat_col_seq, num_col_seq)

In [None]:
start_time_col = 'Start Timestamp'
#start_time_col = 'StartTime'
scaled_time_diffs = scale_time_differences(event, sequence, start_time_col, case_index)

In [None]:
#text_encode, text_size = encode_textual_event(event, 'activity', case_index, eos = False, pad_value = -1)
text_encode, text_size = encode_textual_event(event, 'Activity_code', case_index, eos = False, pad_value = -1)

In [None]:
num_sequences = event_encode.shape[0]
max_num_events = event_encode.shape[1]
num_event_features = event_encode.shape[2]
num_sequence_features = sequence_encode.shape[1]
num_embedding_features = text_size

In [None]:
# data preparation is done as described above
event_feature_list = prepare_data(event_encode, scaled_time_diffs, torch.float)
embedding_feature_list = prepare_data(text_encode, scaled_time_diffs, torch.long)
sequence_features = torch.tensor(sequence_encode, dtype=torch.float)
y_encode = torch.tensor(y_encode, dtype=torch.long)

In [None]:
# Split indices for train and test
train_indices, test_indices = train_test_split(range(len(event_feature_list)), test_size=0.2, stratify=y_encode.numpy(), random_state=42)

# Split the data
train_event_features = [event_feature_list[i] for i in train_indices]
test_event_features = [event_feature_list[i] for i in test_indices]

train_embedding_features = [embedding_feature_list[i] for i in train_indices]
test_embedding_features = [embedding_feature_list[i] for i in test_indices]

train_sequence_features = sequence_features[train_indices]
test_sequence_features = sequence_features[test_indices]

train_y = y_encode[train_indices]
test_y = y_encode[test_indices]

# Create datasets
train_dataset = CustomDataset(train_event_features, train_embedding_features, train_sequence_features, train_y)
test_dataset = CustomDataset(test_event_features, test_embedding_features, test_sequence_features, test_y)

In [None]:
# usage
output_dim = len(np.unique(y_encode))
epochs = 300 # Increase epochs to allow for early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create an Optuna study and optimize the objective function
patience = 30
model_save_folder = '../output/model_hp/2levelEmbeddingGraphConv/'
study = optuna.create_study(direction='maximize', 
                            storage='sqlite:///../output/2levelEmbeddingGraphConv.db',  
                            load_if_exists=True)

study.optimize(lambda trial: objective(trial, model_save_folder, 
                                       train_dataset, test_dataset, 
                                       num_event_features, num_embedding_features, num_sequence_features, 
                                       output_dim, 
                                       patience, epochs, device), n_trials=200)


# Get the best hyperparameters
print("Best Hyperparameters:", study.best_params)
print("Best test accuracy: ", study.best_value)


In [None]:
model_save_path = best_trial_path(study, model_save_folder)
best_model_save_path =  f"{model_save_path[:model_save_path.rfind('/')]}/best_model_{model_save_path[model_save_path.rfind('/')+1:]}"
# Copy the file and 
shutil.copy(model_save_path, best_model_save_path)

best_model, optimizer, criterion, batch_size, l1_lambda, start_epoch, best_accuracy, best_loss, best_std_dev = load_model(EventSequenceEmbeddingGraphConvModel, model_save_path, device)

In [None]:
# Create DataLoader with the loaded batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the number of epochs if you want to continue training
final_epochs = 300  # Set to 0 if you only want to evaluate

history = {
        'train_loss': [],
        'train_accuracy': [],
        'test_loss': [],
        'test_accuracy': []
    }
print(f"Best epoch from best model:{start_epoch + 1}")

start_epoch = 0

# Continue training from the last epoch if needed
for epoch in range(start_epoch, start_epoch + final_epochs):
    train_loss, train_accuracy = train(best_model, train_loader, optimizer, criterion, device, l1_lambda)
    test_loss, test_accuracy = evaluate(best_model, test_loader, criterion, device)    
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')
    # Store in history
    history['train_loss'].append(train_loss)
    history['train_accuracy'].append(train_accuracy)
    history['test_loss'].append(test_loss)
    history['test_accuracy'].append(test_accuracy)
    # Optionally save the model if it improves
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        print(f"New best accuracy with continue training: {best_accuracy:.4f}")

# Final Evaluation
if final_epochs == 0:
    test_loss, test_accuracy = evaluate(best_model, test_loader, criterion, device)
    print(f"Final Test Loss: {test_loss:.4f}, Final Test Accuracy: {test_accuracy:.4f}")

In [None]:
print(plot_training_history(history))

In [None]:
model_save_path = '../output/model_hp/2levelEmbeddingGraphConv_bpi12o/best_model_trial_13.pth'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print_best_hp_graphconv("EventSequenceEmbeddingGraphConvModel", model_save_path,device)