In [1]:
%reload_ext autoreload
%autoreload 2 
import pandas as pd
import numpy as np
import keras_tuner as kt
from keras_tuner import RandomSearch, HyperModel, Hyperband, HyperParameters
from tensorflow.keras.callbacks import EarlyStopping

from DataEncoder import encode_pad_event, encode_pad_sequence, encode_y
from DurationEmbedding import duration_embedding_layer
from FeatureEmbedding import feature_embedding_layer
from DurationEmbeddingLSTMIm import DurationEmbeddingLSTMModel, print_best_hp_duration
from FeatureDurationEmbeddingLSTMIm import FeatureDurationEmbeddingLSTMModel, print_best_hp_duration_feature
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
event = pd.read_csv("..input/Event_Feature_pro.csv")
sequence = pd.read_csv("..input/Sequence_Feature_pro.csv")
event_duration_embedding = pd.read_csv("../output/Event_Duration_Embedding_pro.csv")
event_feature_embedding = pd.read_csv("../output/Event_feature_Embedding_pro.csv")

In [3]:
y_col = sequence.result
y_encode = encode_y(y_col)

In [4]:
cat_col_event = ['Activity_verb', 'Activity_Dec', 'Resource', 'outcome', "stopcode"]
num_col_event = ['net_promotor_score', 'creditscore', 'rate_charged']
case_index = 'Case ID'

event_encode = encode_pad_event(event, cat_col_event, num_col_event, case_index, cat_mask = True, num_mask = True, eos = False)

In [5]:
cat_col_seq = ['plan']
num_col_seq = ['age', 'coverage_numeric', 'length_of_stay']
sequence_encode = encode_pad_sequence(sequence, cat_col_seq, num_col_seq,  cat_mask = False, num_mask = False)

In [6]:
event_duration_embedding_names = event_duration_embedding.columns.to_list()
embedding_duration_cols_names = event_duration_embedding_names[7:]
duration_embedding_encode = duration_embedding_layer(event_duration_embedding, embedding_duration_cols_names, case_index, eos = False)

Event feature embedding process

In [7]:
event_feature_embedding_names = event_feature_embedding.columns.to_list()
embedding_feature_cols_names = event_feature_embedding_names[5:]
feature_embedding_encode = feature_embedding_layer(event_feature_embedding, embedding_feature_cols_names, case_index, eos = False)

In [8]:
# Define the input shape (sequence_length, number_of_features)
event_input_shape = (event_encode.shape[1], event_encode.shape[2])

# Define the number of classes 
num_classes = y_encode.shape[1]

# number of sequence features
num_sequence_features = sequence_encode.shape[1]

duration_embedding_shape = (duration_embedding_encode.shape[1], duration_embedding_encode.shape[2])
feature_embedding_shape = (feature_embedding_encode.shape[1], feature_embedding_encode.shape[2])

In [9]:
# Include batch size as a hyperparameter to be tuned
hp_b = HyperParameters()
batch_size = hp_b.Choice('batch_size', values=[16, 32, 64, 128])

In [10]:
# Split indices for train and test
train_indices, test_indices = train_test_split(np.arange(len(y_encode)), test_size=0.2, stratify=y_encode, random_state=42)

# Split the data
train_event_features = event_encode[train_indices]
test_event_features = event_encode[test_indices]

train_sequence_features = sequence_encode[train_indices]
test_sequence_features = sequence_encode[test_indices]

train_y = y_encode[train_indices]
test_y = y_encode[test_indices]

train_duration_embedding = duration_embedding_encode[train_indices]
test_duration_embedding = duration_embedding_encode[test_indices]

train_feature_embedding = feature_embedding_encode[train_indices]
test_feature_embedding = feature_embedding_encode[test_indices]

Duration Model

In [18]:
#Initialize the hypermodel
hypermodel = DurationEmbeddingLSTMModel(event_input_shape = event_input_shape,
                                        num_sequence_features = num_sequence_features,
                                        duration_embedding_shape = duration_embedding_shape,
                                        num_classes=num_classes)

early_stopping = EarlyStopping(monitor='val_f1_score',  
                               mode='max',
                               patience=5, 
                               restore_best_weights=True)

tuner_band = Hyperband(hypermodel,
                       objective=kt.Objective("val_f1_score", direction="max"),
                       max_epochs=200,
                       factor=3,
                       directory='hparam_tuning',
                       overwrite=False, #if resume tuner, keep overwrite = False
                       project_name='classfication_duration_embedding_im',
                       hyperparameters=hp_b)

# Use the manually split data in the search
tuner_band.search(x=[train_event_features, train_duration_embedding, train_sequence_features], y=train_y, 
                  validation_data=([test_event_features, test_duration_embedding, test_sequence_features], test_y), 
                  epochs=200, callbacks=[early_stopping])

# Get the optimal hyperparameters
best_hps_band = tuner_band.get_best_hyperparameters(num_trials=1)[0]

# Get the best model
best_model_band = tuner_band.get_best_models(num_models=1)[0]

Trial 254 Complete [00h 06m 18s]
val_f1_score: 0.701680600643158

Best val_f1_score So Far: 0.8589022159576416
Total elapsed time: 04h 47m 18s
Epoch 1/300
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 98ms/step - f1_score: 0.8178 - loss: 0.6551 - val_f1_score: 0.8496 - val_loss: 0.5866
Epoch 2/300
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step - f1_score: 0.8211 - loss: 0.6242 - val_f1_score: 0.8333 - val_loss: 0.6217
Epoch 3/300
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - f1_score: 0.8298 - loss: 0.6039 - val_f1_score: 0.8520 - val_loss: 0.5639
Epoch 4/300
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 91ms/step - f1_score: 0.8296 - loss: 0.6128 - val_f1_score: 0.8525 - val_loss: 0.5560
Epoch 5/300
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step - f1_score: 0.8060 - loss: 0.5961 - val_f1_score: 0.8493 - val_loss: 0.5297
Epoch 6/300
[1m107/107[0m [32

In [None]:
# Get predictions
y_pred_probs = best_model_band.predict([test_event_features, test_duration_embedding, test_sequence_features])
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert to class indices
y_true = np.argmax(test_y, axis=1)

# Compute Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute Classification Report
class_report = classification_report(y_true, y_pred, digits=4)
print("Classification Report:")
print(class_report)

In [None]:
# alternatively, build the model with the optimal hyperparameters 
# best_model_band  = band_tuner.hypermodel.build(best_hps_band)
history_band = best_model_band.fit(x=[train_event_features, train_duration_embedding, train_sequence_features], y=train_y, 
                                   validation_data=([test_event_features, test_duration_embedding, test_sequence_features], test_y), 
                                   epochs=200, batch_size=best_hps_band.get('batch_size'), callbacks=[early_stopping])

val_acc_per_epoch_band = history_band.history['val_f1_score']
best_epoch_band = val_acc_per_epoch_band.index(max(val_acc_per_epoch_band)) + 1
print('Best epoch: %d' % (best_epoch_band,))
print('Best F1 Score: ', max(val_acc_per_epoch_band))
# Evaluate the best model
#loss, accuracy = best_model_band.evaluate([event_encode, sequence_encode],y_encode)
#print('Test accuracy:', accuracy)
print_best_hp_duration(best_hps_band)

Duration Feature model

In [12]:
#Initialize the hypermodel
hypermodel = FeatureDurationEmbeddingLSTMModel(event_input_shape = event_input_shape,
                                        num_sequence_features = num_sequence_features,
                                        feature_embedding_shape = feature_embedding_shape,
                                        duration_embedding_shape = duration_embedding_shape,
                                        num_classes=num_classes)

early_stopping = EarlyStopping(monitor='val_f1_score',  
                               mode='max',
                               patience=5, 
                               restore_best_weights=True)

tuner_band = Hyperband(hypermodel,
                       objective=kt.Objective("val_f1_score", direction="max"),
                       max_epochs=200,
                       factor=3,
                       directory='hparam_tuning',
                       overwrite=False, #if resume tuner, keep overwrite = False
                       project_name='classfication_feature_embedding_im',
                       hyperparameters=hp_b)

# Use the manually split data in the search
tuner_band.search(x=[train_event_features, train_feature_embedding, train_duration_embedding, train_sequence_features], y=train_y, 
                  validation_data=([test_event_features, test_feature_embedding, test_duration_embedding, test_sequence_features], test_y), 
                  epochs=200, callbacks=[early_stopping])

# Get the optimal hyperparameters
best_hps_band = tuner_band.get_best_hyperparameters(num_trials=1)[0]

# Get the best model
best_model_band = tuner_band.get_best_models(num_models=1)[0]


Trial 247 Complete [00h 03m 38s]
val_f1_score: 0.8300619125366211

Best val_f1_score So Far: 0.8615099191665649
Total elapsed time: 06h 05m 20s


In [13]:
# Get predictions
y_pred_probs = best_model_band.predict([test_event_features, test_feature_embedding, test_duration_embedding, test_sequence_features])
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert to class indices
y_true = np.argmax(test_y, axis=1)

# Compute Confusion Matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute Classification Report
class_report = classification_report(y_true, y_pred, digits=4)
print("Classification Report:")
print(class_report)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 87ms/step
Confusion Matrix:
[[ 92   0   0   0   0   0]
 [  0 170   0   0   4   0]
 [  0   0   5   0   0   0]
 [  0   0   2  19   0   0]
 [  0   0   0   0  32   0]
 [  0  40   0   0   9  55]]
Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        92
           1     0.8095    0.9770    0.8854       174
           2     0.7143    1.0000    0.8333         5
           3     1.0000    0.9048    0.9500        21
           4     0.7111    1.0000    0.8312        32
           5     1.0000    0.5288    0.6918       104

    accuracy                         0.8715       428
   macro avg     0.8725    0.9018    0.8653       428
weighted avg     0.8976    0.8715    0.8615       428



In [None]:
# alternatively, build the model with the optimal hyperparameters 
# best_model_band  = band_tuner.hypermodel.build(best_hps_band)
history_band = best_model_band.fit(x=[train_event_features, train_feature_embedding, train_duration_embedding, train_sequence_features], y=train_y, 
                                   validation_data=([test_event_features, test_feature_embedding, test_duration_embedding, test_sequence_features], test_y), 
                                   epochs=200, batch_size=best_hps_band.get('batch_size'), callbacks=[early_stopping])

val_acc_per_epoch_band = history_band.history['val_f1_score']
best_epoch_band = val_acc_per_epoch_band.index(max(val_acc_per_epoch_band)) + 1
print('Best epoch: %d' % (best_epoch_band,))
print('Best F1 Score: ', max(val_acc_per_epoch_band))

print_best_hp_duration_feature(best_hps_band)