In [None]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from collections import Counter
import torch
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from DataEncoder import encode_event_prefix_label, encode_pad_sequence, scale_time_differences, scale_time_differences_fast_fixed
from PrefixEmbeddingGCN import prepare_data, CustomDataset,  train, evaluate, EarlyStopping, custom_collate, PrefixGCNClassifier, f1_eva, get_misclassified_samples, cluster_errors
import os 
import shutil
import torch.nn as nn

In [None]:
#event = pd.read_csv("../output/BPI12.csv")
event = pd.read_csv("../output/BPI12w.csv")
#event = pd.read_csv("../output/BPI13i.csv")
#event = pd.read_csv("../output/BPI13c.csv")
#event = pd.read_csv("../output/helpdesk.csv")

In [None]:
# check the size of sequence
shortest_sequence = event.groupby('sequence').size().min()
longest_sequence = event.groupby('sequence').size().max()

print('shortest_sequence:', shortest_sequence)
print('longest_sequence:', longest_sequence)

In [None]:
# prefix size 
prefix_size = 10
# Keep only sequence that have at least `prefix_size` members
event = event[event.groupby('sequence')['sequence'].transform('size') >= prefix_size].reset_index(drop=True)

In [None]:
#BPI13i
#cat_col_event = ['ec1', 'ec4']
#BPi12
event['ec1'] = event['ec1'].astype(str) 
cat_col_event = ['ec1']
#cat_col_event = []
#num_col_event = []
#helpdesk
#cat_col_event = ['ec1']
num_col_event = []
core_event = 'event'
case_index = 'sequence'
text_encode, event_encode, y_encode, text_size, output_dim = encode_event_prefix_label(event, core_event, cat_col_event, num_col_event, case_index, prefix_size, cat_mask=False, num_mask=False)

In [None]:
sequence = event.groupby('sequence').apply(lambda x: x.iloc[prefix_size - 1:]).reset_index(drop=True)

In [None]:
cat_col_seq = []
num_col_seq = ['sn1']
#BPI13c
#cat_col_seq = ['sc2','sc3']
#num_col_seq = []
#helpdesk
#cat_col_seq = ['sc1']
#num_col_seq = []
sequence_encode = encode_pad_sequence(sequence, cat_col_seq, num_col_seq)

In [None]:
start_time_col = 'time'
#start_time_col = 'StartTime'
scaled_time_diffs = scale_time_differences_fast_fixed(event, sequence, start_time_col, case_index)

In [None]:
num_sequences = event_encode.shape[0]
max_num_events = prefix_size
num_event_features = event_encode.shape[2]
num_sequence_features = sequence_encode.shape[1]
num_embedding_features = output_dim

In [None]:
event_feature_list = prepare_data(event_encode, text_encode, scaled_time_diffs)
sequence_features = torch.tensor(sequence_encode, dtype=torch.float)
y_encode = torch.tensor(y_encode, dtype=torch.long)

In [None]:
# Count occurrences of each class
class_counts = Counter(y_encode.numpy())
print("Class distribution:", class_counts)

# Find classes with only one sample
single_sample_classes = [cls for cls, count in class_counts.items() if count == 1]
print("Classes with only 1 sample:", single_sample_classes)

# Create mask to keep only classes with 2+ samples
mask = np.array([y not in single_sample_classes for y in y_encode.numpy()])

# Filter your data
filtered_indices = np.where(mask)[0]
filtered_event_features = [event_feature_list[i] for i in filtered_indices]
filtered_sequence_features = sequence_features[filtered_indices]
filtered_y = y_encode[filtered_indices]

print(f"Original samples: {len(event_feature_list)}")
print(f"Filtered samples: {len(filtered_event_features)}")
print(f"Removed {len(event_feature_list) - len(filtered_event_features)} samples")

# Now perform the split with filtered data
train_indices, test_indices = train_test_split(
    range(len(filtered_event_features)), 
    test_size=0.2, 
    stratify=filtered_y.numpy(), 
    random_state=42
)

# Split the filtered data
train_event_features = [filtered_event_features[i] for i in train_indices]
test_event_features = [filtered_event_features[i] for i in test_indices]
train_sequence_features = filtered_sequence_features[train_indices]
test_sequence_features = filtered_sequence_features[test_indices]
train_y = filtered_y[train_indices]
test_y = filtered_y[test_indices]

In [None]:
# Create datasets
train_dataset = CustomDataset(train_event_features, train_sequence_features, train_y)
test_dataset = CustomDataset(test_event_features, test_sequence_features, test_y)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_event_features = num_event_features
gcn_hidden_dims = 32
num_embedding_features = num_embedding_features
embedding_dims = 16
gcn_hidden_dims_embedding = 32
gcn_hidden_dims_concat = 64
num_sequence_features = num_sequence_features
fc_hidden_dims = 32
fc_hidden_dims_concat = 64
output_dim = output_dim

model = PrefixGCNClassifier(num_event_features, 
                            gcn_hidden_dims,
                            num_embedding_features, 
                            embedding_dims,
                            gcn_hidden_dims_embedding, 
                            gcn_hidden_dims_concat,
                            num_sequence_features, 
                            fc_hidden_dims,
                            fc_hidden_dims_concat, 
                            output_dim).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1) 

In [None]:
# Create DataLoader with the loaded batch size
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate)

early_stopping = EarlyStopping(patience=3, delta=0.0)

# Filepath to save model
model_path = "../output/models/1BPI13i_prefix15_es.pt"

config = {
    'num_event_features': num_event_features,
    'gcn_hidden_dims': gcn_hidden_dims,
    'num_embedding_features': num_embedding_features,
    'embedding_dims': embedding_dims,
    'gcn_hidden_dims_embedding': gcn_hidden_dims_embedding,
    'gcn_hidden_dims_concat': gcn_hidden_dims_concat,
    'num_sequence_features': num_sequence_features,
    'fc_hidden_dims':fc_hidden_dims,
    'fc_hidden_dims_concat':fc_hidden_dims_concat,
    'output_dim': output_dim
}


num_epochs = 10

for epoch in range(num_epochs):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Test Loss: {test_loss:.4f} Acc: {test_acc:.4f}")
    
    if early_stopping(test_loss):
        print("Early stopping triggered.")
        break

    if early_stopping.best_loss_updated:
        print(f"New best model at epoch {epoch+1}, saving to {model_path}")
        best_model_saved = True
         # Save state dict and config
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'train_acc': train_acc,
            'test_loss': test_loss,
            'test_acc': test_acc,
            'config': config
        }, model_path)
        best_model = model
        
if not early_stopping.early_stop and not best_model_saved:
        print("Training completed without early stopping. Saving final model.")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'train_acc': train_acc,
            'test_loss': test_loss,
            'test_acc': test_acc,
            'config': config
        }, model_path)
        best_model = model

In [None]:
#Final evaluation to get predictions and labels
model = best_model
class_report, top3_acc = f1_eva(model, test_loader, device, k=3)
_, top5_acc = f1_eva(model, test_loader, device, k=5)

print(f"Top-3 Accuracy: {top3_acc:.4f}")
print(f"Top-5 Accuracy: {top5_acc:.4f}")

print("\nClassification Report (with F1 scores for each class):")
print(class_report)

In [None]:
errors = get_misclassified_samples(model, test_loader, device)

In [None]:
# Try different feature types and methods
num_clusters = 10
cluster_ids = cluster_errors(errors, num_clusters, use='event_feats')

In [None]:
# After running cluster_errors()
num_clusters = 10
for cluster_id in range(num_clusters):
    cluster_samples = [e for e, c in zip(errors, cluster_ids) if c == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_samples)} samples):")
    
    # Top error patterns
    from collections import Counter
    print("Common mistakes:", Counter((e['label'], e['pred']) for e in cluster_samples).most_common(3))
    
    # Average feature vector
    avg_features = np.mean([e['event_feats'] for e in cluster_samples], axis=0)
    print("Most salient features:", np.argsort(avg_features)[-5:])  # Top 5 influential features

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load(model_path, map_location=device)
config = checkpoint['config']

# Rebuild the model using the same config
model = PrefixGCNClassifier(
    num_event_features=config['num_event_features'],
    gcn_hidden_dims=config['gcn_hidden_dims'],
    num_embedding_features=config['num_embedding_features'],
    embedding_dims=config['embedding_dims'],
    gcn_hidden_dims_embedding=config['gcn_hidden_dims_embedding'],
    gcn_hidden_dims_concat=config['gcn_hidden_dims_concat'],
    num_sequence_features=config['num_sequence_features'],
    fc_hidden_dims=config['fc_hidden_dims'],
    fc_hidden_dims_concat=config['fc_hidden_dims_concat'],
    output_dim=config['output_dim']
)

model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

# Rebuild optimizer (if needed)
optimizer = torch.optim.Adam(model.parameters())
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Create DataLoader with the loaded batch size
batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate)

early_stopping = EarlyStopping(patience=3, delta=0.0)


In [None]:
start_epoch = checkpoint['epoch'] + 1 

for epoch in range(start_epoch, 10):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | Test Loss: {test_loss:.4f} Acc: {test_acc:.4f}")
    
    if early_stopping(test_loss):
        print("Early stopping triggered.")
        break

    if early_stopping.best_loss_updated:
        print(f"New best model at epoch {epoch+1}, saving to {model_path}")
        best_model_saved = True
         # Save state dict and config
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'train_acc': train_acc,
            'test_loss': test_loss,
            'test_acc': test_acc,
            'config': config
        }, model_path)
        
if not early_stopping.early_stop and not best_model_saved:
        print("Training completed without early stopping. Saving final model.")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': train_loss,
            'train_acc': train_acc,
            'test_loss': test_loss,
            'test_acc': test_acc,
            'config': config
        }, model_path)