# Helper Functions

In [1]:
import sys
import os

import os
cwd = os.getcwd()
root_path = os.path.abspath('..\..')
sys.path.insert(0, root_path)

print(sys.path)

['c:\\Users\\ronal\\OneDrive\\Documents\\GitHub\\BPAD', 'c:\\Users\\ronal\\OneDrive\\Documents\\GitHub\\BPAD\\analysis\\raw', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\python39.zip', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\DLLs', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad', '', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages\\win32', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages\\win32\\lib', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages\\Pythonwin', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib\\site-packages', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib\\site-packages\\win32', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpa

In [2]:
import re
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from utils.fs import RESULTS_RAW_DIR

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes, mark_inset
from utils.enums import Perspective

In [3]:
import os

def list_subfolders(experiment_name):
    experiment_path = os.path.join(RESULTS_RAW_DIR, experiment_name)
    # Get all subfolder names in the specified directory
    run_names = [name for name in os.listdir(experiment_path) if os.path.isdir(os.path.join(experiment_path, name))]
    return run_names

In [4]:
import json

def load_results(run_name, verbose=False,directory=None):
    if directory:
        run_path = os.path.join(RESULTS_RAW_DIR, directory, run_name)
    else:
        run_path = os.path.join(RESULTS_RAW_DIR, run_name)
    npy_files = [file for file in os.listdir(run_path) if file.endswith('.npy')]

    loaded_data = {}

    # Load each .npy file and use the file name (without extension) as the key
    for npy_file in npy_files:
        file_path = os.path.join(run_path, npy_file)
        key = os.path.splitext(npy_file)[0]  # Get the file name without .npy extension
        loaded_data[key] = np.load(file_path)

        if verbose: print(f"{loaded_data[key].shape} \t {key}")

    return loaded_data

def load_config(run_name, directory=None):
    if directory:
        config_path = os.path.join(RESULTS_RAW_DIR, directory, run_name, "config.json")
    else:
        config_path = os.path.join(RESULTS_RAW_DIR, run_name, "config.json")

    with open(config_path, "r") as f:
        config = json.load(f)
    return config

def get_buckets(keys):
    buckets = set()
    for key in keys:
        numbers = re.findall(r'\d+', key)
        buckets.update(map(int, numbers))
    if len(buckets) > 0:
        return sorted(buckets)
    return None

In [5]:
def plot_losses(results, labels, directory, run_name, perspective, level, bucket=None, zoom=[[11000,13000],[-0.05, 0.2]], show_plots=True):
    def scatter_plot(ax, results, labels):
        y_values = results
        x_values = np.arange(len(results))
        ax.scatter(x_values[labels == 0], y_values[labels == 0], c='grey', s=3, label='Normal Prefixes', zorder=1)
        ax.scatter(x_values[labels == 1], y_values[labels == 1], c='red', s=3, label='Anomalous Prefixes', zorder=2)
        ax.grid(True)

    # Normalize results
    results = np.interp(results, (results.min(), results.max()), (0, 1))

    subtitle = f'{directory}     {run_name}'
    if len(results) == 0:
        print(f'ERROR no results found for {subtitle}')
    else:
        fig, ax = plt.subplots(figsize=(15, 6))

        labels = labels[:, perspective]
        scatter_plot(ax, results, labels)
        
        perspective_name = Perspective.values()[perspective]

        bucket_string = ''
        if bucket is not None:
            bucket_string = f'with bucket size {str(bucket)}'
        
        title = f'Error per Prefix on the {perspective_name} perspective at {level} level {bucket_string}'
        
        # Print to keep track of plotting
        # print(f'\t {title}')
        
        plt.title(f'{title}\n{subtitle}')
        plt.xlabel('Prefix Index')
        plt.ylabel('Loss')
        
        if zoom:
            axins = inset_axes(ax, width="60%", height="60%", loc='upper right')

            scatter_plot(axins, results, labels)
            axins.set_xlim(zoom[0])
            axins.set_ylim(zoom[1])
            _,_ = ax.indicate_inset_zoom(axins, edgecolor="black", linewidth=3)

        plt.xlabel('Case Index')
        plt.ylabel('Error')
        plt.legend(loc='upper right')
        
        plot_path = f"plots\{directory}\{run_name} "
        os.makedirs(plot_path, exist_ok=True)
        plt.savefig(f"{plot_path}\{perspective_name}_{level}_{bucket_string}.png", format='png', dpi=300)
        
        if show_plots:
            plt.show()
        plt.close()

def bucket_plot_losses(results_name, labels_name, run_name, directory, bucket_lengths, results, perspective, level, zoom=[[11000,13000],[-0.05, 0.2]], show_plots=True, pbar=None):
    if bucket_lengths is None:
        plot_losses(
            results=results[f'{results_name}'], 
            labels=results[f'{labels_name}'],
            directory=directory,
            run_name=run_name, perspective=perspective, level=level, bucket=None, zoom=zoom, show_plots=show_plots)
        if pbar:
            pbar.update(1)       
    else:
        for bucket in bucket_lengths:
            plot_losses(
                results=results[f'{results_name}_{bucket}'], 
                labels=results[f'{labels_name}_{bucket}'],
                directory=directory,
                run_name=run_name, perspective=perspective, level=level, bucket=bucket, zoom=zoom, show_plots=show_plots)
            if pbar:
                pbar.update(1)  


# Loading Runs

In [6]:
# TODO: Some plots seem to be empty, check if this is due to a data saving or plotting error
# TODO: Rerun DAE_Finetuned_Embedding with latest modifications to easily show config in plot
# TODO: Run on the event level

# TODO: BPIC2015 workload seems to be bugged on the datagen level (also present with the synthetic dataset)
# Seems to be a problem with division errors during AD

# TODO: Seems to be an error when using no buckets with bpic2015 and ATC
# ValueError: operands could not be broadcast together with shapes (35483,1065) (35483,8060)

# TODO: Run bpic without bucketing
# TODO: Run DEA in a more restrictive hidden layer config 

In [7]:
# directories = [
#     "Trace2Vec_Synthetic", # Contains synthetic Trace2Vec encoding tests
#     'DAE_bpic2015_prefixes', # Contains real world no prefixes all encoding methods (no trace2vec)
#     'DAE_bpic2015_no_prefixes_v2', # Contains real world with prefixes all encoding methods (no trace2vec)
#     'DAE_Finetuned_Embedding', # Contains synthetic with prefixes all encoding methods (no trace2vec)
#     'DAE_bpic2015_no_buckets',
#     'DAE_bpic2015_no_buckets_real_world',
#     'DAE_finetuned_embedding_batch_size_1'] 
directories = [
    'DAE_debug'] 
directory = directories[-1]
print(directory)

score_results = True
plot_results = True

DAE_debug


In [8]:
run_list = list_subfolders(directory)
print(run_list)

# run_list = [run_list[0]]
# print(run_list)

['24-11-18-13-54_DAE_EncodingCategorical.WORD_2_VEC_ATC_EncodingNumerical.MIN_MAX_SCALING']


In [9]:
runs = []
for run_name in run_list:
    results = load_results(run_name=run_name, directory=directory)
    config = load_config(run_name=run_name, directory=directory)
    buckets = get_buckets(results.keys())

    runs.append({
        "name": run_name,
        "results": results,
        "config": config,
        "buckets": buckets
    })

# Plot Results

In [10]:
results_config = [
    ('result_DAE_trace_Order', 'labels_DAE_trace', Perspective.ORDER, 'trace'),
    ('result_DAE_trace_Attribute', 'labels_DAE_trace', Perspective.ATTRIBUTE, 'trace'),
    ('result_DAE_trace_Arrival Time', 'labels_DAE_trace', Perspective.ARRIVAL_TIME, 'trace'),
    ('result_DAE_trace_Workload', 'labels_DAE_trace', Perspective.WORKLOAD, 'trace'),
]

In [11]:
if plot_results:
    show_plots = False

    nr_buckets = 0
    for run in runs:
        nr_buckets += len(run["buckets"])

    total_iterations = nr_buckets * len(results_config)
    with tqdm(total=total_iterations, desc="Generating Plots") as pbar:
        for run in runs:
            # print(f"Generating: {directory}\t{run_name}")
            for config in results_config:
                # try:
                bucket_plot_losses(
                    results_name=config[0], 
                    labels_name=config[1],
                    directory=directory,
                    run_name=run["name"],
                    bucket_lengths=run["buckets"],
                    results=run["results"],
                    perspective=config[2],
                    level=config[3],
                    zoom=None,
                    show_plots=show_plots,
                    pbar=pbar)
                # except:
                #     print("Error loading ")


Generating Plots: 100%|██████████| 32/32 [00:14<00:00,  2.26it/s]


# Score Results

In [12]:
def get_indexes_by_value(arr):
    value_to_indexes = {}
    for index, value in enumerate(arr):
        if value not in value_to_indexes:
            value_to_indexes[value] = []
        value_to_indexes[value].append(index)
    return value_to_indexes

def normalize(array):
    array = np.array(array)
    return np.interp(array, (array.min(), array.max()), (0, 1))

# Function to extract the number after the last underscore
def extract_number(key):
    return int(key.split('_')[-1])

In [13]:
def process_attribute_labels(output, values, case_length, perspective, perspective_label_indices):
    # print(values.shape)

    perspective_value = values[perspective, :, :, :]
    # print(perspective_value.shape)

    perspective_masked = perspective_value[:, :case_length, :]
    # print(perspective_masked.shape)

    perspective_indexed = perspective_masked[:,:,perspective_label_indices[perspective]]
    # print(perspective_indexed.shape)

    perspective_attribute_value = perspective_indexed.reshape(-1) # Flatten the output
    # print(perspective_attribute_value.shape)

    output.append(perspective_attribute_value)

In [14]:
def reshape_data_for_scoring(results, perspective_label_indices):
    labels_DAE_attribute_Arrival_Time = []
    labels_DAE_attribute_Workload = []
    labels_DAE_attribute_Order = []
    labels_DAE_attribute_Attribute  = []

    labels_DAE_event = []
    labels_DAE_trace = []

    result_DAE_attribute_Arrival_Time = []
    result_DAE_event_Arrival_Time = []
    result_DAE_trace_Arrival_Time = []
    result_DAE_attribute_Workload = []
    result_DAE_event_Workload = []
    result_DAE_trace_Workload = []
    result_DAE_attribute_Order = []
    result_DAE_event_Order = []
    result_DAE_trace_Order = []
    result_DAE_attribute_Attribute = []
    result_DAE_event_Attribute = []
    result_DAE_trace_Attribute = []

    for (key, value) in results.items():
        # print(key, value.shape)

        length = int(key.split('_')[-1])
        perspective = key.split('_')[-2]
        if 'losses' in key:
            continue
        elif 'labels' in key:
            if 'attribute' in key:
                transposed_value = np.transpose(value, (3,0,1,2))# [:, :, :length, :]

                process_attribute_labels(
                    output=labels_DAE_attribute_Arrival_Time,
                    values=transposed_value, 
                    case_length=length, 
                    perspective=Perspective.ARRIVAL_TIME,
                    perspective_label_indices=perspective_label_indices)
                process_attribute_labels(
                    output=labels_DAE_attribute_Attribute,
                    values=transposed_value, 
                    case_length=length, 
                    perspective=Perspective.ATTRIBUTE,
                    perspective_label_indices=perspective_label_indices)
                process_attribute_labels(
                    output=labels_DAE_attribute_Order,
                    values=transposed_value, 
                    case_length=length, 
                    perspective=Perspective.ORDER,
                    perspective_label_indices=perspective_label_indices)
                process_attribute_labels(
                    output=labels_DAE_attribute_Workload,
                    values=transposed_value, 
                    case_length=length, 
                    perspective=Perspective.WORKLOAD,
                    perspective_label_indices=perspective_label_indices)

                # # print(perspective_value.shape)
                # perspective_value = perspective_value.reshape(perspective_value.shape[0], -1)
                # # print(perspective_value.shape)
                # labels_DAE_attribute.append(perspective_value)
            elif 'event' in key:
                perspective_value = np.transpose(value, (2,0,1))[:, :, :length]
                perspective_value = perspective_value.reshape(perspective_value.shape[0], -1)
                labels_DAE_event.append(perspective_value)
            elif 'trace' in key:
                perspective_value = np.transpose(value, (1,0))
                labels_DAE_trace.append(perspective_value)
        elif 'result' in key:
            if 'attribute' in key:
                # print(value.shape)
                # value_max = np.max(value, axis=2)
                # print(value.shape, normalize(value.reshape(-1)).shape, perspective)
                # print(value.shape)
                value = normalize(value.reshape(-1))
                # print(value.shape)
                if 'Arrival Time' in perspective:
                    result_DAE_attribute_Arrival_Time.append(value)
                elif 'Order' in perspective:
                    result_DAE_attribute_Order.append(value)
                elif 'Workload' in perspective:
                    result_DAE_attribute_Workload.append(value)
                elif 'Attribute' in perspective:
                    result_DAE_attribute_Attribute.append(value)
            if 'event' in key:
                value = normalize(value.reshape(-1))
                if 'Arrival Time' in perspective:
                    result_DAE_event_Arrival_Time.append(value)
                elif 'Order' in perspective:
                    result_DAE_event_Order.append(value)
                elif 'Workload' in perspective:
                    result_DAE_event_Workload.append(value)
                elif 'Attribute' in perspective:
                    result_DAE_event_Attribute.append(value)
            elif 'trace' in key:
                value = normalize(value)
                if 'Arrival Time' in perspective:
                    result_DAE_trace_Arrival_Time.append(value)
                elif 'Order' in perspective:
                    result_DAE_trace_Order.append(value)
                elif 'Workload' in perspective:
                    result_DAE_trace_Workload.append(value)
                elif 'Attribute' in perspective:
                    result_DAE_trace_Attribute.append(value)


    # labels_DAE_attribute = np.concatenate(labels_DAE_attribute, axis=1)
    labels_DAE_event = np.concatenate(labels_DAE_event, axis=1)    
    labels_DAE_trace = np.concatenate(labels_DAE_trace, axis=1)

    # print(labels_DAE_attribute.shape)

    # print(np.concatenate(result_DAE_event_Order, axis=0).shape)
    # print(result_DAE_attribute_Attribute.shape)
    # print(result_DAE_attribute_Arrival_Time.shape)
    # print(result_DAE_attribute_Workload.shape)

    labels_DAE_attribute = [
        np.concatenate(labels_DAE_attribute_Order, axis=0),
        np.concatenate(labels_DAE_attribute_Attribute, axis=0),
        np.concatenate(labels_DAE_attribute_Arrival_Time, axis=0),
        np.concatenate(labels_DAE_attribute_Workload, axis=0)
    ]

    result_DAE_attribute = [
        np.concatenate(result_DAE_attribute_Order, axis=0),
        np.concatenate(result_DAE_attribute_Attribute, axis=0),
        np.concatenate(result_DAE_attribute_Arrival_Time, axis=0),
        np.concatenate(result_DAE_attribute_Workload, axis=0)
    ]

    result_DAE_event = [
        np.concatenate(result_DAE_event_Order, axis=0),
        np.concatenate(result_DAE_event_Attribute, axis=0),
        np.concatenate(result_DAE_event_Arrival_Time, axis=0),
        np.concatenate(result_DAE_event_Workload, axis=0)
    ]

    result_DAE_trace = [
        np.concatenate(result_DAE_trace_Order, axis=0),
        np.concatenate(result_DAE_trace_Attribute, axis=0),
        np.concatenate(result_DAE_trace_Arrival_Time, axis=0),
        np.concatenate(result_DAE_trace_Workload, axis=0)
    ]

    return labels_DAE_attribute, labels_DAE_event, labels_DAE_trace, result_DAE_attribute, result_DAE_event, result_DAE_trace

In [15]:
from sklearn.metrics import roc_auc_score, average_precision_score

def score_results(y_trues, pred_probs, perspective):
    y_true = y_trues[perspective][:]
    pred_prob = pred_probs[perspective][:]

    # ROC-AUC
    roc_auc = roc_auc_score(y_true, pred_prob)


    # PR-AUC
    pr_auc = average_precision_score(y_true, pred_prob)

    return roc_auc, pr_auc

In [16]:
import itertools
import pandas as pd

def score(run):
    results = run['results']
    config = run['config']

    sorted_results = dict(sorted(results.items(), key=lambda x: extract_number(x[0])))
    perspective_label_indices = get_indexes_by_value(config['attribute_perspectives_original'])

    (
        labels_DAE_attribute, 
        labels_DAE_event, 
        labels_DAE_trace, 
        result_DAE_attribute, 
        result_DAE_event, 
        result_DAE_trace
    ) = reshape_data_for_scoring(results=sorted_results, perspective_label_indices=perspective_label_indices)

    level = ['trace', 'event', 'attribute']
    datasets = [labels_DAE_trace, labels_DAE_event, labels_DAE_attribute]
    results = [result_DAE_trace, result_DAE_event, result_DAE_attribute]
    perspectives = Perspective.keys()

    scores = []
    for (level, dataset, result), perspective in itertools.product(zip(level, datasets, results), perspectives):
        try:
            roc_auc, pr_auc = score_results(dataset, result, perspective)
            # print(level, perspective, roc_auc, pr_auc)

            scores.append({
                # High level differentiatiors
                'run_name':config['run_name'],
                'model':config['model'],
                'dataset':config['dataset'],
                'repeat':config['repeat'],
                # Level/Perspectives
                'level': level,
                'perspective': Perspective.values()[perspective],
                # Scores
                'roc_auc': roc_auc,
                'pr_auc': pr_auc,
                'run_time': config['run_time'],
                # Config
                'batch_size':config['batch_size'],
                'prefix':config['prefix'],
                'buckets':config['bucket_boundaries'],
                'categorical_encoding':config['categorical_encoding'],
                'numerical_encoding':config['numerical_encoding'],
                'vector_size':config['vector_size'],
                'window_size':config['window_size'] 
            })
        except:
            print(level, perspective)

    return pd.DataFrame(scores)

In [17]:
# for run in runs:
if score_results:
    scores_df = score(run=runs[-1])

In [18]:
scores_df

Unnamed: 0,run_name,model,dataset,repeat,level,perspective,roc_auc,pr_auc,run_time,batch_size,prefix,buckets,categorical_encoding,numerical_encoding,vector_size,window_size
0,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,trace,Order,0.613941,0.056915,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
1,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,trace,Attribute,0.537034,0.051473,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
2,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,trace,Arrival Time,0.516534,0.045274,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
3,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,trace,Workload,0.815386,0.076581,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
4,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,event,Order,0.564894,0.012094,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
5,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,event,Attribute,0.528422,0.014549,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
6,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,event,Arrival Time,0.506849,0.009656,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
7,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,event,Workload,0.814324,0.011861,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
8,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,attribute,Order,0.564894,0.012094,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
9,DAE_debug,DAE,medium_debug_v2-0.15-4_1.json.gz,,attribute,Attribute,0.526387,0.005618,71.211199,8,True,"[3, 4, 5, 6, 7, 8, 9, 13]",Word2Vec Average Then Concatinate,Min Max Scaling,200,10
