# Helper Functions

In [1]:
import sys
import os

import os
cwd = os.getcwd()
root_path = os.path.abspath('..\..')
sys.path.insert(0, root_path)

print(sys.path)

['c:\\Users\\ronal\\OneDrive\\Documents\\GitHub\\BPAD', 'c:\\Users\\ronal\\OneDrive\\Documents\\GitHub\\BPAD\\analysis\\raw', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\python39.zip', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\DLLs', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad', '', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages\\win32', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages\\win32\\lib', 'C:\\Users\\ronal\\AppData\\Roaming\\Python\\Python39\\site-packages\\Pythonwin', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib\\site-packages', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib\\site-packages\\win32', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpad\\lib\\site-packages\\win32\\lib', 'c:\\Users\\ronal\\miniconda3\\envs\\rcvdb-thesis-bpa

In [2]:
import re
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from utils.fs import RESULTS_RAW_DIR

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes, mark_inset
from utils.enums import Perspective

In [3]:
import os

def list_subfolders(experiment_name):
    experiment_path = os.path.join(RESULTS_RAW_DIR, experiment_name)
    # Get all subfolder names in the specified directory
    run_names = [name for name in os.listdir(experiment_path) if os.path.isdir(os.path.join(experiment_path, name))]
    return run_names

In [4]:
def load_results(run_name, verbose=False,directory=None):
    if directory:
        run_path = os.path.join(RESULTS_RAW_DIR, directory, run_name)
    else:
        run_path = os.path.join(RESULTS_RAW_DIR, run_name)
    npy_files = [file for file in os.listdir(run_path) if file.endswith('.npy')]

    loaded_data = {}

    # Load each .npy file and use the file name (without extension) as the key
    for npy_file in npy_files:
        file_path = os.path.join(run_path, npy_file)
        key = os.path.splitext(npy_file)[0]  # Get the file name without .npy extension
        loaded_data[key] = np.load(file_path)

        if verbose: print(f"{loaded_data[key].shape} \t {key}")

    return loaded_data

In [5]:
def get_buckets(keys):
    buckets = set()
    for key in keys:
        numbers = re.findall(r'\d+', key)
        buckets.update(map(int, numbers))
    if len(buckets) > 0:
        return sorted(buckets)
    else:
        return None

In [6]:
def plot_losses(results, labels, directory, run_name, perspective, level, bucket=None, zoom=[[11000,13000],[-0.05, 0.2]], show_plots=True):
    def scatter_plot(ax, results, labels):
        y_values = results
        x_values = np.arange(len(results))
        ax.scatter(x_values[labels == 0], y_values[labels == 0], c='grey', s=3, label='Normal Prefixes', zorder=1)
        ax.scatter(x_values[labels == 1], y_values[labels == 1], c='red', s=3, label='Anomalous Prefixes', zorder=2)
        ax.grid(True)

    subtitle = f'{directory}     {run_name}'
    if len(results) == 0:
        print(f'ERROR no results found for {subtitle}')
    else:
        fig, ax = plt.subplots(figsize=(15, 6))

        labels = labels[:, perspective]
        scatter_plot(ax, results, labels)
        
        perspective_name = Perspective.values()[perspective]

        bucket_string = ''
        if bucket is not None:
            bucket_string = f'with bucket size {str(bucket)}'
        
        title = f'Error per Prefix on the {perspective_name} perspective at {level} level {bucket_string}'
        
        # Print to keep track of plotting
        print(f'\t {title}')
        
        plt.title(f'{title}\n{subtitle}')
        plt.xlabel('Prefix Index')
        plt.ylabel('Loss')
        
        if zoom:
            axins = inset_axes(ax, width="60%", height="60%", loc='upper right')

            scatter_plot(axins, results, labels)
            axins.set_xlim(zoom[0])
            axins.set_ylim(zoom[1])
            _,_ = ax.indicate_inset_zoom(axins, edgecolor="black", linewidth=3)

        plt.xlabel('Case Index')
        plt.ylabel('Error')
        plt.legend(loc='upper right')
        
        plot_path = f"plots\{directory}\{run_name} "
        os.makedirs(plot_path, exist_ok=True)
        plt.savefig(f"{plot_path}\{perspective_name}_{level}_{bucket_string}.png", format='png', dpi=300)
        
        if show_plots:
            plt.show()
        plt.close()

def bucket_plot_losses(results_name, labels_name, run_name, directory, bucket_lengths, results, perspective, level, zoom=[[11000,13000],[-0.05, 0.2]], show_plots=True):
    if bucket_lengths is None:
        plot_losses(
            results=results[f'{results_name}'], 
            labels=results[f'{labels_name}'],
            directory=directory,
            run_name=run_name, perspective=perspective, level=level, bucket=None, zoom=zoom, show_plots=show_plots)       
    else:
        for bucket in bucket_lengths:
            plot_losses(
                results=results[f'{results_name}_{bucket}'], 
                labels=results[f'{labels_name}_{bucket}'],
                directory=directory,
                run_name=run_name, perspective=perspective, level=level, bucket=bucket, zoom=zoom, show_plots=show_plots)


# Loading Runs

In [7]:
# TODO: Some plots seem to be empty, check if this is due to a data saving or plotting error
# TODO: Rerun DAE_Finetuned_Embedding with latest modifications to easily show config in plot
# TODO: Run on the event level

# TODO: BPIC2015 workload seems to be bugged on the datagen level (also present with the synthetic dataset)
# Seems to be a problem with division errors during AD

# TODO: Seems to be an error when using no buckets with bpic2015 and ATC
# ValueError: operands could not be broadcast together with shapes (35483,1065) (35483,8060)

# TODO: Run bpic without bucketing
# TODO: Run DEA in a more restrictive hidden layer config 

In [8]:
directories = [
    "Trace2Vec_Synthetic", # Contains synthetic Trace2Vec encoding tests
    'DAE_bpic2015_prefixes', # Contains real world no prefixes all encoding methods (no trace2vec)
    'DAE_bpic2015_no_prefixes_v2', # Contains real world with prefixes all encoding methods (no trace2vec)
    'DAE_Finetuned_Embedding', # Contains synthetic with prefixes all encoding methods (no trace2vec)
    'DAE_bpic2015_no_buckets',
    'DAE_bpic2015_no_buckets_real_world',
    'DAE_finetuned_embedding_batch_size_1'] 
directory = directories[-1]
print(directory)

DAE_finetuned_embedding_batch_size_1


In [9]:
run_list = list_subfolders(directory)
print(run_list)

# run_list = [run_list[0]]
# print(run_list)

['24-11-06-13-13_DAE_EncodingCategorical.WORD_2_VEC_ATC_EncodingNumerical.MIN_MAX_SCALING']


In [10]:
run_names = []
run_results = []
run_buckets = []
for run_name in run_list:
    results = load_results(run_name=run_name, directory=directory)
    buckets = get_buckets(results.keys())

    run_names.append(run_name)
    run_results.append(results)
    run_buckets.append(buckets)

# Plot Results

In [11]:
results_config = [
    ('result_DAE_trace_Order', 'labels_DAE_trace', Perspective.ORDER, 'trace'),
    ('result_DAE_trace_Attribute', 'labels_DAE_trace', Perspective.ATTRIBUTE, 'trace'),
    ('result_DAE_trace_Arrival Time', 'labels_DAE_trace', Perspective.ARRIVAL_TIME, 'trace'),
    ('result_DAE_trace_Workload', 'labels_DAE_trace', Perspective.WORKLOAD, 'trace'),
]

In [12]:
show_plots = False
for run_name, run_result, run_bucket in zip(run_names, run_results, run_buckets):
    print(f"Generating: {directory}\t{run_name}")
    for config in results_config:
        # try:
            bucket_plot_losses(
                results_name=config[0], 
                labels_name=config[1],
                directory=directory,
                run_name=run_name,
                bucket_lengths=run_bucket,
                results=run_result,
                perspective=config[2],
                level=config[3],
                zoom=None,
                show_plots=show_plots)
        # except:
        #     print("Error loading ")

Generating: DAE_finetuned_embedding_batch_size_1	24-11-06-13-13_DAE_EncodingCategorical.WORD_2_VEC_ATC_EncodingNumerical.MIN_MAX_SCALING
	 Error per Prefix on the Order perspective at trace level with bucket size 3
	 Error per Prefix on the Order perspective at trace level with bucket size 4
	 Error per Prefix on the Order perspective at trace level with bucket size 5
	 Error per Prefix on the Order perspective at trace level with bucket size 6
	 Error per Prefix on the Order perspective at trace level with bucket size 7
	 Error per Prefix on the Order perspective at trace level with bucket size 8
	 Error per Prefix on the Order perspective at trace level with bucket size 9
	 Error per Prefix on the Order perspective at trace level with bucket size 13
	 Error per Prefix on the Attribute perspective at trace level with bucket size 3
	 Error per Prefix on the Attribute perspective at trace level with bucket size 4
	 Error per Prefix on the Attribute perspective at trace level with bucket