In [None]:
import sys
import os

import os
cwd = os.getcwd()
root_path = os.path.abspath('..')
sys.path.insert(0, root_path)


In [None]:
import re
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

from utils.fs import RESULTS_RAW_DIR

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes, mark_inset
from utils.enums import Perspective

In [None]:
def get_latest_run():
    def is_numeric(input):
        try:
            float(input)
            return True
        except ValueError:
            return False
        
    numeric_folders = [folder for folder in os.listdir(RESULTS_RAW_DIR) if is_numeric(folder) and os.path.isdir(os.path.join(RESULTS_RAW_DIR, folder))]

    if numeric_folders:
        highest_number = max(float(folder) for folder in numeric_folders)
        print(f"The latest run is: {highest_number}")
        return str(highest_number)
    else:
        print("No runs found.")
        return None

In [None]:
def load_results(run_name):
    run_path = os.path.join(RESULTS_RAW_DIR, run_name)
    npy_files = [file for file in os.listdir(run_path) if file.endswith('.npy')]

    loaded_data = {}

    # Load each .npy file and use the file name (without extension) as the key
    for npy_file in npy_files:
        file_path = os.path.join(run_path, npy_file)
        key = os.path.splitext(npy_file)[0]  # Get the file name without .npy extension
        loaded_data[key] = np.load(file_path)

        print(f"{loaded_data[key].shape} \t {key}")

    return loaded_data

In [None]:
def get_buckets(keys):
    buckets = set()
    for key in keys:
        numbers = re.findall(r'\d+', key)
        buckets.update(map(int, numbers))
    if len(buckets) > 0:
        return sorted(buckets)
    else:
        return None

In [None]:
latest_run = get_latest_run()
results = load_results(run_name=latest_run)
bucket_lengths = get_buckets(results.keys())

# '1726688270.265151'   0.05 with 0 noise, batch size 8
# '1726683337.723001'   0.25 with 0 noise, batch size 8
# '1726688845.1876202'  0.05 with 0.25 noise, batch size 8
# '1726688947.9127235'  0.25 with 0.25 noise, batch size 8
# '1726689265.3127506'  0.05 with 1 noise, batch size 8
# '1726689364.887093'   0.25 with 1 noise, batch size 8
# '1726746216.6401465'  0.05 with 1 noise, batch size 1
# '1726746783.290876'   0.25 with 1 noise, batch size 1
# '1727098666.0037918'  first attempt multi task

# '1727360416.767745'  0.25 with 1 noise, batch size 2, bucketing, one-hot and min-max
# '1727363555.4167004' 0.25 with 1 noise, batch size 2, bucketing, only min-max
# '1727365180.6552784' 0.25 with 1 noise, batch size 2, bucketing, no scaling

# '1727435814.7056985' 0.15 with 0.25 noise, batch size 2, no arrival-time and workload anomalies
# '1727436294.3738086' 0.15 with 0.25 noise, batch size 2, with arrival-time and workload anomalies

# '1727444617.510686' 0.15 with 0.25 noise, batch size 2, with arrival-time and workload anomalies, fixed casting bug
# '1727363555.4167004' 0.15 with 0.25 noise, batch size 2, with arrival-time and workload anomalies, only min-max
# '1727445103.0633998'


# '1727977210.4493766' Word2Vec attempt only categorical, batch = 32, no bucketing, sum
# '1727977426.604367' Word2Vec attempt only categorical, batch = 4, no bucketing, sum
# '1727977637.8275452' Word2Vec attempt only categorical, batch = 4, with bucketing, sum

# '1728490622.7945442' Word2Vec test
# 

In [None]:
# Check the amount of anomalies of each type are present
if bucket_lengths is not None:
    print(np.sum(results['labels_DAE_trace_8'][:,0] == 1))
    print(np.sum(results['labels_DAE_trace_8'][:,1] == 1))
    print(np.sum(results['labels_DAE_trace_8'][:,2] == 1))
    print(np.sum(results['labels_DAE_trace_8'][:,3] == 1))

    print(len(results['labels_DAE_trace_8']))

# Plotting Losses

In [None]:
# Create a plot
plt.figure(figsize=(10, 6))

# Loop over bucket lengths
if bucket_lengths is None:
    losses = results.get('losses_DAE')
    plt.plot(losses, label=f'Single Bucket')
else:
    for bucket_nr in bucket_lengths:
        # Construct the key
        key = f'losses_DAE_{bucket_nr}'
        
        # Retrieve the corresponding losses
        losses = results.get(key)
        
        # Plot the losses
        if losses is not None:
            plt.plot(losses, label=f'Bucket {bucket_nr}')

# Add labels and title
plt.xlabel('Batch Index')
plt.ylabel('Loss')
plt.title('Model Loss for Different Data Buckets')
plt.legend()

# Show plot
plt.show()

# Plotting Reconstruction Errors

In [None]:
def plot_losses(results, labels, perspective, level, bucket=None, zoom=[[11000,13000],[-0.05, 0.2]]):
    def scatter_plot(ax, results, labels):
        y_values = results
        x_values = np.arange(len(results))
        ax.scatter(x_values[labels == 0], y_values[labels == 0], c='grey', s=3, label='Normal Prefixes', zorder=1)
        ax.scatter(x_values[labels == 1], y_values[labels == 1], c='red', s=3, label='Anomalous Prefixes', zorder=2)
        ax.grid(True)

    fig, ax = plt.subplots(figsize=(15, 6))

    labels = labels[:, perspective]
    scatter_plot(ax, results, labels)
    
    perspective_name = Perspective.values()[perspective]

    bucket_string = ''
    if bucket is not None:
        bucket_string = f'with bucket size {str(bucket)}'
    plt.title(f'Error per Prefix on the {perspective_name} perspective at {level} level {bucket_string}')
    plt.xlabel('Prefix Index')
    plt.ylabel('Loss')
    
    if zoom:
        axins = inset_axes(ax, width="60%", height="60%", loc='upper right')

        scatter_plot(axins, results, labels)
        axins.set_xlim(zoom[0])
        axins.set_ylim(zoom[1])
        _,_ = ax.indicate_inset_zoom(axins, edgecolor="black", linewidth=3)

    plt.xlabel('Batch Index')
    plt.ylabel('Error')
    plt.legend(loc='upper right')
    plt.show()

def bucket_plot_losses(results_name, labels_name, perspective, level, zoom=[[11000,13000],[-0.05, 0.2]]):
    if bucket_lengths is None:
        plot_losses(
            results=results[f'{results_name}'], 
            labels=results[f'{labels_name}'], 
            perspective=perspective, level=level, bucket=None, zoom=zoom)       
    else:
        for bucket in bucket_lengths:
            plot_losses(
                results=results[f'{results_name}_{bucket}'], 
                labels=results[f'{labels_name}_{bucket}'], 
                perspective=perspective, level=level, bucket=bucket, zoom=zoom)


In [None]:
bucket_plot_losses(
    results_name='result_DAE_trace_Order', 
    labels_name='labels_DAE_trace',
    perspective=Perspective.ORDER,
    level='trace',
    zoom=None)
    # zoom=[[2000,3000],[-0.01, 0.05]]

In [None]:
bucket_plot_losses(
    results_name='result_DAE_trace_Attribute', 
    labels_name='labels_DAE_trace',
    perspective=Perspective.ATTRIBUTE,
    level='trace',
    zoom=None)
    #zoom=[[2000,5000],[-0.01, 0.2]])

In [None]:
try:
    bucket_plot_losses(
        results_name='result_DAE_trace_Arrival Time', 
        labels_name='labels_DAE_trace',
        perspective=Perspective.ARRIVAL_TIME,
        level='trace',
        zoom=None)
        #zoom=[[2000,5000],[-0.01, 0.2]])
except:
    print('Data does not exist')

In [None]:
try:
    bucket_plot_losses(
        results_name='result_DAE_trace_Workload', 
        labels_name='labels_DAE_trace',
        perspective=Perspective.WORKLOAD,
        level='trace',
        zoom=None)
        #zoom=[[2000,5000],[-0.01, 0.2]])
except:
    print('Data does not exist')

In [None]:
def bucket_plot_losses_event(results_name, labels_name, perspective, level, zoom=[[11000,13000],[-0.05, 0.2]], event_index=0):
    if bucket_lengths is None:
        plot_losses(
            results=results[f'{results_name}'][:,event_index], 
            labels=results[f'{labels_name}'][:,event_index], 
            perspective=perspective, level=level, bucket=None, zoom=zoom)
    else:      
        for bucket in bucket_lengths:
            plot_losses(
                results=results[f'{results_name}_{bucket}'][:,event_index], 
                labels=results[f'{labels_name}_{bucket}'][:,event_index], 
                perspective=perspective, level=level, bucket=bucket, zoom=zoom)

In [None]:
bucket_plot_losses_event(
    results_name='result_DAE_event_Order', 
    labels_name='labels_DAE_event',
    perspective=Perspective.ORDER,
    level='event',
    # zoom=[[2000,3000],[-0.01, 0.05]],
    zoom=None,
    event_index=0)

In [None]:
bucket_plot_losses_event(
    results_name='result_DAE_event_Order', 
    labels_name='labels_DAE_event',
    perspective=Perspective.ORDER,
    level='event',
    # zoom=[[2000,3000],[-0.01, 0.05]],
    zoom=None,
    event_index=6)

In [None]:
bucket_plot_losses_event(
    results_name='result_DAE_event_Attribute', 
    labels_name='labels_DAE_event',
    perspective=Perspective.ATTRIBUTE,
    level='event',
    # zoom=[[2000,3000],[-0.01, 0.05]],
    zoom=None,
    event_index=1)

In [None]:
try:
    bucket_plot_losses_event(
        results_name='result_DAE_event_Arrival Time', 
        labels_name='labels_DAE_event',
        perspective=Perspective.ARRIVAL_TIME,
        level='event',
        zoom=[[2000,3000],[-0.01, 0.05]],
        event_index=2)
except:
    print('Error, skipped')

In [None]:
try:
    bucket_plot_losses_event(
        results_name='result_DAE_event_Workload', 
        labels_name='labels_DAE_event',
        perspective=Perspective.WORKLOAD,
        level='event',
        zoom=[[2000,3000],[-0.01, 0.05]],
        event_index=0)
except:
    print('Error: Skipped')