# Information

**Author:**<br>Pascal Munaretto (<a href="mailto:pascal.munaretto@outlook.com">Mail</a>)

**Date:**<br>30.09.2022

**Type:**<br>Master's Thesis

**Topic:**<br>Design, Implementation and Performance Analysis of an AI-Based Insider Threat Detection Platform	in Splunk To Counteract Data Exfiltration

**Study Program:**<br>Enterprise and IT Security

**Institution:**<br><a href="https://www.hs-offenburg.de">Offenburg University of Applied Sciences</a>

**Github:**<br>https://github.com/pmunaretto/Master-Thesis

# Setup

## Requirements

In [None]:
!pip install pyod suod

## Patches

In [None]:
# Add callbacks to Auto Encoder, VAE and Deep SVDD
!cp /content/drive/MyDrive/CERT/patches/patched_auto_encoder.py /usr/local/lib/python3.7/dist-packages/pyod/models/auto_encoder.py
!cp /content/drive/MyDrive/CERT/patches/patched_vae.py /usr/local/lib/python3.7/dist-packages/pyod/models/vae.py
!cp /content/drive/MyDrive/CERT/patches/patched_deep_svdd.py /usr/local/lib/python3.7/dist-packages/pyod/models/deep_svdd.py

## Imports

In [None]:
import os
import math
import sys
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from pickle import PicklingError
from joblib import dump, load
from timeit import default_timer as timer
from random import seed, randint
from sklearn.base import TransformerMixin, BaseEstimator, clone
from sklearn.metrics import make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, recall_score, classification_report, f1_score, accuracy_score, precision_score, confusion_matrix, matthews_corrcoef, roc_curve, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
from datetime import datetime
from tensorflow import keras
from matplotlib.backends.backend_pgf import FigureCanvasPgf
from matplotlib.ticker import PercentFormatter
from pyod.utils.data import evaluate_print
from IPython.display import display, Markdown
from pyod.models.iforest import IForest
from pyod.models.ecod import ECOD
from pyod.models.copod import COPOD
from pyod.models.loda import LODA
from pyod.models.cblof import CBLOF
from pyod.models.pca import PCA
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.vae import VAE
from pyod.models.deep_svdd import DeepSVDD

## Configuration

In [None]:
matplotlib.backend_bases.register_backend("pgf", FigureCanvasPgf)

plt.rcParams.update({
    "figure.dpi": 100,
    "savefig.dpi": 300,
    "font.size": 12,
    "image.cmap": "plasma",
    "axes.prop_cycle": plt.cycler("color", "bgrcmyk"), 
    "pgf.texsystem": "pdflatex",
    "font.family": "serif",
    "text.usetex": True,
    "pgf.rcfonts": False
})

tf.get_logger().setLevel("WARN")

# Global Settings
BASE_PATH     = "/content/drive/MyDrive/CERT/r4.2"
N_JOBS        = -1
N_ITER        = 5
CONTAMINATION = 0.01
RETRAIN       = False
DATASET_NAME  = "device_sessions_organization"
TIME_BUCKETS  = [
    "7d", 
    "2d", 
    "1d", 
    "12h",
    "6h", 
    "1h"
]

## Helper Functions

In [None]:
class Debugger(BaseEstimator, TransformerMixin):

    def transform(self, data):
        print("Shape of Preprocessed data:", data.shape)
        print(pd.DataFrame(data).head())
        return data

    def fit(self, data, y=None, **fit_params):
        return self


def plot_anomaly_scores(series, identifier, min, max, save=True):
    plt.figure(figsize=(10,3))
    plt.hist(
        series,
        weights=np.ones(len(series)) / len(series),
        bins=np.arange(min, max, 0.02),
        rwidth=0.8
    )
    plt.xlim(xmin=min, xmax=max)
    plt.xticks(np.arange(min, max+0.1, 0.1))
    plt.xlabel("Anomaly Score")
    plt.ylabel("Percentage")
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    if save:
        plt.savefig(os.path.join(BASE_PATH, "figures", f"{identifier}.pgf"), format="pgf")
    plt.show()


def plot_confusion_matrix(y_true, y_pred, identifier, save=True):
    ConfusionMatrixDisplay.from_predictions(
        y_true,
        y_pred,
        labels=[0, 1],
        display_labels=["Benign", "Malicious"],
        values_format="d",
        colorbar=True,
        cmap="plasma_r"
    )
    plt.grid(False)
    if save:
        plt.savefig(os.path.join(BASE_PATH, "figures", f"{identifier}.pgf"), format="pgf")
    plt.show()


def print_training_result(metrics):
    print(
        "  ".join(
            [
                f"\033[1;33m Training Time: {metrics.training_time_avg:<7.4f}\033[0m",
                f"\033[1;33m Inference Time: {metrics.inference_time_avg:<7.4f}\033[0m",
                f"\033[1;35m pAUC: {metrics.p_auc_10_avg:02.4f} \u00B1 {metrics.p_auc_10_std:02.4f}\033[0m",
                f"\033[1;35m Recall: {metrics.recall_avg:02.4f} \u00B1 {metrics.recall_std:02.4f}\033[0m",
                f"\033[1;32m TN: {metrics.best_classifier_TN:<6}\033[0m",
                f"\033[1;31m FP: {metrics.best_classifier_FP:<5}\033[0m",
                f"\033[1;31m FN: {metrics.best_classifier_FN:<3}\033[0m",
                f"\033[1;32m TP: {metrics.best_classifier_TP:<3}\033[0m",
                f"\033[1;37m Params: {metrics.name}\033[0m"
            ]
        )
    )


def print_gridsearch_result(metrics):
    print(
        "\n".join(
            [
                "\n\033[4mBest hyperparameters:\033[0m",
                f"Params: {metrics.name}",
                f"pAUC:   {metrics.p_auc_10_avg:02.4f} \u00B1 {metrics.p_auc_10_std:02.4f}",
                f"Recall: {metrics.recall_avg:02.4f} \u00B1 {metrics.recall_std:02.4f}"
            ]
        )
    )


def calculate_dispersion_metrics_for_columns(source_df, destination_df, columns):
    for column in columns:
        avg = np.average(source_df[column])
        std = np.std(source_df[column])
        destination_df[f"{column}_avg"] = avg
        destination_df[f"{column}_std"] = std if not math.isnan(std) else 0

    return destination_df


def add_count_and_threat(x):
    avg_session_duration = x.session_duration.mean()
    threat = 1 if 1 in x.threat.values else 0
    return pd.Series([len(x), avg_session_duration, threat], index=["count", "avg_session_duration", "threat"])


def train_classifier_on_single_users(df, classifier, time_bucket, params, n_iter=10):
    # Create a dataframe where the results of the different seeds will be stored
    random_state_summary = pd.DataFrame()

    # Reset the PRNG seed
    seed(1)

    # Perform the training process multiple times with random seeds
    for _ in range(n_iter):

        # Create a clone of the classifier
        try:
            classifier = clone(classifier)
        except:
            pass

        # Update the parameters of the classifier according to the grid search
        classifier.set_params(**params)
        
        # Set the random state attribute of the classifier (if it has one)
        try:
            classifier.set_params(**{"random_state": randint(0, 2**32)})
        except Exception:
            pass

        # Define the transformers that do the rest of the preprocessing (scaling, encoding)
        numeric_transformer = Pipeline(steps=[
            ("scaler", RobustScaler())
        ])
        categorical_transformer = Pipeline(steps=[
            ("ohe", OneHotEncoder())
        ])

        # Create a pipeline that performs the feature selection and scaling
        pipe = Pipeline([
            ("column_transformer", ColumnTransformer(
                transformers=[
                    ("num", numeric_transformer, selector(dtype_exclude=["category", "object"])),
                    ("cat", categorical_transformer, selector(dtype_include=["category", "object"]))
                ]
            )),
            ("classifier", classifier)
        ])
       
        # Benchmark the training
        start_training = timer()
        pipe.fit(df[["count", "avg_session_duration"]])
        end_training = timer()

        # Benchmark the inference
        start_inference = timer()
        pipe.predict(df[["count", "avg_session_duration"]])
        end_inference = timer()

        # Calculate different evaluation metrics
        y_true = df["threat"]
        y_pred = pipe.named_steps["classifier"].labels_
        scores = pipe.named_steps["classifier"].decision_scores_
        recall = recall_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1])

        # Try to calculate the AUC, this could fail if one of the anomaly scores is infinite
        try:
            auc = roc_auc_score(y_true, scores)
            p_auc_10 = roc_auc_score(y_true, scores, max_fpr=0.1)
            p_auc_20 = roc_auc_score(y_true, scores, max_fpr=0.2)
            p_auc_30 = roc_auc_score(y_true, scores, max_fpr=0.3)
        except ValueError:
            auc = -1

        # Create a new series with all the information about the iteration
        metrics = pd.Series(
            {
                "training_time": end_training - start_training,
                "inference_time": end_inference - start_inference,
                "recall": recall,
                "precision": precision,
                "f1": f1,
                "TN": cm[0][0],
                "FP": cm[0][1],
                "FN": cm[1][0],
                "TP": cm[1][1],
                "auc": auc,
                "p_auc_10": p_auc_10,
                "p_auc_20": p_auc_20,
                "p_auc_30": p_auc_30,
                "y_true": y_true,
                "y_pred": y_pred,
                "scores": scores
            }
        )

        # Append the series to our summary dataframe
        random_state_summary = random_state_summary.append(metrics, ignore_index=True)

    # Convert the confusion matrix to integers
    random_state_summary = random_state_summary.astype({"TN": "int32", "FP": "int32", "FN": "int32", "TP": "int32"})

    # Locate the best classifier and separate the predictions from it
    results = random_state_summary.iloc[random_state_summary.p_auc_10.argmax()].rename(str(params))
    predictions = results.loc[["y_true", "y_pred", "scores"]]

    # Remove the columns that should not be part of the results dataframe
    results.drop(["y_true", "y_pred", "scores"], inplace=True)

    # Add the prefix
    results = results.add_prefix("best_classifier_")
    
    # Add the average training and inference time to the dataframe
    results["training_time_avg"]  = np.average(random_state_summary["training_time"])
    results["inference_time_avg"] = np.average(random_state_summary["inference_time"])

    # Calculate averages and different dispersion metrics for the best classifier series
    results = calculate_dispersion_metrics_for_columns(
        source_df=random_state_summary,
        destination_df=results,
        columns=["auc", "p_auc_10", "p_auc_20", "p_auc_30", "recall"]
    )

    return results, predictions


class GridSearch:
    def __init__(self, df, classifier, time_bucket, parameters, gridsearch_path):

        # Instance variables
        self.df = df
        self.classifier = classifier
        self.time_bucket = time_bucket
        self.parameters = parameters
        self.gridsearch_path = gridsearch_path

        # Main paths
        self.summary_path = os.path.join(self.gridsearch_path, "gridsearch_summary.csv")
        self.best_results_path = os.path.join(self.gridsearch_path, "best_results.csv")
        self.best_preds_path = os.path.join(self.gridsearch_path, "best_preds.csv")

        # Create the output directory for the gridsearch
        os.makedirs(gridsearch_path, exist_ok=True)

        # Read existing files
        if os.path.exists(self.summary_path) and not RETRAIN:
            self.gridsearch_summary = pd.read_csv(self.summary_path, index_col=0)
        else:
            self.gridsearch_summary = pd.DataFrame()
        if os.path.exists(self.best_results_path) and not RETRAIN:
            self.best_results = pd.read_csv(self.best_results_path, squeeze=True, index_col=0)
        else:
            self.best_results = None


    def start_training(self):

        # Create an iterable parameter grid from the parameters dictionary
        grid = ParameterGrid(self.parameters)

        # Debug output
        print(f"\033[4mTesting {len(list(grid))} different hyperparameter combinations\033[0m")

        # Iterate over all possible parameter combinations
        for params in grid:

            # Skip the parameters if they are already part of the gridsearch summary
            if not RETRAIN and not self.gridsearch_summary.empty and str(params) in self.gridsearch_summary.index:
                print_training_result(self.gridsearch_summary.loc[str(params)])
                continue

            # Start the training process
            try: 
                if hasattr(classifier, "random_state"):
                    results, predictions = train_classifier_on_single_users(
                        df=self.df,
                        classifier=self.classifier,
                        time_bucket=self.time_bucket,
                        params=params,
                        n_iter=N_ITER
                    )
                else:
                    results, predictions = train_classifier_on_single_users(
                        df=self.df,
                        classifier=self.classifier,
                        time_bucket=self.time_bucket,
                        params=params,
                        n_iter=1
                    )
            except ValueError as e:
                print(f"Skipping {params}: {e}")
                continue

            # Print the metrics of the best classifier
            print_training_result(results)

            # Add the results to the gridsearch summary
            self.gridsearch_summary = self.gridsearch_summary.append(results)

            # Update the best classifier if the iterations performs better than the current best 
            if self.best_results is None or results.p_auc_10_avg > self.best_results.p_auc_10_avg:
                self.best_results = results
                self.best_preds = predictions
                self.save_best_results()

            # Save the progress
            self.save_gridsearch_summary()

        # Print the results of the gridsearch (parameters with the best average)
        print_gridsearch_result(self.best_results)

        return self.best_results


    def save_gridsearch_summary(self):
        self.gridsearch_summary.to_csv(self.summary_path)


    def save_best_results(self):
        self.best_results.to_csv(self.best_results_path)
        self.best_preds.to_frame()\
            .transpose()\
            .apply(pd.Series.explode)\
            .reset_index(drop=True)\
            .to_csv(self.best_preds_path, index=False)


    def get_summary(self):
        return self.gridsearch_summary


def initiate_training_run(classifier_name, classifier, parameters):
    # Define output paths
    summary_path = os.path.join(BASE_PATH, "results_summary", DATASET_NAME)
    summary_file = os.path.join(summary_path, "summary.csv")

    # Create the output directory for the classifier
    os.makedirs(summary_path, exist_ok=True)

    # Iterate through the feature sets
    for i, time_bucket in enumerate(TIME_BUCKETS, start=1):

        display(Markdown(f"# {i}/{len(TIME_BUCKETS)} - Bucket: {time_bucket}"))

        # Group the dataframe by users and time buckets
        tmp = df.groupby([df["user"], pd.Grouper(freq=time_bucket, key="date")]).apply(add_count_and_threat)
        tmp = tmp.reindex(pd.MultiIndex.from_product(tmp.index.levels))
        tmp = tmp.fillna(0)

        # Perform a grid search to find the best parameters for the classifier
        gridsearch = GridSearch(
            df=tmp,
            classifier=classifier,
            time_bucket=time_bucket,
            parameters=parameters,
            gridsearch_path=os.path.join(BASE_PATH, "results_summary", DATASET_NAME, classifier_name, f"gridsearch{i}")
        )
    
        best_parameter_series = gridsearch.start_training()

        # Read the summary file if it already exists, otherwise create a new one
        if os.path.exists(summary_file):
            summary = pd.read_csv(summary_file, index_col=0)
        else:
            summary = pd.DataFrame()

        # Set the index of the pandas series and update / append it to the summary
        index_name = f"{classifier_name}_dataset{i}"
        best_parameter_series.rename(index_name, inplace=True)
        if index_name in summary.index:
            summary.loc[index_name] = best_parameter_series
        else:
            summary = summary.append(best_parameter_series)

        # Save the summaries and predictions to a file
        summary.sort_index(inplace=True)
        summary.to_csv(summary_file)

## Loading the Data

In [None]:
# Read the dataset
df = pd.read_parquet(os.path.join(BASE_PATH, "preprocessed", "device_sessions"))

# Training - Isolation Forest

In [None]:
# Local configuration
classifier_name = "isolation_forest"

# Define the classifier that will be used for training
classifier = IForest(
    behaviour="new",
    max_features=1.0,
    contamination=CONTAMINATION,
    n_jobs=N_JOBS
)

# Define the hyperparameters grid that will be tested for best results
parameters = {
    "n_estimators": [1, 10, 50, 100],
    "max_samples": [128, 256, 512, 1024, 2048, 4096],
}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# Training - LODA

In [None]:
# Local configuration
classifier_name = "loda"

# Define the classifier that will be used for training
classifier = LODA(
    contamination=CONTAMINATION
)

# Define the hyperparameters grid that will be tested for best results
parameters = {
    "n_bins": [6, 8, 10, 12, 14, 16, 20],
    "n_random_cuts": [25, 50, 75, 100]
}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# Training - COPOD

In [None]:
# Local configuration
classifier_name = "copod"

# Define the classifier that will be used for training
classifier = COPOD(
    contamination=CONTAMINATION
)

# Define the hyperparameters grid that will be tested for best results
parameters = {}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# Training - ECOD

In [None]:
# Local configuration
classifier_name = "ecod"

# Define the classifier that will be used for training
classifier = ECOD(
    contamination=CONTAMINATION
)

# Define the hyperparameters grid that will be tested for best results
parameters = {}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# Training - CBLOF

In [None]:
# Local configuration
classifier_name = "cblof"

# Define the classifier that will be used for training
classifier = CBLOF(
    contamination=CONTAMINATION,
    n_jobs=N_JOBS
)

# Define the hyperparameters grid that will be tested for best results
parameters = {
    "n_clusters": [1, 2, 4],
    "alpha": [0.2, 0.4, 0.6, 0.8, 0.9],
    "beta": [2, 4, 8, 16],
    "use_weights": [True, False]
}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# 1/6 - Bucket: 7d

[4mTesting 120 different hyperparameter combinations[0m
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': True}: Could not form valid cluster separation. Please change n_clusters or change clustering method
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': False}: Could not form valid cluster separation. Please change n_clusters or change clustering method
[1;33m Training Time: 1.4588 [0m  [1;33m Inference Time: 0.0453 [0m  [1;35m pAUC: 0.4841 ± 0.0000[0m  [1;35m Recall: 0.0112 ± 0.0000[0m  [1;32m TN: 18712.0[0m  [1;31m FP: 189.0[0m  [1;31m FN: 177.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_clusters': 2, 'use_weights': True}[0m
[1;33m Training Time: 0.4979 [0m  [1;33m Inference Time: 0.0305 [0m  [1;35m pAUC: 0.4840 ± 0.0000[0m  [1;35m Recall: 0.0112 ± 0.0000[0m  [1;32m TN: 18712.0[0m  [1;31m FP: 189.0[0m  [1;31m FN: 177.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_cl

# 2/6 - Bucket: 2d

[4mTesting 120 different hyperparameter combinations[0m
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': True}: Could not form valid cluster separation. Please change n_clusters or change clustering method
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': False}: Could not form valid cluster separation. Please change n_clusters or change clustering method
[1;33m Training Time: 0.4261 [0m  [1;33m Inference Time: 0.0176 [0m  [1;35m pAUC: 0.4842 ± 0.0000[0m  [1;35m Recall: 0.0077 ± 0.0000[0m  [1;32m TN: 65329.0[0m  [1;31m FP: 661.0[0m  [1;31m FN: 258.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_clusters': 2, 'use_weights': True}[0m
[1;33m Training Time: 0.4741 [0m  [1;33m Inference Time: 0.0251 [0m  [1;35m pAUC: 0.4840 ± 0.0000[0m  [1;35m Recall: 0.0077 ± 0.0000[0m  [1;32m TN: 65329.0[0m  [1;31m FP: 661.0[0m  [1;31m FN: 258.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_cl

# 3/6 - Bucket: 1d

[4mTesting 120 different hyperparameter combinations[0m
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': True}: Could not form valid cluster separation. Please change n_clusters or change clustering method
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': False}: Could not form valid cluster separation. Please change n_clusters or change clustering method
[1;33m Training Time: 0.7850 [0m  [1;33m Inference Time: 0.0287 [0m  [1;35m pAUC: 0.5000 ± 0.0000[0m  [1;35m Recall: 0.0070 ± 0.0000[0m  [1;32m TN: 130912.0[0m  [1;31m FP: 1303.0[0m  [1;31m FN: 283.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_clusters': 2, 'use_weights': True}[0m
[1;33m Training Time: 0.7255 [0m  [1;33m Inference Time: 0.0364 [0m  [1;35m pAUC: 0.4997 ± 0.0000[0m  [1;35m Recall: 0.0070 ± 0.0000[0m  [1;32m TN: 130897.0[0m  [1;31m FP: 1318.0[0m  [1;31m FN: 283.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, '

# 4/6 - Bucket: 12h

[4mTesting 120 different hyperparameter combinations[0m
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': True}: Could not form valid cluster separation. Please change n_clusters or change clustering method
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': False}: Could not form valid cluster separation. Please change n_clusters or change clustering method
[1;33m Training Time: 1.1864 [0m  [1;33m Inference Time: 0.0459 [0m  [1;35m pAUC: 0.5164 ± 0.0000[0m  [1;35m Recall: 0.0063 ± 0.0000[0m  [1;32m TN: 261899.0[0m  [1;31m FP: 2624.0[0m  [1;31m FN: 474.0[0m  [1;32m TP: 3.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_clusters': 2, 'use_weights': True}[0m
[1;33m Training Time: 1.1584 [0m  [1;33m Inference Time: 0.0453 [0m  [1;35m pAUC: 0.5171 ± 0.0000[0m  [1;35m Recall: 0.0084 ± 0.0000[0m  [1;32m TN: 261900.0[0m  [1;31m FP: 2623.0[0m  [1;31m FN: 473.0[0m  [1;32m TP: 4.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, '

# 5/6 - Bucket: 6h

[4mTesting 120 different hyperparameter combinations[0m
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': True}: Could not form valid cluster separation. Please change n_clusters or change clustering method
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': False}: Could not form valid cluster separation. Please change n_clusters or change clustering method
[1;33m Training Time: 1.6610 [0m  [1;33m Inference Time: 0.0892 [0m  [1;35m pAUC: 0.5871 ± 0.0000[0m  [1;35m Recall: 0.0207 ± 0.0000[0m  [1;32m TN: 523960.0[0m  [1;31m FP: 5244.0[0m  [1;31m FN: 520.0[0m  [1;32m TP: 11.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_clusters': 2, 'use_weights': True}[0m
[1;33m Training Time: 1.6100 [0m  [1;33m Inference Time: 0.0798 [0m  [1;35m pAUC: 0.5875 ± 0.0000[0m  [1;35m Recall: 0.0226 ± 0.0000[0m  [1;32m TN: 524003.0[0m  [1;31m FP: 5201.0[0m  [1;31m FN: 519.0[0m  [1;32m TP: 12.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2,

# 6/6 - Bucket: 1h

[4mTesting 120 different hyperparameter combinations[0m
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': True}: Could not form valid cluster separation. Please change n_clusters or change clustering method
Skipping {'alpha': 0.2, 'beta': 2, 'n_clusters': 1, 'use_weights': False}: Could not form valid cluster separation. Please change n_clusters or change clustering method
[1;33m Training Time: 7.0924 [0m  [1;33m Inference Time: 0.4439 [0m  [1;35m pAUC: 0.8359 ± 0.0000[0m  [1;35m Recall: 0.1529 ± 0.0000[0m  [1;32m TN: 3146363.0[0m  [1;31m FP: 30657.0[0m  [1;31m FN: 953.0[0m  [1;32m TP: 172.0[0m  [1;37m Params: {'alpha': 0.2, 'beta': 2, 'n_clusters': 2, 'use_weights': True}[0m
[1;33m Training Time: 7.2284 [0m  [1;33m Inference Time: 0.4576 [0m  [1;35m pAUC: 0.8362 ± 0.0000[0m  [1;35m Recall: 0.1538 ± 0.0000[0m  [1;32m TN: 3146206.0[0m  [1;31m FP: 30814.0[0m  [1;31m FN: 952.0[0m  [1;32m TP: 173.0[0m  [1;37m Params: {'alpha': 0.2, 'bet

# Training - PCA

In [None]:
# Local configuration
classifier_name = "pca"

# Define the classifier that will be used for training
classifier = PCA(
    contamination=CONTAMINATION
)

# Define the hyperparameters grid that will be tested for best results
parameters = {
    "n_components": [1, 2, 3, 4, 5, 6, 7],
    "whiten": [True, False],
    "svd_solver": ["full", "arpack", "randomized"],
    "weighted": [True, False],
    "standardization": [True, False]
}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# 1/6 - Bucket: 7d

[4mTesting 168 different hyperparameter combinations[0m
[1;33m Training Time: 0.0181 [0m  [1;33m Inference Time: 0.0049 [0m  [1;35m pAUC: 0.4820 ± 0.0000[0m  [1;35m Recall: 0.0168 ± 0.0000[0m  [1;32m TN: 18713 [0m  [1;31m FP: 188  [0m  [1;31m FN: 176[0m  [1;32m TP: 3  [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': True}[0m
[1;33m Training Time: 0.0140 [0m  [1;33m Inference Time: 0.0060 [0m  [1;35m pAUC: 0.4820 ± 0.0000[0m  [1;35m Recall: 0.0168 ± 0.0000[0m  [1;32m TN: 18713 [0m  [1;31m FP: 188  [0m  [1;31m FN: 176[0m  [1;32m TP: 3  [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': False}[0m
[1;33m Training Time: 0.0149 [0m  [1;33m Inference Time: 0.0053 [0m  [1;35m pAUC: 0.4820 ± 0.0000[0m  [1;35m Recall: 0.0168 ± 0.0000[0m  [1;32m TN: 18713 [0m  [1;31m FP: 188  [0m  [1;31m FN: 176[0m  [1;32m TP: 3  [0

# 2/6 - Bucket: 2d

[4mTesting 168 different hyperparameter combinations[0m
[1;33m Training Time: 0.0274 [0m  [1;33m Inference Time: 0.0115 [0m  [1;35m pAUC: 0.4827 ± 0.0000[0m  [1;35m Recall: 0.0115 ± 0.0000[0m  [1;32m TN: 65330 [0m  [1;31m FP: 660  [0m  [1;31m FN: 257[0m  [1;32m TP: 3  [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': True}[0m
[1;33m Training Time: 0.0255 [0m  [1;33m Inference Time: 0.0113 [0m  [1;35m pAUC: 0.4827 ± 0.0000[0m  [1;35m Recall: 0.0115 ± 0.0000[0m  [1;32m TN: 65330 [0m  [1;31m FP: 660  [0m  [1;31m FN: 257[0m  [1;32m TP: 3  [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': False}[0m
[1;33m Training Time: 0.0281 [0m  [1;33m Inference Time: 0.0113 [0m  [1;35m pAUC: 0.4827 ± 0.0000[0m  [1;35m Recall: 0.0115 ± 0.0000[0m  [1;32m TN: 65330 [0m  [1;31m FP: 660  [0m  [1;31m FN: 257[0m  [1;32m TP: 3  [0

# 3/6 - Bucket: 1d

[4mTesting 168 different hyperparameter combinations[0m
[1;33m Training Time: 0.0387 [0m  [1;33m Inference Time: 0.0170 [0m  [1;35m pAUC: 0.5049 ± 0.0000[0m  [1;35m Recall: 0.0246 ± 0.0000[0m  [1;32m TN: 130899[0m  [1;31m FP: 1316 [0m  [1;31m FN: 278[0m  [1;32m TP: 7  [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': True}[0m
[1;33m Training Time: 0.0382 [0m  [1;33m Inference Time: 0.0159 [0m  [1;35m pAUC: 0.5049 ± 0.0000[0m  [1;35m Recall: 0.0246 ± 0.0000[0m  [1;32m TN: 130899[0m  [1;31m FP: 1316 [0m  [1;31m FN: 278[0m  [1;32m TP: 7  [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': False}[0m
[1;33m Training Time: 0.0398 [0m  [1;33m Inference Time: 0.0152 [0m  [1;35m pAUC: 0.5049 ± 0.0000[0m  [1;35m Recall: 0.0246 ± 0.0000[0m  [1;32m TN: 130899[0m  [1;31m FP: 1316 [0m  [1;31m FN: 278[0m  [1;32m TP: 7  [0

# 4/6 - Bucket: 12h

[4mTesting 168 different hyperparameter combinations[0m
[1;33m Training Time: 0.0653 [0m  [1;33m Inference Time: 0.0257 [0m  [1;35m pAUC: 0.5631 ± 0.0000[0m  [1;35m Recall: 0.0231 ± 0.0000[0m  [1;32m TN: 261906[0m  [1;31m FP: 2617 [0m  [1;31m FN: 466[0m  [1;32m TP: 11 [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': True}[0m
[1;33m Training Time: 0.0624 [0m  [1;33m Inference Time: 0.0251 [0m  [1;35m pAUC: 0.5631 ± 0.0000[0m  [1;35m Recall: 0.0231 ± 0.0000[0m  [1;32m TN: 261906[0m  [1;31m FP: 2617 [0m  [1;31m FN: 466[0m  [1;32m TP: 11 [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': False}[0m
[1;33m Training Time: 0.0685 [0m  [1;33m Inference Time: 0.0261 [0m  [1;35m pAUC: 0.5631 ± 0.0000[0m  [1;35m Recall: 0.0231 ± 0.0000[0m  [1;32m TN: 261906[0m  [1;31m FP: 2617 [0m  [1;31m FN: 466[0m  [1;32m TP: 11 [0

# 5/6 - Bucket: 6h

[4mTesting 168 different hyperparameter combinations[0m
[1;33m Training Time: 0.1347 [0m  [1;33m Inference Time: 0.0553 [0m  [1;35m pAUC: 0.6507 ± 0.0000[0m  [1;35m Recall: 0.0339 ± 0.0000[0m  [1;32m TN: 523927[0m  [1;31m FP: 5277 [0m  [1;31m FN: 513[0m  [1;32m TP: 18 [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': True}[0m
[1;33m Training Time: 0.1291 [0m  [1;33m Inference Time: 0.0566 [0m  [1;35m pAUC: 0.6507 ± 0.0000[0m  [1;35m Recall: 0.0339 ± 0.0000[0m  [1;32m TN: 523927[0m  [1;31m FP: 5277 [0m  [1;31m FN: 513[0m  [1;32m TP: 18 [0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': False}[0m
[1;33m Training Time: 0.1642 [0m  [1;33m Inference Time: 0.0696 [0m  [1;35m pAUC: 0.6507 ± 0.0000[0m  [1;35m Recall: 0.0339 ± 0.0000[0m  [1;32m TN: 523927[0m  [1;31m FP: 5277 [0m  [1;31m FN: 513[0m  [1;32m TP: 18 [0

# 6/6 - Bucket: 1h

[4mTesting 168 different hyperparameter combinations[0m
[1;33m Training Time: 0.6740 [0m  [1;33m Inference Time: 0.2052 [0m  [1;35m pAUC: 0.8666 ± 0.0000[0m  [1;35m Recall: 0.1813 ± 0.0000[0m  [1;32m TN: 3145471[0m  [1;31m FP: 31549[0m  [1;31m FN: 921[0m  [1;32m TP: 204[0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': True}[0m
[1;33m Training Time: 0.6494 [0m  [1;33m Inference Time: 0.2111 [0m  [1;35m pAUC: 0.8666 ± 0.0000[0m  [1;35m Recall: 0.1813 ± 0.0000[0m  [1;32m TN: 3145471[0m  [1;31m FP: 31549[0m  [1;31m FN: 921[0m  [1;32m TP: 204[0m  [1;37m Params: {'n_components': 1, 'standardization': True, 'svd_solver': 'full', 'weighted': True, 'whiten': False}[0m
[1;33m Training Time: 0.6578 [0m  [1;33m Inference Time: 0.2009 [0m  [1;35m pAUC: 0.8666 ± 0.0000[0m  [1;35m Recall: 0.1813 ± 0.0000[0m  [1;32m TN: 3145471[0m  [1;31m FP: 31549[0m  [1;31m FN: 921[0m  [1;32m TP: 204

# Training - AE

In [None]:
# Local configuration
classifier_name = "auto_encoder"

# Define the classifier that will be used for training
classifier = AutoEncoder(
    output_activation="sigmoid",
    optimizer=keras.optimizers.Adam(),
    epochs=100,
    batch_size=16384,
    validation_size=0.1,
    dropout_rate=0.2,
    l2_regularizer=0.1,
    preprocessing=False,
    verbose=0,
    callbacks=[
        keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, min_lr=1e-6),
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=6)
    ],
    contamination=CONTAMINATION
)

# Define the hyperparameters grid that will be tested for best results
parameters = {
    "hidden_neurons": [[8, 4, 4, 8], [4, 2, 2, 4], [2, 1, 1, 2]],
    "hidden_activation": ["relu", "sigmoid", "tanh"]
}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# 1/6 - Bucket: 7d

[4mTesting 9 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 8.7137 [0m  [1;33m Inference Time: 0.9405 [0m  [1;35m pAUC: 0.4841 ± 0.0002[0m  [1;35m Recall: 0.0112 ± 0.0000[0m  [1;32m TN: 18712.0[0m  [1;31m FP: 189.0[0m  [1;31m FN: 177.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [4, 2, 2, 4]}[0m
[1;33m Training Time: 9.4098 [0m  [1;33m Inference Time: 0.8022 [0m  [1;35m pAUC: 0.4841 ± 0.0003[0m  [1;35m Recall: 0.0112 ± 0.0000[0m  [1;32m TN: 18712.0[0m  [1;31m FP: 189.0[0m  [1;31m FN: 177.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [2, 1, 1, 2]}[0m
Skipping {'hidden_activation': 'sigmoid', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 10.7861[0

# 2/6 - Bucket: 2d

[4mTesting 9 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 19.3443[0m  [1;33m Inference Time: 2.4586 [0m  [1;35m pAUC: 0.4835 ± 0.0002[0m  [1;35m Recall: 0.0077 ± 0.0000[0m  [1;32m TN: 65329.0[0m  [1;31m FP: 661.0[0m  [1;31m FN: 258.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [4, 2, 2, 4]}[0m
[1;33m Training Time: 16.5715[0m  [1;33m Inference Time: 2.4617 [0m  [1;35m pAUC: 0.4837 ± 0.0002[0m  [1;35m Recall: 0.0077 ± 0.0000[0m  [1;32m TN: 65329.0[0m  [1;31m FP: 661.0[0m  [1;31m FN: 258.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [2, 1, 1, 2]}[0m
Skipping {'hidden_activation': 'sigmoid', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 17.4577[0

# 3/6 - Bucket: 1d

[4mTesting 9 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 34.5196[0m  [1;33m Inference Time: 4.9706 [0m  [1;35m pAUC: 0.4997 ± 0.0003[0m  [1;35m Recall: 0.0070 ± 0.0000[0m  [1;32m TN: 130893.0[0m  [1;31m FP: 1322.0[0m  [1;31m FN: 283.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [4, 2, 2, 4]}[0m
[1;33m Training Time: 31.3635[0m  [1;33m Inference Time: 4.8494 [0m  [1;35m pAUC: 0.4999 ± 0.0004[0m  [1;35m Recall: 0.0070 ± 0.0000[0m  [1;32m TN: 130893.0[0m  [1;31m FP: 1322.0[0m  [1;31m FN: 283.0[0m  [1;32m TP: 2.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [2, 1, 1, 2]}[0m
Skipping {'hidden_activation': 'sigmoid', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 30.199

# 4/6 - Bucket: 12h

[4mTesting 9 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 60.4888[0m  [1;33m Inference Time: 9.5182 [0m  [1;35m pAUC: 0.5163 ± 0.0008[0m  [1;35m Recall: 0.0084 ± 0.0000[0m  [1;32m TN: 261906.0[0m  [1;31m FP: 2617.0[0m  [1;31m FN: 473.0[0m  [1;32m TP: 4.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [4, 2, 2, 4]}[0m
[1;33m Training Time: 51.1064[0m  [1;33m Inference Time: 9.0625 [0m  [1;35m pAUC: 0.5164 ± 0.0006[0m  [1;35m Recall: 0.0084 ± 0.0000[0m  [1;32m TN: 261906.0[0m  [1;31m FP: 2617.0[0m  [1;31m FN: 473.0[0m  [1;32m TP: 4.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [2, 1, 1, 2]}[0m
Skipping {'hidden_activation': 'sigmoid', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 65.179

# 5/6 - Bucket: 6h

[4mTesting 9 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 84.8039[0m  [1;33m Inference Time: 19.9372[0m  [1;35m pAUC: 0.5876 ± 0.0000[0m  [1;35m Recall: 0.0226 ± 0.0000[0m  [1;32m TN: 524003.0[0m  [1;31m FP: 5201.0[0m  [1;31m FN: 519.0[0m  [1;32m TP: 12.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [4, 2, 2, 4]}[0m
[1;33m Training Time: 96.1869[0m  [1;33m Inference Time: 17.6562[0m  [1;35m pAUC: 0.5876 ± 0.0000[0m  [1;35m Recall: 0.0226 ± 0.0000[0m  [1;32m TN: 524003.0[0m  [1;31m FP: 5201.0[0m  [1;31m FN: 519.0[0m  [1;32m TP: 12.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [2, 1, 1, 2]}[0m
Skipping {'hidden_activation': 'sigmoid', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 110.

# 6/6 - Bucket: 1h

[4mTesting 9 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 466.2859[0m  [1;33m Inference Time: 119.5500[0m  [1;35m pAUC: 0.8520 ± 0.0055[0m  [1;35m Recall: 0.1538 ± 0.0000[0m  [1;32m TN: 3146206.0[0m  [1;31m FP: 30814.0[0m  [1;31m FN: 952.0[0m  [1;32m TP: 173.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [4, 2, 2, 4]}[0m
[1;33m Training Time: 333.7006[0m  [1;33m Inference Time: 115.6667[0m  [1;35m pAUC: 0.8588 ± 0.0000[0m  [1;35m Recall: 0.1538 ± 0.0000[0m  [1;32m TN: 3146267[0m  [1;31m FP: 30753[0m  [1;31m FN: 952[0m  [1;32m TP: 173[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [2, 1, 1, 2]}[0m
Skipping {'hidden_activation': 'sigmoid', 'hidden_neurons': [8, 4, 4, 8]}: The number of neurons should not exceed the number of features
[1;33m Training Time: 53

# Training - Deep SVDD

In [None]:
# Local configuration
classifier_name = "deep_svdd"

# Define the classifier that will be used for training
classifier = DeepSVDD(
    output_activation="sigmoid",
    optimizer=keras.optimizers.Adam(),
    epochs=100,
    batch_size=16384,
    validation_size=0.1,
    dropout_rate=0.2,
    l2_regularizer=0.1,
    preprocessing=False,
    verbose=0,
    callbacks=[
        keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=3, min_lr=1e-6),
        keras.callbacks.EarlyStopping(monitor="val_loss", patience=6)
    ],
    contamination=CONTAMINATION
)

# Define the hyperparameters grid that will be tested for best results
parameters = {
    "hidden_neurons": [[64, 32], [32, 16], [16, 8], [8, 4], [4, 2], [2, 1]],
    "hidden_activation": ["relu", "sigmoid", "tanh"],
    "use_ae": [True, False]
}

# Start the training
initiate_training_run(classifier_name, classifier, parameters)

# 6/1 - Bucket: 1h

[4mTesting 36 different hyperparameter combinations[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [64, 32], 'use_ae': True}: The number of neurons should not exceed the number of features
[1;33m Training Time: 568.0645[0m  [1;33m Inference Time: 123.7460[0m  [1;35m pAUC: 0.8153 ± 0.0169[0m  [1;35m Recall: 0.1685 ± 0.0300[0m  [1;32m TN: 3146268.0[0m  [1;31m FP: 30752.0[0m  [1;31m FN: 952.0[0m  [1;32m TP: 173.0[0m  [1;37m Params: {'hidden_activation': 'relu', 'hidden_neurons': [64, 32], 'use_ae': False}[0m
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [32, 16], 'use_ae': True}: The number of neurons should not exceed the number of features
Skipping {'hidden_activation': 'relu', 'hidden_neurons': [32, 16], 'use_ae': False}: Exception encountered when calling layer "tf.math.subtract_2" (type TFOpLambda).

Dimensions must be equal, but are 16 and 32 for '{{node tf.math.subtract_2/Sub}} = Sub[T=DT_FLOAT](Placeholder, tf.math.subtract_2/Sub/y)' wit