In [1]:
import pandas as pd
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import fbeta_score, roc_auc_score, average_precision_score, matthews_corrcoef
import numpy as np
from collections import defaultdict

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

# Shuttle

In [3]:
# Load the training dataset
train_data = pd.read_csv("shuttle_train.csv")

# Load the test dataset
test_data = pd.read_csv("shuttle_test.csv")

# Parameters
num_runs = 5  # Number of iterations

# Initialize accumulators
accuracy_scores = []
f2_scores = []
auc_roc_scores = []
auc_pr_scores = []
mcc_scores = []

# Initialize metrics accumulators for classification report
class_metrics = defaultdict(lambda: defaultdict(float))  # Stores precision/recall/F1 for each class

verbose_run = 1
for run in range(num_runs):
    print(f"Run {run + 1}/{num_runs}")
    
    # Shuffle data
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    test_data = test_data.sample(frac=1).reset_index(drop=True)
    
    # Split features and labels
    train_y = train_data['label'].values
    train_X = train_data.drop(['label'], axis=1).values
    test_y = test_data['label'].values
    test_X = test_data.drop(['label'], axis=1).values
    
    # Train the model
    clf = TabNetClassifier(verbose=verbose_run)
    clf.fit(train_X, train_y, eval_set=[(train_X, train_y)], eval_metric=['logloss'])
    verbose_run = 0
    
    # Make predictions
    test_probabilities = clf.predict_proba(test_X)  # Outputs probabilities
    test_predictions = clf.predict(test_X)  # Convert to discrete class predictions
    
    # Calculate accuracy
    accuracy = accuracy_score(test_y, test_predictions)
    accuracy_scores.append(accuracy)
    
    # Calculate F2-Score
    f2_score = fbeta_score(test_y, test_predictions, beta=2, average='weighted')
    f2_scores.append(f2_score)
    
    # Calculate AUC-ROC (One-vs-Rest)
    auc_roc = roc_auc_score(test_y, test_probabilities, multi_class='ovr')
    auc_roc_scores.append(auc_roc)
    
    # Calculate AUC-PR (One-vs-Rest)
    auc_pr = average_precision_score(test_y, test_probabilities, average='weighted')
    auc_pr_scores.append(auc_pr)
    
    # Calculate MCC
    mcc = matthews_corrcoef(test_y, test_predictions)
    mcc_scores.append(mcc)
    
    # Classification Report
    report = classification_report(test_y, test_predictions, output_dict=True, zero_division=0)
    
    for class_label, metrics in report.items():
        if isinstance(metrics, dict):  # Skip non-class metrics
            for metric_name, metric_value in metrics.items():
                class_metrics[class_label][metric_name] += metric_value

# Calculate averages
average_accuracy = np.mean(accuracy_scores)
average_f2_score = np.mean(f2_scores)
average_auc_roc = np.mean(auc_roc_scores)
average_auc_pr = np.mean(auc_pr_scores)
average_mcc = np.mean(mcc_scores)

# Average the classification report metrics
average_class_metrics = {
    class_label: {metric_name: metric_value / num_runs for metric_name, metric_value in metrics.items()}
    for class_label, metrics in class_metrics.items()
}

# Output results
print(f"Average Test Accuracy: {average_accuracy:.4f}")
print(f"Average F2-Score (Weighted): {average_f2_score:.4f}")
print(f"Average AUC-ROC (One-vs-Rest): {average_auc_roc:.4f}")
print(f"Average AUC-PR: {average_auc_pr:.4f}")
print(f"Average Matthews Correlation Coefficient: {average_mcc:.4f}")

print("\nAverage Classification Report:")
for class_label, metrics in average_class_metrics.items():
    print(f"Class {class_label}:")
    for metric_name, metric_value in metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")


Run 1/5




epoch 0  | loss: 0.51339 | val_0_logloss: 0.08558 |  0:00:03s
epoch 1  | loss: 0.08531 | val_0_logloss: 0.04791 |  0:00:07s
epoch 2  | loss: 0.04925 | val_0_logloss: 0.03666 |  0:00:10s
epoch 3  | loss: 0.04239 | val_0_logloss: 0.03158 |  0:00:13s
epoch 4  | loss: 0.03665 | val_0_logloss: 0.02854 |  0:00:17s
epoch 5  | loss: 0.0344  | val_0_logloss: 0.02722 |  0:00:20s
epoch 6  | loss: 0.02502 | val_0_logloss: 0.03536 |  0:00:23s
epoch 7  | loss: 0.02482 | val_0_logloss: 0.0221  |  0:00:26s
epoch 8  | loss: 0.02872 | val_0_logloss: 0.02489 |  0:00:29s
epoch 9  | loss: 0.02246 | val_0_logloss: 0.02069 |  0:00:32s
epoch 10 | loss: 0.02315 | val_0_logloss: 0.02044 |  0:00:35s
epoch 11 | loss: 0.02013 | val_0_logloss: 0.02481 |  0:00:38s
epoch 12 | loss: 0.02217 | val_0_logloss: 0.02095 |  0:00:41s
epoch 13 | loss: 0.01815 | val_0_logloss: 0.01953 |  0:00:44s
epoch 14 | loss: 0.01674 | val_0_logloss: 0.01762 |  0:00:47s
epoch 15 | loss: 0.01833 | val_0_logloss: 0.01955 |  0:00:50s
epoch 16



Run 2/5

Early stopping occurred at epoch 27 with best_epoch = 17 and best_val_0_logloss = 0.01332




Run 3/5

Early stopping occurred at epoch 49 with best_epoch = 39 and best_val_0_logloss = 0.00999




Run 4/5

Early stopping occurred at epoch 18 with best_epoch = 8 and best_val_0_logloss = 0.01431




Run 5/5

Early stopping occurred at epoch 44 with best_epoch = 34 and best_val_0_logloss = 0.01114




Average Test Accuracy: 0.9973
Average F2-Score (Weighted): 0.9971
Average AUC-ROC (One-vs-Rest): 0.9938
Average AUC-PR: 0.9982
Average Matthews Correlation Coefficient: 0.9924

Average Classification Report:
Class 1:
  precision: 0.9985
  recall: 0.9998
  f1-score: 0.9991
  support: 9117.0000
Class 2:
  precision: 0.7309
  recall: 0.6600
  f1-score: 0.6735
  support: 10.0000
Class 3:
  precision: 0.8209
  recall: 0.4000
  f1-score: 0.5237
  support: 34.0000
Class 4:
  precision: 0.9969
  recall: 1.0000
  f1-score: 0.9984
  support: 1781.0000
Class 5:
  precision: 0.9918
  recall: 0.9982
  f1-score: 0.9950
  support: 653.0000
Class 6:
  precision: 0.3333
  recall: 0.3000
  f1-score: 0.2933
  support: 2.0000
Class 7:
  precision: 0.4000
  recall: 0.1333
  f1-score: 0.2000
  support: 3.0000
Class macro avg:
  precision: 0.7532
  recall: 0.6416
  f1-score: 0.6690
  support: 11600.0000
Class weighted avg:
  precision: 0.9969
  recall: 0.9973
  f1-score: 0.9968
  support: 11600.0000


# Covertype

In [4]:
# Load the training dataset
train_data = pd.read_csv("covtype_train.csv")

# Load the test dataset
test_data = pd.read_csv("covtype_test.csv")

# Parameters
num_runs = 5  # Number of iterations

# Initialize accumulators
accuracy_scores = []
f2_scores = []
auc_roc_scores = []
auc_pr_scores = []
mcc_scores = []

# Initialize metrics accumulators for classification report
class_metrics = defaultdict(lambda: defaultdict(float))  # Stores precision/recall/F1 for each class

verbose_run = 1
for run in range(num_runs):
    print(f"Run {run + 1}/{num_runs}")
    
    # Shuffle data
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    test_data = test_data.sample(frac=1).reset_index(drop=True)
    
    # Split features and labels
    train_y = train_data['label'].values
    train_X = train_data.drop(['label'], axis=1).values
    test_y = test_data['label'].values
    test_X = test_data.drop(['label'], axis=1).values
    
    # Train the model
    clf = TabNetClassifier(verbose=verbose_run)
    clf.fit(train_X, train_y, eval_set=[(train_X, train_y)], eval_metric=['logloss'])
    verbose_run = 0
    
    # Make predictions
    test_probabilities = clf.predict_proba(test_X)  # Outputs probabilities
    test_predictions = clf.predict(test_X)  # Convert to discrete class predictions
    
    # Calculate accuracy
    accuracy = accuracy_score(test_y, test_predictions)
    accuracy_scores.append(accuracy)
    
    # Calculate F2-Score
    f2_score = fbeta_score(test_y, test_predictions, beta=2, average='weighted')
    f2_scores.append(f2_score)
    
    # Calculate AUC-ROC (One-vs-Rest)
    auc_roc = roc_auc_score(test_y, test_probabilities, multi_class='ovr')
    auc_roc_scores.append(auc_roc)
    
    # Calculate AUC-PR (One-vs-Rest)
    auc_pr = average_precision_score(test_y, test_probabilities, average='weighted')
    auc_pr_scores.append(auc_pr)
    
    # Calculate MCC
    mcc = matthews_corrcoef(test_y, test_predictions)
    mcc_scores.append(mcc)
    
    # Classification Report
    report = classification_report(test_y, test_predictions, output_dict=True, zero_division=0)
    
    for class_label, metrics in report.items():
        if isinstance(metrics, dict):  # Skip non-class metrics
            for metric_name, metric_value in metrics.items():
                class_metrics[class_label][metric_name] += metric_value

# Calculate averages
average_accuracy = np.mean(accuracy_scores)
average_f2_score = np.mean(f2_scores)
average_auc_roc = np.mean(auc_roc_scores)
average_auc_pr = np.mean(auc_pr_scores)
average_mcc = np.mean(mcc_scores)

# Average the classification report metrics
average_class_metrics = {
    class_label: {metric_name: metric_value / num_runs for metric_name, metric_value in metrics.items()}
    for class_label, metrics in class_metrics.items()
}

# Output results
print(f"Average Test Accuracy: {average_accuracy:.4f}")
print(f"Average F2-Score (Weighted): {average_f2_score:.4f}")
print(f"Average AUC-ROC (One-vs-Rest): {average_auc_roc:.4f}")
print(f"Average AUC-PR: {average_auc_pr:.4f}")
print(f"Average Matthews Correlation Coefficient: {average_mcc:.4f}")

print("\nAverage Classification Report:")
for class_label, metrics in average_class_metrics.items():
    print(f"Class {class_label}:")
    for metric_name, metric_value in metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")


Run 1/5




epoch 0  | loss: 0.76758 | val_0_logloss: 0.63712 |  0:00:34s
epoch 1  | loss: 0.62552 | val_0_logloss: 0.58473 |  0:01:08s
epoch 2  | loss: 0.57607 | val_0_logloss: 0.53496 |  0:01:43s
epoch 3  | loss: 0.53513 | val_0_logloss: 0.48667 |  0:02:17s
epoch 4  | loss: 0.50821 | val_0_logloss: 0.46888 |  0:02:51s
epoch 5  | loss: 0.49313 | val_0_logloss: 0.45239 |  0:03:25s
epoch 6  | loss: 0.48088 | val_0_logloss: 0.44536 |  0:03:59s
epoch 7  | loss: 0.47163 | val_0_logloss: 0.42602 |  0:04:32s
epoch 8  | loss: 0.47321 | val_0_logloss: 0.47076 |  0:05:06s
epoch 9  | loss: 0.45963 | val_0_logloss: 0.41327 |  0:05:40s
epoch 10 | loss: 0.45489 | val_0_logloss: 0.41826 |  0:06:15s
epoch 11 | loss: 0.45088 | val_0_logloss: 0.40017 |  0:06:48s
epoch 12 | loss: 0.44245 | val_0_logloss: 0.38964 |  0:07:21s
epoch 13 | loss: 0.43394 | val_0_logloss: 0.40391 |  0:07:55s
epoch 14 | loss: 0.4418  | val_0_logloss: 0.42774 |  0:08:28s
epoch 15 | loss: 0.45772 | val_0_logloss: 0.39742 |  0:09:02s
epoch 16



Run 2/5

Early stopping occurred at epoch 61 with best_epoch = 51 and best_val_0_logloss = 0.3368




Run 3/5

Early stopping occurred at epoch 66 with best_epoch = 56 and best_val_0_logloss = 0.35105




Run 4/5

Early stopping occurred at epoch 64 with best_epoch = 54 and best_val_0_logloss = 0.35125




Run 5/5

Early stopping occurred at epoch 62 with best_epoch = 52 and best_val_0_logloss = 0.32996




Average Test Accuracy: 0.8603
Average F2-Score (Weighted): 0.8593
Average AUC-ROC (One-vs-Rest): 0.9824
Average AUC-PR: 0.9324
Average Matthews Correlation Coefficient: 0.7743

Average Classification Report:
Class 1:
  precision: 0.8819
  recall: 0.8328
  f1-score: 0.8566
  support: 42368.0000
Class 2:
  precision: 0.8604
  recall: 0.9126
  f1-score: 0.8857
  support: 56661.0000
Class 3:
  precision: 0.8319
  recall: 0.8379
  f1-score: 0.8339
  support: 7151.0000
Class 4:
  precision: 0.8310
  recall: 0.5792
  f1-score: 0.6797
  support: 549.0000
Class 5:
  precision: 0.7646
  recall: 0.4343
  f1-score: 0.5522
  support: 1899.0000
Class 6:
  precision: 0.6865
  recall: 0.6813
  f1-score: 0.6797
  support: 3473.0000
Class 7:
  precision: 0.8879
  recall: 0.8480
  f1-score: 0.8669
  support: 4102.0000
Class macro avg:
  precision: 0.8206
  recall: 0.7323
  f1-score: 0.7650
  support: 116203.0000
Class weighted avg:
  precision: 0.8606
  recall: 0.8603
  f1-score: 0.8587
  support: 116203

# KDD

In [5]:
# Load the training dataset
train_data = import_data("kdd_train.csv")

# Load the test dataset
test_data = import_data("kdd_test.csv")

# Parameters
num_runs = 5  # Number of iterations

# Initialize accumulators
accuracy_scores = []
f2_scores = []
auc_roc_scores = []
auc_pr_scores = []
mcc_scores = []

# Initialize metrics accumulators for classification report
class_metrics = defaultdict(lambda: defaultdict(float))  # Stores precision/recall/F1 for each class

verbose_run = 1
for run in range(num_runs):
    print(f"Run {run + 1}/{num_runs}")
    
    # Shuffle data
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    test_data = test_data.sample(frac=1).reset_index(drop=True)
    
    # Split features and labels
    train_y = train_data['label'].values
    train_X = train_data.drop(['label'], axis=1).values
    test_y = test_data['label'].values
    test_X = test_data.drop(['label'], axis=1).values
    
    # Train the model
    clf = TabNetClassifier(verbose=verbose_run)
    clf.fit(train_X, train_y, eval_set=[(train_X, train_y)], eval_metric=['logloss'])
    verbose_run = 0
    
    # Make predictions
    test_probabilities = clf.predict_proba(test_X)  # Outputs probabilities
    test_predictions = clf.predict(test_X)  # Convert to discrete class predictions
    
    # Calculate accuracy
    accuracy = accuracy_score(test_y, test_predictions)
    accuracy_scores.append(accuracy)
    
    # Calculate F2-Score
    f2_score = fbeta_score(test_y, test_predictions, beta=2, average='weighted')
    f2_scores.append(f2_score)
    
    # Calculate AUC-ROC (One-vs-Rest)
    auc_roc = roc_auc_score(test_y, test_probabilities, multi_class='ovr')
    auc_roc_scores.append(auc_roc)
    
    # Calculate AUC-PR (One-vs-Rest)
    auc_pr = average_precision_score(test_y, test_probabilities, average='weighted')
    auc_pr_scores.append(auc_pr)
    
    # Calculate MCC
    mcc = matthews_corrcoef(test_y, test_predictions)
    mcc_scores.append(mcc)
    
    # Classification Report
    report = classification_report(test_y, test_predictions, output_dict=True, zero_division=0)
    
    for class_label, metrics in report.items():
        if isinstance(metrics, dict):  # Skip non-class metrics
            for metric_name, metric_value in metrics.items():
                class_metrics[class_label][metric_name] += metric_value

# Calculate averages
average_accuracy = np.mean(accuracy_scores)
average_f2_score = np.mean(f2_scores)
average_auc_roc = np.mean(auc_roc_scores)
average_auc_pr = np.mean(auc_pr_scores)
average_mcc = np.mean(mcc_scores)

# Average the classification report metrics
average_class_metrics = {
    class_label: {metric_name: metric_value / num_runs for metric_name, metric_value in metrics.items()}
    for class_label, metrics in class_metrics.items()
}

# Output results
print(f"Average Test Accuracy: {average_accuracy:.4f}")
print(f"Average F2-Score (Weighted): {average_f2_score:.4f}")
print(f"Average AUC-ROC (One-vs-Rest): {average_auc_roc:.4f}")
print(f"Average AUC-PR: {average_auc_pr:.4f}")
print(f"Average Matthews Correlation Coefficient: {average_mcc:.4f}")

print("\nAverage Classification Report:")
for class_label, metrics in average_class_metrics.items():
    print(f"Class {class_label}:")
    for metric_name, metric_value in metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")


  df = pd.read_csv(file, parse_dates=True, keep_date_col=True)


Memory usage of dataframe is 789.51 MB
Memory usage after optimization is: 196.60 MB
Decreased by 75.1%


  df = pd.read_csv(file, parse_dates=True, keep_date_col=True)


Memory usage of dataframe is 197.38 MB
Memory usage after optimization is: 49.15 MB
Decreased by 75.1%
Run 1/5




epoch 0  | loss: 0.09035 | val_0_logloss: 0.02316 |  0:01:09s
epoch 1  | loss: 0.01344 | val_0_logloss: 0.03259 |  0:02:20s
epoch 2  | loss: 0.01285 | val_0_logloss: 0.02049 |  0:03:30s
epoch 3  | loss: 0.01015 | val_0_logloss: 0.01118 |  0:04:44s
epoch 4  | loss: 0.00892 | val_0_logloss: 0.01542 |  0:05:54s
epoch 5  | loss: 0.00845 | val_0_logloss: 0.03439 |  0:07:04s
epoch 6  | loss: 0.00822 | val_0_logloss: 0.03624 |  0:08:19s
epoch 7  | loss: 0.00848 | val_0_logloss: 0.02444 |  0:09:30s
epoch 8  | loss: 0.00887 | val_0_logloss: 0.02814 |  0:10:41s
epoch 9  | loss: 0.00707 | val_0_logloss: 0.02183 |  0:11:51s
epoch 10 | loss: 0.00682 | val_0_logloss: 0.04009 |  0:13:01s
epoch 11 | loss: 0.0071  | val_0_logloss: 0.01269 |  0:14:10s
epoch 12 | loss: 0.00626 | val_0_logloss: 0.02165 |  0:15:20s
epoch 13 | loss: 0.00647 | val_0_logloss: 0.03427 |  0:16:30s

Early stopping occurred at epoch 13 with best_epoch = 3 and best_val_0_logloss = 0.01118




Run 2/5

Early stopping occurred at epoch 27 with best_epoch = 17 and best_val_0_logloss = 0.00949




Run 3/5

Early stopping occurred at epoch 16 with best_epoch = 6 and best_val_0_logloss = 0.02009




Run 4/5

Early stopping occurred at epoch 17 with best_epoch = 7 and best_val_0_logloss = 0.01522




Run 5/5

Early stopping occurred at epoch 38 with best_epoch = 28 and best_val_0_logloss = 0.00646




Average Test Accuracy: 0.9971
Average F2-Score (Weighted): 0.9969
Average AUC-ROC (One-vs-Rest): 0.9837
Average AUC-PR: 0.9987
Average Matthews Correlation Coefficient: 0.9665

Average Classification Report:
Class back.:
  precision: 0.9681
  recall: 0.4916
  f1-score: 0.5951
  support: 441.0000
Class buffer_overflow.:
  precision: 0.0000
  recall: 0.0000
  f1-score: 0.0000
  support: 6.0000
Class ftp_write.:
  precision: 0.0000
  recall: 0.0000
  f1-score: 0.0000
  support: 2.0000
Class guess_passwd.:
  precision: 0.6119
  recall: 0.7273
  f1-score: 0.6623
  support: 11.0000
Class imap.:
  precision: 0.0000
  recall: 0.0000
  f1-score: 0.0000
  support: 2.0000
Class ipsweep.:
  precision: 0.9844
  recall: 0.9812
  f1-score: 0.9828
  support: 2496.0000
Class land.:
  precision: 0.0000
  recall: 0.0000
  f1-score: 0.0000
  support: 4.0000
Class loadmodule.:
  precision: 0.0000
  recall: 0.0000
  f1-score: 0.0000
  support: 2.0000
Class multihop.:
  precision: 0.0000
  recall: 0.0000
  f

# Darknet

In [6]:
# Load the training dataset
train_data = pd.read_csv("darknet_train.csv")

# Load the test dataset
test_data = pd.read_csv("darknet_test.csv")

# Parameters
num_runs = 5  # Number of iterations

# Initialize accumulators
accuracy_scores = []
f2_scores = []
auc_roc_scores = []
auc_pr_scores = []
mcc_scores = []

# Initialize metrics accumulators for classification report
class_metrics = defaultdict(lambda: defaultdict(float))  # Stores precision/recall/F1 for each class

verbose_run = 1
for run in range(num_runs):
    print(f"Run {run + 1}/{num_runs}")
    
    # Shuffle data
    train_data = train_data.sample(frac=1).reset_index(drop=True)
    test_data = test_data.sample(frac=1).reset_index(drop=True)
    
    # Split features and labels
    train_y = train_data['label'].values
    train_X = train_data.drop(['label'], axis=1).values
    test_y = test_data['label'].values
    test_X = test_data.drop(['label'], axis=1).values
    
    # Train the model
    clf = TabNetClassifier(verbose=verbose_run)
    clf.fit(train_X, train_y, eval_set=[(train_X, train_y)], eval_metric=['logloss'])
    verbose_run = 0
    
    # Make predictions
    test_probabilities = clf.predict_proba(test_X)  # Outputs probabilities
    test_predictions = clf.predict(test_X)  # Convert to discrete class predictions
    
    # Calculate accuracy
    accuracy = accuracy_score(test_y, test_predictions)
    accuracy_scores.append(accuracy)
    
    # Calculate F2-Score
    f2_score = fbeta_score(test_y, test_predictions, beta=2, average='weighted')
    f2_scores.append(f2_score)
    
    # Calculate AUC-ROC (One-vs-Rest)
    auc_roc = roc_auc_score(test_y, test_probabilities, multi_class='ovr')
    auc_roc_scores.append(auc_roc)
    
    # Calculate AUC-PR (One-vs-Rest)
    auc_pr = average_precision_score(test_y, test_probabilities, average='weighted')
    auc_pr_scores.append(auc_pr)
    
    # Calculate MCC
    mcc = matthews_corrcoef(test_y, test_predictions)
    mcc_scores.append(mcc)
    
    # Classification Report
    report = classification_report(test_y, test_predictions, output_dict=True, zero_division=0)
    
    for class_label, metrics in report.items():
        if isinstance(metrics, dict):  # Skip non-class metrics
            for metric_name, metric_value in metrics.items():
                class_metrics[class_label][metric_name] += metric_value

# Calculate averages
average_accuracy = np.mean(accuracy_scores)
average_f2_score = np.mean(f2_scores)
average_auc_roc = np.mean(auc_roc_scores)
average_auc_pr = np.mean(auc_pr_scores)
average_mcc = np.mean(mcc_scores)

# Average the classification report metrics
average_class_metrics = {
    class_label: {metric_name: metric_value / num_runs for metric_name, metric_value in metrics.items()}
    for class_label, metrics in class_metrics.items()
}

# Output results
print(f"Average Test Accuracy: {average_accuracy:.4f}")
print(f"Average F2-Score (Weighted): {average_f2_score:.4f}")
print(f"Average AUC-ROC (One-vs-Rest): {average_auc_roc:.4f}")
print(f"Average AUC-PR: {average_auc_pr:.4f}")
print(f"Average Matthews Correlation Coefficient: {average_mcc:.4f}")

print("\nAverage Classification Report:")
for class_label, metrics in average_class_metrics.items():
    print(f"Class {class_label}:")
    for metric_name, metric_value in metrics.items():
        print(f"  {metric_name}: {metric_value:.4f}")


Run 1/5




epoch 0  | loss: 0.35467 | val_0_logloss: 0.25503 |  0:00:09s
epoch 1  | loss: 0.11494 | val_0_logloss: 0.09692 |  0:00:19s
epoch 2  | loss: 0.08185 | val_0_logloss: 0.06624 |  0:00:29s
epoch 3  | loss: 0.06231 | val_0_logloss: 0.05281 |  0:00:39s
epoch 4  | loss: 0.04829 | val_0_logloss: 0.03999 |  0:00:49s
epoch 5  | loss: 0.04389 | val_0_logloss: 0.03575 |  0:00:58s
epoch 6  | loss: 0.05056 | val_0_logloss: 0.05347 |  0:01:08s
epoch 7  | loss: 0.04927 | val_0_logloss: 0.03098 |  0:01:18s
epoch 8  | loss: 0.03203 | val_0_logloss: 0.04191 |  0:01:28s
epoch 9  | loss: 0.0295  | val_0_logloss: 0.02437 |  0:01:38s
epoch 10 | loss: 0.02792 | val_0_logloss: 0.02296 |  0:01:48s
epoch 11 | loss: 0.02542 | val_0_logloss: 0.02088 |  0:01:58s
epoch 12 | loss: 0.0238  | val_0_logloss: 0.01965 |  0:02:08s
epoch 13 | loss: 0.02198 | val_0_logloss: 0.01808 |  0:02:18s
epoch 14 | loss: 0.01941 | val_0_logloss: 0.01739 |  0:02:27s
epoch 15 | loss: 0.02    | val_0_logloss: 0.01809 |  0:02:37s
epoch 16



Run 2/5

Early stopping occurred at epoch 58 with best_epoch = 48 and best_val_0_logloss = 0.0035




Run 3/5

Early stopping occurred at epoch 53 with best_epoch = 43 and best_val_0_logloss = 0.00921




Run 4/5

Early stopping occurred at epoch 67 with best_epoch = 57 and best_val_0_logloss = 0.00434




Run 5/5

Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_0_logloss = 0.03094




Average Test Accuracy: 0.9969
Average F2-Score (Weighted): 0.9969
Average AUC-ROC (One-vs-Rest): 0.9995
Average AUC-PR: 0.9989
Average Matthews Correlation Coefficient: 0.9886

Average Classification Report:
Class Darknet_Audio-Streaming:
  precision: 0.9960
  recall: 0.9856
  f1-score: 0.9908
  support: 2657.0000
Class Darknet_Browsing:
  precision: 0.7807
  recall: 0.7849
  f1-score: 0.7813
  support: 53.0000
Class Darknet_Chat:
  precision: 0.9788
  recall: 0.9866
  f1-score: 0.9826
  support: 908.0000
Class Darknet_Email:
  precision: 0.9948
  recall: 0.9914
  f1-score: 0.9931
  support: 116.0000
Class Darknet_File-Transfer:
  precision: 0.9829
  recall: 0.9885
  f1-score: 0.9856
  support: 522.0000
Class Darknet_P2P:
  precision: 0.9357
  recall: 0.9227
  f1-score: 0.9254
  support: 44.0000
Class Darknet_VOIP:
  precision: 0.9867
  recall: 0.9631
  f1-score: 0.9747
  support: 293.0000
Class Darknet_Video-Streaming:
  precision: 0.9408
  recall: 0.9502
  f1-score: 0.9450
  support: