In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (
    roc_curve, auc,
    precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report
)
from sklearn.ensemble import IsolationForest

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

np.random.seed(42)
tf.random.set_seed(42)


In [5]:
from google.colab import files

uploaded = files.upload()

Saving malicious_flows.csv to malicious_flows (1).csv
Saving normal_flows.csv to normal_flows (1).csv


In [7]:
benign = pd.read_csv("normal_flows.csv")
malicious = pd.read_csv("malicious_flows.csv")

print("Benign:", benign.shape)
print("Malicious:", malicious.shape)

Benign: (159420, 24)
Malicious: (14375, 24)


In [8]:
DROP_COLS = [
    "LABEL",
    "IS_COMPLETE",
    "IS_INCOMPLETE",
    "IS_RESET",
    "IS_REJECTED"
]

X_benign = benign.drop(columns=DROP_COLS)
X_malicious = malicious.drop(columns=DROP_COLS)

print("Initial feature count:", X_benign.shape[1])


Initial feature count: 19


In [9]:
HEAVY_TAIL = [
    "PPS_FWD", "PPS_REV",
    "BPS_FWD", "BPS_REV",
    "FWD_BWD_PKT_RATIO",
    "FWD_BWD_BYTE_RATIO",
    "BYTES_TOTAL",
    "BYTES_FWD",
    "BYTES_REV",
]

for col in HEAVY_TAIL:
    if col in X_benign.columns:
        X_benign[col] = np.log1p(X_benign[col])
        X_malicious[col] = np.log1p(X_malicious[col])

print("Log transform complete.")


Log transform complete.


In [10]:
selector = VarianceThreshold(threshold=1e-6)
selector.fit(X_benign)

X_benign = X_benign.loc[:, selector.get_support()]
X_malicious = X_malicious[X_benign.columns]

print("After variance filtering:", X_benign.shape[1])


After variance filtering: 19


In [11]:
corr = X_benign.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.95)]

X_benign = X_benign.drop(columns=to_drop)
X_malicious = X_malicious.drop(columns=to_drop)

print("Dropped correlated:", to_drop)
print("Final feature count:", X_benign.shape[1])


Dropped correlated: ['PACKETS_REV', 'PACKETS_TOTAL', 'BYTES_TOTAL', 'PPS_REV', 'BPS_REV', 'BIDIRECTIONAL_PAIRS']
Final feature count: 13


In [12]:
feature_groups = {
    "counts": [col for col in X_benign.columns if "PACKETS" in col or "BYTES" in col],
    "rates": [col for col in X_benign.columns if "PPS" in col or "BPS" in col],
    "ratios": [col for col in X_benign.columns if "RATIO" in col or "ASYMMETRY" in col],
    "bidirectional": [col for col in X_benign.columns if "BIDIRECTIONAL" in col or "ROUNDTRIPS" in col],
    "timing": [col for col in X_benign.columns if "TIME" in col or "DURATION" in col],
}

for group, cols in feature_groups.items():
    print(group, ":", len(cols))


counts : 3
rates : 2
ratios : 4
bidirectional : 1
timing : 2


In [13]:
def build_autoencoder(input_dim):
    bottleneck_dim = max(4, input_dim // 4)

    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation="relu")(inputs)
    x = layers.Dense(64, activation="relu")(x)
    bottleneck = layers.Dense(bottleneck_dim, activation="relu")(x)
    x = layers.Dense(64, activation="relu")(bottleneck)
    x = layers.Dense(128, activation="relu")(x)
    outputs = layers.Dense(input_dim, activation="linear")(x)

    return models.Model(inputs, outputs)


In [14]:
def train_and_evaluate(X_benign, X_malicious):

    X_train, X_temp = train_test_split(X_benign, test_size=0.4, random_state=42)
    X_val, X_test_benign = train_test_split(X_temp, test_size=0.5, random_state=42)

    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_benign_scaled = scaler.transform(X_test_benign)
    X_mal_scaled = scaler.transform(X_malicious)

    model = build_autoencoder(X_train_scaled.shape[1])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3), loss="mse")

    early_stop = callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

    model.fit(
        X_train_scaled, X_train_scaled,
        validation_data=(X_val_scaled, X_val_scaled),
        epochs=100,
        batch_size=512,
        shuffle=True,
        callbacks=[early_stop],
        verbose=0
    )

    def reconstruction_error(model, X):
        X_hat = model.predict(X, verbose=0)
        return np.mean((X - X_hat)**2, axis=1)

    err_benign = reconstruction_error(model, X_test_benign_scaled)
    err_mal = reconstruction_error(model, X_mal_scaled)

    X_scores = np.concatenate([err_benign, err_mal])
    y_test = np.concatenate([
        np.zeros(len(err_benign)),
        np.ones(len(err_mal))
    ])

    fpr, tpr, _ = roc_curve(y_test, X_scores)
    roc_auc = auc(fpr, tpr)

    threshold = np.percentile(err_benign, 95)
    mal_sf_mask = malicious["IS_COMPLETE"] == 1
    X_mal_sf = X_malicious[mal_sf_mask]
    X_mal_sf_scaled = scaler.transform(X_mal_sf)
    err_mal_sf = reconstruction_error(model, X_mal_sf_scaled)

    mal_sf_detection = np.mean(err_mal_sf > threshold)

    return roc_auc, mal_sf_detection


In [15]:
roc_auc_full, mal_sf_rate_full = train_and_evaluate(X_benign, X_malicious)

print("Full Model AUC:", roc_auc_full)
print("Malicious SF Detection Rate:", mal_sf_rate_full)


Full Model AUC: 0.9998233531333692
Malicious SF Detection Rate: 1.0


In [16]:
X_train, X_temp = train_test_split(X_benign, test_size=0.4, random_state=42)
X_val, X_test_benign = train_test_split(X_temp, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_benign_scaled = scaler.transform(X_test_benign)
X_mal_scaled = scaler.transform(X_malicious)

iso = IsolationForest(n_estimators=200, contamination=0.05, random_state=42)
iso.fit(X_train_scaled)

score_benign = -iso.decision_function(X_test_benign_scaled)
score_mal = -iso.decision_function(X_mal_scaled)

scores = np.concatenate([score_benign, score_mal])
y_test = np.concatenate([
    np.zeros(len(score_benign)),
    np.ones(len(score_mal))
])

fpr, tpr, _ = roc_curve(y_test, scores)
iso_auc = auc(fpr, tpr)

print("Isolation Forest AUC:", iso_auc)


Isolation Forest AUC: 0.9970825917865305


In [17]:
print("Threshold | FPR | TPR")
print("--------------------------")

err_benign = score_benign
err_mal = score_mal

for p in [90, 95, 97, 99]:
    th = np.percentile(err_benign, p)
    fpr = np.mean(err_benign > th)
    tpr = np.mean(err_mal > th)
    print(f"{p:9d} | {fpr:.3f} | {tpr:.3f}")


Threshold | FPR | TPR
--------------------------
       90 | 0.100 | 1.000
       95 | 0.050 | 1.000
       97 | 0.030 | 1.000
       99 | 0.010 | 0.963


In [18]:
ablation_results = {}

for group_name, cols in feature_groups.items():
    remaining_cols = [c for c in X_benign.columns if c not in cols]

    X_benign_reduced = X_benign[remaining_cols]
    X_mal_reduced = X_malicious[remaining_cols]

    roc_auc, mal_sf_rate = train_and_evaluate(X_benign_reduced, X_mal_reduced)

    ablation_results[group_name] = {
        "AUC": roc_auc,
        "Mal_SF_Detection": mal_sf_rate
    }

    print(f"\nRemoved group: {group_name}")
    print("AUC:", roc_auc)
    print("Malicious SF Detection:", mal_sf_rate)



Removed group: counts
AUC: 0.9996920270764129
Malicious SF Detection: 1.0

Removed group: rates
AUC: 0.9996121571130129
Malicious SF Detection: 1.0

Removed group: ratios
AUC: 0.999697924541681
Malicious SF Detection: 1.0

Removed group: bidirectional
AUC: 0.9998945918083488
Malicious SF Detection: 1.0

Removed group: timing
AUC: 0.9995944472626315
Malicious SF Detection: 1.0


In [19]:
print("\n=== Ablation Summary ===")
print("Full Model AUC:", roc_auc_full)

for group, result in ablation_results.items():
    print(f"\nWithout {group}:")
    print("AUC:", result["AUC"])
    print("Malicious SF Detection:", result["Mal_SF_Detection"])



=== Ablation Summary ===
Full Model AUC: 0.9998233531333692

Without counts:
AUC: 0.9996920270764129
Malicious SF Detection: 1.0

Without rates:
AUC: 0.9996121571130129
Malicious SF Detection: 1.0

Without ratios:
AUC: 0.999697924541681
Malicious SF Detection: 1.0

Without bidirectional:
AUC: 0.9998945918083488
Malicious SF Detection: 1.0

Without timing:
AUC: 0.9995944472626315
Malicious SF Detection: 1.0
