In [1]:
# =========================================================
# UNSW-NB15 Binary Anomaly Detection
# EO + SAE + SMOTE + ExtraTrees + CatBoost
# FAST, REALISTIC, PAPER-SAFE (SEEDED)
# =========================================================

!pip install -q catboost tensorflow imbalanced-learn

# =======================
# 0. RANDOM SEEDS
# =======================

import os
import random
import numpy as np
import tensorflow as tf

SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# =======================
# 1. IMPORTS
# =======================

import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import SMOTE

from catboost import CatBoostClassifier

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# =========================================================
# 2. LOAD DATA
# =========================================================

train_df = pd.read_csv("UNSW_NB15_training-set.csv")
test_df  = pd.read_csv("UNSW_NB15_testing-set.csv")

# Handle missing values
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# Binary labels
train_df['label'] = train_df['label'].map({0: 'Normal', 1: 'Attack'})
test_df['label']  = test_df['label'].map({0: 'Normal', 1: 'Attack'})

# Drop unused columns
train_df.drop(columns=['id', 'attack_cat'], inplace=True)
test_df.drop(columns=['id', 'attack_cat'], inplace=True)

# =========================================================
# 3. ENCODING
# =========================================================

cat_cols = ['proto', 'service', 'state']

for col in cat_cols:
    le = LabelEncoder()
    all_vals = pd.concat([train_df[col], test_df[col]]).astype(str)
    le.fit(all_vals)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col]  = le.transform(test_df[col].astype(str))

target_le = LabelEncoder()
train_df['label'] = target_le.fit_transform(train_df['label'])
test_df['label']  = target_le.transform(test_df['label'])

X_train = train_df.drop('label', axis=1).values
y_train = train_df['label'].values
X_test  = test_df.drop('label', axis=1).values
y_test  = test_df['label'].values

# =========================================================
# 4. SCALING
# =========================================================

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# =========================================================
# 5. EQUILIBRIUM OPTIMIZER (FEATURE SELECTION)
# =========================================================

def fitness(sol, X, y):
    if sol.sum() == 0:
        return 1
    Xs = X[:, sol == 1]
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    scores = []

    for tr, val in skf.split(Xs, y):
        clf = ExtraTreesClassifier(
            n_estimators=50,
            class_weight='balanced',
            n_jobs=-1,
            random_state=SEED
        )
        clf.fit(Xs[tr], y[tr])
        pred = clf.predict(Xs[val])
        scores.append(balanced_accuracy_score(y[val], pred))

    return 1 - np.mean(scores)

def EO(X, y, pop=10, iters=10):
    n_feat = X.shape[1]
    P = np.random.randint(0, 2, (pop, n_feat))
    F = np.array([fitness(p, X, y) for p in P])

    for _ in range(iters):
        elite = P[np.argsort(F)[:3]]
        eq = elite.mean(axis=0)

        for i in range(pop):
            P[i] = (np.random.rand(n_feat) < eq).astype(int)
            F[i] = fitness(P[i], X, y)

    return P[np.argmin(F)]

best_features = EO(X_train, y_train)
X_train = X_train[:, best_features == 1]
X_test  = X_test[:, best_features == 1]

print("Selected features:", X_train.shape[1])

# =========================================================
# 6. STACKED AUTOENCODER
# =========================================================

inp = Input(shape=(X_train.shape[1],))
x = Dense(64, activation='relu')(inp)
x = Dense(32, activation='relu')(x)
latent = Dense(16, activation='relu')(x)
x = Dense(32, activation='relu')(latent)
x = Dense(64, activation='relu')(x)
out = Dense(X_train.shape[1])(x)

autoencoder = Model(inp, out)
encoder = Model(inp, latent)

autoencoder.compile(optimizer=Adam(0.001), loss='mse')
autoencoder.fit(
    X_train, X_train,
    epochs=20,
    batch_size=256,
    validation_split=0.1,
    callbacks=[EarlyStopping(patience=4, restore_best_weights=True)],
    verbose=1
)

X_train_enc = encoder.predict(X_train)
X_test_enc  = encoder.predict(X_test)

# =========================================================
# 7. SMOTE (CLASS IMBALANCE HANDLING)
# =========================================================

X_train_enc, y_train = SMOTE(random_state=SEED).fit_resample(X_train_enc, y_train)

# =========================================================
# 8. EVALUATION FUNCTION
# =========================================================

def evaluate(name, y_true, y_pred):
    print(f"\n===== {name} =====")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Balanced Accuracy:", balanced_accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, target_names=target_le.classes_))

# =========================================================
# 9. FINAL MODELS
# =========================================================

# Extra Trees
et = ExtraTreesClassifier(
    n_estimators=200,
    class_weight='balanced',
    n_jobs=-1,
    random_state=SEED
)
et.fit(X_train_enc, y_train)
evaluate("Extra Trees", y_test, et.predict(X_test_enc))

# CatBoost
cb = CatBoostClassifier(
    iterations=300,
    depth=7,
    learning_rate=0.05,
    loss_function='Logloss',
    class_weights=[1, 1.2],
    random_seed=SEED,
    verbose=False
)
cb.fit(X_train_enc, y_train)
evaluate("CatBoost", y_test, cb.predict(X_test_enc))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hSelected features: 25
Epoch 1/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - loss: 0.6290 - val_loss: 0.2126
Epoch 2/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1556 - val_loss: 0.0968
Epoch 3/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.1039 - val_loss: 0.0563
Epoch 4/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0813 - val_loss: 0.0446
Epoch 5/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0665 - val_loss: 0.0412
Epoch 6/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0563 - val_loss: 0.0368
Epoch 7/20
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0464 - val_loss: 0.0330
Epoch 8/20
[1