In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

2025-07-04 05:48:40.980389: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751608121.236863      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751608121.311444      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Load and preprocess the data
train = pd.read_parquet("/kaggle/input/unsw-nb15/UNSW_NB15_training-set.parquet")
test = pd.read_parquet("/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.parquet")
df = pd.concat([train, test])
df = df.drop(columns=['id', 'attack_cat'], errors='ignore')
cat_cols = ['proto', 'service', 'state']
for col in cat_cols:
    df[col] = pd.factorize(df[col])[0]
X = df.drop('label', axis=1)
y = df['label']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [3]:
# MoCo encoder with prediction head (like MoCo v2)
def build_encoder(input_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation='swish')(inputs)
    x = layers.Dense(128, activation='swish')(x)
    projection = layers.Dense(64)(x)  # Projection head
    prediction = layers.Dense(64)(layers.Activation('swish')(projection))  # Prediction head
    return models.Model(inputs, prediction)

In [4]:
# Contrastive loss
def contrastive_loss(query, key, queue, temperature=0.07):
    query = tf.math.l2_normalize(query, axis=1)
    key = tf.math.l2_normalize(key, axis=1)
    queue = tf.math.l2_normalize(queue, axis=1)
    l_pos = tf.reshape(tf.reduce_sum(query * key, axis=1), [-1,1])
    l_neg = tf.matmul(query, queue, transpose_b=True)
    logits = tf.concat([l_pos, l_neg], axis=1) / temperature
    labels = tf.zeros(logits.shape[0], dtype=tf.int32)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)
    return tf.reduce_mean(loss)

In [5]:
# Augmentations
def augment_batch(X):
    mask = (np.random.rand(*X.shape) > 0.15).astype(np.float32)
    noise = np.random.normal(0, 0.05, size=X.shape).astype(np.float32)
    return X * mask + noise

In [6]:
# FIFO queue update
def update_queue(queue, new_keys):
    batch_size = tf.shape(new_keys)[0]
    remaining = queue.shape[0] - batch_size
    new_queue = tf.concat([new_keys, queue[:remaining]], axis=0)
    return tf.stop_gradient(new_queue)

In [7]:
# Momentum encoder update
@tf.function
def momentum_update(query_encoder, key_encoder, m=0.999):
    for q_var, k_var in zip(query_encoder.trainable_variables, key_encoder.trainable_variables):
        k_var.assign(m * k_var + (1 - m) * q_var)

In [None]:
# Training setup
embedding_dim = 64
queue_size = 65536
queue = tf.Variable(tf.math.l2_normalize(tf.random.normal([queue_size, embedding_dim]), axis=1), trainable=False)

ratios = [(10,90), (20,80), (30,70), (40,60), (50,50), (60,40), (70,30), (80,20), (90,10)]
all_results = []

for train_pct, test_pct in ratios:
    print(f"\n--- Train:Test = {train_pct}:{test_pct} ---")
    X_train, X_test, y_train, y_test = train_test_split(
        X.values.astype(np.float32), y.values,
        test_size=test_pct/100,
        stratify=y,
        random_state=42
    )

    query_encoder = build_encoder(X_train.shape[1])
    key_encoder = build_encoder(X_train.shape[1])
    for q_var, k_var in zip(query_encoder.variables, key_encoder.variables):
        k_var.assign(q_var)

    optimizer = tf.keras.optimizers.Adam(3e-4)
    batch_size = 512
    epochs = 20
    num_samples = X_train.shape[0]

    for epoch in range(epochs):
        idx = np.random.permutation(num_samples)
        X_shuffled = X_train[idx]

        for i in range(0, num_samples, batch_size):
            batch = X_shuffled[i:i+batch_size]
            if batch.shape[0] < 2:
                continue
            x_q = augment_batch(batch)
            x_k = augment_batch(batch)
            with tf.GradientTape() as tape:
                z_q = query_encoder(x_q, training=True)
                z_k = tf.stop_gradient(key_encoder(x_k, training=True))
                loss = contrastive_loss(z_q, z_k, queue)
            grads = tape.gradient(loss, query_encoder.trainable_variables)
            optimizer.apply_gradients(zip(grads, query_encoder.trainable_variables))
            momentum_update(query_encoder, key_encoder)
            queue.assign(update_queue(queue, z_k))

        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.numpy():.4f}")
        X_train_ssl = query_encoder.predict(X_train)
        X_test_ssl = query_encoder.predict(X_test)

        rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
        rf.fit(X_train_ssl, y_train)
        y_pred = rf.predict(X_test_ssl)
        report = classification_report(y_test, y_pred, output_dict=True)

        acc = report['accuracy']
        f1_0 = report['0']['f1-score']
        f1_1 = report['1']['f1-score']

        all_results.append({
            'train_pct': train_pct,
            'test_pct': test_pct,
            'epoch': epoch + 1,
            'loss': float(loss.numpy()),
            'accuracy': acc,
            'f1_class_0': f1_0,
            'f1_class_1': f1_1
        })

        print(f"Accuracy: {acc:.4f}, F1 Class 0: {f1_0:.4f}, F1 Class 1: {f1_1:.4f}")

results_df = pd.DataFrame(all_results)
results_df.to_csv('moco_results_all_epochs.csv', index=False)
print("\nSaved results to 'moco_results_all_epochs.csv'")

In [None]:
# Visualization
sns.set(style="whitegrid")
ratios_unique = results_df[['train_pct', 'test_pct']].drop_duplicates()
plt.figure(figsize=(12, 7))
for _, row in ratios_unique.iterrows():
    subset = results_df[(results_df['train_pct'] == row['train_pct']) & 
                        (results_df['test_pct'] == row['test_pct'])]
    label = f"{int(row['train_pct'])}:{int(row['test_pct'])}"
    plt.plot(subset['epoch'], subset['accuracy'], marker='o', label=label)
plt.title("Accuracy vs Epoch per Train:Test Ratio")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(title="Train:Test")
plt.tight_layout()
plt.show()

In [None]:
# Loss vs Epoch
plt.figure(figsize=(12, 7))
for _, row in ratios_unique.iterrows():
    subset = results_df[
        (results_df['train_pct'] == row['train_pct']) & 
        (results_df['test_pct'] == row['test_pct'])
    ]
    label = f"{int(row['train_pct'])}:{int(row['test_pct'])}"
    plt.plot(subset['epoch'], subset['loss'], marker='o', label=label)
plt.title("Contrastive Loss vs Epoch per Train:Test Ratio")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(title="Train:Test")
plt.tight_layout()
plt.show()

Hyperparameters Settings for only 70:30 ratio

In [8]:
from itertools import product
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
learning_rates = [1e-4, 5e-4]
temperatures = [0.05, 0.1]
momentums = [0.99, 0.995]
dropouts = [0.3, 0.4]
batch_sizes = [128, 256]
queue_sizes = [16384, 32768]

In [10]:
def augment_batch(X):
    mask = (np.random.rand(*X.shape) > 0.1).astype(np.float32)
    noise = np.random.normal(0, 0.03, size=X.shape).astype(np.float32)
    return X * mask + noise

In [11]:
def build_encoder(input_dim, dropout):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(512)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Dense(256)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    x = layers.Dropout(dropout)(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    projection = layers.Dense(64)(x)
    prediction = layers.Dense(64)(layers.Activation('swish')(projection))
    return models.Model(inputs, prediction)

In [None]:
import time
from itertools import product
from sklearn.metrics import classification_report

all_results = []
combo_id = 0

# Define your hyperparameter grid
learning_rates = [1e-4, 5e-4]
temperatures = [0.05, 0.1]
momentums = [0.99, 0.995]
dropouts = [0.3, 0.4]
batch_sizes = [128, 256]
queue_sizes = [16384, 32768]

# Start tuning
for lr, temp, m, drop, batch_size, queue_size in product(
    learning_rates, temperatures, momentums, dropouts, batch_sizes, queue_sizes
):
    combo_id += 1
    print(f"\n🔧 Combo {combo_id}: LR={lr}, Temp={temp}, Momentum={m}, Dropout={drop}, "
          f"Batch={batch_size}, Queue={queue_size}")
    
    combo_start = time.time()  # Track time for each combo

    # Build queue
    queue = tf.Variable(
        tf.math.l2_normalize(tf.random.normal([queue_size, 64]), axis=1), trainable=False
    )

    # Split data (70:30)
    X_train, X_test, y_train, y_test = train_test_split(
        X.values.astype(np.float32), y.values,
        test_size=0.3,
        stratify=y,
        random_state=42
    )

    # Build encoders
    query_encoder = build_encoder(X_train.shape[1], drop)
    key_encoder = build_encoder(X_train.shape[1], drop)
    for q_var, k_var in zip(query_encoder.variables, key_encoder.variables):
        k_var.assign(q_var)

    optimizer = tf.keras.optimizers.Adam(lr)
    epochs = 20  
    num_samples = X_train.shape[0]

    # MoCo Pretraining
    for epoch in range(epochs):
        idx = np.random.permutation(num_samples)
        X_shuffled = X_train[idx]
        total_loss = []

        for i in range(0, num_samples, batch_size):
            batch = X_shuffled[i:i+batch_size]
            if batch.shape[0] < 2:
                continue

            x_q = augment_batch(batch)
            x_k = augment_batch(batch)

            with tf.GradientTape() as tape:
                z_q = query_encoder(x_q, training=True)
                z_k = tf.stop_gradient(key_encoder(x_k, training=True))

                def contrastive_loss(query, key, queue, temperature):
                    query = tf.math.l2_normalize(query, axis=1)
                    key = tf.math.l2_normalize(key, axis=1)
                    queue = tf.math.l2_normalize(queue, axis=1)
                    l_pos = tf.reshape(tf.reduce_sum(query * key, axis=1), [-1,1])
                    l_neg = tf.matmul(query, queue, transpose_b=True)
                    logits = tf.concat([l_pos, l_neg], axis=1) / temperature
                    labels = tf.zeros(logits.shape[0], dtype=tf.int32)
                    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)
                    return tf.reduce_mean(loss)

                loss = contrastive_loss(z_q, z_k, queue, temperature=temp)

            grads = tape.gradient(loss, query_encoder.trainable_variables)
            optimizer.apply_gradients(zip(grads, query_encoder.trainable_variables))

            for q_var, k_var in zip(query_encoder.trainable_variables, key_encoder.trainable_variables):
                k_var.assign(m * k_var + (1 - m) * q_var)

            def update_queue(queue, new_keys):
                batch_size = tf.shape(new_keys)[0]
                remaining = queue.shape[0] - batch_size
                return tf.concat([new_keys, queue[:remaining]], axis=0)

            queue.assign(update_queue(queue, z_k))
            total_loss.append(loss.numpy())

        avg_loss = np.mean(total_loss)
        print(f" Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

    # Fine-tuning
    query_encoder.trainable = True
    finetune_model = tf.keras.Sequential([
        query_encoder,
        layers.Dense(128, activation='swish'),
        layers.BatchNormalization(),
        layers.Dropout(drop),
        layers.Dense(1, activation='sigmoid')
    ])

    finetune_model.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    finetune_model.fit(X_train, y_train, validation_data=(X_test, y_test),
                       epochs=5, batch_size=batch_size, verbose=0)
    loss, acc = finetune_model.evaluate(X_test, y_test, verbose=0)
    y_pred = (finetune_model.predict(X_test) > 0.5).astype(int)
    report = classification_report(y_test, y_pred, output_dict=True)

    # Store results
    combo_result = {
        'combo_id': combo_id,
        'learning_rate': lr,
        'temperature': temp,
        'momentum': m,
        'dropout': drop,
        'batch_size': batch_size,
        'queue_size': queue_size,
        'loss': float(loss),
        'accuracy': acc,
        'f1_class_0': report['0']['f1-score'],
        'f1_class_1': report['1']['f1-score']
    }
    all_results.append(combo_result)

    # ✅ Print live progress
    print(f"✅ Finished Combo {combo_id}: "
          f"Accuracy={acc:.4f}, "
          f"Loss={loss:.4f}, "
          f"F1 Class 0={combo_result['f1_class_0']:.4f}, "
          f"F1 Class 1={combo_result['f1_class_1']:.4f}")

    # ⏱️ Print time taken for combo
    elapsed = time.time() - combo_start
    print(f"⏱️ Combo Time: {elapsed:.2f} seconds")

# Save final results
results_df = pd.DataFrame(all_results)
results_df.to_csv('moco_hyperparam_live_results.csv', index=False)
print("📁 Saved all results to 'moco_hyperparam_live_results.csv'")



🔧 Combo 1: LR=0.0001, Temp=0.05, Momentum=0.99, Dropout=0.3, Batch=128, Queue=16384
 Epoch 1/20, Loss: 0.6462
 Epoch 2/20, Loss: 0.1385
 Epoch 3/20, Loss: 0.0588
 Epoch 4/20, Loss: 0.0351
 Epoch 5/20, Loss: 0.0231
 Epoch 6/20, Loss: 0.0165
 Epoch 7/20, Loss: 0.0126
 Epoch 8/20, Loss: 0.0107
 Epoch 9/20, Loss: 0.0076
 Epoch 10/20, Loss: 0.0073
 Epoch 11/20, Loss: 0.0057
 Epoch 12/20, Loss: 0.0048
 Epoch 13/20, Loss: 0.0049
 Epoch 14/20, Loss: 0.0040
 Epoch 15/20, Loss: 0.0040
 Epoch 16/20, Loss: 0.0040
 Epoch 17/20, Loss: 0.0033
 Epoch 18/20, Loss: 0.0032
 Epoch 19/20, Loss: 0.0033
 Epoch 20/20, Loss: 0.0030


I0000 00:00:1751615123.355552     109 service.cc:148] XLA service 0x7f0624009bc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751615123.357479     109 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1751615123.357500     109 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1751615123.995489     109 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1751615126.996876     109 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2416/2416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
✅ Finished Combo 1: Accuracy=0.8939, Loss=0.1987, F1 Class 0=0.8324, F1 Class 1=0.9223
⏱️ Combo Time: 4405.03 seconds

🔧 Combo 2: LR=0.0001, Temp=0.05, Momentum=0.99, Dropout=0.3, Batch=128, Queue=32768
 Epoch 1/20, Loss: 0.7577
 Epoch 2/20, Loss: 0.1782
 Epoch 3/20, Loss: 0.0776
 Epoch 4/20, Loss: 0.0430
 Epoch 5/20, Loss: 0.0286
 Epoch 6/20, Loss: 0.0210
 Epoch 7/20, Loss: 0.0163
 Epoch 8/20, Loss: 0.0128
 Epoch 9/20, Loss: 0.0102
 Epoch 10/20, Loss: 0.0091
 Epoch 11/20, Loss: 0.0076
 Epoch 12/20, Loss: 0.0072
 Epoch 13/20, Loss: 0.0066
 Epoch 14/20, Loss: 0.0062
 Epoch 15/20, Loss: 0.0056
 Epoch 16/20, Loss: 0.0051
 Epoch 17/20, Loss: 0.0049
 Epoch 18/20, Loss: 0.0049
 Epoch 19/20, Loss: 0.0044
 Epoch 20/20, Loss: 0.0044
[1m2416/2416[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
✅ Finished Combo 2: Accuracy=0.8931, Loss=0.1988, F1 Class 0=0.8305, F1 Class 1=0.9219
⏱️ Combo Time: 4498.30 se

In [None]:
results_df = pd.DataFrame(all_results)
results_df.to_csv('moco_hyperparam_extended_results.csv', index=False)

sns.set(style="whitegrid")
plt.figure(figsize=(14, 6))
sns.barplot(data=results_df, x='combo_id', y='accuracy', hue='temperature')
plt.title("Accuracy by Hyperparameter Combination")
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6))
sns.barplot(data=results_df, x='combo_id', y='f1_class_1', hue='learning_rate')
plt.title("F1 Score (Class 1) by Combination")
plt.tight_layout()
plt.show()