In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load and preprocess the data
train = pd.read_parquet("/kaggle/input/unsb-nb15/UNSW_NB15_testing-set.parquet")
test = pd.read_parquet("/kaggle/input/unsb-nb15/UNSW_NB15_testing-set.parquet")
df = pd.concat([train, test])
df = df.drop(columns=['id', 'attack_cat'], errors='ignore')
cat_cols = ['proto', 'service', 'state']
for col in cat_cols:
    df[col] = pd.factorize(df[col])[0]
X = df.drop('label', axis=1)
y = df['label']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

# Enhanced MoCo encoder with deeper layers, BatchNorm, Dropout
def build_encoderT(input_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)
    x = layers.Dropout(0.3)(x)

    x = layers.Dense(64)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('swish')(x)

    projection = layers.Dense(32)(x)
    prediction = layers.Dense(32)(layers.Activation('swish')(projection))
    return models.Model(inputs, prediction)

# Contrastive loss
def contrastive_loss(query, key, queue, temperature=0.07):
    query = tf.math.l2_normalize(query, axis=1)
    key = tf.math.l2_normalize(key, axis=1)
    queue = tf.math.l2_normalize(queue, axis=1)
    l_pos = tf.reshape(tf.reduce_sum(query * key, axis=1), [-1,1])
    l_neg = tf.matmul(query, queue, transpose_b=True)
    logits = tf.concat([l_pos, l_neg], axis=1) / temperature
    labels = tf.zeros(logits.shape[0], dtype=tf.int32)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)
    return tf.reduce_mean(loss)

# Augmentations
def augment_batch(X):
    mask = (np.random.rand(*X.shape) > 0.15).astype(np.float32)
    noise = np.random.normal(0, 0.05, size=X.shape).astype(np.float32)
    return X * mask + noise

# FIFO queue update
def update_queue(queue, new_keys):
    batch_size = tf.shape(new_keys)[0]
    remaining = queue.shape[0] - batch_size
    new_queue = tf.concat([new_keys, queue[:remaining]], axis=0)
    return tf.stop_gradient(new_queue)

# Momentum encoder update
@tf.function
def momentum_update(query_encoder, key_encoder, m=0.999):
    for q_var, k_var in zip(query_encoder.trainable_variables, key_encoder.trainable_variables):
        k_var.assign(m * k_var + (1 - m) * q_var)

# Training setup
embedding_dim = 32
queue_size = 65536
queue = tf.Variable(tf.math.l2_normalize(tf.random.normal([queue_size, embedding_dim]), axis=1), trainable=False)

ratios = [(10,90), (20,80), (30,70), (40,60), (50,50), (60,40), (70,30), (80,20), (90,10)]
all_results = []

for train_pct, test_pct in ratios:
    print(f"\n--- Train:Test = {train_pct}:{test_pct} ---")
    X_train, X_test, y_train, y_test = train_test_split(
        X.values.astype(np.float32), y.values,
        test_size=test_pct/100,
        stratify=y,
        random_state=42
    )

    query_encoder = build_encoderT(X_train.shape[1])
    key_encoder = build_encoderT(X_train.shape[1])
    for q_var, k_var in zip(query_encoder.variables, key_encoder.variables):
        k_var.assign(q_var)

    optimizer = tf.keras.optimizers.Adam(3e-4)
    batch_size = 512
    epochs = 20
    num_samples = X_train.shape[0]

    for epoch in range(epochs):
        idx = np.random.permutation(num_samples)
        X_shuffled = X_train[idx]

        for i in range(0, num_samples, batch_size):
            batch = X_shuffled[i:i+batch_size]
            if batch.shape[0] < 2:
                continue
            x_q = augment_batch(batch)
            x_k = augment_batch(batch)
            with tf.GradientTape() as tape:
                z_q = query_encoder(x_q, training=True)
                z_k = tf.stop_gradient(key_encoder(x_k, training=True))
                loss = contrastive_loss(z_q, z_k, queue)
            grads = tape.gradient(loss, query_encoder.trainable_variables)
            optimizer.apply_gradients(zip(grads, query_encoder.trainable_variables))
            momentum_update(query_encoder, key_encoder)
            queue.assign(update_queue(queue, z_k))

        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.numpy():.4f}")

    # Fine-tuning phase
    query_encoder.trainable = True
    finetune_model = tf.keras.Sequential([
        query_encoder,
        layers.Dense(64, activation='swish'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])

    finetune_model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    finetune_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=256)
    loss, acc = finetune_model.evaluate(X_test, y_test)
    print(f"Fine-tuned Accuracy: {acc:.4f}")

    y_pred = (finetune_model.predict(X_test) > 0.5).astype(int)
    report = classification_report(y_test, y_pred, output_dict=True)

    all_results.append({
        'train_pct': train_pct,
        'test_pct': test_pct,
        'epoch': epoch + 1,
        'loss': float(loss),
        'accuracy': acc,
        'f1_class_0': report['0']['f1-score'],
        'f1_class_1': report['1']['f1-score']
    })

results_df = pd.DataFrame(all_results)
results_df.to_csv('moco_finetune_results.csv', index=False)
print("\nSaved results to 'moco_finetune_results.csv'")

# Visualization
sns.set(style="whitegrid")
ratios_unique = results_df[['train_pct', 'test_pct']].drop_duplicates()
plt.figure(figsize=(12, 7))
for _, row in ratios_unique.iterrows():
    subset = results_df[(results_df['train_pct'] == row['train_pct']) & 
                        (results_df['test_pct'] == row['test_pct'])]
    label = f"{int(row['train_pct'])}:{int(row['test_pct'])}"
    plt.plot(subset['epoch'], subset['accuracy'], marker='o', label=label)
plt.title("Accuracy vs Epoch per Train:Test Ratio (Fine-Tuned)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(title="Train:Test")
plt.tight_layout()
plt.show()


2025-07-03 18:30:51.286617: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751567451.469954      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751567451.523612      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1751567465.476497      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1751567465.477199      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability:


--- Train:Test = 10:90 ---
Epoch 1/20, Loss: 5.4173
Epoch 2/20, Loss: 4.7954
Epoch 3/20, Loss: 5.6169
Epoch 4/20, Loss: 6.3851
Epoch 5/20, Loss: 5.7556
Epoch 6/20, Loss: 5.5960
Epoch 7/20, Loss: 6.3493
Epoch 8/20, Loss: 5.6544
Epoch 9/20, Loss: 5.4031
Epoch 10/20, Loss: 6.6148
Epoch 11/20, Loss: 5.2557
Epoch 12/20, Loss: 5.4638
Epoch 13/20, Loss: 5.4266
Epoch 14/20, Loss: 5.8204
Epoch 15/20, Loss: 6.5851
Epoch 16/20, Loss: 5.1892
Epoch 17/20, Loss: 5.0568
Epoch 18/20, Loss: 4.8865
Epoch 19/20, Loss: 4.9842
Epoch 20/20, Loss: 5.1413
Epoch 1/10


I0000 00:00:1751567582.741681     101 service.cc:148] XLA service 0x7afa480048d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751567582.742156     101 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1751567582.742178     101 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1751567583.317167     101 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m56/65[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 3ms/step - accuracy: 0.5941 - loss: 0.7373

I0000 00:00:1751567586.495973     101 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 92ms/step - accuracy: 0.6050 - loss: 0.7255 - val_accuracy: 0.7734 - val_loss: 0.6249
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.7480 - loss: 0.5378 - val_accuracy: 0.7770 - val_loss: 0.5636
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7776 - loss: 0.4688 - val_accuracy: 0.7818 - val_loss: 0.4983
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7788 - loss: 0.4382 - val_accuracy: 0.7947 - val_loss: 0.4447
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.7761 - loss: 0.4301 - val_accuracy: 0.8074 - val_loss: 0.4025
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7864 - loss: 0.4051 - val_accuracy: 0.8159 - val_loss: 0.3741
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━