In [None]:
# 🔁 SimCLR + SVM on UNSW-NB15 with Progress Output

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Load data
train = pd.read_parquet("/kaggle/input/unsw-nb15/UNSW_NB15_training-set.parquet")
test = pd.read_parquet("/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.parquet")
df = pd.concat([train, test])
df = df.drop(columns=['id', 'attack_cat'], errors='ignore')
for col in ['proto', 'service', 'state']:
    df[col] = pd.factorize(df[col])[0]
X = df.drop('label', axis=1)
y = df['label']
X = StandardScaler().fit_transform(X)

def build_encoder(input_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(512, activation='relu')(inputs)
    x = layers.Dense(256, activation='relu')(x)
    return models.Model(inputs, x)

def build_projection_head(input_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dense(64)(x)
    return models.Model(inputs, x)

def nt_xent_loss(z_i, z_j, temperature=0.5):
    z_i = tf.math.l2_normalize(z_i, axis=1)
    z_j = tf.math.l2_normalize(z_j, axis=1)
    representations = tf.concat([z_i, z_j], axis=0)
    similarity_matrix = tf.matmul(representations, representations, transpose_b=True)
    logits = similarity_matrix / temperature
    batch_size = tf.shape(z_i)[0]
    mask = tf.eye(2 * batch_size)
    logits = logits * (1 - mask) - 1e9 * mask
    labels = tf.concat([tf.range(batch_size, 2 * batch_size), tf.range(0, batch_size)], axis=0)
    loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
    return tf.reduce_mean(loss)

def augment(X):
    noise = np.random.normal(0, 0.1, X.shape)
    mask = np.random.binomial(1, 0.85, X.shape)
    return X * mask + noise

def train_simclr(X_train, epochs=20, batch_size=512):
    encoder = build_encoder(X_train.shape[1])
    projector = build_projection_head(encoder.output_shape[1])
    model_input = tf.keras.Input(shape=(X_train.shape[1],))
    features = encoder(model_input)
    projections = projector(features)
    simclr_model = tf.keras.Model(model_input, projections)
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

    for epoch in range(epochs):
        idx = np.random.permutation(len(X_train))
        X_train_shuffled = X_train[idx]
        for i in range(0, len(X_train), batch_size):
            batch = X_train_shuffled[i:i+batch_size]
            if batch.shape[0] < 2:
                continue
            x1 = augment(batch)
            x2 = augment(batch)
            with tf.GradientTape() as tape:
                z1 = simclr_model(x1, training=True)
                z2 = simclr_model(x2, training=True)
                loss = nt_xent_loss(z1, z2)
            grads = tape.gradient(loss, simclr_model.trainable_variables)
            optimizer.apply_gradients(zip(grads, simclr_model.trainable_variables))
        print(f" Epoch {epoch+1}/20, Loss: {loss.numpy():.4f}")
    return encoder

def evaluate_with_svm(encoder, X_train, X_test, y_train, y_test):
    X_train_embedded = encoder.predict(X_train)
    X_test_embedded = encoder.predict(X_test)
    clf = SVC(kernel='rbf', class_weight='balanced')
    clf.fit(X_train_embedded, y_train)
    y_pred = clf.predict(X_test_embedded)
    report = classification_report(y_test, y_pred, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'f1_class_0': report['0']['f1-score'],
        'f1_class_1': report['1']['f1-score']
    }

results = []
ratios = [10,20,30,40,50,60,70,80,90]

for idx, ratio in enumerate(ratios):
    print(f"\n🔁 Combo {idx + 1} → Train:Test = {ratio}:{100 - ratio}")
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=(100 - ratio) / 100,
        stratify=y, random_state=42
    )
    encoder = train_simclr(X_tr.astype(np.float32), epochs=20, batch_size=1024)
    metrics = evaluate_with_svm(encoder, X_tr, X_te, y_tr, y_te)
    metrics.update({'train_ratio': ratio})
    results.append(metrics)
    print(f"✅ Finished Combo {idx + 1}: Accuracy = {metrics['accuracy']:.4f}, "
          f"F1 Class 0 = {metrics['f1_class_0']:.4f}, F1 Class 1 = {metrics['f1_class_1']:.4f}")

# Save results
pd.DataFrame(results).to_csv("simclr_svm_unsw_results.csv", index=False)

import pandas as pd
import matplotlib.pyplot as plt

# Load the results
results_df = pd.read_csv("simclr_svm_unsw_results.csv")

# Plot Accuracy and F1 Scores
plt.figure(figsize=(12, 6))
plt.plot(results_df['train_ratio'], results_df['accuracy'], marker='o', label='Accuracy')
plt.plot(results_df['train_ratio'], results_df['f1_class_0'], marker='s', label='F1 Score (Class 0)')
plt.plot(results_df['train_ratio'], results_df['f1_class_1'], marker='^', label='F1 Score (Class 1)')

plt.title('SimCLR + SVM Performance on UNSW-NB15')
plt.xlabel('Train Ratio (%)')
plt.ylabel('Score')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


2025-07-14 20:04:44.508720: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752523484.876805      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752523484.984510      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



🔁 Combo 1 → Train:Test = 10:90


I0000 00:00:1752523503.810266      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1752523503.810998      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


 Epoch 1/20, Loss: 4.2951
 Epoch 2/20, Loss: 4.2755
 Epoch 3/20, Loss: 4.2187
 Epoch 4/20, Loss: 4.2061
 Epoch 5/20, Loss: 4.1676
 Epoch 6/20, Loss: 4.1807
 Epoch 7/20, Loss: 4.1867
 Epoch 8/20, Loss: 4.1803
 Epoch 9/20, Loss: 4.1950
 Epoch 10/20, Loss: 4.1211
 Epoch 11/20, Loss: 4.1519
 Epoch 12/20, Loss: 4.1489
 Epoch 13/20, Loss: 4.1165
 Epoch 14/20, Loss: 4.1299
 Epoch 15/20, Loss: 4.1226
 Epoch 16/20, Loss: 4.1225
 Epoch 17/20, Loss: 4.1946
 Epoch 18/20, Loss: 4.1302
 Epoch 19/20, Loss: 4.1419
 Epoch 20/20, Loss: 4.0988


I0000 00:00:1752523540.991025     106 service.cc:148] XLA service 0x7d342c001d90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1752523540.992292     106 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1752523540.992312     106 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1752523541.108045     106 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m120/806[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step

I0000 00:00:1752523541.390637     106 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m806/806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
[1m7248/7248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step
✅ Finished Combo 1: Accuracy = 0.8816, F1 Class 0 = 0.8436, F1 Class 1 = 0.9048

🔁 Combo 2 → Train:Test = 20:80
 Epoch 1/20, Loss: 4.9547
 Epoch 2/20, Loss: 4.9080
 Epoch 3/20, Loss: 4.8634
 Epoch 4/20, Loss: 4.8796
 Epoch 5/20, Loss: 4.8401
 Epoch 6/20, Loss: 4.8391
 Epoch 7/20, Loss: 4.8511
 Epoch 8/20, Loss: 4.8041
 Epoch 9/20, Loss: 4.8164
 Epoch 10/20, Loss: 4.8078
 Epoch 11/20, Loss: 4.8307
 Epoch 12/20, Loss: 4.8139
 Epoch 13/20, Loss: 4.7850
 Epoch 14/20, Loss: 4.8138
 Epoch 15/20, Loss: 4.8081
 Epoch 16/20, Loss: 4.7773
 Epoch 17/20, Loss: 4.7720
 Epoch 18/20, Loss: 4.8064
 Epoch 19/20, Loss: 4.8014
 Epoch 20/20, Loss: 4.8193
[1m1611/1611[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m6442/6442[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step
✅ Finished Combo 2: Accuracy = 0.8823, F1 Class 0 =