In [1]:
import pandas as pd

# Load the full KDD Cup 1999 dataset
df = pd.read_csv('/kaggle/input/kdd-cup-1999-data/kddcup.data.corrected', header=None)

# Preview the first few rows
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [2]:
column_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell", "su_attempted", 
    "num_root", "num_file_creations", "num_shells", "num_access_files", 
    "num_outbound_cmds", "is_host_login", "is_guest_login", "count", 
    "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", 
    "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", 
    "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", 
    "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
    "dst_host_srv_rerror_rate", "label"
]

# Assign column names to the dataframe
df.columns = column_names

In [3]:
# Show dataset shape
print("Dataset shape:", df.shape)

# Show label distribution
print(df['label'].value_counts())

Dataset shape: (4898431, 42)
label
smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: count, dtype: int64


In [4]:
df['label'] = df['label'].apply(lambda x: 'normal' if x == 'normal.' else 'attack')

In [5]:
for col in ['protocol_type', 'service', 'flag']:
    df[col], _ = pd.factorize(df[col])

In [6]:
df['label'], _ = pd.factorize(df['label'])  # 0: attack, 1: normal (order may vary)

In [7]:
df['label'].value_counts()

label
1    3925650
0     972781
Name: count, dtype: int64

In [8]:
from sklearn.preprocessing import MinMaxScaler

X = df.drop('label', axis=1)
y = df['label']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [9]:
df_processed = X_scaled.copy()
df_processed['label'] = y

In [10]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models

2025-08-07 13:54:03.492152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754574843.668322      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754574843.721757      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:
def get_encoder(input_dim, projection_dim):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.Dense(256, activation='relu')(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(projection_dim)(x)
    return models.Model(inputs, outputs, name="encoder")

In [12]:
def contrastive_loss(z_i, z_j, temperature=0.5):
    z_i = tf.math.l2_normalize(z_i, axis=1)
    z_j = tf.math.l2_normalize(z_j, axis=1)

    batch_size = tf.shape(z_i)[0]

    representations = tf.concat([z_i, z_j], axis=0)
    similarity_matrix = tf.matmul(representations, representations, transpose_b=True)
    logits = similarity_matrix / temperature

    labels = tf.range(batch_size)
    labels = tf.concat([labels, labels], axis=0)

    mask = tf.eye(2 * batch_size)
    logits = logits * (1 - mask) - 1e9 * mask  # mask out self-similarity

    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [13]:
def augment(X, noise_factor=0.01):
    noise = noise_factor * tf.random.normal(shape=tf.shape(X))
    return X + noise

In [14]:
def train_moco(X, projection_dim=64, batch_size=256, epochs=2):
    input_dim = X.shape[1]
    encoder = get_encoder(input_dim, projection_dim)
    optimizer = tf.keras.optimizers.Adam()

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        indices = np.random.permutation(len(X))
        X_shuffled = tf.gather(X, indices)

        for i in range(0, len(X), batch_size):
            batch = X_shuffled[i:i + batch_size]
            if tf.shape(batch)[0] < batch_size:
                continue

            with tf.GradientTape() as tape:
                x1 = augment(batch)
                x2 = augment(batch)

                z1 = encoder(x1, training=True)
                z2 = encoder(x2, training=True)

                loss = contrastive_loss(z1, z2)

            grads = tape.gradient(loss, encoder.trainable_variables)
            optimizer.apply_gradients(zip(grads, encoder.trainable_variables))

        print(f"Loss: {tf.reduce_mean(loss).numpy():.4f}")
    
    return encoder

In [15]:
encoder = train_moco(tf.convert_to_tensor(X_scaled.values, dtype=tf.float32))

# Extract learned features
features = encoder.predict(X_scaled.values)

I0000 00:00:1754574855.845274      19 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/2
Loss: 500000000.0000
Epoch 2/2
Loss: 500000000.0000


I0000 00:00:1754577388.741702      60 service.cc:148] XLA service 0x7a742c03d010 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1754577388.742337      60 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1754577388.818156      60 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   130/153076[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:00[0m 1ms/step

I0000 00:00:1754577389.001431      60 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m153076/153076[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 1ms/step
