In [3]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Dense

# Load dataset
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]
df = pd.read_csv("kddcup.data_10_percent_corrected", names=column_names)

# Encode categorical and create binary labels
for col in ['protocol_type', 'service', 'flag']:
    df[col] = LabelEncoder().fit_transform(df[col])
df['label_binary'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)

# Prepare feature and scale
X = df.drop(['label', 'label_binary'], axis=1)
y = df['label_binary']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
print("\n🔹 Isolation Forest:")
iso = IsolationForest(contamination=0.1, random_state=42)
y_pred_iso = iso.fit_predict(X_scaled)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)
print(classification_report(y, y_pred_iso))

#  Autoencoder
print("\n🔹 Autoencoder:")

# Split for training 
X_normal = X_scaled[y == 0]
X_train, X_test = train_test_split(X_normal, test_size=0.2, random_state=42)

# Define model
input_dim = X_train.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(encoded)
decoded = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=10, batch_size=256, shuffle=True, validation_split=0.1, verbose=1)

# Reconstruction error on test set
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)
threshold = np.percentile(mse, 90)
y_pred_ae = (mse > threshold).astype(int)

print(classification_report(y, y_pred_ae))


🔹 Isolation Forest:
              precision    recall  f1-score   support

           0       0.17      0.80      0.29     97278
           1       0.61      0.08      0.13    396743

    accuracy                           0.22    494021
   macro avg       0.39      0.44      0.21    494021
weighted avg       0.52      0.22      0.16    494021


🔹 Autoencoder:
Epoch 1/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1100 - val_loss: 0.0091
Epoch 2/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0063 - val_loss: 0.0030
Epoch 3/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0027 - val_loss: 0.0020
Epoch 4/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0018 - val_loss: 0.0015
Epoch 5/10
[1m274/274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0014 - val_loss: 0.0012
Epoch 6/10
[1m274/274[0m [32m━━━━