In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Step 1: Load the dataset
df = pd.read_csv("kddcup_converted.csv")  # Replace with your filename

# Step 2: Preprocessing
# Drop label if present and create binary label for anomaly
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

X = df.drop("label", axis=1)
y = df["label"]

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3a: Isolation Forest
print("🧪 Isolation Forest")
iso_forest = IsolationForest(contamination=0.1, random_state=42)
y_pred_iforest = iso_forest.fit_predict(X_scaled)
y_pred_iforest = [1 if x == -1 else 0 for x in y_pred_iforest]

print("\n📊 Isolation Forest Report:")
print(classification_report(y, y_pred_iforest, target_names=["Normal", "Anomaly"]))

# Step 3b: Autoencoder
print("\n🧪 Autoencoder")

# Split only normal data for training
X_normal = X_scaled[y == 0]
X_train, X_val = train_test_split(X_normal, test_size=0.2, random_state=42)

input_dim = X.shape[1]

# Build Autoencoder
autoencoder = Sequential([
    Input(shape=(input_dim,)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(32, activation='relu'),
    Dense(input_dim, activation='linear')
])
autoencoder.compile(optimizer='adam', loss='mse')

# Train autoencoder
history = autoencoder.fit(X_train, X_train,
                          epochs=20,
                          batch_size=256,
                          validation_data=(X_val, X_val),
                          verbose=1)

# Compute reconstruction error
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)

# Threshold for anomaly detection
threshold = np.percentile(mse, 95)
y_pred_ae = [1 if e > threshold else 0 for e in mse]

print("\n📊 Autoencoder Report:")
print(classification_report(y, y_pred_ae, target_names=["Normal", "Anomaly"]))

# Plot MSE distribution
plt.figure(figsize=(10, 5))
sns.histplot(mse, bins=50, kde=True)
plt.axvline(threshold, color='r', linestyle='--', label='Threshold')
plt.title('Reconstruction Error')
plt.xlabel('MSE')
plt.ylabel('Frequency')
plt.legend()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'kddcup_converted.csv'