In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# --- 1. Load -----------------------------------------------------------------
df = pd.read_csv("/content/kddcup_converted.csv")      # change path as needed
print("Columns in file →", list(df.columns))   # <-- see what you actually have

# --- 2. Identify the label column (if any) -----------------------------------
possible_labels = ["label", "class", "attack", "target"]
label_col = next((c for c in possible_labels if c in df.columns), None)

if label_col:
    print(f"👉 Using '{label_col}' as the ground‑truth label column")
    y = df[label_col].copy()
    # example binarization: 0 = normal, 1 = anomaly
    if y.dtype == "object":
        y = y.apply(lambda x: 0 if x.lower().startswith("normal") else 1)
    df = df.drop(columns=[label_col])
else:
    print("⚠️  No label column found – running in **unsupervised** mode")
    y = None  # we’ll only predict, not score

# --- 3. Encode categoricals ---------------------------------------------------
cat_cols = df.select_dtypes(include="object").columns
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_cols}
for col, enc in encoders.items():
    df[col] = enc.transform(df[col])

# --- 4. Scale -----------------------------------------------------------------
X = df.values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- 5. Isolation Forest ------------------------------------------------------
iso = IsolationForest(contamination=0.1, random_state=42)
pred_if = (iso.fit_predict(X_scaled) == -1).astype(int)

if y is not None:
    print("\nIsolation Forest results")
    print(classification_report(y, pred_if, target_names=["Normal", "Anomaly"]))
else:
    # just show how many points were flagged
    print(f"Isolation Forest flagged {pred_if.sum()} / {len(pred_if)} records as anomalies")

# --- 6. Autoencoder -----------------------------------------------------------
input_dim = X.shape[1]
auto = Sequential([
    Input(shape=(input_dim,)),
    Dense(32, activation="relu"),
    Dense(16, activation="relu"),
    Dense(32, activation="relu"),
    Dense(input_dim, activation="linear"),
])
auto.compile(optimizer="adam", loss="mse")

# train only on data assumed normal
train_mask = (y == 0) if y is not None else np.ones(len(X_scaled), dtype=bool)
X_train, X_val = train_test_split(X_scaled[train_mask], test_size=0.2, random_state=42)

auto.fit(X_train, X_train,
         epochs=20, batch_size=256,
         validation_data=(X_val, X_val), verbose=1)

recons = auto.predict(X_scaled)
mse = np.mean(np.square(X_scaled - recons), axis=1)
thresh = np.percentile(mse, 95)
pred_ae = (mse > thresh).astype(int)

if y is not None:
    print("\nAutoencoder results")
    print(classification_report(y, pred_ae, target_names=["Normal", "Anomaly"]))
else:
    print(f"Autoencoder flagged {pred_ae.sum()} / {len(pred_ae)} records as anomalies "
          f"(threshold={thresh:.4f})")


Columns in file → ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41']
⚠️  No label column found – running in **unsupervised** mode
Isolation Forest flagged 31084 / 311029 records as anomalies
Epoch 1/20
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.5926 - val_loss: 0.3111
Epoch 2/20
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.3860 - val_loss: 0.2430
Epoch 3/20
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1963 - val_loss: 0.1832
Epoch 4/20
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1542 - val_loss: 0.1554
Epoch 5/20
[1m972/972[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0996 - val_loss: 0.1375
Epoch 6/20
[1m97