In [13]:
import pandas as pd

df = pd.read_csv(r"C:\Users\rcish\Downloads\archive (2)\kddcup.data_10_percent_corrected.csv", header=None)
print(df.head())

   0    1     2   3    4     5   6   7   8   9   ...  32   33   34    35   36  \
0   0  tcp  http  SF  181  5450   0   0   0   0  ...   9  1.0  0.0  0.11  0.0   
1   0  tcp  http  SF  239   486   0   0   0   0  ...  19  1.0  0.0  0.05  0.0   
2   0  tcp  http  SF  235  1337   0   0   0   0  ...  29  1.0  0.0  0.03  0.0   
3   0  tcp  http  SF  219  1337   0   0   0   0  ...  39  1.0  0.0  0.03  0.0   
4   0  tcp  http  SF  217  2032   0   0   0   0  ...  49  1.0  0.0  0.02  0.0   

    37   38   39   40       41  
0  0.0  0.0  0.0  0.0  normal.  
1  0.0  0.0  0.0  0.0  normal.  
2  0.0  0.0  0.0  0.0  normal.  
3  0.0  0.0  0.0  0.0  normal.  
4  0.0  0.0  0.0  0.0  normal.  

[5 rows x 42 columns]


In [15]:
import os
print(os.getcwd())

C:\Users\rcish


In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

#Load dataset using FULL PATH — update if needed
df = pd.read_csv(r"C:\Users\rcish\Downloads\archive (2)\kddcup.data_10_percent_corrected.csv", header=None)

#Add column names (from kddcup.names)
col_names = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "label"
]
df.columns = col_names

#Convert label to binary (0 = normal, 1 = anomaly)
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)

#Encode categorical features using one-hot encoding
df = pd.get_dummies(df, columns=["protocol_type", "service", "flag"])

#Drop constant column
df.drop("num_outbound_cmds", axis=1, inplace=True)

#Split features and target
X = df.drop("label", axis=1)
y = df["label"]

#Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

#Train Isolation Forest
iso_model = IsolationForest(n_estimators=100, contamination=0.2, random_state=42)
iso_model.fit(X_scaled)

#Predict anomalies
y_pred_iso = iso_model.predict(X_scaled)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)

#Evaluate
print("Isolation Forest Evaluation")
print(confusion_matrix(y, y_pred_iso))
print(classification_report(y, y_pred_iso))
print("ROC AUC:", roc_auc_score(y, y_pred_iso))

In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras import regularizers

#Filter only normal data for training
X_normal = X_scaled[y == 0]

#Build autoencoder
input_dim = X_normal.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu', activity_regularizer=regularizers.l1(1e-5))(input_layer)
encoded = Dense(16, activation='relu')(encoded)
encoded = Dense(8, activation='relu')(encoded)
decoded = Dense(16, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

#Train autoencoder
autoencoder.fit(X_normal, X_normal,
                epochs=20,
                batch_size=256,
                shuffle=True,
                validation_split=0.1,
                verbose=1)

In [None]:
#Reconstruct and calculate MSE
reconstructions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - reconstructions, 2), axis=1)

#Set threshold using 95th percentile of normal MSE
threshold = np.percentile(mse[y == 0], 95)
print("Reconstruction error threshold:", threshold)

#Predict anomalies
y_pred_ae = [1 if e > threshold else 0 for e in mse]

#Evaluate
print("📊 Autoencoder Evaluation")
print(confusion_matrix(y, y_pred_ae))
print(classification_report(y, y_pred_ae))
print("ROC AUC:", roc_auc_score(y, y_pred_ae))

In [None]:
# Histogram of reconstruction errors
plt.figure(figsize=(10,5))
sns.histplot(mse[y == 0], bins=50, color='green', label='Normal')
sns.histplot(mse[y == 1], bins=50, color='red', label='Anomaly')
plt.axvline(threshold, color='blue', linestyle='--', label='Threshold')
plt.legend()
plt.title("Reconstruction Error Histogram")
plt.xlabel("MSE")
plt.ylabel("Frequency")
plt.show()