In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping
import joblib

In [31]:
# Load data
df = pd.read_csv("RequestAllGood.csv")
if "class" in df.columns:
    df = df.drop(columns=["class"])

In [33]:
# Define features for anomaly detection
FEATURES = [
    "method", "path","body", "url_length", "body_length", "path_entropy",
    "body_entropy", "header_count", "sql_injection_count", "xss_attack_count",
    "command_injection_count", "directory_traversal_count", "csrf_count"
]

In [35]:
# Select features
df = df[FEATURES]

In [37]:
# Handle categorical features, including "body" with empty values
le_method = LabelEncoder()
le_path = LabelEncoder()
le_body = LabelEncoder()

In [39]:
# Encode "method" and "path"
df["method"] = le_method.fit_transform(df["method"].fillna("GET"))  # Default to "GET" for missing
df["path"] = le_path.fit_transform(df["path"].fillna("/"))  # Default to "/" for missing

In [41]:
# Handle "body" (categorical, with empty strings)
df["body"] = df["body"].fillna("")  # Replace NaN or None with empty string
df["body"] = le_body.fit_transform(df["body"])  # Encode empty strings as a valid category

In [43]:
# Combine features (no TF-IDF needed)
X = df[FEATURES].values  # Use only the selected numerical features

In [45]:
# Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [47]:
# Train-test split
X_train, X_val = train_test_split(X_scaled, test_size=0.2, shuffle=True, random_state=42)

In [49]:
# Build autoencoder
input_dim = X_train.shape[1]  # 12 features
input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation="swish")(input_layer)
encoded = Dropout(0.3)(encoded)
encoded = Dense(64, activation="swish")(encoded)
decoded = Dense(128, activation="swish")(encoded)
decoded = Dense(input_dim, activation="sigmoid")(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.00005), loss=Huber(delta=1.0))

In [51]:
# Train
early_stopping = EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True)
autoencoder.fit(X_train, X_train, validation_data=(X_val, X_val), epochs=500, batch_size=128, callbacks=[early_stopping])

Epoch 1/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.3336 - val_loss: 0.3223
Epoch 2/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3288 - val_loss: 0.3187
Epoch 3/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3232 - val_loss: 0.3148
Epoch 4/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.3220 - val_loss: 0.3100
Epoch 5/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3139 - val_loss: 0.3043
Epoch 6/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3102 - val_loss: 0.2973
Epoch 7/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.3063 - val_loss: 0.2886
Epoch 8/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.2913 - val_loss: 0.2782
Epoch 9/500
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1b561aee600>

In [53]:
# Save model and encoders
autoencoder.save("my_model.keras")
joblib.dump(scaler, "autoencoder_scaler.pkl")
joblib.dump(le_method, "AL_method_encoder.pkl")
joblib.dump(le_path, "AL_path_encoder.pkl")
joblib.dump(le_body, "AL_body_encoder.pkl")  # Save body encoder for

['AL_body_encoder.pkl']

In [55]:
# Find anomaly threshold
reconstructed = autoencoder.predict(X_val)
errors = np.mean(np.abs(X_val - reconstructed), axis=1)
ANOMALY_THRESHOLD = np.percentile(errors, 95)
print("🔥 Best Anomaly Threshold:", ANOMALY_THRESHOLD)
joblib.dump(ANOMALY_THRESHOLD, "anomaly_threshold.pkl")

print("✅ Training Complete! Model & encoders saved.")

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step  
🔥 Best Anomaly Threshold: 0.5676260789062934
✅ Training Complete! Model & encoders saved.
