In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import joblib
from sklearn.preprocessing import StandardScaler

# --- 1. Load and Prepare Data ---
# Load both datasets
good_df = pd.read_csv('/content/2good_reqff.csv')
bad_df = pd.read_csv('/content/2bad_reqff.csv')

# Define the features to be used
feature_cols = ['path_length', 'body_length', 'badwords_count']

# Create the training data using ONLY good requests
X_train_good = good_df[feature_cols].values

# Combine all data for testing later
combined_df = pd.concat([good_df, bad_df], ignore_index=True)
X_all = combined_df[feature_cols].values

# Scale the features. It's crucial to fit the scaler ONLY on the good data.
scaler = StandardScaler()
X_train_good_scaled = scaler.fit_transform(X_train_good)
X_all_scaled = scaler.transform(X_all) # Apply the same scaling to all data

# --- 2. Build and Train the Autoencoder ---
autoencoder = tf.keras.models.Sequential([
  tf.keras.layers.Dense(units=2, activation='relu', input_shape=(X_all_scaled.shape[1],)),
  tf.keras.layers.Dense(units=X_all_scaled.shape[1], activation='sigmoid')
])
autoencoder.compile(optimizer='adam', loss=tf.keras.losses.MeanAbsoluteError())


# Train the model on the scaled GOOD data
autoencoder.fit(X_train_good_scaled, X_train_good_scaled, epochs=20, batch_size=32, shuffle=True, verbose=0)

autoencoder.save("model.h5")
print("Model saved to model.h5")

joblib.dump(scaler, "scaler.pkl")

# --- 3. Generate Anomaly Scores ---
# Get the model's reconstructions of ALL data
reconstructions = autoencoder.predict(X_all_scaled)

# Calculate the Mean Absolute Error between the original and reconstructed data
# This error is our anomaly score
anomaly_scores_ae = np.mean(np.abs(X_all_scaled - reconstructions), axis=1)

# Add the scores to our combined dataframe
combined_df['anomaly_score_autoencoder'] = anomaly_scores_ae

print("Data with Autoencoder Anomaly Scores:")
print(combined_df.head())
print("\nSample of scores from the end of the file (bad requests):")
print(combined_df.tail())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model saved to model.h5
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Data with Autoencoder Anomaly Scores:
  method                    path body  single_q  double_q  dashes  braces  \
0    GET                       /  NaN         0         0       0       0   
1    GET             /robots.txt  NaN         0         0       0       0   
2    GET              /style.css  NaN         0         0       0       0   
3    GET        /images/logo.gif  NaN         0         0       0       0   
4    GET  /images/header_pic.jpg  NaN         0         0       0       0   

   spaces  percentages  semicolons  angle_brackets  special_chars  \
0       0            0           0               0              0   
1       0            0           0               0              0   
2       0            0           0               0              0   
3       0            0           0               0              0   
4       0            0           0               0     

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# --- 1. Choose your threshold based on the plot ---
threshold = 1.2

# --- 2. Apply the threshold to get final predictions ---
# If score > threshold, it's an anomaly (1), otherwise it's normal (0)
combined_df['prediction'] = (combined_df['anomaly_score_autoencoder'] > threshold).astype(int)

# Create the true labels for comparison (bad=1, good=0)
combined_df['true_label'] = (combined_df['class'] == 'bad').astype(int)

# --- 3. Calculate Final Metrics ---
# Generate the confusion matrix
tn, fp, fn, tp = confusion_matrix(combined_df['true_label'], combined_df['prediction']).ravel()

print(f"Threshold set to: {threshold}")
print(f"True Negatives (Good logs identified as Good): {tn}")
print(f"False Positives (Good logs flagged as Bad): {fp}")
print(f"False Negatives (Bad logs missed): {fn}")
print(f"True Positives (Bad logs caught): {tp}")

# False Positive Rate (FPR) -> Your project goal is <5%
fpr = fp / (fp + tn)
print(f"\nFalse Positive Rate: {fpr:.2%}")

# Accuracy -> Overall correctness
accuracy = accuracy_score(combined_df['true_label'], combined_df['prediction'])
print(f"Accuracy: {accuracy:.2%}")

# Detection Rate (Recall) -> Percentage of bad logs you successfully caught
detection_rate = tp / (tp + fn)
print(f"Detection Rate (Recall): {detection_rate:.2%}")

Threshold set to: 1.2
True Negatives (Good logs identified as Good): 278
False Positives (Good logs flagged as Bad): 9
False Negatives (Bad logs missed): 1419
True Positives (Bad logs caught): 4091

False Positive Rate: 3.14%
Accuracy: 75.37%
Detection Rate (Recall): 74.25%


In [22]:
AUTOENCODER_PATH = "model.h5"
GOOD_CSV = "2good_reqff.csv"
BAD_CSV = "2bad_reqff.csv"
SCALER_PATH = "scaler.pkl"
scaler = joblib.load(SCALER_PATH)

# Load data

df_good = pd.read_csv(GOOD_CSV)
df_bad = pd.read_csv(BAD_CSV)

df_combined = pd.concat([df_good, df_bad], axis=0).reset_index(drop=True)
feature_cols = ['path_length', 'body_length', 'badwords_count']
df_features = df_combined[feature_cols].copy()
df_features = df_features.dropna()
data = df_features.astype(np.float32).values


# Load trained autoencoder

print("[INFO] Loading autoencoder...")
autoencoder = tf.keras.models.load_model("model.h5")

# Compute reconstruction error per feature

print("[INFO] Computing reconstruction errors...")
reconstructed = autoencoder.predict(data)
reconstruction_errors = np.square(data - reconstructed)  # shape = (n_samples, n_features)
total_error = reconstruction_errors.mean(axis=1)

# === 1. Train surrogate model ===
print("[INFO] Training surrogate model (RandomForest)...")
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(data, total_error)

# Save model
joblib.dump(rf, "surrogate_rf_model.pkl")
print("[✅] Saved model as surrogate_rf_model.pkl")

# === 2. Use only top 200 anomalies ===
top_n = 200
top_indices = np.argsort(total_error)[-top_n:]
X_top = data[top_indices]
errors_top = total_error[top_indices]

# === 3. Run SHAP with TreeExplainer using approximate method ===
print("[INFO] Running fast SHAP explainability...")
explainer = shap.TreeExplainer(rf, data=X_top, feature_perturbation="interventional")
shap_values = explainer.shap_values(X_top, approximate=True)

# === 4. SHAP dataframe ===
shap_df = pd.DataFrame(shap_values, columns=feature_cols)
shap_df["anomaly_score"] = errors_top

# Optional: original data slice for joining
original_top = df_combined.iloc[top_indices].reset_index(drop=True)
final_df = pd.concat([original_top.reset_index(drop=True), shap_df.add_prefix("shap_")], axis=1)

# === 5. Save to CSV ===
final_df.to_csv("shap_explanations_goodbad.csv", index=False)




[INFO] Loading autoencoder...
[INFO] Computing reconstruction errors...
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[INFO] Training surrogate model (RandomForest)...
[✅] Saved model as surrogate_rf_model.pkl
[INFO] Running fast SHAP explainability...


In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
import shap


# Load your CSV (replace path as needed)
df = pd.read_csv("/content/wls_day-02.csv", encoding="ISO-8859-1")

# Select numeric + useful columns
cols = ["EventID", "LogonID", "ParentProcessID", "ProcessID"]
df_selected = df[cols].copy()

# Convert hex-like strings to integers
def hex_to_int(val):
    try:
        if isinstance(val, str) and val.startswith("0x"):
            return int(val, 16)
        return int(val)
    except:
        return np.nan

for col in ["LogonID", "ParentProcessID", "ProcessID"]:
    df_selected[col] = df_selected[col].apply(hex_to_int)

# Drop missing values
df_selected.dropna(inplace=True)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_selected)

print("[✅] Cleaned + Scaled Shape:", X_scaled.shape)


# Build autoencoder model
autoencoder = tf.keras.models.Sequential([
    tf.keras.layers.Dense(4, activation='relu', input_shape=(X_scaled.shape[1],)),
    tf.keras.layers.Dense(X_scaled.shape[1], activation='linear')
])
autoencoder.compile(optimizer='adam', loss='mae')

# Train the model
autoencoder.fit(X_scaled, X_scaled, epochs=20, batch_size=32, shuffle=True, verbose=0)

# Get reconstructions
reconstructions = autoencoder.predict(X_scaled)
anomaly_scores = np.mean(np.abs(X_scaled - reconstructions), axis=1)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_scaled, anomaly_scores)



# === 1. TreeExplainer with fast approximation ===
explainer = shap.TreeExplainer(
    rf_model,
    data=X_scaled,
    feature_perturbation="interventional",
    model_output="raw"
)

# === 2. Focus on top 200 anomalies to save time ===
top_anomalies = np.argsort(anomaly_scores)[-200:]  # take worst 200 logs
X_top = X_scaled[top_anomalies]

# Approximate SHAP values (faster)
shap_values = explainer.shap_values(X_top, approximate=True)

# === 3. Prepare DataFrames ===
# SHAP DataFrame
shap_df = pd.DataFrame(shap_values, columns=cols)
shap_df["anomaly_score"] = anomaly_scores[top_anomalies]

# Original data subset
df_top = df_selected.iloc[top_anomalies].reset_index(drop=True)
df_top["anomaly_score"] = anomaly_scores[top_anomalies]

# Final output with original + SHAP
final_df = pd.concat([df_top, shap_df.add_prefix("shap_")], axis=1)

# === 4. Save CSV ===
final_df.to_csv("shap_explanations_top200.csv", index=False)
print("✅ Saved shap_explanations_top200.csv ✅")


[✅] Cleaned + Scaled Shape: (2604, 4)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
✅ Saved shap_explanations_top200.csv ✅


In [23]:
from pymongo import MongoClient

# List of CSV files to insert
csv_files = [
    "/content/shap_explanations_top200.csv",
    "/content/shap_explanations_goodbad.csv"
]

# MongoDB URI
uri = "ENTER YOUR KEYclient = MongoClient(uri)

# Target DB and collection
db = client["log_analysis"]
collection = db["shap_explanations"]

# Insert all files
total_inserted = 0
for csv in csv_files:
    df = pd.read_csv(csv)
    records = df.to_dict(orient="records")
    if records:  # only insert if not empty
        collection.insert_many(records)
        total_inserted += len(records)
        print(f"[✅] Inserted {len(records)} records from {csv}")
    else:
        print(f"[⚠️] Skipped {csv} — empty or invalid")

print(f"\n[🏁 DONE] Inserted total {total_inserted} SHAP logs into MongoDB Atlas!")


[✅] Inserted 200 records from /content/shap_explanations_top200.csv
[✅] Inserted 200 records from /content/shap_explanations_goodbad.csv

[🏁 DONE] Inserted total 400 SHAP logs into MongoDB Atlas!
