In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import shap
import warnings
import joblib
import pickle
warnings.filterwarnings('ignore', category=UserWarning, module='tqdm')

def hex_to_int(val):
    try:
        if isinstance(val, str) and val.startswith("0x"):
            return int(val, 16)
        return int(val)
    except:
        return np.nan

def load_and_preprocess(csv_path, feature_cols, max_rows=5000):
    try:
        df = pd.read_csv(csv_path)
        print(f"  📄 Loaded {len(df)} rows from {csv_path}")
        
        if len(df) > max_rows:
            df = df.sample(n=max_rows, random_state=42)
            print(f"  📉 Sampled down to {max_rows} rows")

        initial_rows = len(df)
        df = df.dropna(subset=feature_cols)
        print(f"  🧹 Removed {initial_rows - len(df)} rows with missing values")

        for col in feature_cols:
            df[col] = df[col].apply(hex_to_int)

        df = df.dropna(subset=feature_cols)
        df[feature_cols] = df[feature_cols].astype(np.float32)
        
        print(f"  ✅ Final dataset: {len(df)} rows, {len(feature_cols)} features")
        return df, df[feature_cols].values
        
    except FileNotFoundError:
        print(f"  ❌ File not found: {csv_path}")
        return None, None
    except Exception as e:
        print(f"  ❌ Error processing {csv_path}: {str(e)}")
        return None, None

def train_autoencoder(X_train):
    print("  🤖 Training autoencoder...")
    model = Sequential([
        Dense(2, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(X_train.shape[1], activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='mae')
    model.fit(X_train, X_train, epochs=20, batch_size=32, verbose=0)
    print("  ✅ Autoencoder training complete")
    return model

def compute_anomaly_scores(autoencoder, X):
    print("  📊 Computing anomaly scores...")
    recon = autoencoder.predict(X, verbose=0)
    scores = np.mean(np.abs(X - recon), axis=1)
    print(f"  📈 Anomaly scores - Min: {scores.min():.4f}, Max: {scores.max():.4f}, Mean: {scores.mean():.4f}")
    return scores

def explain_with_shap(model, X, feature_names, scores, top_n=200):
    print(f"  🔍 Generating SHAP explanations for top {top_n} anomalies...")
    top_idx = np.argsort(scores)[-top_n:]
    X_top = X[top_idx]
    scores_top = scores[top_idx]
    
    explainer = shap.TreeExplainer(model, data=X_top, feature_perturbation="interventional")
    shap_values = explainer.shap_values(X_top, approximate=True)
    shap_df = pd.DataFrame(shap_values, columns=feature_names)
    shap_df["anomaly_score"] = scores_top
    
    print("  ✅ SHAP explanations generated")
    return shap_df, top_idx, explainer

def save_models(dataset_name, autoencoder, scaler, rf_model, explainer, feature_names):
    """Save all models for a specific dataset"""
    model_dir = f"saved_models/{dataset_name}"
    os.makedirs(model_dir, exist_ok=True)
    
    # Save autoencoder (Keras model)
    autoencoder.save(f"{model_dir}/autoencoder.keras")
    
    # Save scaler and random forest (sklearn models)
    joblib.dump(scaler, f"{model_dir}/scaler.pkl")
    joblib.dump(rf_model, f"{model_dir}/random_forest.pkl")
    
    # Save SHAP explainer and feature names
    with open(f"{model_dir}/shap_explainer.pkl", 'wb') as f:
        pickle.dump(explainer, f)
    
    with open(f"{model_dir}/feature_names.pkl", 'wb') as f:
        pickle.dump(feature_names, f)
    
    print(f"  💾 All models saved to {model_dir}")

# === CONFIG ===
csv_list = [
    ("../datasets/2good_reqff.csv", ["path_length", "body_length", "badwords_count"], "shap_explanations_goodbad.csv", "goodbad"),
    ("../datasets/wls_day-02.csv", ["ProcessID", "ParentProcessID", "EventID"], "shap_explanations_network.csv", "network"),
    ("../datasets/netflow_day-02.csv", ["Duration", "SrcPackets", "DstPackets", "SrcBytes", "DstBytes"], "shap_explanations_host.csv", "host")
]

# Create main models directory
os.makedirs("saved_models", exist_ok=True)

# === PIPELINE ===
print("🚀 Starting Anomaly Detection Pipeline\n")

for i, (path, features, out_csv, model_name) in enumerate(csv_list, 1):
    print(f"[⚙️] Processing Dataset {i}/3: {path}")
    print(f"     Features: {features}")
    
    df, X = load_and_preprocess(path, features)
    if df is None:
        print("     ⏭️  Skipping to next dataset\n")
        continue
    
    print("  📏 Scaling features...")
    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)
    
    ae = train_autoencoder(X_scaled)
    scores = compute_anomaly_scores(ae, X_scaled)
    
    print("  🌲 Training Random Forest...")
    rf = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_scaled, scores)
    
    shap_df, top_idx, explainer = explain_with_shap(rf, X_scaled, features, scores)
    
    # Save all models
    save_models(model_name, ae, scaler, rf, explainer, features)
    
    final_df = pd.concat([
        df.iloc[top_idx].reset_index(drop=True), 
        shap_df.add_prefix("shap_")
    ], axis=1)
    
    final_df.to_csv(out_csv, index=False)
    print(f"  💾 Saved results to {out_csv}")
    
    print(f"  📋 Sample Results (Top 5 Anomalies):")
    print(final_df.head().to_string(index=False))
    print(f"\n  📊 Summary Statistics:")
    print(f"     - Total anomalies analyzed: {len(final_df)}")
    print(f"     - Highest anomaly score: {final_df['shap_anomaly_score'].max():.4f}")
    print(f"     - Average anomaly score: {final_df['shap_anomaly_score'].mean():.4f}")
    
    print(f"\n✅ Completed processing {path}\n" + "="*50 + "\n")

print("🎉 Pipeline completed successfully!")
print("📁 All models saved in 'saved_models' directory")


🚀 Starting Anomaly Detection Pipeline

[⚙️] Processing Dataset 1/3: ../datasets/2good_reqff.csv
     Features: ['path_length', 'body_length', 'badwords_count']
  📄 Loaded 287 rows from ../datasets/2good_reqff.csv
  🧹 Removed 0 rows with missing values
  ✅ Final dataset: 287 rows, 3 features
  📏 Scaling features...
  🤖 Training autoencoder...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  ✅ Autoencoder training complete
  📊 Computing anomaly scores...
  📈 Anomaly scores - Min: 0.3580, Max: 7.0644, Mean: 0.6181
  🌲 Training Random Forest...
  🔍 Generating SHAP explanations for top 200 anomalies...
  ✅ SHAP explanations generated
  💾 All models saved to saved_models/goodbad
  💾 Saved results to shap_explanations_goodbad.csv
  📋 Sample Results (Top 5 Anomalies):
method                                path body  single_q  double_q  dashes  braces  spaces  percentages  semicolons  angle_brackets  special_chars  path_length  body_length  badwords_count class  shap_path_length  shap_body_length  shap_badwords_count  shap_anomaly_score
   GET /index.jsp?content=inside_press.htm  NaN         0         0       0       0       0            0           0               0              0         35.0          0.0             0.0  good         -0.268456         -0.016153                  0.0            0.410151
   GET /index.jsp?content=inside_about.htm  NaN         0         0       

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  ✅ Autoencoder training complete
  📊 Computing anomaly scores...
  📈 Anomaly scores - Min: 0.0539, Max: 6.6316, Mean: 0.3011
  🌲 Training Random Forest...
  🔍 Generating SHAP explanations for top 200 anomalies...
  ✅ SHAP explanations generated
  💾 All models saved to saved_models/network
  💾 Saved results to shap_explanations_network.csv
  📋 Sample Results (Top 5 Anomalies):
   UserName  EventID    LogHost   LogonID DomainName ParentProcessName  ParentProcessID    ProcessName  Time  ProcessID LogonTypeDescription Source AuthenticationPackage  LogonType Destination SubjectUserName SubjectLogonID SubjectDomainName Status ServiceName FailureReason  shap_ProcessID  shap_ParentProcessID  shap_EventID  shap_anomaly_score
Comp655648$   4688.0 Comp655648     0x3e7  Domain001          services            704.0   rundll32.exe 86400    21324.0                  NaN    NaN                   NaN        NaN         NaN             NaN            NaN               NaN    NaN         NaN           Na

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


  ✅ Autoencoder training complete
  📊 Computing anomaly scores...
  📈 Anomaly scores - Min: 0.0198, Max: 31.6918, Mean: 0.1710
  🌲 Training Random Forest...
  🔍 Generating SHAP explanations for top 200 anomalies...
  ✅ SHAP explanations generated
  💾 All models saved to saved_models/host
  💾 Saved results to shap_explanations_host.csv
  📋 Sample Results (Top 5 Anomalies):
  Time  Duration  SrcDevice  DstDevice  Protocol   SrcPort   DstPort  SrcPackets  DstPackets  SrcBytes  DstBytes  shap_Duration  shap_SrcPackets  shap_DstPackets  shap_SrcBytes  shap_DstBytes  shap_anomaly_score
121173  689622.0 Comp989948 Comp730289        17       123       123      5224.0         0.0  397024.0       0.0      -0.368661        -0.918774        -0.072914      -0.077712      -0.061812            0.363811
121053  752556.0 Comp124494 Comp253429         6 Port94511       445         0.0      6616.0       0.0  595575.0      -0.337807        -0.868480        -0.179600      -0.048423      -0.063130          

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load CSV
df = pd.read_csv("wls_day-02.csv")  # update with your path if needed
feature_cols = ["ProcessID", "ParentProcessID", "LogonID", "EventID"]

# Convert hex to int
def hex_to_int(val):
    try:
        if isinstance(val, str) and val.startswith("0x"):
            return int(val, 16)
        return int(val)
    except:
        return np.nan

for col in feature_cols:
    df[col] = df[col].apply(hex_to_int)

df.dropna(subset=feature_cols, inplace=True)

# Scale
scaler = StandardScaler()
X = df[feature_cols].astype(np.float32)
X_scaled = scaler.fit_transform(X)

# Autoencoder
model = Sequential([
    Dense(2, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dense(X_scaled.shape[1], activation='sigmoid')
])
model.compile(optimizer='adam', loss='mae')
history = model.fit(X_scaled, X_scaled, epochs=20, batch_size=32, verbose=0)

# Reconstruction error
recon = model.predict(X_scaled, verbose=0)
errors = np.mean(np.abs(X_scaled - recon), axis=1)

# Assume top 5% anomalies
threshold = np.percentile(errors, 45)
preds = (errors > threshold).astype(int)
labels = np.zeros(len(errors))
labels[errors > threshold] = 1

# Metrics
print("Precision:", precision_score(labels, preds))
print("Recall:", recall_score(labels, preds))
print("F1:", f1_score(labels, preds))
print("ROC-AUC:", roc_auc_score(labels, errors))
print("Confusion Matrix:\n", confusion_matrix(labels, preds))

# Loss plot
plt.plot(history.history['loss'])
plt.title("Autoencoder Training Loss")
plt.xlabel("Epoch")
plt.ylabel("MAE Loss")
plt.grid()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'wls_day-02.csv'

In [None]:
from pymongo import MongoClient

uri = "ENTER YOUR KEYclient = MongoClient(uri)

db = client["log_analysis"]
collection = db["shap_explanations"]

# Print 5 documents
for doc in collection.find().limit(5):
    print(doc)


{'_id': ObjectId('6894ffdd874daee6c347819d'), 'path_length': -4829.610679602709, 'body_length': -5071.62602397515, 'badwords_count': -746.8810707360627, 'anomaly_score': 0.07648983, 'explanation': 'Log flagged due to low body_length and low path_length contributing to anomaly score.'}
{'_id': ObjectId('6894ffdd874daee6c347819e'), 'path_length': -4799.939513546723, 'body_length': -5069.499950897789, 'badwords_count': -745.4170540314017, 'anomaly_score': 33.337746, 'explanation': 'Log flagged due to low body_length and low path_length contributing to anomaly score.'}
{'_id': ObjectId('6894ffdd874daee6c347819f'), 'path_length': -4810.467175360215, 'body_length': -5064.038138792648, 'badwords_count': -746.6811775439102, 'anomaly_score': 27.007772, 'explanation': 'Log flagged due to low body_length and low path_length contributing to anomaly score.'}
{'_id': ObjectId('6894ffdd874daee6c34781a0'), 'path_length': -4718.863101373702, 'body_length': -5109.013793914081, 'badwords_count': -745.317