In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import shap

def hex_to_int(val):
    try:
        if isinstance(val, str) and val.startswith("0x"):
            return int(val, 16)
        return int(val)
    except:
        return np.nan

def load_and_preprocess(csv_path, feature_cols, max_rows=5000):
    df = pd.read_csv(csv_path)
    if len(df) > max_rows:
        df = df.sample(n=max_rows, random_state=42)

    df = df.dropna(subset=feature_cols)

    # Convert hex strings to integers
    for col in feature_cols:
        df[col] = df[col].apply(hex_to_int)

    df = df.dropna(subset=feature_cols)
    df[feature_cols] = df[feature_cols].astype(np.float32)

    return df, df[feature_cols].values

def train_autoencoder(X_train):
    model = Sequential([
        Dense(2, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(X_train.shape[1], activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='mae')
    model.fit(X_train, X_train, epochs=20, batch_size=32, verbose=0)
    return model

def compute_anomaly_scores(autoencoder, X):
    recon = autoencoder.predict(X, verbose=0)
    return np.mean(np.abs(X - recon), axis=1)

def explain_with_shap(model, X, feature_names, scores, top_n=200):
    top_idx = np.argsort(scores)[-top_n:]
    X_top = X[top_idx]
    scores_top = scores[top_idx]

    explainer = shap.TreeExplainer(model, data=X_top, feature_perturbation="interventional")
    shap_values = explainer.shap_values(X_top, approximate=True)

    explanations = []
    for i, row in enumerate(shap_values):
        # Get top 2 contributing features
        top_features_idx = np.argsort(np.abs(row))[-2:]
        top_features = []
        for idx in reversed(top_features_idx):
            feature = feature_names[idx]
            direction = "high" if row[idx] > 0 else "low"
            top_features.append(f"{direction} {feature}")

        explanation = f"Log flagged due to {top_features[0]} and {top_features[1]} contributing to anomaly score."
        explanations.append(explanation)

    # Build final DataFrame
    explanation_df = pd.DataFrame({
        "explanation": explanations,
        "anomaly_score": scores_top
    })

    return explanation_df, top_idx

# === CONFIG ===
csv_list = [
    ("/content/2good_reqff.csv", ["path_length", "body_length", "badwords_count"], "shap_explanations_goodbad.csv"),
    ("/content/wls_day-02.csv", ["ProcessID", "ParentProcessID", "EventID"], "shap_explanations_network.csv"),
    ("/content/netflow_day-02.csv", ["Duration", "SrcPackets", "DstPackets", "SrcBytes", "DstBytes"], "shap_explanations_host.csv")
]

# === PIPELINE ===
for path, features, out_csv in csv_list:
    print(f"[⚙️] Processing {path}...")
    df, X = load_and_preprocess(path, features)
    scaler = StandardScaler().fit(X)
    X_scaled = scaler.transform(X)

    ae = train_autoencoder(X_scaled)
    scores = compute_anomaly_scores(ae, X_scaled)

    rf = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_scaled, scores)
    shap_df, top_idx = explain_with_shap(rf, X_scaled, features, scores)

    final_df = pd.concat([df.iloc[top_idx].reset_index(drop=True), shap_df.add_prefix("shap_")], axis=1)
    final_df.to_csv(out_csv, index=False)
    print(f"[✅] Saved {out_csv}")


[⚙️] Processing /content/2good_reqff.csv...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[✅] Saved shap_explanations_goodbad.csv
[⚙️] Processing /content/wls_day-02.csv...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[✅] Saved shap_explanations_network.csv
[⚙️] Processing /content/netflow_day-02.csv...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[✅] Saved shap_explanations_host.csv
