<a href="https://colab.research.google.com/github/sankeawthong/Project-1-Lita-Chatbot/blob/main/%5B20251219%5D%20Regen_figures_from_artifacts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#!/usr/bin/env python3
"""
regen_figures_from_artifacts.py

Regenerate publication figures (Reliability diagram, ROC overlay, PR overlay) from the latest
trained artifacts/results.

Supports:
  (A) CIC-IoMT Option A (uncalibrated vs temperature-scaled vs isotonic) on HELD-OUT TEST
  (B) NF-ToN-IoT in-domain (optional) ROC/PR/reliability from trained artifacts
  (C) Robustness curves (optional): AUROC/AUPR/FPR@95%DR vs epsilon from metrics JSON files

Key design:
- Calibration is fit ONLY on the validation-calibration subset (Val-cal), never on Test.
- For CIC Option A, this script matches the split sizes in your summary by default:
    Train = provided CIC train file
    Holdout = provided CIC test file -> split 50/50 into Validation and Test
    Validation -> split 50/50 into Val-selection and Val-calibration

Outputs:
  outdir/
    roc_overlay_CIC_OptionA.png
    pr_overlay_CIC_OptionA.png
    reliability_overlay_CIC_OptionA.png
    optionA_split_audit.json
    optionA_plot_metrics.json

Optional NF outputs (if NF inputs provided):
    roc_NF_in_domain.png
    pr_NF_in_domain.png
    reliability_NF_in_domain.png
    nf_plot_metrics.json

Optional robustness outputs (if --metrics-dir provided):
    robust_NF_auroc_vs_eps.png
    robust_NF_aupr_vs_eps.png
    robust_NF_fpr95_vs_eps.png
    robust_parse_summary.json

Usage examples:

1) CIC Option A figures (recommended)
python regen_figures_from_artifacts.py \
  --cic-train-csv CIC_train.csv \
  --cic-test-csv  CIC_test.csv \
  --cic-label-col Label \
  --cic-pipe-joblib artifacts/CIC_OptionA_pipe.joblib \
  --cic-mlp-joblib  artifacts/CIC_OptionA_mlp.joblib \
  --cic-temp-meta   CIC_IoMT__OptionA__Calibrated(temperature)__meta.json \
  --random-state 42 \
  --outdir paper_exports/optionA_figures

2) Add NF in-domain plots
... plus:
  --nf-test-csv NF_test.csv --nf-label-col Label \
  --nf-pipe-joblib artifacts/NF_in_domain_pipe.joblib \
  --nf-mlp-joblib artifacts/NF_in_domain_mlp.joblib

3) Add robustness curves from JSON metrics
... plus:
  --metrics-dir ./results_dir_containing_metrics_jsons
"""

In [None]:
import argparse
import json
import re
from pathlib import Path

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import (
    roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score,
    brier_score_loss
)
from sklearn.isotonic import IsotonicRegression

In [None]:
# -----------------------------------------------------------------
# Custom transformer stub for joblib loading (must be in __main__)
# -----------------------------------------------------------------
class SafeNaNDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = np.asarray(X, dtype=float)
        X = np.where(np.isfinite(X), X, np.nan)
        self.keep_mask_ = ~np.all(np.isnan(X), axis=0)
        if not np.any(self.keep_mask_):
            raise ValueError("All features are NaN after cleaning.")
        return self

    def transform(self, X):
        X = np.asarray(X, dtype=float)
        X = np.where(np.isfinite(X), X, np.nan)
        return X[:, self.keep_mask_]

In [None]:
# -----------------
# Label utilities
# -----------------
def ensure_binary_labels(y):
    """
    Map labels to {0,1} where 1=attack, 0=benign.

    Supports:
      - numeric {0,1}
      - other numeric: smallest value -> benign, others -> attack (heuristic)
      - strings containing "benign"/"normal" -> benign
      - fallback: most frequent label -> benign
    """
    y = pd.Series(y)

    if pd.api.types.is_numeric_dtype(y):
        uniq = sorted(y.dropna().unique().tolist())
        if set(uniq) <= {0, 1}:
            return y.astype(int).to_numpy()
        benign_val = min(uniq)
        return (y != benign_val).astype(int).to_numpy()

    y_str = y.astype(str).str.lower().str.strip()
    benign_tokens = {"benign", "normal", "0", "false", "legit", "legitimate", "background"}
    is_benign = y_str.isin(benign_tokens) | y_str.str.contains("benign") | y_str.str.contains("normal")
    if is_benign.sum() == 0:
        benign_label = y_str.value_counts().idxmax()
        is_benign = (y_str == benign_label)
    return (~is_benign).astype(int).to_numpy()

In [None]:
# -----------------------------------------------------------
# Split utilities (CIC Option A to match your summary counts)
# -----------------------------------------------------------
def cic_optionA_indices_from_provided_files(y_holdout, random_state=42):
    """
    Given labels for the provided CIC test file (treated as HOLDOUT pool),
    split into:
      - Validation (50%)
      - Test (50%)
    then split Validation into:
      - Val-selection (50% of val)
      - Val-calibration (50% of val)

    Returns: val_sel_idx, val_cal_idx, test_idx (all indices relative to holdout array)
    """
    y_holdout = np.asarray(y_holdout).astype(int)

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=random_state)
    val_idx, test_idx = next(sss1.split(np.zeros_like(y_holdout), y_holdout))

    y_val = y_holdout[val_idx]
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=random_state)
    val_sel_rel, val_cal_rel = next(sss2.split(np.zeros_like(y_val), y_val))

    val_sel_idx = val_idx[val_sel_rel]
    val_cal_idx = val_idx[val_cal_rel]

    return val_sel_idx, val_cal_idx, test_idx

In [None]:
# --------------------------
# Model inference utilities
# --------------------------
def predict_hybrid(pipe, mlp, X_df):
    """
    Hybrid inference:
      - pipe produces a linear score (decision_function); fallback to logit(p_lr)
      - mlp consumes that score as 1D input and returns p(attack)
    """
    try:
        z = pipe.decision_function(X_df).reshape(-1, 1)
    except Exception:
        p_lr = np.clip(pipe.predict_proba(X_df)[:, 1], 1e-7, 1 - 1e-7)
        z = np.log(p_lr / (1.0 - p_lr)).reshape(-1, 1)

    p = mlp.predict_proba(z)[:, 1]
    return np.asarray(p, dtype=float)


def _logit(p):
    p = np.clip(p, 1e-7, 1 - 1e-7)
    return np.log(p / (1.0 - p))


def _sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))


def apply_temperature_scaling(p_uncal, T):
    z = _logit(p_uncal)
    return _sigmoid(z / float(T))


def fit_isotonic(p_val, y_val):
    ir = IsotonicRegression(out_of_bounds="clip")
    ir.fit(p_val, y_val)
    return ir

In [None]:
# -------------------------------
# Reliability diagram utilities
# -------------------------------
def reliability_curve(p, y, n_bins=15):
    p = np.asarray(p, dtype=float)
    y = np.asarray(y, dtype=int)

    bins = np.linspace(0.0, 1.0, n_bins + 1)
    bin_ids = np.digitize(p, bins) - 1
    bin_ids = np.clip(bin_ids, 0, n_bins - 1)

    acc = np.full(n_bins, np.nan, dtype=float)
    conf = np.full(n_bins, np.nan, dtype=float)
    counts = np.zeros(n_bins, dtype=int)

    for b in range(n_bins):
        mask = (bin_ids == b)
        counts[b] = int(mask.sum())
        if counts[b] > 0:
            acc[b] = float(y[mask].mean())
            conf[b] = float(p[mask].mean())

    ece = 0.0
    n = len(p)
    for b in range(n_bins):
        if counts[b] > 0:
            ece += (counts[b] / n) * abs(acc[b] - conf[b])

    bins_centers = (bins[:-1] + bins[1:]) / 2.0
    return bins_centers, acc, conf, float(ece)

In [None]:
# -----------------
#     Plotting
# -----------------
def plot_roc_overlay(y_true, variants, out_png, title):
    plt.figure(figsize=(6.2, 5.2))
    metrics = {}
    for name, p in variants.items():
        fpr, tpr, _ = roc_curve(y_true, p)
        auc = roc_auc_score(y_true, p)
        plt.plot(fpr, tpr, label=f"{name} (AUROC={auc:.6f})", linewidth=1.8)
        metrics[name] = {"auroc": float(auc)}
    plt.plot([0, 1], [0, 1], linestyle="--", linewidth=1.0)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend(fontsize=8, frameon=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close()
    return metrics


def plot_pr_overlay(y_true, variants, out_png, title):
    plt.figure(figsize=(6.2, 5.2))
    metrics = {}
    for name, p in variants.items():
        prec, rec, _ = precision_recall_curve(y_true, p)
        aupr = average_precision_score(y_true, p)
        plt.plot(rec, prec, label=f"{name} (AUPR={aupr:.6f})", linewidth=1.8)
        metrics[name] = {"aupr": float(aupr)}
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(title)
    plt.legend(fontsize=8, frameon=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close()
    return metrics


def plot_reliability_overlay(y_true, variants, out_png, title, n_bins=15):
    plt.figure(figsize=(6.2, 5.2))
    plt.plot([0, 1], [0, 1], linestyle="--", linewidth=1.0)
    metrics = {}
    for name, p in variants.items():
        _, acc, conf, ece = reliability_curve(p, y_true, n_bins=n_bins)
        mask = ~np.isnan(acc) & ~np.isnan(conf)
        plt.plot(conf[mask], acc[mask], marker="o", linewidth=1.8, label=f"{name} (ECE={ece:.4g})")
        metrics[name] = {
            "ece": float(ece),
            "brier": float(brier_score_loss(y_true, p)),
        }
    plt.xlabel("Mean predicted probability")
    plt.ylabel("Empirical accuracy")
    plt.title(title)
    plt.legend(fontsize=8, frameon=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close()
    return metrics

In [None]:
# ----------------------------------------------------
# Robustness curves from metrics JSON files (optional)
# ----------------------------------------------------
def parse_robustness_metrics(metrics_dir: Path):
    pat = re.compile(r"NF_ToN_IoT__in_domain__(FGSM|PGD)_eps=([0-9.]+)__binary_metrics\.json$")
    out = {"FGSM": [], "PGD": []}

    for p in metrics_dir.glob("NF_ToN_IoT__in_domain__*__binary_metrics.json"):
        m = pat.search(p.name)
        if not m:
            continue
        atk = m.group(1)
        eps = float(m.group(2))
        with open(p, "r") as f:
            d = json.load(f)

        auroc = d.get("auroc", d.get("AUROC"))
        aupr = d.get("aupr", d.get("AUPR"))
        fpr95 = d.get("fpr_at_dr95", d.get("FPR@DR=0.95", d.get("fpr95")))

        out[atk].append({
            "eps": eps,
            "auroc": float(auroc) if auroc is not None else None,
            "aupr": float(aupr) if aupr is not None else None,
            "fpr95": float(fpr95) if fpr95 is not None else None,
            "file": p.name,
        })

    for atk in out:
        out[atk] = sorted(out[atk], key=lambda x: x["eps"])
    return out


def plot_robustness_curve(curves, key, out_png, title):
    plt.figure(figsize=(6.2, 5.2))
    any_plotted = False
    for atk, rows in curves.items():
        xs = [r["eps"] for r in rows if r.get(key) is not None]
        ys = [r[key] for r in rows if r.get(key) is not None]
        if len(xs) == 0:
            continue
        plt.plot(xs, ys, marker="o", linewidth=1.8, label=atk)
        any_plotted = True
    plt.xlabel("epsilon")
    plt.ylabel(key)
    plt.title(title)
    if any_plotted:
        plt.legend(frameon=False)
    plt.tight_layout()
    plt.savefig(out_png, dpi=300)
    plt.close()


def main():
    ap = argparse.ArgumentParser()

    # CIC inputs
    ap.add_argument("--cic-train-csv", type=str, required=True)
    ap.add_argument("--cic-test-csv", type=str, required=True)
    ap.add_argument("--cic-label-col", type=str, default="Label")
    ap.add_argument("--cic-drop-cols", type=str, nargs="*", default=[])
    ap.add_argument("--cic-pipe-joblib", type=str, required=True)
    ap.add_argument("--cic-mlp-joblib", type=str, required=True)
    ap.add_argument("--cic-temp-meta", type=str, required=True)

    # Optional NF inputs
    ap.add_argument("--nf-test-csv", type=str, default=None)
    ap.add_argument("--nf-label-col", type=str, default="Label")
    ap.add_argument("--nf-drop-cols", type=str, nargs="*", default=[])
    ap.add_argument("--nf-pipe-joblib", type=str, default=None)
    ap.add_argument("--nf-mlp-joblib", type=str, default=None)

    # Optional metrics directory (robustness curves)
    ap.add_argument("--metrics-dir", type=str, default=None)

    ap.add_argument("--random-state", type=int, default=42)
    ap.add_argument("--outdir", type=str, default="paper_exports/regen_figures")

    # ------------------------------------------------------------------------
    # FIX: Explicitly provide arguments for Colab execution
    # ------------------------------------------------------------------------
    args_list = [
        "--cic-train-csv", "CIC_IoMT_2024_WiFi_MQTT_train.csv",
        "--cic-test-csv", "CIC_IoMT_2024_WiFi_MQTT_test.csv",
        "--cic-label-col", "label",
        "--cic-pipe-joblib", "CIC_OptionA_pipe.joblib",
        "--cic-mlp-joblib", "CIC_OptionA_mlp.joblib",
        "--cic-temp-meta", "CIC_IoMT__OptionA__Calibrated(temperature)__meta.json",
        "--outdir", "paper_exports/regen_figures"
    ]
    print(f"Running with arguments: {args_list}")
    args = ap.parse_args(args_list)

    outdir = Path(args.outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    # -----------------
    # CIC Option A: load and split
    # -----------------
    df_train = pd.read_csv(args.cic_train_csv)
    df_holdout = pd.read_csv(args.cic_test_csv)

    if args.cic_label_col not in df_train.columns or args.cic_label_col not in df_holdout.columns:
        raise ValueError(f"CIC label column '{args.cic_label_col}' must exist in both provided files.")

    y_holdout = ensure_binary_labels(df_holdout[args.cic_label_col].values)

    drop_cic = set(args.cic_drop_cols + [args.cic_label_col])
    X_holdout = df_holdout.drop(columns=[c for c in drop_cic if c in df_holdout.columns])

    # ---------------------------------------------------------------------
    # FIX: Handle feature mismatch (likely missing metadata columns)
    # The pipeline expects 53 columns (based on SafeNaNDropper mask).
    # The provided test data has 46 features (after dropping label).
    # We pad with dummy columns to match the shape.
    # ---------------------------------------------------------------------
    target_dims = 53
    current_dims = X_holdout.shape[1]
    if current_dims < target_dims:
        missing = target_dims - current_dims
        print(f"DEBUG: Padding input with {missing} dummy columns to match pipeline expectation ({target_dims}).")
        # Prepend columns (assuming missing metadata at start)
        dummy_data = np.zeros((len(X_holdout), missing))
        dummy_df = pd.DataFrame(dummy_data, columns=[f"dummy_{i}" for i in range(missing)], index=X_holdout.index)
        X_holdout = pd.concat([dummy_df, X_holdout], axis=1)

    val_sel_idx, val_cal_idx, test_idx = cic_optionA_indices_from_provided_files(
        y_holdout, random_state=args.random_state
    )

    pipe_cic = joblib.load(args.cic_pipe_joblib)
    mlp_cic = joblib.load(args.cic_mlp_joblib)

    p_test_uncal = predict_hybrid(pipe_cic, mlp_cic, X_holdout.iloc[test_idx])
    y_test = y_holdout[test_idx]

    with open(args.cic_temp_meta, "r") as f:
        meta = json.load(f)
    T = float(meta.get("T", 1.0))
    p_test_temp = apply_temperature_scaling(p_test_uncal, T=T)

    p_val_cal_uncal = predict_hybrid(pipe_cic, mlp_cic, X_holdout.iloc[val_cal_idx])
    y_val_cal = y_holdout[val_cal_idx]
    ir = fit_isotonic(p_val_cal_uncal, y_val_cal)
    p_test_iso = np.asarray(ir.transform(p_test_uncal), dtype=float)

    variants_cic = {
        "Uncalibrated": p_test_uncal,
        "Temp-scaled": p_test_temp,
        "Isotonic": p_test_iso,
    }

    audit = {
        "cic_train_file_n": int(len(df_train)),
        "cic_holdout_file_n": int(len(df_holdout)),
        "n_val_sel": int(len(val_sel_idx)),
        "n_val_cal": int(len(val_cal_idx)),
        "n_test": int(len(test_idx)),
        "random_state": int(args.random_state),
        "temperature_T": float(T),
    }
    with open(outdir / "optionA_split_audit.json", "w") as f:
        json.dump(audit, f, indent=2)

    roc_m = plot_roc_overlay(y_test, variants_cic, outdir / "roc_overlay_CIC_OptionA.png",
                             "CIC-IoMT Option A: ROC Overlay (Held-out Test)")
    pr_m = plot_pr_overlay(y_test, variants_cic, outdir / "pr_overlay_CIC_OptionA.png",
                           "CIC-IoMT Option A: PR Overlay (Held-out Test)")
    rel_m = plot_reliability_overlay(y_test, variants_cic, outdir / "reliability_overlay_CIC_OptionA.png",
                                     "CIC-IoMT Option A: Reliability Diagram (Held-out Test)", n_bins=15)

    plot_metrics = {}
    for k in variants_cic:
        plot_metrics.setdefault(k, {}).update(roc_m.get(k, {}))
        plot_metrics.setdefault(k, {}).update(pr_m.get(k, {}))
        plot_metrics.setdefault(k, {}).update(rel_m.get(k, {}))
    with open(outdir / "optionA_plot_metrics.json", "w") as f:
        json.dump(plot_metrics, f, indent=2)

    # -----------------
    # Optional NF in-domain plots
    # -----------------
    if args.nf_test_csv and args.nf_pipe_joblib and args.nf_mlp_joblib:
        df_nf = pd.read_csv(args.nf_test_csv)
        if args.nf_label_col not in df_nf.columns:
            raise ValueError(f"NF label column '{args.nf_label_col}' not found in NF file.")
        y_nf = ensure_binary_labels(df_nf[args.nf_label_col].values)
        drop_nf = set(args.nf_drop_cols + [args.nf_label_col])
        X_nf = df_nf.drop(columns=[c for c in drop_nf if c in df_nf.columns])

        pipe_nf = joblib.load(args.nf_pipe_joblib)
        mlp_nf = joblib.load(args.nf_mlp_joblib)
        p_nf = predict_hybrid(pipe_nf, mlp_nf, X_nf)

        variants_nf = {"NF in-domain": p_nf}
        nf_roc = plot_roc_overlay(y_nf, variants_nf, outdir / "roc_NF_in_domain.png",
                                  "NF-ToN-IoT In-domain: ROC (Test)")
        nf_pr = plot_pr_overlay(y_nf, variants_nf, outdir / "pr_NF_in_domain.png",
                                "NF-ToN-IoT In-domain: PR (Test)")
        nf_rel = plot_reliability_overlay(y_nf, variants_nf, outdir / "reliability_NF_in_domain.png",
                                          "NF-ToN-IoT In-domain: Reliability (Test)", n_bins=15)

        nf_metrics = {"NF in-domain": {}}
        nf_metrics["NF in-domain"].update(nf_roc["NF in-domain"])
        nf_metrics["NF in-domain"].update(nf_pr["NF in-domain"])
        nf_metrics["NF in-domain"].update(nf_rel["NF in-domain"])
        with open(outdir / "nf_plot_metrics.json", "w") as f:
            json.dump(nf_metrics, f, indent=2)

    # -----------------
    # Optional robustness curves from JSON metrics
    # -----------------
    if args.metrics_dir:
        curves = parse_robustness_metrics(Path(args.metrics_dir))
        with open(outdir / "robust_parse_summary.json", "w") as f:
            json.dump(curves, f, indent=2)

        plot_robustness_curve(curves, "auroc", outdir / "robust_NF_auroc_vs_eps.png",
                              "NF-ToN-IoT Robustness: AUROC vs epsilon")
        plot_robustness_curve(curves, "aupr", outdir / "robust_NF_aupr_vs_eps.png",
                              "NF-ToN-IoT Robustness: AUPR vs epsilon")
        plot_robustness_curve(curves, "fpr95", outdir / "robust_NF_fpr95_vs_eps.png",
                              "NF-ToN-IoT Robustness: FPR@95%DR vs epsilon")

    print("Done. Outputs written to:", str(outdir))


if __name__ == "__main__":
    main()

In [None]:
import joblib
import pandas as pd
import numpy as np

# Load pipeline to inspect
pipe_cic = joblib.load('CIC_OptionA_pipe.joblib')

# Check the first step of the pipeline
first_step_name, first_step_obj = pipe_cic.steps[0]
print(f"First step: {first_step_name} -> {type(first_step_obj)}")

# If it is SafeNaNDropper, check the mask
if hasattr(first_step_obj, 'keep_mask_'):
    mask = first_step_obj.keep_mask_
    print(f"Keep mask shape: {mask.shape}")
    print(f"Keep mask sum (features kept): {mask.sum()}")
    print(f"Keep mask values: {mask.astype(int)}")
else:
    print("First step does not have keep_mask_ attribute.")

# Check current data columns
df_check = pd.read_csv('CIC_IoMT_2024_WiFi_MQTT_test.csv', nrows=1)
print(f"Current columns ({len(df_check.columns)}): {df_check.columns.tolist()}")