In [107]:
import numpy as np
import pandas as pd
import time, os, pickle

from scipy.stats import wilcoxon

from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import matthews_corrcoef, make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, HistGradientBoostingClassifier)
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import optuna

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
%matplotlib inline

import warnings as w
w.simplefilter(action='ignore',category=FutureWarning)

In [108]:
N_OUTER_SPLITS = 10  # outer = evaluation
N_REPEATS = 10
N_INNER_SPLITS = 5   # inner = tuning
N_TRIALS  = 10 
N_JOBS = -1
pos_label = 1

In [109]:
df = pd.read_csv("./Dataset/preprocessed dataset.csv")

X = df.drop(columns=['class'])
y = df['class']

# **1. Pipeline Builders**
---

In [112]:
def build_dt_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        criterion = trial.suggest_categorical("clf__criterion", ["gini", "entropy"])
        max_depth = trial.suggest_int("clf__max_depth", 2, 8)
    else:
        criterion = fixed_params["clf__criterion"]
        max_depth = fixed_params["clf__max_depth"]

    clf = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, class_weight="balanced")

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_rf_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        n_estimators = trial.suggest_int("clf__n_estimators", 50, 100)
        max_depth = trial.suggest_int("clf__max_depth", 2, 8)
    else:
        n_estimators = fixed_params["clf__n_estimators"]
        max_depth = fixed_params["clf__max_depth"]

    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight="balanced", n_jobs=N_JOBS) 

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_et_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        n_estimators = trial.suggest_int("clf__n_estimators", 50, 100)
        max_depth = trial.suggest_int("clf__max_depth", 2, 8)
    else:
        n_estimators = fixed_params["clf__n_estimators"]
        max_depth = fixed_params["clf__max_depth"]

    clf = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth, class_weight="balanced", n_jobs=N_JOBS) 

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_xgb_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        n_estimators = trial.suggest_categorical("clf__n_estimators", [100, 300, 500, 700, 900])
        learning_rate = trial.suggest_float("clf__learning_rate", 0.1, 0.7)
        max_depth = trial.suggest_int("clf__max_depth", 2, 8)
    else:
        n_estimators = fixed_params["clf__n_estimators"]
        learning_rate = fixed_params["clf__learning_rate"]
        max_depth = fixed_params["clf__max_depth"]

    clf = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, n_jobs=N_JOBS,
                        scale_pos_weight=scale_pos_weight) 

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_ada_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        n_estimators = trial.suggest_categorical("clf__n_estimators", [100, 300, 500, 700, 900])
        learning_rate = trial.suggest_float("clf__learning_rate", 0.01, 1.0)
    else:
        n_estimators = fixed_params["clf__n_estimators"]
        learning_rate = fixed_params["clf__learning_rate"]

    base_tree = DecisionTreeClassifier(max_depth=1, class_weight="balanced")

    clf = AdaBoostClassifier(estimator=base_tree, n_estimators=n_estimators, learning_rate=learning_rate)

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_gb_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        n_estimators = trial.suggest_categorical("clf__n_estimators", [50, 100, 500, 1000])
        learning_rate = trial.suggest_categorical("clf__learning_rate", [0.001, 0.01, 0.1])
        min_samples_leaf = trial.suggest_categorical("clf__min_samples_leaf", [1, 5, 10])
        max_depth = trial.suggest_int("clf__max_depth", 2, 8)
        loss = trial.suggest_categorical("clf__loss", ["deviance", "exponential"])
    else:
        n_estimators = fixed_params["clf__n_estimators"]
        learning_rate = fixed_params["clf__learning_rate"]
        min_samples_leaf = fixed_params["clf__min_samples_leaf"]
        max_depth = fixed_params["clf__max_depth"]
        loss = fixed_params["clf__loss"]

    # sklearn rename: "deviance" -> "log_loss"
    sklearn_loss = "log_loss" if loss == "deviance" else loss

    clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, min_samples_leaf=min_samples_leaf, 
                                     max_depth=max_depth, loss=sklearn_loss)

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_hgb_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        max_iter = trial.suggest_categorical("clf__max_iter", [100, 300, 500, 700, 900])
        learning_rate = trial.suggest_float("clf__learning_rate", 0.1, 0.7)
        min_samples_leaf = trial.suggest_int("clf__min_samples_leaf", 5, 25)
        max_depth = trial.suggest_int("clf__max_depth", 2, 8)
    else:
        max_iter = fixed_params["clf__max_iter"]
        learning_rate = fixed_params["clf__learning_rate"]
        min_samples_leaf = fixed_params["clf__min_samples_leaf"]
        max_depth = fixed_params["clf__max_depth"]

    clf = HistGradientBoostingClassifier(max_iter=max_iter, learning_rate=learning_rate, min_samples_leaf=min_samples_leaf, max_depth=max_depth)

    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])
#--------------------------------------
def build_cat_pipeline(k, trial=None, fixed_params=None, scale_pos_weight=None):
    if trial is not None:
        n_estimators = trial.suggest_categorical("clf__n_estimators", [100, 300, 500, 700, 900]) #CatBoost uses iterations
        learning_rate = trial.suggest_float("clf__learning_rate", 0.1, 0.7)
        depth = trial.suggest_int("clf__depth", 2, 8)
        min_data_in_leaf = trial.suggest_int("clf__min_data_in_leaf", 1, 10)
    else:
        n_estimators = fixed_params["clf__n_estimators"]
        learning_rate = fixed_params["clf__learning_rate"]
        depth = fixed_params["clf__depth"]
        min_data_in_leaf = fixed_params["clf__min_data_in_leaf"]
    
    clf = CatBoostClassifier(iterations=n_estimators, learning_rate=learning_rate, depth=depth, min_data_in_leaf=min_data_in_leaf,
                             auto_class_weights="Balanced", verbose=False)
    
    return Pipeline([('scaler', StandardScaler()),
                     ('select', SelectKBest(mutual_info_classif, k=k)),
                     ('clf', clf)])

In [113]:
def choose_k_from_mi(X_train, y_train, rule="mean"):
    mi = mutual_info_classif(X_train, y_train)
    thr = mi.mean() if rule == "mean" else np.median(mi)
    k = int((mi >= thr).sum())
    return k

# **2. Nested CV**
---

In [114]:
#  - Outer: RepeatedStratifiedKFold (10 folds x 10 repeats)
#  - Inner: StratifiedKFold for Optuna objective via cross_val_score
# -------------------------
def nested_cv_model(X, y, model_name, build_pipeline_fn, n_outer_splits=N_OUTER_SPLITS, n_repeats=N_REPEATS, n_inner_splits=N_INNER_SPLITS, 
              n_trials=N_TRIALS):
    
    outer_cv = RepeatedStratifiedKFold(n_splits=n_outer_splits, n_repeats=n_repeats)

    outer_mcc = []
    outer_acc = []
    outer_precision = []
    outer_recall = []
    outer_f1 = []
    outer_auc = []
    outer_inf_time = []

    fold_idx = 0

    for train_idx, test_idx in outer_cv.split(X, y):
        fold_idx += 1
        print(f"[{model_name}] Outer split {fold_idx}/{outer_cv.get_n_splits()}")

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        #-----------------------------------------------------
        pos = (y_train == pos_label).sum()
        neg = (y_train != pos_label).sum()
        fold_scale_pos_weight = neg / pos
        #-----------------------------------------------------
        k = choose_k_from_mi(X_train, y_train, rule="mean")
        #-----------------------------------------------------
        def objective(trial):
            pipe = build_pipeline_fn(k, trial=trial, scale_pos_weight=fold_scale_pos_weight)
            inner_cv = StratifiedKFold(n_splits=n_inner_splits, shuffle=True)
            cv_scores = cross_val_score(pipe, X_train, y_train, cv=inner_cv, scoring=make_scorer(matthews_corrcoef), n_jobs=N_JOBS)
            return float(cv_scores.mean())

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
        best_params = study.best_params
        print(f"Best params: {best_params}")
        #-----------------------------------------------------
        # Refit best pipeline on full outer training data
        best_pipe = build_pipeline_fn(k, fixed_params=best_params, scale_pos_weight=fold_scale_pos_weight)
        best_pipe.fit(X_train, y_train)

        # Evaluate on outer test fold
        start_time = time.perf_counter()
        y_pred = best_pipe.predict(X_test)
        end_time = time.perf_counter()
        #-----------------------------------------------------
        avg_inf_time = (end_time - start_time) / len(X_test)
        outer_inf_time.append(avg_inf_time)

        # Probabilities for AUC if available
        if hasattr(best_pipe, "predict_proba"):
            y_proba = best_pipe.predict_proba(X_test)[:, 1]
        else:
            try:
                scores_dec = best_pipe.decision_function(X_test)
                scores_dec = (scores_dec - scores_dec.min()) / (scores_dec.max() - scores_dec.min() + 1e-9)
                y_proba = scores_dec
            except Exception:
                y_proba = None
        #-----------------------------------------------------
        # Metrics
        mcc = matthews_corrcoef(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        auc = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan

        outer_mcc.append(mcc)
        outer_acc.append(acc)
        outer_precision.append(precision)
        outer_recall.append(recall)
        outer_f1.append(f1)
        outer_auc.append(auc)

        print(f"Split {fold_idx}: MCC={mcc:.4f}, ACC={acc:.4f}, "
              f"Precision={precision:.4f}, Recall={recall:.4f}, F1={f1:.4f}, AUC={auc:.4f}, "
              f"AvgInfTime={avg_inf_time:.6f}s")

    results = {
        "model": model_name,
        "mcc_scores": outer_mcc,
        "acc_scores": outer_acc,
        "precision_scores": outer_precision,
        "recall_scores": outer_recall,
        "f1_scores": outer_f1,
        "auc_scores": outer_auc,
        "inf_time_scores": outer_inf_time,

        "mcc_mean": float(np.mean(outer_mcc)),
        "mcc_std": float(np.std(outer_mcc)),
        "acc_mean": float(np.mean(outer_acc)),
        "precision_mean": float(np.mean(outer_precision)),
        "recall_mean": float(np.mean(outer_recall)),
        "f1_mean": float(np.mean(outer_f1)),
        "auc_mean": float(np.nanmean(outer_auc)),
        "inf_time_mean": float(np.mean(outer_inf_time))
        }
    return results

# **3. Main**
---

In [None]:
models_config = {"DT":  build_dt_pipeline,
                 "RF":  build_rf_pipeline,
                 "ET":  build_et_pipeline,
                 "XGB": build_xgb_pipeline,
                 "Ada": build_ada_pipeline,
                 "GB":  build_gb_pipeline,
                 "HGB": build_hgb_pipeline,
                 "CAT": build_cat_pipeline}

os.makedirs("./Results", exist_ok=True)
os.makedirs("./Results/all", exist_ok=True)
os.makedirs("./Results/per_model", exist_ok=True)

results_all = []
summary_rows = []
metric_scores = {'MCC': {}, 'ACC': {}, 'Precision': {},'Recall': {}, 'F1': {}, 'AUC': {}, 'INF_TIME': {}}

for model_name, builder_fn in models_config.items():

    pkl_path = f"./Results/per_model/{model_name}_nested_cv_result.pkl"
    csv_scores_path = f"./Results/per_model/{model_name}_fold_scores.csv"

    #Resume behavior, load if already computed
    if os.path.exists(pkl_path):
        print(f"[{model_name}] Found saved result -> loading: {pkl_path}")
        with open(pkl_path, "rb") as f:
            res = pickle.load(f)
    else:
        print(f"[{model_name}] Running nested CV ...")
        res = nested_cv_model(X=X, y=y, model_name=model_name, build_pipeline_fn=builder_fn, n_outer_splits=N_OUTER_SPLITS, n_repeats=N_REPEATS,
                          n_inner_splits=N_INNER_SPLITS, n_trials=N_TRIALS)
        
        with open(pkl_path, "wb") as f:
            pickle.dump(res, f)

    #Per-model fold scores
    per_model_scores = pd.DataFrame({"MCC": res["mcc_scores"],
                                     "ACC": res["acc_scores"],
                                     "Precision": res["precision_scores"],
                                     "Recall": res["recall_scores"],
                                     "F1": res["f1_scores"],
                                     "AUC": res["auc_scores"],
                                     "INF_TIME": res["inf_time_scores"]
                                    })
    per_model_scores.to_csv(csv_scores_path, index=False)

    #Update in-memory aggregations
    results_all.append(res)

    summary_rows.append({"Model": model_name,
                         "MCC_mean": res["mcc_mean"],
                         "MCC_std":  res["mcc_std"],
                         "ACC_mean": res["acc_mean"],
                         "PREC_mean": res["precision_mean"],
                         "REC_mean":  res["recall_mean"],
                         "F1_mean":   res["f1_mean"],
                         "AUC_mean":  res["auc_mean"],
                         "INF_TIME_mean": res["inf_time_mean"]
                        })

    metric_scores["MCC"][model_name] = res["mcc_scores"]
    metric_scores["ACC"][model_name] = res["acc_scores"]
    metric_scores["Precision"][model_name] = res["precision_scores"]
    metric_scores["Recall"][model_name] = res["recall_scores"]
    metric_scores["F1"][model_name] = res["f1_scores"]
    metric_scores["AUC"][model_name] = res["auc_scores"]
    metric_scores["INF_TIME"][model_name] = res["inf_time_scores"]

    summary_df = pd.DataFrame(summary_rows)
    summary_df.to_csv("./Results/summary_performance_metrics.csv", index=False)
    print(summary_df)

    #All metrics" files
    pd.DataFrame(metric_scores["ACC"]).to_csv("./Results/all/ACC_Scores.csv", index=False)
    pd.DataFrame(metric_scores["MCC"]).to_csv("./Results/all/MCC_Scores.csv", index=False)
    pd.DataFrame(metric_scores["Precision"]).to_csv("./Results/all/Precision_Scores.csv", index=False)
    pd.DataFrame(metric_scores["Recall"]).to_csv("./Results/all/Recall_Scores.csv", index=False)
    pd.DataFrame(metric_scores["F1"]).to_csv("./Results/all/F1_Scores.csv", index=False)
    pd.DataFrame(metric_scores["AUC"]).to_csv("./Results/all/AUC_Scores.csv", index=False)
    pd.DataFrame(metric_scores["INF_TIME"]).to_csv("./Results/all/INF_TIME_Scores.csv", index=False)

print("Done. Per-model results saved in ./Results/per_model/ and combined scores in ./Results/all/")

In [None]:
import pandas as pd

CSV_PATH = "./Results/summary_performance_metrics.csv"

# Order must match your paper
MODEL_ORDER = ["DT", "RF", "ET", "XGB", "Ada", "GB", "HGB", "CAT"]

# Columns in your summary CSV (based on your code)
COL_MAP = {
    "Accuracy": "ACC_mean",
    "Precision": "PREC_mean",
    "Recall": "REC_mean",
    "AUC": "AUC_mean",
    "MCC": "MCC_mean",
    # time in seconds in your code -> convert to microseconds
    "Detection Time (in microseconds)": "INF_TIME_mean",
}

# Which direction is better?
HIGHER_BETTER = {"Accuracy", "Precision", "Recall", "AUC", "MCC"}
LOWER_BETTER = {"Detection Time (in microseconds)"}

df = pd.read_csv(CSV_PATH)

# Ensure we have expected models and ordering
df["Model"] = df["Model"].astype(str)
df = df.set_index("Model").reindex(MODEL_ORDER)

missing = df.index[df.isna().all(axis=1)].tolist()
if missing:
    raise ValueError(f"Missing models in CSV (or all-NaN rows): {missing}")

# Convert time to microseconds
df["INF_TIME_us"] = df["INF_TIME_mean"] * 1_000_000.0

def fmt4(x: float) -> str:
    return f"{x:.4f}"

# Build rows (with bold best)
rows = []
for metric_name, col in COL_MAP.items():
    if metric_name == "Detection Time (in microseconds)":
        values = df["INF_TIME_us"]
        best_model = values.idxmin()
    else:
        values = df[col]
        best_model = values.idxmax()

    formatted = []
    for m in MODEL_ORDER:
        v = values.loc[m]
        cell = fmt4(v)
        if m == best_model:
            cell = r"\textbf{" + cell + "}"
        formatted.append(cell)

    rows.append((metric_name, formatted))

# Create LaTeX
latex = []
latex.append(r"\begin{table}[hbt!]")
latex.append(r"\caption{Tree-based ensembles performance scores.}")
latex.append(r"\label{tab:tree_based_ensemble_performance}")
latex.append(r"\scriptsize")
latex.append(r"\centering")
latex.append(r"\begin{tabular}{lcccccccc}")
latex.append(r"\toprule")
latex.append(r"\textbf{Classifier} & \textbf{DT} & \textbf{RF} & \textbf{ET} & \textbf{XGB} & \textbf{Ada} & \textbf{GB} & \textbf{HGB} & \textbf{CAT} \\")
latex.append(r"\midrule")

for metric_name, vals in rows:
    latex.append(metric_name + " & " + " & ".join(vals) + r" \\")

latex.append(r"\bottomrule")
latex.append(r"\end{tabular}")
latex.append(r"\end{table}")

print("\n".join(latex))


# **4. Statistical Test**
---

In [None]:
MCC_Scores = pd.read_csv("./Results/all/MCC_Scores.csv")
models = ['DT', 'RF', 'ET', 'XGB', 'Ada', 'GB', 'HGB', 'CAT']

for m in models:
    MCC_Scores[m] = pd.to_numeric(MCC_Scores[m], errors="coerce")

MCC_Scores = MCC_Scores.dropna(subset=models, how="any")

alpha = 0.05
peers = len(models) - 1

decision_matrix = pd.DataFrame(0, index=models, columns=models, dtype="Int64")
pvals = pd.DataFrame(np.nan, index=models, columns=models)

for i in models:
    xi = MCC_Scores[i].to_numpy()
    mean_i = float(np.mean(xi))

    for j in models:
        if i == j:
            decision_matrix.loc[i, j] = pd.NA
            continue

        xj = MCC_Scores[j].to_numpy()
        mean_j = float(np.mean(xj))

        diff = xi - xj
        if np.all(diff == 0):
            stat = 0.0
            p = 1.0
        else:
            stat, p = wilcoxon(xi, xj) #zero_method="zsplit"

        pvals.loc[i, j] = p

        print(f"Statistics={stat:.8f}, p={p:.8f}")
        print(f"The test between {i} and {j}")
        if p > alpha:
            print("Same prediction performance (Fail to reject H0)")
        else:
            print("Different prediction performance (Reject H0)")
        print("-" * 50)

        # 1 only if i significantly better than j (mean-based direction)
        decision_matrix.loc[i, j] = 1 if (p <= alpha and mean_i > mean_j) else 0

# wins/losses
#-----------------------
wins = decision_matrix[models].sum(axis=1, skipna=True).astype(int)
losses = decision_matrix[models].sum(axis=0, skipna=True).astype(int)

win_pct = (wins / peers * 100).round().astype(int).astype(str) + "%"
loss_pct = (losses / peers * 100).round().astype(int).astype(str) + "%"

decision_matrix_object = decision_matrix.astype("object")

for m in models:
    decision_matrix_object.loc[m, m] = "-"

for i in models:
    for j in models:
        if i != j and decision_matrix_object.loc[i, j] == 0:
            decision_matrix_object.loc[i, j] = ""

decision_matrix_object["Win"] = wins.astype(object)
decision_matrix_object["Win %"] = win_pct.astype(object)

decision_matrix_object.loc["Loss", models] = losses.values
decision_matrix_object.loc["Loss", "Win"] = ""
decision_matrix_object.loc["Loss", "Win %"] = ""

decision_matrix_object.loc["Loss %", models] = loss_pct.values
decision_matrix_object.loc["Loss %", "Win"] = ""
decision_matrix_object.loc["Loss %", "Win %"] = ""

os.makedirs("./Results/StatTest", exist_ok=True)
decision_matrix_object.to_csv("./Results/StatTest/Wilcoxon_Test_Result.csv", index=True)
pvals.to_csv("./Results/StatTest/Wilcoxon_PValues.csv", index=True)

decision_matrix_object

# **5. Plots**
---

## **MI feature ranking**
---

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10)
n_folds = cv.get_n_splits()
feature_names = np.array(X.columns)
selected_counts = pd.Series(0, index=feature_names, dtype=int)
mi_sum = pd.Series(0.0, index=feature_names, dtype=float)
mi_sq_sum = pd.Series(0.0, index=feature_names, dtype=float)

# -------------------------
# Compute MI per fold on TRAIN split
# -------------------------
for train_idx, _ in cv.split(X, y):
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]

    mi = mutual_info_classif(X_train, y_train)
    mi_s = pd.Series(mi, index=feature_names)

    k = choose_k_from_mi(X_train, y_train, rule="mean")
    selected = mi_s.sort_values(ascending=False).iloc[:k].index

    selected_counts.loc[selected] += 1
    mi_sum += mi_s
    mi_sq_sum += (mi_s ** 2)

# -------------------------
# Aggregate fold-wise stats
# -------------------------
mi_mean = mi_sum / n_folds
mi_std = np.sqrt((mi_sq_sum / n_folds) - (mi_mean ** 2))

feature_cv_summary = pd.DataFrame({"selected_freq": selected_counts, "selected_pct": (selected_counts / n_folds) * 100.0, "mi_mean": mi_mean, 
                                   "mi_std": mi_std}).sort_values(["selected_freq", "mi_mean"], ascending=[False, False])

feature_cv_summary.to_csv(os.path.join("./Results/CV_Selected_Features_Summary.csv"), index=True)

# -------------------------
# Plot 1: Selection frequency (%)
# -------------------------
top_n = 30
top_freq = feature_cv_summary.head(top_n)

fig1 = plt.figure(figsize=(12, 6))
plt.xlabel("Features")
plt.ylabel("Selection Frequency (%)")
plt.title("Most Important Transaction Features")

bars1 = plt.bar(top_freq.index, top_freq["selected_pct"].values, width=0.5, color=(0.2, 0.4, 0.6, 0.6))
for bar, val in zip(bars1, top_freq["selected_pct"].values):
    y_pos = bar.get_height() / 2 if bar.get_height() >= 0.05 else bar.get_height() + 0.02
    plt.text(bar.get_x() + bar.get_width() / 2, y_pos, f"{val:.0f}", ha="center", va="center", rotation=90,
             fontsize=9, color="black")

plt.xticks(rotation=45, ha="right")
plt.grid(True, color="grey", linewidth=0.3, linestyle="-.")
plt.tight_layout()

fig1.savefig(os.path.join("./Results/Figures/CV_Feature_Selection_Frequency.pdf"), format="pdf", bbox_inches="tight")
plt.show()

# -------------------------
# Plot 2: CV-averaged MI ranking
# -------------------------
top_mi = feature_cv_summary.sort_values("mi_mean", ascending=False).head(top_n)

fig2 = plt.figure(figsize=(12, 6))
plt.xlabel("Features")
plt.ylabel("Averaged MI")
plt.title("Features Ranking Based on MI Value")

bars2 = plt.bar(top_mi.index, top_mi["mi_mean"].values, width=0.5, color=(0.2, 0.4, 0.6, 0.6))

for bar, val in zip(bars2, top_mi["mi_mean"].values):
    y_pos = bar.get_height() / 2 if bar.get_height() >= 0.05 else bar.get_height() + 0.1
    plt.text(bar.get_x() + bar.get_width() / 2,y_pos, f"{val:.4f}", ha="center", va="center", rotation=90, 
             fontsize=9, color="black")

plt.xticks(rotation=45, ha="right")
plt.grid(True, color="grey", linewidth=0.3, linestyle="-.")
plt.tight_layout()

fig2.savefig(os.path.join("./Results/Figures/CV_MI_Ranking.pdf"), format="pdf", bbox_inches="tight")
plt.show()

feature_cv_summary

## **Precision & Recall bar chart (per model)**
---

In [None]:
PREC_Scores = pd.read_csv("./Results/all/Precision_Scores.csv")
REC_Scores  = pd.read_csv("./Results/all/Recall_Scores.csv")

models = ['DT', 'RF', 'ET', 'XGB', 'Ada', 'GB', 'HGB', 'CAT']

for df in [PREC_Scores, REC_Scores]:
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

mean_Precision_Scores = [float(PREC_Scores[m].mean()) for m in models]
mean_Recall_Scores    = [float(REC_Scores[m].mean()) for m in models]
Precision_And_Recall_scores = mean_Precision_Scores + mean_Recall_Scores
Precision_And_Recall_scores_DF = pd.DataFrame({'Classifier': models * 2, 'Score': Precision_And_Recall_scores,
                                               'Evaluation Metrics': (['Precision'] * len(models)) + (['Recall'] * len(models))})

sns.set(style='ticks')
g = sns.catplot(x='Classifier', y='Score', hue='Evaluation Metrics', data=Precision_And_Recall_scores_DF, kind='bar', height=4, 
                aspect=2.5, palette="PuBu")

ax = g.facet_axis(0, 0)
for p in ax.patches:
    h = p.get_height()
    if (h is None) or np.isnan(h) or h <= 0:
        continue
    ax.text(p.get_x() + p.get_width() / 2, h / 2, f"{h:.4f}", color='black', ha="center", va="center", rotation='vertical', size='small')

plt.title('Precision and Recall for Each Tree-based Classifier', fontsize=12)
plt.grid(True, color="grey", which='major', linewidth="0.3", linestyle="-." )
plt.grid(True, color="grey", which='minor', linestyle=':', linewidth="0.5")
plt.minorticks_on()

g.savefig('./Results/Figures/Precision_and_Recall_barplot.pdf', format="pdf", bbox_inches="tight")
plt.show()

## **AUC & MCC bar chart**
---

In [None]:
AUC_Scores = pd.read_csv("./Results/all/AUC_Scores.csv")
MCC_Scores  = pd.read_csv("./Results/all/MCC_Scores.csv")

models = ['DT', 'RF', 'ET', 'XGB', 'Ada', 'GB', 'HGB', 'CAT']

for df in [AUC_Scores, MCC_Scores]:
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

mean_AUC_Scores = [float(AUC_Scores[m].mean()) for m in models]
mean_MCC_Scores    = [float(MCC_Scores[m].mean()) for m in models]
AUC_And_MCC_scores = mean_AUC_Scores + mean_MCC_Scores
AUC_And_MCC_scores_DF = pd.DataFrame({'Classifier': models * 2, 'Score': AUC_And_MCC_scores,
                                               'Evaluation Metrics': (['AUC'] * len(models)) + (['MCC'] * len(models))})

sns.set(style='ticks')
g = sns.catplot(x='Classifier', y='Score', hue='Evaluation Metrics', data=AUC_And_MCC_scores_DF, kind='bar', height=4, 
                aspect=2.5, palette="PuBu")

ax = g.facet_axis(0, 0)
for p in ax.patches:
    h = p.get_height()
    if (h is None) or np.isnan(h) or h <= 0:
        continue
    ax.text(p.get_x() + p.get_width() / 2, h / 2, f"{h:.4f}", color='black', ha="center", va="center", rotation='vertical', size='small')

plt.title('AUC and MCC for Each Tree-based Classifier', fontsize=12)
plt.grid(True, color="grey", which='major', linewidth="0.3", linestyle="-." )
plt.grid(True, color="grey", which='minor', linestyle=':', linewidth="0.5")
plt.minorticks_on()

g.savefig('./Results/Figures/AUC_and_MCC_barplot.pdf', format="pdf", bbox_inches="tight")
plt.show()

## **Boxplots for Precision, Recall, AUC, MCC**
---

In [None]:
base_path = "./Results/all/"

Precision_Scores = pd.read_csv(base_path + "Precision_Scores.csv")
Recall_Scores    = pd.read_csv(base_path + "Recall_Scores.csv")
AUC_Scores       = pd.read_csv(base_path + "AUC_Scores.csv")
MCC_Scores       = pd.read_csv(base_path + "MCC_Scores.csv")

models = ['DT', 'RF', 'ET', 'XGB', 'Ada', 'GB', 'HGB', 'CAT']

for df in [Precision_Scores, Recall_Scores, AUC_Scores, MCC_Scores]:
    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

n_rows = 2
n_cols = 2
gs = gridspec.GridSpec(n_rows, n_cols)
scale = max(n_cols, n_rows)
fig = plt.figure(figsize=(2 * scale, 2 * scale))

metrics = ['Precision', 'Recall', 'AUC', 'MCC']
metricsData = [Precision_Scores, Recall_Scores, AUC_Scores, MCC_Scores]
n_plots = len(metrics)

scale = max(n_cols, n_rows)
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(7 * scale, 5 * scale))

for i, ax in enumerate(axes.flatten()):
    if i < n_plots:
        g = sns.boxplot(data=metricsData[i], orient="h", ax=ax, palette="Pastel2", order=models)
        # g.set(title='')
        g.set(ylabel='Classifier')
        g.set(xlabel=metrics[i])

        ax.minorticks_on()
        ax.grid(True, color="grey", which='major', linewidth="0.3", linestyle=":")
        ax.grid(True, color="grey", which='minor', linestyle=":", linewidth="0.5")

fig.tight_layout()
fig.savefig('./Results/Figures/Precision_Recall_AUC_MCC_Boxplots.pdf', format="pdf", bbox_inches="tight")
plt.show()

## **Detection time bar chart**
---

In [None]:
Time_Scores = pd.read_csv("./Results/all/INF_TIME_Scores.csv")
models = ['DT', 'RF', 'ET', 'XGB', 'Ada', 'GB', 'HGB', 'CAT']
for c in Time_Scores.columns:
    Time_Scores[c] = pd.to_numeric(Time_Scores[c], errors="coerce")

Models_Time_Score = pd.DataFrame({"Classifier": models, "Scoring Time (µs)": [Time_Scores[m].mean() * 1e6 for m in models]})

sns.set(style='ticks')
g = sns.catplot(x='Classifier', y='Scoring Time (µs)', data=Models_Time_Score, kind='bar', height=5, aspect=1.2,palette="PuBu")

ax = g.facet_axis(0, 0)

# Disable scientific notation completely
ax.ticklabel_format(style='plain', axis='y', useOffset=False)

for p in ax.patches:
    h = p.get_height()
    if h <= 0:
        continue
    ax.text(p.get_x() + p.get_width() / 2, h / 2, f"{h:.2f}", ha="center", va="center", rotation="vertical", color="black", fontsize=10)

plt.title('Scoring Time for Each Tree-based Classifier', fontsize=12)
plt.ylabel('Scoring Time (µs)')
plt.grid(True, color="grey", which='major', linewidth=0.3, linestyle='-.')
plt.grid(True, color="grey", which='minor', linestyle=':', linewidth=0.5)
plt.minorticks_on()

g.savefig('./Results/Figures/Scoring_Time_Tree_Based_Ensembles.pdf', format="pdf", bbox_inches="tight")
plt.show()

In [None]:
Time_Scores = pd.read_csv("./Results/all/INF_TIME_Scores.csv")

models = ['DT', 'RF', 'ET', 'XGB', 'Ada', 'GB', 'HGB', 'CAT']

for c in Time_Scores.columns: Time_Scores[c] = pd.to_numeric(Time_Scores[c], errors="coerce")

Models_Time_Score = pd.DataFrame({"Classifier": models, "Scoring Time": [Time_Scores[m].mean() for m in models]})

sns.set(style='ticks')
g = sns.catplot( x='Classifier', y='Scoring Time', data=Models_Time_Score, kind='bar', height=5, aspect=1.2, palette="PuBu")
ax = g.facet_axis(0, 0)
ax.ticklabel_format(style='plain', axis='y', useOffset=False)
for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2, p.get_height() * 0.5, f"{p.get_height():.6f}", ha="center", va="center", rotation="vertical",
            color="black", size="medium")

plt.title('Scoring Time for Each Tree-based Classifier', fontsize=12)
plt.grid(True, color="grey", which='major', linewidth="0.3", linestyle="-." )
plt.grid(True, color="grey", which='minor', linestyle=':', linewidth="0.5")
plt.minorticks_on()

g.savefig('./Results/Figures/Scoring_Time_Tree_Based_Ensembles.pdf', format="pdf", bbox_inches="tight")

plt.show()