In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
import sys
import os
from itertools import combinations

PATH = '/content/'
drive.mount("/content/drive", force_remount=True)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor  # keep if you'll use it later

df = pd.read_csv(PATH + "comparison_numeric_small.csv")

# Quick look
print("Original shape:", df.shape)
display(df.head())

# 1) Remove a specific dataset by name
#    (here I assume the column is literally called 'Dataset')
bad_datasets = ["breast_cancer_wisconsin"]
df = df[~df["Dataset"].isin(bad_datasets)]

print("\nAfter removing specific datasets:", bad_datasets)
print("Shape:", df.shape)

# 2) Identify metric columns (the ones with _DPG / _DiCE suffixes)
suffix_dpg = "_DPG"
suffix_baseline = "_DiCE"

metric_cols = [c for c in df.columns if c.endswith(suffix_dpg) or c.endswith(suffix_baseline)]
print("\nMetric columns used to check NaNs:")
print(metric_cols)

# 3) Check NaN counts per dataset (optional, just for inspection)
nan_counts = df[metric_cols].isna().sum(axis=1)
print("\nNaN counts per dataset row (before dropping):")
display(pd.DataFrame({
    "Dataset": df["Dataset"],
    "nan_count_metrics": nan_counts
}))

# 4) Drop rows (datasets) that have ANY NaN in these metric columns
df_clean = df.dropna(subset=metric_cols)

print("\nAfter dropping rows with NaNs in metric columns:")
print("Shape:", df_clean.shape)

# Show remaining datasets
print("\nRemaining datasets:")
print(df_clean["Dataset"].tolist())

# If you want to continue with heatmaps or stats, use df_clean from now on

bad_datasets = ["abalone_19", "wheat-seeds", "heart_disease_uci"]  # optionally: add "heart_disease_uci"
df_clean = df_clean[~df_clean["Dataset"].isin(bad_datasets)]

df = df_clean


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 0) Use dataset name as index (adapt this to your actual column name) ---
# Try some common possibilities; if none exist, we keep the numeric index
possible_name_cols = ["dataset", "Dataset", "data", "Data", "name", "Name"]
name_col = None
for c in possible_name_cols:
    if c in df.columns:
        name_col = c
        break

if name_col is not None:
    df = df.set_index(name_col)
    print(f"Using '{name_col}' as dataset index.")
else:
    print("No dataset-name column found; using numeric index as dataset ID.")

print("Index (datasets):", df.index.tolist())

# --- 1) Detect metrics (base names) present for both DPG and DiCE ---
suffix_dpg = "_DPG"
suffix_baseline = "_DiCE"  # use the exact suffix in your CSV

cols_dpg = [c for c in df.columns if c.endswith(suffix_dpg)]
cols_base = [c for c in df.columns if c.endswith(suffix_baseline)]

metrics = sorted(
    set(c.replace(suffix_dpg, "") for c in cols_dpg)
    & set(c.replace(suffix_baseline, "") for c in cols_base)
)

print("Metrics used in heatmap:", metrics)

# --- 2) Build matrix of differences (DPG - DiCE) ---
diff_mat = pd.DataFrame(index=df.index)

for metric in metrics:
    col_dpg = metric + suffix_dpg
    col_base = metric + suffix_baseline
    diff_mat[metric] = df[col_dpg] - df[col_base]

display(diff_mat)

# --- 3) Column-wise normalization to [-1, 1] by max absolute value per metric ---
diff_mat_norm = diff_mat.copy()

for col in diff_mat_norm.columns:
    max_abs = np.nanmax(np.abs(diff_mat_norm[col].values))
    if max_abs > 0:
        diff_mat_norm[col] = diff_mat_norm[col] / max_abs
    else:
        diff_mat_norm[col] = 0.0  # all zeros/NaNs

display(diff_mat_norm)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# diff_mat and diff_mat_norm must already be defined:
#   diff_mat:  original differences (DPG-CF - DiCE) per dataset x metric
#   diff_mat_norm: same shape, column-wise normalised to [-1, 1]

# --- 1) Internal metric -> pretty label ---
metric_name_map = {
    "plausibility_nbr_cf":   "Implausibility",
    "count_diversity_all":   "Diversity",
    "avg_nbr_changes":       "Sparsity",
    "accuracy_knn_sklearn":  "Discriminative Power",
    "runtime":               "Runtime",
    "distance_mh":           "Distance",
    "perc_valid_cf_all":     "Validity",
    "perc_actionable_cf_all":"Actionability",
}

# --- 2) Internal metric -> goal arrow (↑ or ↓) ---
metric_goal_map = {
    "plausibility_nbr_cf":   "↓",  # higher plausibility is better
    "count_diversity_all":   "↑",
    "avg_nbr_changes":       "↓",
    "accuracy_knn_sklearn":  "↑",
    "runtime":               "↓",
    "distance_mh":           "↓",
    "perc_valid_cf_all":     "↑",
    "perc_actionable_cf_all":"↑",
}

# --- 3) Keep only metrics present in diff_mat and sort alphabetically by pretty label ---
metrics_present = [m for m in metric_name_map.keys() if m in diff_mat.columns]

# sort by pretty label (Actionability, Distance, Diversity, ... )
base_metrics_sorted = sorted(
    metrics_present,
    key=lambda m: metric_name_map[m]
)

# --- 4) Reorder matrices ---
diff_mat_plot = diff_mat[base_metrics_sorted]
diff_mat_norm_plot = diff_mat_norm[base_metrics_sorted]

# --- 5) Build x-axis labels with arrows, e.g. "Distance ↓" ---
x_labels = [
    f"{metric_name_map[m]} {metric_goal_map[m]}"
    for m in base_metrics_sorted
]

# --- 6) Plot heatmap ---
plt.figure(figsize=(8, 6))
ax = sns.heatmap(
    diff_mat_norm_plot,
    annot=diff_mat_plot.round(2),  # show original DPG-CF - DiCE diffs
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1, vmax=1,
    center=0,
    linewidths=0.5,
    linecolor="gray",
    cbar_kws={"label": "Column-wise scaled difference"}
)

ax.set_title("", fontsize=14)
ax.set_xlabel("Metric (goal)", fontsize=12)
ax.set_ylabel("Dataset", fontsize=12)

ax.set_xticklabels(x_labels, rotation=45, ha="right", fontsize=10)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)

plt.tight_layout()

# --- 7) Save as PDF for LaTeX ---
plt.savefig(PATH+"heatmap_dpgcf_dice_metrics.pdf", format="pdf", bbox_inches="tight")

plt.show()


In [None]:
print("Columns:", df.columns.tolist())
df.head()


In [None]:
!pip install scipy pandas

import pandas as pd
import numpy as np
from scipy.stats import wilcoxon

In [None]:
# === 2. Detect metric pairs with suffixes ===
suffix_dpg = "_DPG"
suffix_baseline = "_DiCE"   # NOTE: exact capitalization from your CSV

cols_dpg = [c for c in df.columns if c.endswith(suffix_dpg)]
cols_base = [c for c in df.columns if c.endswith(suffix_baseline)]

# Base metric names (before suffix)
metrics = sorted(
    set(c.replace(suffix_dpg, "") for c in cols_dpg)
    & set(c.replace(suffix_baseline, "") for c in cols_base)
)

print("Detected metrics:", metrics)

# === 3. Holm–Bonferroni correction ===
def holm_bonferroni(p_vals):
    """
    p_vals: dict {metric_name: raw_p}
    returns: dict {metric_name: corrected_p}
    """
    items = sorted(p_vals.items(), key=lambda x: x[1])  # sort by p ascending
    m = len(items)
    corrected_internal = {}
    prev = 0.0
    for i, (name, p) in enumerate(items, start=1):
        adj = min(1.0, (m - i + 1) * p)   # Holm step-down
        # ensure monotonic increasing across sorted p's
        adj = max(adj, prev)
        prev = adj
        corrected_internal[name] = adj

    # restore original order
    return {name: corrected_internal[name] for name in p_vals.keys()}

# === 4. Run Wilcoxon per metric ===
results = []
raw_pvals = {}

for metric in metrics:
    col_dpg = metric + suffix_dpg
    col_base = metric + suffix_baseline

    x = df[col_dpg]
    y = df[col_base]

    # Drop NaN pairs (e.g., when DiCE fails on some dataset)
    mask = ~(x.isna() | y.isna())
    x_clean = x[mask].values
    y_clean = y[mask].values

    if len(x_clean) < 1:
        print(f"Skipping metric {metric}: no valid pairs (all NaN).")
        continue

    # Wilcoxon signed-rank on paired differences (DPG - baseline)
    # zero_method='wilcox' ignores ties/zeros in the differences
    stat, p = wilcoxon(
        x_clean, y_clean,
        zero_method='wilcox',
        alternative='two-sided'
    )

    diff = x_clean - y_clean
    median_diff = np.median(diff)

    # Simple sign-based effect size: (n_pos - n_neg) / (n_pos + n_neg)
    non_zero = diff[diff != 0]
    n_pos = np.sum(non_zero > 0)
    n_neg = np.sum(non_zero < 0)
    if (n_pos + n_neg) > 0:
        effect_sign = (n_pos - n_neg) / (n_pos + n_neg)
    else:
        effect_sign = 0.0  # all differences zero

    raw_pvals[metric] = p

    results.append({
        "metric": metric,
        "n_datasets_used": len(x_clean),
        "median_diff(DPG - DiCE)": median_diff,
        "wilcoxon_stat": stat,
        "p_value_raw": p,
        "effect_size_sign": effect_sign
    })

# If no metrics survived, stop gracefully
if not results:
    print("No metrics detected or no valid pairs to test.")
else:
    # === 5. Apply Holm–Bonferroni correction across metrics ===
    corrected = holm_bonferroni(raw_pvals)
    for r in results:
        r["p_value_holm"] = corrected[r["metric"]]

    # === 6. Show results as a table ===
    res_df = pd.DataFrame(results)
    res_df = res_df.sort_values("p_value_raw")

    # Round for readability
    for col in ["median_diff(DPG - DiCE)", "p_value_raw", "p_value_holm", "effect_size_sign"]:
        res_df[col] = res_df[col].round(4)

    display(res_df)

    # Optional: get LaTeX for the paper
    print(res_df.to_latex(index=False))



In [None]:
# === Generate LaTeX table for paper ===

# Metric name mapping (internal -> pretty)
metric_name_map = {
    "plausibility_nbr_cf":   "Implausibility",
    "count_diversity_all":   "Diversity",
    "avg_nbr_changes":       "Sparsity",
    "accuracy_knn_sklearn":  "Discriminative Power",
    "runtime":               "Runtime",
    "distance_mh":           "Distance",
    "perc_valid_cf_all":     "Validity",
    "perc_actionable_cf_all":"Actionability",
}

# Metric goal mapping (internal -> LaTeX arrow command)
metric_goal_map = {
    "plausibility_nbr_cf":   ("\\downarrow", "lower"),
    "count_diversity_all":   ("\\uparrow", "higher"),
    "avg_nbr_changes":       ("\\downarrow", "lower"),
    "accuracy_knn_sklearn":  ("\\uparrow", "higher"),
    "runtime":               ("\\downarrow", "lower"),
    "distance_mh":           ("\\downarrow", "lower"),
    "perc_valid_cf_all":     ("\\uparrow", "higher"),
    "perc_actionable_cf_all":("\\uparrow", "higher"),
}

# Sort by pretty name alphabetically
res_df_sorted = res_df.copy()
res_df_sorted["pretty_name"] = res_df_sorted["metric"].map(metric_name_map)
res_df_sorted = res_df_sorted.sort_values("pretty_name")

# Get the number of datasets used (should be same for all metrics)
n_datasets = int(res_df_sorted.iloc[0]["n_datasets_used"])

# Build LaTeX table rows
latex_rows = []

for _, row in res_df_sorted.iterrows():
    metric = row["metric"]
    pretty_name = metric_name_map.get(metric, metric)
    goal_symbol, goal_direction = metric_goal_map.get(metric, ("", ""))
    
    median_diff = row["median_diff(DPG - DiCE)"]
    p_value = row["p_value_raw"]
    
    # Determine which method is better based on goal and median difference
    is_significant = p_value < 0.05
    
    if abs(median_diff) < 1e-6:  # essentially zero
        best_method = "Both"
    elif goal_direction == "higher":
        # Higher is better, so if median_diff > 0, DPG-CF is better
        if median_diff > 0:
            best_method = "DPG-CF"
        else:
            best_method = "DiCE"
    else:  # goal_direction == "lower"
        # Lower is better, so if median_diff < 0, DPG-CF is better
        if median_diff < 0:
            best_method = "DPG-CF"
        else:
            best_method = "DiCE"
    
    # Bold if significant and not "Both"
    if is_significant and best_method != "Both":
        best_method = f"\\textbf{{{best_method}}}"
    
    # Format the row exactly as in the example (without n column)
    latex_row = f"    \\texttt{{{pretty_name}}}  & ${goal_symbol}$ & {median_diff:6.2f}  & {p_value:.2f} & {best_method}            \\\\"
    latex_rows.append(latex_row)

# Assemble full LaTeX table
latex_table = f"""\\begin{{table}}[b!]
  \\centering
  \\caption{{Paired Wilcoxon signed-rank comparison between DPG-CF and DiCE across
  {n_datasets} datasets. $\\Delta = \\text{{DPG-CF}} - \\text{{DiCE}}$. The ``Goal'' column indicates
  whether higher ($\\uparrow$) or lower ($\\downarrow$) values are desirable. The
  ``Best'' column reports the method with better median performance according to
  this goal; method names in bold denote statistically significant differences
  (Wilcoxon test, $p < 0.05$). All metrics computed using $n = {n_datasets}$ datasets.}}
  \\label{{tab:wilcoxon_results}}
  \\begin{{tabular}}{{lcccl}}
    \\toprule
    Metric & Goal & median $\\Delta$ & $p$-value & Best \\\\
    \\midrule
"""

for row in latex_rows:
    latex_table += row + "\n"

latex_table += r"""    \bottomrule
  \end{tabular}
\end{table}"""

print(latex_table)

In [None]:
import matplotlib.pyplot as plt

# Build a long-form dataframe of differences per dataset & metric
rows = []
for metric in metrics:
    col_dpg = metric + suffix_dpg
    col_base = metric + suffix_baseline
    if col_dpg not in df.columns or col_base not in df.columns:
        continue

    diffs = df[col_dpg] - df[col_base]
    for dataset_name, dval in zip(df.index, diffs):
        rows.append({"metric": metric, "dataset": dataset_name, "diff": dval})

diff_df = pd.DataFrame(rows)

plt.figure(figsize=(8, 4))

# One scatter column per metric
metrics_order = res_df["metric"].tolist()  # use same order as stats table
positions = {m: i for i, m in enumerate(metrics_order)}

for m in metrics_order:
    subset = diff_df[diff_df["metric"] == m]
    x = np.full(len(subset), positions[m])
    plt.scatter(x, subset["diff"], alpha=0.7)

plt.axhline(0, color="black", linewidth=1)
plt.xticks(range(len(metrics_order)), metrics_order, rotation=45, ha="right")
plt.ylabel("Difference per dataset (DPG - DiCE)")
plt.title("Per-dataset metric differences grouped by metric")

plt.tight_layout()
plt.show()


In [None]:
metric_to_plot = "perc_valid_cf_all"  # change to the exact base name used in your CSV

col_dpg = metric_to_plot + suffix_dpg
col_base = metric_to_plot + suffix_baseline

x_base = df[col_base].values
x_dpg = df[col_dpg].values

datasets_labels = df.index.astype(str)

plt.figure(figsize=(6, 6))
plt.scatter(x_base, x_dpg)

# Diagonal line (equal performance)
min_val = min(np.nanmin(x_base), np.nanmin(x_dpg))
max_val = max(np.nanmax(x_base), np.nanmax(x_dpg))
plt.plot([min_val, max_val], [min_val, max_val], linestyle="--")

for xb, xd, label in zip(x_base, x_dpg, datasets_labels):
    plt.plot([xb, xb, xd], [xb, xd, xd], alpha=0.3)  # optional segments
    # Or annotate points with dataset names (comment if cluttered):
    # plt.text(xb, xd, label, fontsize=8)

plt.xlabel("DiCE")
plt.ylabel("DPG-CF")
plt.title(f"{metric_to_plot}: per-dataset scores (DPG vs DiCE)")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

res_df_plot = res_df.sort_values("p_value_raw")  # or any order you prefer
x = np.arange(len(res_df_plot))
vals = res_df_plot["median_diff(DPG - DiCE)"].values
pvals = res_df_plot["p_value_holm"].values

plt.figure(figsize=(8, 4))
plt.bar(x, vals)
plt.axhline(0, color="black", linewidth=1)

for i, (v, p) in enumerate(zip(vals, pvals)):
    if p < 0.001:
        mark = "***"
    elif p < 0.01:
        mark = "**"
    elif p < 0.05:
        mark = "*"
    else:
        mark = ""
    if mark:
        plt.text(i, v + np.sign(v)*0.01, mark, ha="center", va="bottom" if v>=0 else "top")

plt.xticks(x, res_df_plot["metric"], rotation=45, ha="right")
plt.ylabel("Median difference (DPG - DiCE)")
plt.title("Per-metric median difference with Holm-corrected significance")

plt.tight_layout()
plt.show()
