# Figures & Tables (MN vs RF)

Purpose:
1. Read MN label files (from MN_01_build_MN_labels.ipynb)
2. Compute country-level precision/recall/F1 for MN vs RF across k-thresholds (k>3, k>5) and majority œÑ (0.1, 0.2, 0.3)
3. Create 2√ó3 multipanel boxplot figure:
    rows: k>3, k>5
    columns: precision, recall, F1
4. Create summary CSVs:
    - mn_rf_summary_segments_population.csv
    - mn_rf_summary_segments_population_GLOBAL_k_tau_table_millions.csv

# Requirements:
- MN_Comparison_Files/{country}/{country}_segments_mnlabels_k{K}_maj{TAG}.gpkg produced by 01_MN_Data_and_Labels, for K in {3,5} and TAG in {10,20,30}.

- RF GPKGs with rf_label and POP_SEG: {country}_rf_preds_filtered80.gpkg


# 1Ô∏è‚É£ Imports & Path Setup

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import precision_score, recall_score, f1_score

try:
    from tqdm import tqdm
except ImportError:
    # fallback if tqdm not installed
    def tqdm(x, *args, **kwargs):
        return x

# --- ROOTS (EDIT TO MATCH YOUR MACHINE) ---
MN_COMP_ROOT = Path("../MN/Outputs/MN_Comparison_Files")

RF_DIR = Path("../2_modelling/02_application/Filtered_80pct_allattributes)

OUT_DIR = MN_COMP_ROOT.parent  # "MN\Outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

FIG_DIR = OUT_DIR / "Figures"
FIG_DIR.mkdir(parents=True, exist_ok=True)

# --- Parameters ---
K_VALUES   = [3, 5]                   # k-complexity cutoffs
TAU_VALUES = [0.1, 0.2, 0.3]          # majority thresholds
TAU_TAGS   = {0.1: "10", 0.2: "20", 0.3: "30"}

MN_LABEL_COL = "mn_label"            # change to "label_final" if your files use that name

METRICS = ["precision", "recall", "f1"]

# 2Ô∏è‚É£ Helpers

In [None]:
def compute_metrics(y_true, y_pred):
    """Binary classification metrics (MN vs RF) with safe zero_division handling."""
    return {
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall":    recall_score(y_true, y_pred, zero_division=0),
        "f1":        f1_score(y_true, y_pred, zero_division=0),
    }


def load_rf_mn_pair(country: str, k_thr: int, tau: float):
    """
    Load matching RF and MN label data for a given country, k-threshold and œÑ.
    Returns (rf_df, mn_df) or (None, None) if missing/problem.
    """
    tag = TAU_TAGS[tau]

    rf_path = RF_DIR / f"{country}_rf_preds_filtered80.gpkg"
    mn_path = MN_COMP_ROOT / country / f"{country}_segments_mnlabels_k{k_thr}_maj{tag}.gpkg"

    if not rf_path.exists() or not mn_path.exists():
        return None, None

    try:
        rf = gpd.read_file(rf_path)
        mn = gpd.read_file(mn_path)
    except Exception as e:
        print(f"‚ùå Error reading {country}, k={k_thr}, œÑ={tau}: {e}")
        return None, None

    if "rf_label" not in rf.columns or MN_LABEL_COL not in mn.columns:
        return None, None

    # we only need labels + population for the tables
    rf = rf[["rf_label", "POP_SEG"]].copy()
    mn = mn[[MN_LABEL_COL, "POP_SEG"]].copy()

    # sanity: ensure same length
    if len(rf) != len(mn):
        print(f"‚ö†Ô∏è Row-count mismatch for {country} (k={k_thr}, œÑ={tau}); truncating to min length.")
        n = min(len(rf), len(mn))
        rf, mn = rf.iloc[:n].reset_index(drop=True), mn.iloc[:n].reset_index(drop=True)

    # drop rows with missing labels
    sub = pd.concat(
        [rf["rf_label"], mn[MN_LABEL_COL], rf["POP_SEG"], mn["POP_SEG"]],
        axis=1,
        keys=["rf_label", MN_LABEL_COL, "rf_POP_SEG", "mn_POP_SEG"]
    ).dropna(subset=["rf_label", MN_LABEL_COL])

    if sub.empty:
        return None, None

    # rebuild frames with consistent indices
    rf_clean = sub[["rf_label", "rf_POP_SEG"]].rename(columns={"rf_POP_SEG": "POP_SEG"})
    mn_clean = sub[[MN_LABEL_COL, "mn_POP_SEG"]].rename(columns={"mn_POP_SEG": "POP_SEG"})

    return rf_clean, mn_clean

# 3Ô∏è‚É£ Country-level MN‚ÄìRF Metrics for Figure

In [None]:
rows = []

# countries detected from MN_Comparison_Files (folder names)
countries = [p.name for p in MN_COMP_ROOT.iterdir() if p.is_dir()]
print(f"Detected {len(countries)} countries in MN_Comparison_Files")

for k_thr in K_VALUES:
    for tau in TAU_VALUES:
        for country in countries:
            rf, mn = load_rf_mn_pair(country, k_thr, tau)
            if rf is None or mn is None:
                continue

            y_true = rf["rf_label"].astype(int)
            y_pred = mn[MN_LABEL_COL].astype(int)

            m = compute_metrics(y_true, y_pred)
            m.update({
                "country": country,
                "k_thr": k_thr,
                "tau": tau,
            })
            rows.append(m)

metrics_df = pd.DataFrame(rows)
metrics_df.head()

# 4Ô∏è‚É£ Create 2√ó3 multipanel layout (k>3 vs k>5, 3 metrics)

In [None]:
if metrics_df.empty:
    raise RuntimeError("No metrics computed; check paths and MN label column name.")

# Styling similar to SSI figure
sns.set_theme(context="paper", style="white", rc={
    "axes.edgecolor": "0.4",
    "axes.linewidth": 0.8,
    "axes.labelsize": 10,
    "font.size": 9.5,
    "xtick.labelsize": 9,
    "ytick.labelsize": 9,
})
plt.rcParams["figure.dpi"] = 300
plt.rcParams["font.family"] = "DejaVu Sans"

palette   = {"0.1": "#f0f0f0", "0.2": "#bdbdbd", "0.3": "#636363"}
tau_order = ["0.1", "0.2", "0.3"]
k_labels  = {3: "k > 3", 5: "k > 5"}

fig, axes = plt.subplots(
    nrows=2, ncols=3, figsize=(8.8, 4.8),
    sharey=True, sharex=False,
    gridspec_kw={"hspace": 0.35, "wspace": 0.25}
)

for r, k_thr in enumerate(K_VALUES):
    sub_k = metrics_df[metrics_df["k_thr"] == k_thr].copy()
    # pre-compute string tau for plotting
    sub_k["tau_str"] = sub_k["tau"].astype(str)

    for c, metric in enumerate(METRICS):
        ax = axes[r, c]
        ax.axhline(0.5, lw=0.6, ls="--", color="#bdbdbd", zorder=0)

        sns.boxplot(
            data=sub_k,
            x="tau_str", y=metric,
            order=tau_order,
            palette=palette, width=0.5,
            whis=(5, 95), showfliers=False,
            boxprops    = dict(linewidth=0.9, edgecolor="0.4"),
            whiskerprops= dict(linewidth=0.8, color="0.4"),
            medianprops = dict(linewidth=2.0, color="black"),
            capprops    = dict(linewidth=0.8, color="0.4"),
            ax=ax,
        )

        # Scatter overlay + median labels
        for i, tau_str in enumerate(tau_order):
            vals = sub_k.loc[sub_k["tau_str"] == tau_str, metric].dropna()
            x_vals = np.random.normal(i, 0.05, size=len(vals))
            ax.scatter(
                x_vals, vals,
                s=14, color="black", alpha=0.6,
                edgecolor="white", linewidth=0.3, zorder=3
            )
            if not vals.empty:
                m = vals.median()
                ax.text(i + 0.29, m + 0.01, f"{m:.2f}",
                        va="center", ha="left",
                        fontsize=9, color="#222222", fontweight="bold")

        # Axes labels and layout
        ax.set_xticklabels(["œÑ=0.1", "œÑ=0.2", "œÑ=0.3"])
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_ylim(0, 1.02)
        sns.despine(ax=ax)

        # Row labels (left)
        if c == 0:
            ax.text(-0.30, 0.5, k_labels[k_thr],
                    fontsize=10.5, fontweight="bold",
                    rotation=90, va="center", ha="center",
                    transform=ax.transAxes)

        # Column titles (top row only)
        if r == 0:
            ax.set_title(metric.capitalize(), fontsize=10, pad=4)

fig.suptitle(
    "Country-level alignment of MN with CSD ‚Äî Threshold effects across k-levels",
    fontsize=12.5, fontweight="bold", y=0.95
)
plt.tight_layout(rect=[0, 0, 1, 0.96])

fig_out = FIG_DIR / "multipanel_mn_rf_boxplots_gray_compact.png"
plt.savefig(fig_out, bbox_inches="tight")
plt.show()

print(f"‚úÖ Figure saved to: {fig_out}")

# 5Ô∏è‚É£ MN vs RF Summary: Segments & Population (per country + Overall)

In [None]:
records = []

for k_thr in K_VALUES:
    for tau in TAU_VALUES:
        print(f"\nüîπ Processing k > {k_thr}, œÑ = {tau}")
        for country in tqdm(countries):
            rf, mn = load_rf_mn_pair(country, k_thr, tau)
            if rf is None or mn is None:
                continue

            # Ensure numeric
            rf["rf_label"] = pd.to_numeric(rf["rf_label"], errors="coerce")
            mn[MN_LABEL_COL] = pd.to_numeric(mn[MN_LABEL_COL], errors="coerce")
            rf["POP_SEG"] = pd.to_numeric(rf["POP_SEG"], errors="coerce")
            mn["POP_SEG"] = pd.to_numeric(mn["POP_SEG"], errors="coerce")

            rf = rf.dropna(subset=["rf_label", "POP_SEG"])
            mn = mn.dropna(subset=[MN_LABEL_COL, "POP_SEG"])
            if rf.empty or mn.empty:
                continue

            # total segments & population (using RF POP_SEG as reference)
            total_segments = len(rf)
            total_pop      = rf["POP_SEG"].sum()

            rf_deprived_segments = int((rf["rf_label"] == 1).sum())
            rf_deprived_pop      = rf.loc[rf["rf_label"] == 1, "POP_SEG"].sum()

            mn_deprived_segments = int((mn[MN_LABEL_COL] == 1).sum())
            mn_deprived_pop      = mn.loc[mn[MN_LABEL_COL] == 1, "POP_SEG"].sum()

            records.append({
                "country": country,
                "k_thr": k_thr,
                "tau": tau,
                "total_segments": total_segments,
                "rf_deprived_segments": rf_deprived_segments,
                "mn_deprived_segments": mn_deprived_segments,
                "rf_deprived_pop": rf_deprived_pop,
                "mn_deprived_pop": mn_deprived_pop,
                "total_pop": total_pop,
            })

df = pd.DataFrame(records)
df.head()

# 6Ô∏è‚É£ Add Overall (Global Totals) and Save CSV

In [None]:
overall_rows = []
for (k_thr, tau), grp in df.groupby(["k_thr", "tau"]):
    overall_rows.append({
        "country": "Overall",
        "k_thr": k_thr,
        "tau": tau,
        "total_segments": grp["total_segments"].sum(),
        "rf_deprived_segments": grp["rf_deprived_segments"].sum(),
        "mn_deprived_segments": grp["mn_deprived_segments"].sum(),
        "rf_deprived_pop": grp["rf_deprived_pop"].sum(),
        "mn_deprived_pop": grp["mn_deprived_pop"].sum(),
        "total_pop": grp["total_pop"].sum(),
    })

overall_df = pd.DataFrame(overall_rows)
final_df = pd.concat([df, overall_df], ignore_index=True)

out_csv = OUT_DIR / "mn_rf_summary_segments_population.csv"
final_df.to_csv(out_csv, index=False)

print(f"\n‚úÖ Summary (segments & population) saved to:\n{out_csv}")
try:
    from IPython.display import display
    display(final_df.head(12))
except Exception:
    pass

# 7Ô∏è‚É£ Global Summary Table (populations in millions)

In [None]:
IN_CSV  = OUT_DIR / "mn_rf_summary_segments_population.csv"
OUT_CSV = OUT_DIR / "mn_rf_summary_segments_population_GLOBAL_k_tau_table_millions.csv"

df_global = pd.read_csv(IN_CSV)

# Ensure numeric types
for col in [
    "total_segments", "rf_deprived_segments", "mn_deprived_segments",
    "rf_deprived_pop", "mn_deprived_pop", "total_pop"
]:
    df_global[col] = pd.to_numeric(df_global[col], errors="coerce")

# Aggregate globally for each k-threshold √ó tau (using ONLY 'Overall' rows is also possible;
# here we re-aggregate from country rows as a cross-check)
global_summary = (
    df_global.groupby(["k_thr", "tau"], as_index=False)
             .agg({
                 "total_segments": "sum",
                 "rf_deprived_segments": "sum",
                 "mn_deprived_segments": "sum",
                 "total_pop": "sum",
                 "rf_deprived_pop": "sum",
                 "mn_deprived_pop": "sum",
             })
)

# Convert populations to millions and compute shares
global_summary["Total_Pop_M"]       = (global_summary["total_pop"]       / 1e6).round(2)
global_summary["RF_Deprived_Pop_M"] = (global_summary["rf_deprived_pop"] / 1e6).round(2)
global_summary["MN_Deprived_Pop_M"] = (global_summary["mn_deprived_pop"] / 1e6).round(2)

global_summary["RF_Deprived_Pop_%"] = (
    global_summary["RF_Deprived_Pop_M"] / global_summary["Total_Pop_M"] * 100
).round(2)
global_summary["MN_Deprived_Pop_%"] = (
    global_summary["MN_Deprived_Pop_M"] / global_summary["Total_Pop_M"] * 100
).round(2)

out = global_summary[[
    "k_thr", "tau",
    "total_segments", "rf_deprived_segments", "mn_deprived_segments",
    "Total_Pop_M", "RF_Deprived_Pop_M", "MN_Deprived_Pop_M",
    "RF_Deprived_Pop_%", "MN_Deprived_Pop_%",
]].sort_values(["k_thr", "tau"]).reset_index(drop=True)

with pd.option_context("display.max_rows", None):
    print(out)

out.to_csv(OUT_CSV, index=False)
print(f"\n‚úÖ Saved global MN summary (millions) to:\n{OUT_CSV}")