# Analyse Random Forest Outputs (Thresholds, Curves, Confusion Matrices)

This notebook:

1. Reconstructs the training and test split using the saved holdout indices.  
2. Loads the trained Random Forest model and computes predicted probabilities.  
3. Explores classification thresholds (precision/recall/F1 vs threshold).  
4. Plots PR and ROC curves on the holdout set.  
5. Computes confusion matrices and detailed metrics at a chosen threshold (e.g., τ = 0.40).  
6. Computes per-region metrics and balanced (downsampled) evaluations.  
7. Computes Matthews correlation coefficient (MCC).

**Inputs:**

- Labeled RF training data (8 IDEABench cities):  
  `../.../1_preprocessing/LabelledData_For_RF/*.csv`
- RF training outputs:
  - `region_mapping.json`
  - `tables/file_list.csv`
  - `tables/holdout_indices.csv`
  - `rf_best_model.joblib`  
  located in: `../01_training/rf_outputs/`

**Outputs:**

- Plots and tables under:  
  `analysis_outputs/plots/`  
  `analysis_outputs/tables/`


# Imports and paths

In [None]:
from pathlib import Path
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    precision_recall_curve, average_precision_score, roc_curve, roc_auc_score,
    precision_score, recall_score, f1_score, confusion_matrix, accuracy_score,
    classification_report
)
from sklearn.utils import check_random_state
import joblib


In [None]:
# Avoid oversubscription of threads on some systems
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"


In [None]:
# Folder with per-city labeled training CSVs (from preprocessing step)
CSV_FOLDER = Path("../../1_preprocessing/LabelledData_For_RF")

# RF training outputs (model, region mapping, file_list, holdout indices)
RF_OUT_DIR = Path("../01_training/rf_outputs")

# Where this notebook will save analysis outputs
ANALYSIS_OUT = Path("analysis_outputs")
PLOTS  = ANALYSIS_OUT / "plots"
TABLES = ANALYSIS_OUT / "tables"

PLOTS.mkdir(parents=True, exist_ok=True)
TABLES.mkdir(parents=True, exist_ok=True)

print("CSV_FOLDER: ", CSV_FOLDER.resolve())
print("RF_OUT_DIR:", RF_OUT_DIR.resolve())
print("ANALYSIS_OUT:", ANALYSIS_OUT.resolve())


# Region mapping and file order

In [None]:
# Load region mapping created during training
REGION_MAP_PATH = RF_OUT_DIR / "region_mapping.json"
with REGION_MAP_PATH.open("r", encoding="utf-8") as f:
    REGION_MAP = json.load(f)

INVERSE_REGION = {v: k for k, v in REGION_MAP.items()}

# Load exact file order used for training
file_list_path = RF_OUT_DIR / "tables" / "file_list.csv"
file_list = pd.read_csv(file_list_path)["file"].tolist()
print(f"Files to load (ordered): {len(file_list)}")
file_list


# Build full dataset

In [None]:
target_col = "slum_label1"
REGION_COL = "REG1_GHSL"

predictor_cols = [
    "i5_par_area", "i1_pop_area", "i6_paru_area", "i8_paru_par", "B_AVG_SEG",
    "i9_roads_par", "PARU_A_SEG", "B_AREA_SEG", "B_CV_SEG",
    "REGION_CODE",  # numeric code mapped from REG1_GHSL
]

def map_region(val):
    if pd.isna(val):
        return REGION_MAP["Unknown"]
    return REGION_MAP.get(str(val), REGION_MAP["Unknown"])

dfs = []
for fname in file_list:
    csv_path = CSV_FOLDER / fname
    if not csv_path.exists():
        print(f"[WARN] Missing labeled CSV: {csv_path}")
        continue
    df = pd.read_csv(csv_path)

    # Add REGION_CODE
    df["REGION_CODE"] = df[REGION_COL].map(map_region)

    keep = [c for c in predictor_cols if c != "REGION_CODE"] + [target_col, "REGION_CODE"]
    missing = [c for c in keep if c not in df.columns]
    if missing:
        print(f"[WARN] Skipping {fname} due to missing cols: {missing}")
        continue

    df = df[keep]
    dfs.append(df)

if not dfs:
    raise RuntimeError("No valid files after schema check.")

full_data = pd.concat(dfs, ignore_index=True)
clean_data = full_data.dropna(subset=predictor_cols + [target_col]).reset_index(drop=True)

X_full = clean_data[predictor_cols].to_numpy(dtype=float)
y_full = clean_data[target_col].to_numpy()

print("Data shape after cleaning:", clean_data.shape)
clean_data.head()


# Rebuild train/test split from saved indices

In [None]:
holdout_idx_df = pd.read_csv(RF_OUT_DIR / "tables" / "holdout_indices.csv")

n = len(clean_data)
test_mask = np.zeros(n, dtype=bool)
test_indices = holdout_idx_df.loc[holdout_idx_df["is_test"] == 1, "global_index"].to_numpy()
test_mask[test_indices] = True
train_mask = ~test_mask

print("Train/Test sizes:", train_mask.sum(), test_mask.sum())

X_train, y_train = X_full[train_mask], y_full[train_mask]
X_test,  y_test  = X_full[test_mask],  y_full[test_mask]

region_test  = clean_data.loc[test_mask, "REGION_CODE"].to_numpy()
region_train = clean_data.loc[train_mask, "REGION_CODE"].to_numpy()

pd.Series(region_test).map(INVERSE_REGION).value_counts()


# Load trained model and inspect parameters

In [None]:
model_path = RF_OUT_DIR / "rf_best_model.joblib"
best = joblib.load(model_path)

best_params_json = RF_OUT_DIR / "tables" / "best_params.json"
if best_params_json.exists():
    with best_params_json.open("r") as f:
        print("Best params from training:", json.load(f))

print("Model n_estimators (refit):", getattr(best, "n_estimators", None))


# Score holdout and threshold sweep

In [None]:
# Predicted probabilities on holdout
y_score = best.predict_proba(X_test)[:, 1]

grid = np.linspace(0, 1, 101)
rows = []
for thr in grid:
    y_pred = (y_score >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bal_acc = 0.5 * (rec + spec)
    rows.append([thr, prec, rec, f1, spec, bal_acc, tn, fp, fn, tp])

thr_df = pd.DataFrame(
    rows,
    columns=["threshold","precision","recall","f1",
             "specificity","balanced_acc","tn","fp","fn","tp"]
)
thr_df.to_csv(TABLES / "threshold_sweep.csv", index=False)

best_row = thr_df.iloc[thr_df["f1"].values.argmax()]
best_thr = float(best_row["threshold"])
best_row.to_frame().T.to_csv(TABLES / "best_threshold_f1.csv", index=False)

print("Threshold maximizing F1:", best_thr)
best_row


# Plot threshold curves

In [None]:
plt.figure(figsize=(8,5))
plt.plot(thr_df["threshold"], thr_df["precision"], label="Precision")
plt.plot(thr_df["threshold"], thr_df["recall"],    label="Recall")
plt.plot(thr_df["threshold"], thr_df["f1"],        label="F1")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Threshold Sweep (Holdout)")
plt.legend()
plt.tight_layout()
plt.savefig(PLOTS / "threshold_sweep.png", dpi=150)
plt.show()


In [None]:
# PR curve
prec, rec, _ = precision_recall_curve(y_test, y_score)
pr_auc = average_precision_score(y_test, y_score)
pd.DataFrame({"recall": rec, "precision": prec}).to_csv(
    TABLES / "pr_curve_points.csv", index=False
)

plt.figure(figsize=(6,5))
plt.plot(rec, prec)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall (AP = {pr_auc:.3f})")
plt.tight_layout()
plt.savefig(PLOTS / "pr_curve.png", dpi=150)
plt.show()

# ROC curve
fpr, tpr, _ = roc_curve(y_test, y_score)
auc = roc_auc_score(y_test, y_score)
pd.DataFrame({"fpr": fpr, "tpr": tpr}).to_csv(
    TABLES / "roc_curve_points.csv", index=False
)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
plt.plot([0,1],[0,1],'--',alpha=0.4)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC (Holdout)")
plt.legend()
plt.tight_layout()
plt.savefig(PLOTS / "roc_curve.png", dpi=300)
plt.show()

print(f"AUC={auc:.4f}, PR-AUC={pr_auc:.4f} at threshold maximizing F1 = {best_thr:.2f}")


# Helper: metrics at threshold

In [None]:
def metrics_at_threshold(y_true, y_proba, thr: float):
    y_pred = (y_proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    acc  = accuracy_score(y_true, y_pred)
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bal_acc = 0.5 * (rec + spec)
    return {
        "threshold": thr,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "accuracy": acc,
        "specificity": spec,
        "balanced_acc": bal_acc,
        "tn": tn, "fp": fp, "fn": fn, "tp": tp
    }


# Helper: confusion matrix plotting

In [None]:
def plot_confusion_matrix(y_true, y_proba, thr: float, savepath=None):
    y_pred = (y_proba >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, digits=3)
    print(f"--- Classification Report @ threshold = {thr:.2f} ---")
    print(report)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation='nearest', cmap="Blues")
    fig.colorbar(im, ax=ax)

    ax.set(
        xticks=np.arange(2),
        yticks=np.arange(2),
        xticklabels=["Pred 0", "Pred 1"],
        yticklabels=["Actual 0", "Actual 1"],
        xlabel="Predicted",
        ylabel="Actual",
        title=f"Confusion Matrix @ threshold = {thr:.2f}"
    )

    vmax = cm.max()
    for i in range(2):
        for j in range(2):
            val = cm[i, j]
            color = "white" if cm[i, j] > vmax / 2 else "black"
            ax.text(j, i, f"{val}", ha="center", va="center",
                    color=color, fontsize=12, fontweight="bold")

    plt.tight_layout()
    if savepath:
        plt.savefig(savepath, dpi=300)
    plt.show()


# Helper: curves with threshold marked

In [None]:
def plot_curves_with_threshold(y_true, y_proba, thr: float, save_prefix: str):
    # PR curve with marked threshold
    prec, rec, thr_pr = precision_recall_curve(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)

    idx_pr = np.argmin(np.abs(thr_pr - thr)) if len(thr_pr) > 0 else None
    rec_thr  = rec[idx_pr] if idx_pr is not None else None
    prec_thr = prec[idx_pr] if idx_pr is not None else None

    plt.figure(figsize=(6,5))
    plt.plot(rec, prec, label=f"AP={pr_auc:.3f}")
    if idx_pr is not None:
        plt.scatter([rec_thr], [prec_thr])
    plt.xlabel("Recall"); plt.ylabel("Precision")
    plt.title("Precision–Recall (Holdout)")
    plt.legend(); plt.tight_layout()
    plt.savefig(PLOTS / f"{save_prefix}_pr_curve_marked.png", dpi=150)
    plt.show()

    # ROC with threshold point
    fpr, tpr, thr_roc = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)

    idx_roc = np.argmin(np.abs(thr_roc - thr)) if len(thr_roc) > 0 else None
    tpr_thr = tpr[idx_roc] if idx_roc is not None and idx_roc < len(tpr) else None
    fpr_thr = fpr[idx_roc] if idx_roc is not None and idx_roc < len(fpr) else None

    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f"AUC={auc:.3f}")
    plt.plot([0,1],[0,1],'--',alpha=0.4)
    if idx_roc is not None:
        plt.scatter([fpr_thr], [tpr_thr])
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.title("ROC (Holdout)")
    plt.legend(); plt.tight_layout()
    plt.savefig(PLOTS / f"{save_prefix}_roc_curve_marked.png", dpi=150)
    plt.show()

    # Threshold sweep with vertical line
    plt.figure(figsize=(8,5))
    plt.plot(thr_df["threshold"], thr_df["precision"], label="Precision")
    plt.plot(thr_df["threshold"], thr_df["recall"],    label="Recall")
    plt.plot(thr_df["threshold"], thr_df["f1"],        label="F1")
    plt.axvline(thr, ls="--", alpha=0.7)
    plt.xlabel("Threshold"); plt.ylabel("Score")
    plt.title("Threshold Sweep (Holdout)")
    plt.legend(); plt.tight_layout()
    plt.savefig(PLOTS / f"{save_prefix}_threshold_sweep_marked.png", dpi=150)
    plt.show()


# Choose operating threshold and evaluate

In [None]:
# Final operating threshold used in manuscript (can be changed)
THR = 0.40  

m = metrics_at_threshold(y_test, y_score, THR)
pd.DataFrame([m]).to_csv(TABLES / f"metrics_at_thr_{THR:.2f}.csv", index=False)
pd.DataFrame([m])


In [None]:
plot_confusion_matrix(
    y_test, y_score,
    THR,
    savepath=PLOTS / f"confusion_matrix_thr_{THR:.2f}.png"
)


In [None]:
plot_curves_with_threshold(
    y_test, y_score,
    THR,
    save_prefix=f"thr_{THR:.2f}"
)


In [None]:
rows = []
for code in sorted(np.unique(region_test)):
    mask = (region_test == code)
    if mask.sum() == 0:
        continue
    mm = metrics_at_threshold(y_test[mask], y_score[mask], THR)
    mm["region_code"] = int(code)
    mm["region_name"] = INVERSE_REGION.get(int(code), "Unknown")
    rows.append(mm)

per_region_df = pd.DataFrame(rows)
per_region_df.to_csv(
    TABLES / f"per_region_metrics_thr_{THR:.2f}.csv", index=False
)
per_region_df


# Balanced evaluation on test set (downsample negatives)

In [None]:
BAL_N_REPEATS = 50
BAL_RANDOM_SEED = 42

def balanced_indices(y_true, rng):
    """Return indices for a balanced subset: all positives + sampled negatives of equal count."""
    y_true = np.asarray(y_true)
    pos_idx = np.flatnonzero(y_true == 1)
    neg_idx = np.flatnonzero(y_true == 0)
    if len(pos_idx) == 0 or len(neg_idx) == 0:
        raise ValueError("Cannot build balanced subset: one of the classes is missing in y_true.")
    neg_sample = rng.choice(neg_idx, size=len(pos_idx), replace=False)
    sel = np.concatenate([pos_idx, neg_sample])
    rng.shuffle(sel)
    return sel

rng = check_random_state(BAL_RANDOM_SEED)
rows_bal = []
cms = []

for r in range(BAL_N_REPEATS):
    sel = balanced_indices(y_test, rng)
    m_bal = metrics_at_threshold(y_test[sel], y_score[sel], THR)
    m_bal["repeat"] = r
    rows_bal.append(m_bal)

    y_pred_bal = (y_score[sel] >= THR).astype(int)
    cm_bal = confusion_matrix(y_test[sel], y_pred_bal)
    cms.append(cm_bal)

bal_df = pd.DataFrame(rows_bal)
bal_df.to_csv(TABLES / f"balanced_metrics_thr_{THR:.2f}.csv", index=False)

metric_cols = ["precision","recall","f1","accuracy","specificity",
               "balanced_acc","tn","fp","fn","tp"]

means = bal_df[metric_cols].mean()
stds  = bal_df[metric_cols].std(ddof=1)

bal_summary_row = pd.DataFrame([{
    **{f"{k}_mean": float(means[k]) for k in metric_cols},
    **{f"{k}_std":  float(stds[k])  for k in metric_cols},
    "n_repeats": int(len(bal_df))
}])

bal_summary_row.to_csv(
    TABLES / f"balanced_metrics_summary_thr_{THR:.2f}.csv", index=False
)
bal_summary_row


In [None]:
cm_avg = np.mean(np.stack(cms, axis=0), axis=0)
cm_avg_rounded = np.rint(cm_avg).astype(int)
pd.DataFrame(
    cm_avg_rounded,
    index=["True 0","True 1"],
    columns=["Pred 0","Pred 1"]
).to_csv(TABLES / f"confusion_matrix_balanced_avg_thr_{THR:.2f}.csv")

fig, ax = plt.subplots(figsize=(6,5))
im = ax.imshow(cm_avg, interpolation='nearest', cmap="Blues")
fig.colorbar(im, ax=ax)
ax.set(
    xticks=np.arange(2), yticks=np.arange(2),
    xticklabels=["Pred 0","Pred 1"], yticklabels=["Actual 0","Actual 1"],
    xlabel="Predicted", ylabel="Actual",
    title=f"Averaged Confusion Matrix (Balanced, {BAL_N_REPEATS} draws) @ τ={THR:.2f}"
)
vmax = cm_avg.max()
for i in range(2):
    for j in range(2):
        val = cm_avg[i, j]
        color = "white" if val > vmax/2 else "black"
        ax.text(j, i, f"{val:.1f}", ha="center", va="center",
                color=color, fontsize=12, fontweight="bold")
plt.tight_layout()
plt.savefig(PLOTS / f"confusion_matrix_balanced_avg_thr_{THR:.2f}.png", dpi=150)
plt.show()


In [None]:
from sklearn.metrics import matthews_corrcoef

mcc = matthews_corrcoef(y_test, (y_score >= THR).astype(int))
print("Matthews correlation coefficient (MCC) at τ=%.2f: %.4f" % (THR, mcc))
