Logistic regression per ecoregion. 

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Per-ecoregion Logistic Regression — neg:pos sweep (1:1..10:1) + AUCs

For each ecoregion parquet file:
  - Convert `fraction` to binary burned label:
        burned = 1 if fraction > 0.50 else 0
  - Create a fixed 10% global test set (stratified by burned)
  - Use remaining 90% as TrainVal pool

  - For each negative:positive ratio R in [1, 2, ..., 10]:
        * Keep ALL positives from TrainVal
        * Sample min(R * n_pos, n_neg) negatives from TrainVal
        * Shuffle to create a TrainVal subset
        * Stratified Train vs Val split inside that subset
        * Train LogisticRegression (L2) on training subset
        * On validation subset:
            - Compute precision–recall curve
            - Choose threshold that maximizes F1
        * On fixed test set:
            - Compute IoU, Precision, Recall, F1 at that threshold
            - Compute ROC AUC + ROC curve
            - Compute PR AUC (Average Precision) + PR curve
        * Save:
            - ROC PNG
            - PR PNG
            - Model .pkl
            - Row in per-ecoregion summary CSV

You can then select the best neg:pos ratio per ecoregion using ROC AUC / PR AUC.
"""

import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

import joblib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    jaccard_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    roc_curve,
    precision_recall_curve,
)
from sklearn.utils import shuffle as sklearn_shuffle

# ============================================================
# CONFIG
# ============================================================
# Where your per-ecoregion parquet files live
ECO_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_by_ecoregion"

# Where to save outputs
OUT_ROOT = "/explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_logreg_negpos_auc"
os.makedirs(OUT_ROOT, exist_ok=True)

# Global TrainVal / Test split
TEST_FRAC        = 0.10   # 10% fixed test set
VAL_SIZE_OVERALL = 0.20   # overall fraction of full data used as validation

RANDOM_STATE = 42

# Negative:positive ratios to sweep (negatives per positive in TrainVal subset)
NEG_POS_RATIOS = list(range(1, 11))  # 1:1 up to 10:1

# Columns that should NOT be used as predictors
DROP_COLS = {
    "fraction",
    "burned",
    "year",
    "month",
    "latitude",
    "longitude",
    "bin",      # often present and should not be a predictor
}

# ============================================================
# HELPER: basic cleaning & type coercion
# ============================================================

def prepare_xy(df: pd.DataFrame):
    """
    Return X (predictors) and y (burned) after:
      - fraction -> binary burned (1 if >0.50 else 0)
      - dropping NaN/inf rows
      - coercing predictors to numeric
    """
    if "fraction" not in df.columns:
        raise ValueError("Expected column 'fraction' in ecoregion parquet.")

    df = df.copy()

    # Make fraction 0/1 and set burned target
    df["fraction"] = df["fraction"].astype("float32").clip(0.0, 1.0)
    df["burned"]   = (df["fraction"] > 0.50).astype("uint8")

    # Drop rows with any NaNs / ±inf
    df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how="any")
    if df.empty:
        return None, None, None

    # Predictors = all columns except reserved/drop columns
    predictors = [c for c in df.columns if c not in DROP_COLS]
    if not predictors:
        raise ValueError("No predictor columns left after dropping reserved columns.")

    X = df[predictors].copy()
    y = df["burned"].astype("uint8")

    # Coerce non-numeric predictors to numeric
    coerced = 0
    for c in predictors:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors="coerce")
            coerced += 1

    if coerced:
        mask = X.notna().all(axis=1)
        X = X.loc[mask].copy()
        y = y.loc[mask].copy()
        if len(X) == 0:
            return None, None, None

    return X, y, predictors


# ============================================================
# MAIN LOOP OVER ECOREGION PARQUET FILES
# ============================================================

eco_files = sorted(glob.glob(os.path.join(ECO_DIR, "*.parquet")))
if not eco_files:
    raise RuntimeError(f"No parquet files found in {ECO_DIR}")

print(f"Found {len(eco_files)} ecoregion parquet files.")

for eco_file in tqdm(eco_files, desc="Ecoregions"):
    eco_basename = os.path.basename(eco_file)
    eco_name = os.path.splitext(eco_basename)[0]

    eco_out_dir = os.path.join(OUT_ROOT, eco_name)
    eco_models_dir = os.path.join(eco_out_dir, "models")
    eco_figs_dir = os.path.join(eco_out_dir, "figures")
    os.makedirs(eco_out_dir, exist_ok=True)
    os.makedirs(eco_models_dir, exist_ok=True)
    os.makedirs(eco_figs_dir, exist_ok=True)

    print(f"\n=== Ecoregion: {eco_name} ===")
    df_eco = pd.read_parquet(eco_file)

    # Prepare X, y, predictors
    X_full, y_full, predictors = prepare_xy(df_eco)
    if X_full is None or y_full is None:
        print(f"[SKIP] {eco_name}: no valid rows after cleaning/coercion.")
        continue

    # Need both classes present
    cls_counts = y_full.value_counts()
    if set(cls_counts.index.tolist()) != {0, 1}:
        print(f"[SKIP] {eco_name}: only one class present (counts={cls_counts.to_dict()}).")
        continue

    data = X_full.copy()
    data["burned"] = y_full

    # =======================================================
    # GLOBAL TRAINVAL vs TEST SPLIT (FIXED TEST SET)
    # =======================================================
    try:
        idx_trainval, idx_test = train_test_split(
            data.index,
            test_size=TEST_FRAC,
            random_state=RANDOM_STATE,
            stratify=data["burned"],
        )
    except ValueError as e:
        print(f"[SKIP] {eco_name}: could not stratify global split (likely too few positives). Error: {e}")
        continue

    trainval = data.loc[idx_trainval].copy()
    test     = data.loc[idx_test].copy()

    print("Global split sizes:")
    print(f"  Train/Val pool: {len(trainval)}")
    print(f"  Test          : {len(test)}")
    print("Test class counts:", test["burned"].value_counts().to_dict())

    X_test = test[predictors].copy()
    y_test = test["burned"].astype("uint8")

    test_pos = int((y_test == 1).sum())
    test_neg = int((y_test == 0).sum())
    print(f"Test set positives (1): {test_pos}")
    print(f"Test set negatives (0): {test_neg}")

    # Split TrainVal pool into pos/neg
    pos_tv = trainval[trainval["burned"] == 1]
    neg_tv = trainval[trainval["burned"] == 0]
    n_pos_tv, n_neg_tv = len(pos_tv), len(neg_tv)
    print("TrainVal pool class counts:", trainval["burned"].value_counts().to_dict())
    if n_pos_tv == 0 or n_neg_tv == 0:
        print(f"[SKIP] {eco_name}: TrainVal pool has only one class.")
        continue

    summary_rows = []

    # Precompute inner val size (relative to TrainVal subset for each ratio)
    # VAL_SIZE_OVERALL is fraction of full data; convert to fraction of TrainVal
    val_size_inner = VAL_SIZE_OVERALL / (1.0 - TEST_FRAC)

    # =======================================================
    # SWEEP OVER NEG:POS RATIOS (1:1 .. 10:1)
    # =======================================================
    for ratio in NEG_POS_RATIOS:
        # Target negatives for this ratio
        neg_target = int(ratio * n_pos_tv)
        neg_target = max(1, min(neg_target, n_neg_tv))

        eff_ratio = neg_target / max(n_pos_tv, 1)
        print(f"\n--- {eco_name}: neg:pos target ratio = {ratio}:1 "
              f"(effective ~{eff_ratio:.2f}:1, target_neg={neg_target}) ---")

        # Sample negatives for this sweep
        try:
            neg_tv_sample = neg_tv.sample(
                n=neg_target,
                random_state=RANDOM_STATE + ratio,
                replace=False,
            )
        except ValueError as e:
            print(f"[SKIP ratio] {eco_name}, ratio={ratio}: sampling error: {e}")
            continue

        # Combine all positives + sampled negatives
        tv_subset = pd.concat([pos_tv, neg_tv_sample], axis=0)
        tv_subset = sklearn_shuffle(
            tv_subset,
            random_state=RANDOM_STATE + 100 + ratio,
        ).reset_index(drop=True)

        # Check both classes present in subset
        if tv_subset["burned"].nunique() < 2:
            print(f"[SKIP ratio] {eco_name}, ratio={ratio}: only one class in TrainVal subset.")
            continue

        print("TrainVal subset class counts:", tv_subset["burned"].value_counts().to_dict())

        # Train vs Val split within this TrainVal subset
        try:
            train_sub, val_sub = train_test_split(
                tv_subset,
                test_size=val_size_inner,
                random_state=RANDOM_STATE,
                stratify=tv_subset["burned"],
            )
        except ValueError as e:
            print(f"[SKIP ratio] {eco_name}, ratio={ratio}: could not stratify Train/Val split: {e}")
            continue

        X_train = train_sub[predictors].copy()
        y_train = train_sub["burned"].astype("uint8")
        X_val   = val_sub[predictors].copy()
        y_val   = val_sub["burned"].astype("uint8")

        n_pos_train = int((y_train == 1).sum())
        n_neg_train = int((y_train == 0).sum())
        n_pos_val   = int((y_val   == 1).sum())
        n_neg_val   = int((y_val   == 0).sum())
        print(f"Train subset class counts: {{0: {n_neg_train}, 1: {n_pos_train}}}")
        print(f"Val subset class counts  : {{0: {n_neg_val},   1: {n_pos_val}}}")

        # If Train or Val ends up single-class, skip this ratio
        if len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2:
            print(f"[SKIP ratio] {eco_name}, ratio={ratio}: single-class train or val.")
            continue

        # ----------------- TRAIN LOGISTIC REGRESSION MODEL -----------------
        model = make_pipeline(
            StandardScaler(with_mean=True, with_std=True),
            LogisticRegression(
                penalty="l2",
                solver="lbfgs",
                max_iter=1000,
                n_jobs=-1,
                class_weight=None,  # we are controlling class balance via sampling
            )
        )

        model.fit(X_train, y_train)

        # ----------------- VALIDATION PROBABILITIES -----------------
        y_val_proba = model.predict_proba(X_val)[:, 1]

        # Threshold selection on VAL — still useful for reporting,
        # but model comparison should rely on AUCs (threshold-free).
        prec_val, rec_val, thr_val = precision_recall_curve(y_val, y_val_proba)
        prec_ = prec_val[:-1]
        rec_  = rec_val[:-1]
        f1_vals = 2 * prec_ * rec_ / (prec_ + rec_ + 1e-12)
        best_idx = int(np.argmax(f1_vals))
        best_thr = float(thr_val[best_idx])
        best_f1_val = float(f1_vals[best_idx])

        print(f"  Best threshold on val (max F1): {best_thr:.3f}, val F1={best_f1_val:.3f}")

        # ----------------- TEST PROBABILITIES & METRICS -----------------
        y_test_proba = model.predict_proba(X_test)[:, 1]

        # Thresholded metrics at best_thr
        y_test_hat = (y_test_proba >= best_thr).astype("uint8")
        test_iou   = jaccard_score(y_test, y_test_hat, average="binary", zero_division=0)
        test_prec  = precision_score(y_test, y_test_hat, zero_division=0)
        test_rec   = recall_score(y_test, y_test_hat, zero_division=0)
        test_f1    = f1_score(y_test, y_test_hat, zero_division=0)

        # Threshold-free metrics (AUCs) on fixed test set
        fpr, tpr, _ = roc_curve(y_test, y_test_proba)
        test_auc_roc = roc_auc_score(y_test, y_test_proba)

        prec_curve_test, rec_curve_test, _ = precision_recall_curve(y_test, y_test_proba)
        test_auc_pr = average_precision_score(y_test, y_test_proba)

        print("  Test metrics (fixed test set):")
        print(f"    IoU (Jaccard)          : {test_iou:.3f}")
        print(f"    Precision              : {test_prec:.3f}")
        print(f"    Recall                 : {test_rec:.3f}")
        print(f"    F1 Score               : {test_f1:.3f}")
        print(f"    ROC AUC                : {test_auc_roc:.3f}")
        print(f"    PR AUC (Avg Precision) : {test_auc_pr:.3f}")

        # ---- Save ROC curve plot ----
        plt.figure(figsize=(6, 5))
        plt.plot(fpr, tpr, label=f"ROC (AUC = {test_auc_roc:.3f})")
        plt.plot([0, 1], [0, 1], linestyle="--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(
            f"{eco_name} — Logistic ROC\n"
            f"neg:pos ≈ {eff_ratio:.2f}:1 (target {ratio}:1)"
        )
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        roc_fig_out = os.path.join(
            eco_figs_dir,
            f"{eco_name}_roc_negpos{ratio:02d}.png"
        )
        plt.savefig(roc_fig_out, dpi=150)
        plt.close()
        print(f"  Saved ROC curve: {roc_fig_out}")

        # ---- Save Precision–Recall curve plot ----
        plt.figure(figsize=(6, 5))
        plt.plot(rec_curve_test, prec_curve_test, label=f"PR (AUC = {test_auc_pr:.3f})")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(
            f"{eco_name} — Logistic PR Curve\n"
            f"neg:pos ≈ {eff_ratio:.2f}:1 (target {ratio}:1)"
        )
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        pr_fig_out = os.path.join(
            eco_figs_dir,
            f"{eco_name}_pr_negpos{ratio:02d}.png"
        )
        plt.savefig(pr_fig_out, dpi=150)
        plt.close()
        print(f"  Saved PR curve: {pr_fig_out}")

        # ----------------- SAVE MODEL -----------------
        model_path = os.path.join(
            eco_models_dir,
            f"{eco_name}_logreg_negpos{ratio:02d}.pkl"
        )
        joblib.dump(model, model_path)
        print(f"  Saved model for neg:pos={ratio}:1 to: {model_path}")

        # ----------------- APPEND SUMMARY ROW -----------------
        summary_rows.append(
            dict(
                ecoregion           = eco_name,
                neg_pos_ratio       = ratio,                 # target ratio
                eff_neg_pos_ratio   = round(eff_ratio, 3),   # realized in TrainVal subset
                n_pos_tv_pool       = int(n_pos_tv),
                n_neg_tv_pool       = int(n_neg_tv),
                n_pos_train         = n_pos_train,
                n_neg_train         = n_neg_train,
                n_pos_val           = n_pos_val,
                n_neg_val           = n_neg_val,
                n_pos_test          = test_pos,
                n_neg_test          = test_neg,
                threshold_val_bestf1= round(best_thr, 4),
                val_f1              = round(best_f1_val, 4),
                test_iou            = round(test_iou, 4),
                test_precision      = round(test_prec, 4),
                test_recall         = round(test_rec, 4),
                test_f1             = round(test_f1, 4),
                test_auc_roc        = round(test_auc_roc, 4),
                test_auc_pr         = round(test_auc_pr, 4),
            )
        )

    # =======================================================
    # SAVE PER-ECOREGION SUMMARY CSV
    # =======================================================
    if summary_rows:
        summary_df = pd.DataFrame(summary_rows)
        summary_csv = os.path.join(
            eco_out_dir,
            f"{eco_name}_logreg_negpos_sweep_auc.csv"
        )
        summary_df.to_csv(summary_csv, index=False)
        print(f"\nSaved summary metrics for {eco_name} to:\n  {summary_csv}")
    else:
        print(f"\nNo valid neg:pos sweeps for {eco_name}; no CSV written.")


In [3]:
't'

't'

Print number of 1's per ecoregion and percent of total. 

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Read per-ecoregion parquet files and report number of fire pixels (1's)
per ecoregion, plus percent of the total across all ecoregions.

Definition:
    burned = 1 if fraction > 0.50 else 0
"""

import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

# ============================================================
# CONFIG
# ============================================================
# Where your per-ecoregion parquet files live
ECO_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_by_ecoregion"

# Threshold for "fire pixel"
FRACTION_THR = 0.50

# ============================================================
# MAIN
# ============================================================

eco_files = sorted(glob.glob(os.path.join(ECO_DIR, "*.parquet")))
if not eco_files:
    raise RuntimeError(f"No parquet files found in {ECO_DIR}")

print(f"Found {len(eco_files)} ecoregion parquet files.\n")

results = []

for eco_file in tqdm(eco_files, desc="Ecoregions"):
    eco_basename = os.path.basename(eco_file)
    eco_name = os.path.splitext(eco_basename)[0]

    df = pd.read_parquet(eco_file)

    if "fraction" not in df.columns:
        print(f"[WARN] {eco_name}: no 'fraction' column, skipping.")
        continue

    # Clean fraction column a bit
    frac = df["fraction"].astype("float32")
    frac = frac.replace([np.inf, -np.inf], np.nan)

    # burned = 1 if fraction > FRACTION_THR
    burned_mask = frac > FRACTION_THR
    n_burned = int(burned_mask.sum())

    results.append(
        dict(
            ecoregion=eco_name,
            n_fire_pixels=n_burned,
        )
    )

# Convert to DataFrame
if not results:
    raise RuntimeError("No valid ecoregions processed (no results).")

res_df = pd.DataFrame(results)

# Total fire pixels across all ecoregions
total_fire = int(res_df["n_fire_pixels"].sum())

print("\n==============================================")
print(f"Total number of burned pixels (fraction > {FRACTION_THR:.2f}) "
      f"across ALL ecoregions: {total_fire:,}")
print("==============================================\n")

if total_fire == 0:
    print("Total fire pixels across all ecoregions is 0. Nothing more to report.")
else:
    res_df["pct_of_total"] = res_df["n_fire_pixels"] / total_fire * 100.0

    # Sort descending by number of fire pixels
    res_df = res_df.sort_values("n_fire_pixels", ascending=False).reset_index(drop=True)

    print("Per-ecoregion fire pixel counts (fraction > {:.2f}):".format(FRACTION_THR))
    print()

    # Pretty print
    for _, row in res_df.iterrows():
        eco = row["ecoregion"]
        n   = int(row["n_fire_pixels"])
        pct = float(row["pct_of_total"])
        print(f"{eco:40s}  fire_pixels = {n:10,d}  ({pct:6.2f}% of total)")


Found 23 ecoregion parquet files.



Ecoregions: 100%|██████████| 23/23 [00:14<00:00,  1.54it/s]


Total number of burned pixels (fraction > 0.50) across ALL ecoregions: 63,521

Per-ecoregion fire pixel counts (fraction > 0.50):

ecoregion_Central_Taiga                   fire_pixels =     13,400  ( 21.10% of total)
ecoregion_Montane_Boreal                  fire_pixels =     11,981  ( 18.86% of total)
ecoregion_Montane_Sub-Boreal              fire_pixels =      8,049  ( 12.67% of total)
ecoregion_Montane_Sub-Arctic              fire_pixels =      5,181  (  8.16% of total)
ecoregion_SOFTWOOD_SHIELD                 fire_pixels =      4,450  (  7.01% of total)
ecoregion_Northern_Taiga                  fire_pixels =      3,457  (  5.44% of total)
ecoregion_Southern_Taiga                  fire_pixels =      3,114  (  4.90% of total)
ecoregion_Wetlands                        fire_pixels =      2,619  (  4.12% of total)
ecoregion_ALASKA_BOREAL_INTERIOR          fire_pixels =      2,590  (  4.08% of total)
ecoregion_TAIGA_SHIELD                    fire_pixels =      2,275  (  3.58% of total




Per ecoregion xgboost

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Per-ecoregion focal-GBM/XGB — negative-fraction sweep + AUCs,
with IoU-at-best-threshold learning curves.

For each ecoregion parquet file:
  - Convert `fraction` to binary burned label:
        burned = 1 if fraction > 0.50 else 0
  - Create a fixed 10% global test set (stratified by burned)
  - Use remaining 90% as TrainVal pool

  - For each negative fraction f in [1.0, 0.9, ..., 0.1]:
        * Keep ALL positives from TrainVal
        * Sample f * (all negatives in TrainVal)
        * Stratified Train vs Val split inside that subset
        * Train focal-loss model for a fixed NUM_BOOST_ROUNDS trees
        * For each iteration:
            - Predict on val
            - Find best F1 threshold for that iteration
            - Compute IoU on val + train at that threshold
        * Define best_iteration as the iteration with max val IoU
        * Use that iteration & threshold to:
            - Compute final test metrics
            - Plot ROC and PR curves
        * Save, in an ecoregion-specific folder:
            - IoU-at-best-threshold learning curve (train & val)
            - Feature importance PNG
            - ROC curve PNG (with AUC)
            - Precision–Recall curve PNG (with AUPRC)
            - Model file
            - Row in per-ecoregion summary CSV with IoU/F1 + ROC AUC + PR AUC + best_iteration
"""

import os
import glob
import inspect
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

import lightgbm as lgb
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_recall_curve,
    jaccard_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    roc_curve,
)

# ============================================================
# CONFIG
# ============================================================

# Where your per-ecoregion parquet files live
ECO_DIR = "/explore/nobackup/people/spotter5/clelland_fire_ml/parquet_cems_by_ecoregion"

# Where to save outputs (per-ecoregion subfolders)
OUT_ROOT = "/explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh"
os.makedirs(OUT_ROOT, exist_ok=True)

RANDOM_STATE = 42

# Global TrainVal / Test split
TEST_FRAC        = 0.10   # 10% fixed test set
VAL_SIZE_OVERALL = 0.20   # overall fraction of full data used as validation

# Fractions of negatives in TrainVal pool to keep (all positives kept)
NEG_FRAC_STEPS = [
    1.0, 0.9, 0.8, 0.7, 0.6,
    0.5, 0.4, 0.3, 0.2, 0.1
]

# Fixed number of boosting rounds (trees)
NUM_BOOST_ROUNDS = 600

TOP_N_IMPORT  = 30
FOCAL_ALPHA   = 0.25
FOCAL_GAMMA   = 2.0

# Columns that should NOT be used as predictors
DROP_COLS = {
    "fraction",
    "burned",
    "year",
    "month",
    "latitude",
    "longitude",
    "bin",
}

LGB_PARAMS = dict(
    boosting_type="gbdt",
    learning_rate=0.05,
    num_leaves=48,
    min_data_in_leaf=100,
    feature_fraction=0.75,
    bagging_fraction=0.75,
    bagging_freq=5,
    lambda_l2=2.0,
    n_jobs=-1,
    metric="aucpr",
)

# ============================================================
# Helpers
# ============================================================

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def lgb_has_fobj():
    try:
        sig = inspect.signature(lgb.train)
        return "fobj" in sig.parameters
    except Exception:
        return False

def prepare_xy(df: pd.DataFrame):
    """
    Prepare X (predictors) and y (burned) from an ecoregion parquet:
      - fraction -> binary burned (1 if >0.50 else 0)
      - drop NaN/inf rows
      - coerce predictors to numeric
      - treat 'b1' as categorical if present
    """
    if "fraction" not in df.columns:
        raise ValueError("Expected column 'fraction' in ecoregion parquet.")

    df = df.copy()
    df["fraction"] = df["fraction"].astype("float32").clip(0.0, 1.0)
    df["burned"]   = (df["fraction"] > 0.50).astype("uint8")

    # Drop rows with any NaNs / ±inf
    df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how="any")
    if df.empty:
        return None, None, None

    predictors = [c for c in df.columns if c not in DROP_COLS]
    if not predictors:
        raise ValueError("No predictor columns left after dropping reserved columns.")

    X = df[predictors].copy()
    y = df["burned"].astype("uint8")

    # Handle land cover as categorical if present
    if "b1" in X.columns and not pd.api.types.is_categorical_dtype(X["b1"]):
        X["b1"] = X["b1"].astype("category")

    # Coerce other non-numeric to numeric
    coerced = 0
    for c in predictors:
        if c == "b1" and pd.api.types.is_categorical_dtype(X["b1"]):
            continue
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors="coerce")
            coerced += 1

    if coerced:
        num_cols = [
            c for c in X.columns
            if not (c == "b1" and pd.api.types.is_categorical_dtype(X["b1"]))
        ]
        mask = X[num_cols].notna().all(axis=1)
        if "b1" in X.columns and pd.api.types.is_categorical_dtype(X["b1"]):
            mask &= X["b1"].notna()
        X = X.loc[mask].copy()
        y = y.loc[mask].copy()
        if len(X) == 0:
            return None, None, None

    return X, y, predictors

# --- FOCAL LOSS for LightGBM (margin -> grad/hess) ---
def focal_loss_lgb(y_pred, dataset):
    y_true = dataset.get_label()
    p = sigmoid(y_pred)
    p = np.clip(p, 1e-7, 1 - 1e-7)
    a, g = FOCAL_ALPHA, FOCAL_GAMMA

    # Stable approximate focal gradients
    grad_pos = a * ((1 - p) ** g) * (g * (-np.log(p)) * (1 - p) - 1) * (p * (1 - p))
    grad_neg = (1 - a) * (p ** g) * (g * (-np.log(1 - p)) * p + 1) * (p * (1 - p))
    grad = np.where(y_true > 0.5, grad_pos, grad_neg)

    # Approximate hessian with logistic hessian
    hess = p * (1 - p)
    return grad, hess

# ============================================================
# MAIN
# ============================================================

eco_files = sorted(glob.glob(os.path.join(ECO_DIR, "*.parquet")))
if not eco_files:
    raise RuntimeError(f"No parquet files found in {ECO_DIR}")

print(f"Found {len(eco_files)} ecoregion parquet files.\n")

USE_LGB_FOBJ = lgb_has_fobj()
if not USE_LGB_FOBJ:
    print("\n[INFO] LightGBM build lacks `fobj` support on train(); "
          "falling back to XGBoost for focal loss where needed.")
    import xgboost as xgb

# VAL_SIZE_OVERALL is fraction of full data; convert to fraction of TrainVal
val_size_inner = VAL_SIZE_OVERALL / (1.0 - TEST_FRAC)

for eco_file in tqdm(eco_files, desc="Ecoregions"):
    eco_basename = os.path.basename(eco_file)
    eco_name = os.path.splitext(eco_basename)[0]

    eco_out_dir   = os.path.join(OUT_ROOT, eco_name)
    eco_models_dir = os.path.join(eco_out_dir, "models")
    eco_figs_dir   = os.path.join(eco_out_dir, "figures")
    os.makedirs(eco_out_dir, exist_ok=True)
    os.makedirs(eco_models_dir, exist_ok=True)
    os.makedirs(eco_figs_dir, exist_ok=True)

    print(f"\n=== Ecoregion: {eco_name} ===")

    df_eco = pd.read_parquet(eco_file)

    # Prepare X, y, predictors
    X_full, y_full, predictors = prepare_xy(df_eco)
    if X_full is None or y_full is None:
        print(f"[SKIP] {eco_name}: no valid rows after cleaning/coercion.")
        continue

    cls_counts = y_full.value_counts()
    if set(cls_counts.index.tolist()) != {0, 1}:
        print(f"[SKIP] {eco_name}: only one class present (counts={cls_counts.to_dict()}).")
        continue

    data = X_full.copy()
    data["burned"] = y_full

    # ----------------- FIXED GLOBAL TEST SPLIT (per ecoregion) -----------------
    try:
        idx_trainval, idx_test = train_test_split(
            data.index,
            test_size=TEST_FRAC,
            random_state=RANDOM_STATE,
            stratify=data["burned"],
        )
    except ValueError as e:
        print(f"[SKIP] {eco_name}: could not stratify global split "
              f"(likely too few positives). Error: {e}")
        continue

    trainval = data.loc[idx_trainval].copy()
    test     = data.loc[idx_test].copy()

    print("Global split sizes:")
    print(f"  Train/Val pool: {len(trainval)}")
    print(f"  Test          : {len(test)}")
    print("Test class counts:", test["burned"].value_counts().to_dict())

    X_test = test[predictors].copy()
    y_test = test["burned"].astype("uint8")

    test_pos = int((y_test == 1).sum())
    test_neg = int((y_test == 0).sum())
    print(f"Test set positives (1): {test_pos}")
    print(f"Test set negatives (0): {test_neg}")

    # Split TrainVal into pos / neg
    pos_tv = trainval[trainval["burned"] == 1]
    neg_tv = trainval[trainval["burned"] == 0]
    n_pos_tv, n_neg_tv = len(pos_tv), len(neg_tv)
    print("TrainVal pool class counts:", trainval["burned"].value_counts().to_dict())
    if n_pos_tv == 0 or n_neg_tv == 0:
        print(f"[SKIP] {eco_name}: TrainVal pool has only one class.")
        continue

    neg_per_pos_initial = n_neg_tv / max(n_pos_tv, 1)
    print(f"Initial neg:pos ratio in TrainVal ~ {neg_per_pos_initial:.2f}:1")

    summary_rows = []

    # ----------------- NEGATIVE FRACTION SWEEP (per ecoregion) -----------------
    for step_idx, frac in enumerate(NEG_FRAC_STEPS):
        frac_pct = int(round(frac * 100))
        print("\n" + "=" * 80)
        print(f"=== {eco_name} — Negative Fraction Step: {frac:.1f} "
              f"(approx {frac_pct}% of TrainVal negatives kept) ===")

        # Number of negatives to sample in TrainVal
        neg_target = int(round(frac * n_neg_tv))
        neg_target = max(1, neg_target)
        eff_ratio = neg_target / max(n_pos_tv, 1)
        print(f"Target negatives in TrainVal subset: {neg_target}")
        print(f"Effective neg:pos ratio in TrainVal subset ~ {eff_ratio:.2f}:1")

        # Sample negatives and combine with ALL positives from TrainVal
        try:
            neg_tv_sample = neg_tv.sample(
                neg_target,
                random_state=RANDOM_STATE + step_idx,
                replace=False,
            )
        except ValueError as e:
            print(f"[SKIP frac] {eco_name}, frac={frac}: sampling error: {e}")
            continue

        tv_subset = (
            pd.concat([pos_tv, neg_tv_sample], axis=0)
            .sample(frac=1.0, random_state=RANDOM_STATE + 100 + step_idx)
            .reset_index(drop=True)
        )

        print("TrainVal subset class counts (for this fraction):",
              tv_subset["burned"].value_counts().to_dict())

        if tv_subset["burned"].nunique() < 2:
            print(f"[SKIP frac] {eco_name}, frac={frac}: only one class in TrainVal subset.")
            continue

        # Train vs Val split within this TrainVal subset
        try:
            train_sub, val_sub = train_test_split(
                tv_subset,
                test_size=val_size_inner,
                random_state=RANDOM_STATE,
                stratify=tv_subset["burned"],
            )
        except ValueError as e:
            print(f"[SKIP frac] {eco_name}, frac={frac}: could not stratify Train/Val split: {e}")
            continue

        X_train = train_sub[predictors].copy()
        y_train = train_sub["burned"].astype("uint8")
        X_val   = val_sub[predictors].copy()
        y_val   = val_sub["burned"].astype("uint8")

        n_pos_train = int((y_train == 1).sum())
        n_neg_train = int((y_train == 0).sum())
        n_pos_val   = int((y_val   == 1).sum())
        n_neg_val   = int((y_val   == 0).sum())
        print(f"Train subset class counts: {{0: {n_neg_train}, 1: {n_pos_train}}}")
        print(f"Val subset class counts  : {{0: {n_neg_val},   1: {n_pos_val}}}")

        if len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2:
            print(f"[SKIP frac] {eco_name}, frac={frac}: single-class train or val.")
            continue

        evals_result = {}
        backend = "lightgbm"
        best_iter = NUM_BOOST_ROUNDS  # will be updated

        # ----------------- TRAIN SINGLE MODEL FOR THIS FRACTION -----------------
        if USE_LGB_FOBJ:
            train_set = lgb.Dataset(X_train, label=y_train)
            val_set   = lgb.Dataset(X_val,   label=y_val, reference=train_set)
            params = LGB_PARAMS.copy()
            params["seed"] = RANDOM_STATE
            params["objective"] = "binary"  # overridden by fobj

            booster = lgb.train(
                params,
                train_set,
                num_boost_round=NUM_BOOST_ROUNDS,
                valid_sets=[train_set, val_set],
                valid_names=["train", "validation"],
                fobj=focal_loss_lgb,
                callbacks=[
                    lgb.log_evaluation(period=50),
                    lgb.record_evaluation(evals_result),
                ]
            )

        else:
            backend = "xgboost"

            def focal_loss_xgb(preds, dtrain):
                y = dtrain.get_label()
                p = sigmoid(preds)
                p = np.clip(p, 1e-7, 1 - 1e-7)
                a, g = FOCAL_ALPHA, FOCAL_GAMMA
                grad_pos = a * ((1 - p) ** g) * (g * (-np.log(p)) * (1 - p) - 1) * (p * (1 - p))
                grad_neg = (1 - a) * (p ** g) * (g * (-np.log(1 - p)) * p + 1) * (p * (1 - p))
                grad = np.where(y > 0.5, grad_pos, grad_neg)
                hess = p * (1 - p)
                return grad, hess

            dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
            dval   = xgb.DMatrix(X_val,   label=y_val,   enable_categorical=True)

            params_xgb = dict(
                booster="gbtree",
                eta=0.05,
                max_depth=0,        # use `max_leaves` with tree_method=hist
                max_leaves=48,
                subsample=0.75,
                colsample_bytree=0.75,
                reg_lambda=2.0,
                tree_method="hist",   # or "gpu_hist" if you want GPU
                objective="reg:logistic",  # overridden by custom obj
                eval_metric="aucpr",
                seed=RANDOM_STATE,
                nthread=-1,
            )

            evals = [(dtrain, "train"), (dval, "validation")]
            booster = xgb.train(
                params_xgb,
                dtrain,
                num_boost_round=NUM_BOOST_ROUNDS,
                evals=evals,
                obj=focal_loss_xgb,
                verbose_eval=50,
                evals_result=evals_result,
            )

        # ----------------- IoU-at-best-threshold LEARNING CURVES -----------------
        train_iou_curve = []
        val_iou_curve   = []
        thr_curve       = []

        print("\nComputing IoU-at-best-threshold curves across iterations...")
        for it in range(1, NUM_BOOST_ROUNDS + 1):
            if USE_LGB_FOBJ:
                val_margin   = booster.predict(X_val,   num_iteration=it)
                train_margin = booster.predict(X_train, num_iteration=it)
            else:
                dval_it   = xgb.DMatrix(X_val,   enable_categorical=True)
                dtrain_it = xgb.DMatrix(X_train, enable_categorical=True)
                val_margin   = booster.predict(dval_it,   iteration_range=(0, it))
                train_margin = booster.predict(dtrain_it, iteration_range=(0, it))

            val_proba   = sigmoid(val_margin)
            train_proba = sigmoid(train_margin)

            # Find best threshold on VAL for this iteration (max F1)
            prec_val, rec_val, thr_val = precision_recall_curve(y_val, val_proba)
            prec_ = prec_val[:-1]
            rec_  = rec_val[:-1]
            f1_vals = 2 * prec_ * rec_ / (prec_ + rec_ + 1e-12)
            best_idx_it = int(np.argmax(f1_vals))
            best_thr_it = float(thr_val[best_idx_it])

            thr_curve.append(best_thr_it)

            # IoU on VAL at its best threshold for this iteration
            y_val_hat_it = (val_proba >= best_thr_it).astype("uint8")
            val_iou_it = jaccard_score(y_val, y_val_hat_it, average="binary", zero_division=0)
            val_iou_curve.append(val_iou_it)

            # IoU on TRAIN at same threshold
            y_train_hat_it = (train_proba >= best_thr_it).astype("uint8")
            train_iou_it = jaccard_score(y_train, y_train_hat_it, average="binary", zero_division=0)
            train_iou_curve.append(train_iou_it)

        train_iou_curve = np.array(train_iou_curve)
        val_iou_curve   = np.array(val_iou_curve)
        thr_curve       = np.array(thr_curve)

        # Best iteration = argmax val IoU curve (1-based)
        best_iter_idx = int(np.argmax(val_iou_curve))
        best_iter     = int(best_iter_idx + 1)
        best_thr      = float(thr_curve[best_iter_idx])
        best_val_iou  = float(val_iou_curve[best_iter_idx])

        print(f"\nBest iteration based on val IoU-at-best-threshold: {best_iter}")
        print(f"  Val IoU at best_iter : {best_val_iou:.3f}")
        print(f"  Threshold at best_iter (val F1-optimal): {best_thr:.3f}")

        # Plot IoU learning curves
        plt.figure(figsize=(10, 6))
        plt.plot(train_iou_curve, label="Train IoU (best thr per iter)")
        plt.plot(val_iou_curve,   label="Validation IoU (best thr per iter)")
        plt.axvline(best_iter_idx, linestyle="--", label=f"best_iter={best_iter}")
        plt.xlabel("Boosting Rounds")
        plt.ylabel("IoU (Jaccard)")
        plt.title(
            f"{eco_name} — Focal-{backend.upper()} IoU-at-best-threshold\n"
            f"Neg fraction={frac:.1f} (≈{eff_ratio:.2f}:1 in TrainVal subset)"
        )
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        iou_fig_out = os.path.join(
            eco_figs_dir,
            f"{eco_name}_iou_curve_bestthr_focal_negfrac{frac_pct:03d}.png"
        )
        plt.savefig(iou_fig_out, dpi=150)
        plt.close()
        print(f"Saved IoU-at-best-threshold learning curve: {iou_fig_out}")

        # ----------------- FINAL TEST METRICS USING best_iter & best_thr -----------------
        if USE_LGB_FOBJ:
            test_margin = booster.predict(X_test, num_iteration=best_iter)
        else:
            dtest = xgb.DMatrix(X_test, enable_categorical=True)
            test_margin = booster.predict(dtest, iteration_range=(0, best_iter))

        y_test_proba = sigmoid(test_margin)
        y_test_hat   = (y_test_proba >= best_thr).astype("uint8")

        test_iou  = jaccard_score(y_test, y_test_hat, average="binary", zero_division=0)
        test_prec = precision_score(y_test, y_test_hat, zero_division=0)
        test_rec  = recall_score(y_test, y_test_hat, zero_division=0)
        test_f1   = f1_score(y_test, y_test_hat, zero_division=0)

        # Threshold-free metrics (AUCs) on fixed test set
        fpr, tpr, roc_thr = roc_curve(y_test, y_test_proba)
        test_auc_roc = roc_auc_score(y_test, y_test_proba)

        prec_curve_test, rec_curve_test, pr_thr_test = precision_recall_curve(y_test, y_test_proba)
        test_auc_pr = average_precision_score(y_test, y_test_proba)

        print("\n==== FINAL TEST METRICS (focal, fixed global test set) ====")
        print(f"Ecoregion                : {eco_name}")
        print(f"Neg fraction (TrainVal)  : {frac:.1f} (≈{eff_ratio:.2f}:1)")
        print(f"Best iteration (val IoU) : {best_iter}")
        print(f"Threshold (val best F1)  : {best_thr:.3f}")
        print(f"IoU (Jaccard)            : {test_iou:.3f}")
        print(f"Precision                : {test_prec:.3f}")
        print(f"Recall                   : {test_rec:.3f}")
        print(f"F1 Score                 : {test_f1:.3f}")
        print(f"ROC AUC                  : {test_auc_roc:.3f}")
        print(f"PR AUC (Avg Precision)   : {test_auc_pr:.3f}")

        # ---- Save ROC curve plot ----
        plt.figure(figsize=(6, 5))
        plt.plot(fpr, tpr, label=f"ROC (AUC = {test_auc_roc:.3f})")
        plt.plot([0, 1], [0, 1], linestyle="--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(
            f"{eco_name} — Focal-{backend.upper()} ROC\n"
            f"Neg fraction={frac:.1f} (≈{eff_ratio:.2f}:1 in TrainVal subset)"
        )
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        roc_fig_out = os.path.join(
            eco_figs_dir,
            f"{eco_name}_roc_curve_focal_negfrac{frac_pct:03d}.png"
        )
        plt.savefig(roc_fig_out, dpi=150)
        plt.close()
        print(f"Saved ROC curve: {roc_fig_out}")

        # ---- Save Precision–Recall curve plot ----
        plt.figure(figsize=(6, 5))
        plt.plot(rec_curve_test, prec_curve_test, label=f"PR (AUC = {test_auc_pr:.3f})")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(
            f"{eco_name} — Focal-{backend.upper()} PR Curve\n"
            f"Neg fraction={frac:.1f} (≈{eff_ratio:.2f}:1 in TrainVal subset)"
        )
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        pr_fig_out = os.path.join(
            eco_figs_dir,
            f"{eco_name}_pr_curve_focal_negfrac{frac_pct:03d}.png"
        )
        plt.savefig(pr_fig_out, dpi=150)
        plt.close()
        print(f"Saved Precision–Recall curve: {pr_fig_out}")

        # ----------------- FEATURE IMPORTANCE -----------------
        if USE_LGB_FOBJ:
            gain_imp = booster.feature_importance(importance_type="gain")
            feat_names = np.array(X_train.columns)
        else:
            fmap = booster.get_score(importance_type="gain")
            feat_names = np.array(X_train.columns)
            gain_imp = np.array(
                [fmap.get(f"f{i}", 0.0) for i in range(len(feat_names))],
                dtype=float,
            )

        gain_imp = gain_imp / (gain_imp.sum() + 1e-12)
        order = np.argsort(gain_imp)[::-1][:TOP_N_IMPORT]

        plt.figure(figsize=(9, max(5, 0.28 * len(order))))
        plt.barh(feat_names[order][::-1], gain_imp[order][::-1])
        plt.xlabel("Relative Gain Importance")
        plt.title(
            f"{eco_name} — Focal-{backend.upper()} Feature Importance (Top {len(order)})\n"
            f"Neg fraction={frac:.1f} (≈{eff_ratio:.2f}:1 in TrainVal subset)"
        )
        plt.tight_layout()
        fi_fig_out = os.path.join(
            eco_figs_dir,
            f"{eco_name}_feature_importance_focal_negfrac{frac_pct:03d}.png"
        )
        plt.savefig(fi_fig_out, dpi=150)
        plt.close()
        print(f"Saved focal feature importance plot: {fi_fig_out}")

        # ----------------- SAVE MODEL -----------------
        model_path = os.path.join(
            eco_models_dir,
            f"{eco_name}_focal_model_negfrac{frac_pct:03d}_{backend}.pkl"
        )
        joblib.dump(booster, model_path)
        print(f"Saved model for {eco_name}, neg fraction {frac:.1f} to: {model_path}")

        # ----------------- APPEND SUMMARY ROW -----------------
        summary_rows.append(
            dict(
                ecoregion        = eco_name,
                neg_fraction     = frac,
                neg_fraction_pct = frac_pct,
                eff_neg_pos_ratio= round(eff_ratio, 3),
                n_pos_train      = int((train_sub["burned"] == 1).sum()),
                n_neg_train      = int((train_sub["burned"] == 0).sum()),
                n_pos_val        = n_pos_val,
                n_neg_val        = n_neg_val,
                n_pos_test       = test_pos,
                n_neg_test       = test_neg,
                focal_alpha      = FOCAL_ALPHA,
                focal_gamma      = FOCAL_GAMMA,
                threshold        = round(best_thr, 3),
                val_iou_best     = round(best_val_iou, 3),
                test_iou         = round(test_iou, 3),
                test_precision   = round(test_prec, 3),
                test_recall      = round(test_rec, 3),
                test_f1          = round(test_f1, 3),
                test_auc_roc     = round(test_auc_roc, 3),
                test_auc_pr      = round(test_auc_pr, 3),
                best_iteration   = int(best_iter),
                backend          = backend,
            )
        )

    # ----------------- SAVE PER-ECOREGION SUMMARY CSV -----------------
    if summary_rows:
        summary_df = pd.DataFrame(summary_rows)
        summary_csv = os.path.join(
            eco_out_dir,
            f"{eco_name}_focal_neg_fraction_sweep_metrics_auc_thresh.csv"
        )
        summary_df.to_csv(summary_csv, index=False)
        print(f"\nSaved focal neg-fraction sweep summary for {eco_name} to:\n  {summary_csv}")
    else:
        print(f"\nNo valid focal neg-fraction sweeps for {eco_name}; no CSV written.")

print("\n✅ Done. Per-ecoregion focal neg-fraction sweeps complete.")


  from .autonotebook import tqdm as notebook_tqdm


Found 23 ecoregion parquet files.


[INFO] LightGBM build lacks `fobj` support on train(); falling back to XGBoost for focal loss where needed.


Ecoregions:   0%|          | 0/23 [00:00<?, ?it/s]


=== Ecoregion: ecoregion_ALASKA_BOREAL_INTERIOR ===
Global split sizes:
  Train/Val pool: 5460372
  Test          : 606708
Test class counts: {0: 606449, 1: 259}
Test set positives (1): 259
Test set negatives (0): 606449
TrainVal pool class counts: {0: 5458041, 1: 2331}
Initial neg:pos ratio in TrainVal ~ 2341.50:1

=== ecoregion_ALASKA_BOREAL_INTERIOR — Negative Fraction Step: 1.0 (approx 100% of TrainVal negatives kept) ===
Target negatives in TrainVal subset: 5458041
Effective neg:pos ratio in TrainVal subset ~ 2341.50:1
TrainVal subset class counts (for this fraction): {0: 5458041, 1: 2331}
Train subset class counts: {0: 4245143, 1: 1813}
Val subset class counts  : {0: 1212898,   1: 518}
[0]	train-aucpr:0.00043	validation-aucpr:0.00043
[50]	train-aucpr:0.00043	validation-aucpr:0.00043
[100]	train-aucpr:0.00043	validation-aucpr:0.00043
[150]	train-aucpr:0.00043	validation-aucpr:0.00043
[200]	train-aucpr:0.00023	validation-aucpr:0.00022
[250]	train-aucpr:0.00022	validation-aucpr:0.0

Ecoregions:   4%|▍         | 1/23 [2:09:02<47:18:58, 7742.66s/it]

Saved focal feature importance plot: /explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh/ecoregion_ALASKA_BOREAL_INTERIOR/figures/ecoregion_ALASKA_BOREAL_INTERIOR_feature_importance_focal_negfrac010.png
Saved model for ecoregion_ALASKA_BOREAL_INTERIOR, neg fraction 0.1 to: /explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh/ecoregion_ALASKA_BOREAL_INTERIOR/models/ecoregion_ALASKA_BOREAL_INTERIOR_focal_model_negfrac010_xgboost.pkl

Saved focal neg-fraction sweep summary for ecoregion_ALASKA_BOREAL_INTERIOR to:
  /explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh/ecoregion_ALASKA_BOREAL_INTERIOR/ecoregion_ALASKA_BOREAL_INTERIOR_focal_neg_fraction_sweep_metrics_auc_thresh.csv

=== Ecoregion: ecoregion_ALASKA_TUNDRA ===
Global split sizes:
  Train/Val pool: 4567546
  Test          : 507506
Test class counts: {0: 507497, 1: 9}
Test set positives (1): 9
Test set negatives (0): 507497
TrainV

Ecoregions:   9%|▊         | 2/23 [3:36:59<36:42:18, 6292.31s/it]

Saved focal feature importance plot: /explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh/ecoregion_ALASKA_TUNDRA/figures/ecoregion_ALASKA_TUNDRA_feature_importance_focal_negfrac010.png
Saved model for ecoregion_ALASKA_TUNDRA, neg fraction 0.1 to: /explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh/ecoregion_ALASKA_TUNDRA/models/ecoregion_ALASKA_TUNDRA_focal_model_negfrac010_xgboost.pkl

Saved focal neg-fraction sweep summary for ecoregion_ALASKA_TUNDRA to:
  /explore/nobackup/people/spotter5/clelland_fire_ml/ecoregion_focal_negfrac_auc_thresh/ecoregion_ALASKA_TUNDRA/ecoregion_ALASKA_TUNDRA_focal_neg_fraction_sweep_metrics_auc_thresh.csv

=== Ecoregion: ecoregion_ARCTIC_CORDILLERA ===


Ecoregions:  13%|█▎        | 3/23 [3:37:01<19:00:02, 3420.12s/it]

[SKIP] ecoregion_ARCTIC_CORDILLERA: only one class present (counts={0: 2145480}).

=== Ecoregion: ecoregion_Arctic_Deserts_and_Tundra ===
Global split sizes:
  Train/Val pool: 13338615
  Test          : 1482069
Test class counts: {0: 1482043, 1: 26}
Test set positives (1): 26
Test set negatives (0): 1482043
TrainVal pool class counts: {0: 13338382, 1: 233}
Initial neg:pos ratio in TrainVal ~ 57246.27:1

=== ecoregion_Arctic_Deserts_and_Tundra — Negative Fraction Step: 1.0 (approx 100% of TrainVal negatives kept) ===
Target negatives in TrainVal subset: 13338382
Effective neg:pos ratio in TrainVal subset ~ 57246.27:1
TrainVal subset class counts (for this fraction): {0: 13338382, 1: 233}
Train subset class counts: {0: 10374297, 1: 181}
Val subset class counts  : {0: 2964085,   1: 52}
[0]	train-aucpr:0.00002	validation-aucpr:0.00002
[50]	train-aucpr:0.00002	validation-aucpr:0.00002
[100]	train-aucpr:0.00002	validation-aucpr:0.00002


In [None]:
't'