In [None]:
Tune light gbm

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Tune XGBoost hyperparameters with K-fold CV (focal loss, PR-AUC):

- Loads the same balanced 10x parquet:
    cems_with_fraction_balanced_10x.parquet

- Creates burned label (fraction > 0.5).

- Reserves fixed 10% global test set (NOT used for tuning).
  Tuning is done only on the remaining 90% TrainVal data.

- Runs K-fold CV on the TrainVal set for a small grid of XGBoost params.
- Uses focal loss via custom obj and PR-AUC (average_precision_score) as metric.
- Saves best params to JSON so they can be reused in the neg-fraction sweep.
"""

import os
import json
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import average_precision_score

# ----------------- CONFIG -----------------
PARQUET_IN   = "/explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/cems_with_fraction_balanced_10x.parquet"
RANDOM_STATE = 42
N_FOLDS      = 10
NUM_BOOST_ROUNDS = 600   # should match your main training script

FOCAL_ALPHA  = 0.25
FOCAL_GAMMA  = 2.0

OUT_ROOT = "/explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/neg_ratio_experiments_globaltest"
OUT_DIR  = os.path.join(OUT_ROOT, "option4_focal_loss_10x_negative_auc_thresh")
os.makedirs(OUT_DIR, exist_ok=True)

BEST_PARAMS_JSON = os.path.join(OUT_DIR, "tuned_xgb_focal_params.json")

# ----------------- Helpers -----------------
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))


def focal_loss_xgb(preds, dtrain):
    """
    Custom focal loss for XGBoost using raw logits.

    preds: raw scores (logits) from the model
    dtrain: xgb.DMatrix
    """
    y = dtrain.get_label()
    p = sigmoid(preds)
    p = np.clip(p, 1e-7, 1 - 1e-7)
    a, g = FOCAL_ALPHA, FOCAL_GAMMA

    # Focal loss grad for positives and negatives
    grad_pos = a * ((1 - p) ** g) * (g * (-np.log(p)) * (1 - p) - 1) * (p * (1 - p))
    grad_neg = (1 - a) * (p ** g) * (g * (-np.log(1 - p)) * p + 1) * (p * (1 - p))
    grad = np.where(y > 0.5, grad_pos, grad_neg)

    # Approximate Hessian by logistic Hessian
    hess = p * (1 - p)
    return grad, hess


# ----------------- LOAD & PREP -----------------
print(f"Loading parquet: {PARQUET_IN}")
df = pd.read_parquet(PARQUET_IN)
if "fraction" not in df.columns:
    raise ValueError("Expected column 'fraction' in dataset.")

df["fraction"] = df["fraction"].astype("float32").clip(0, 1)
before = len(df)
df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how="any").copy()
print(f"Dropped {before - len(df):,} rows with NaNs/±inf; {len(df):,} remain.")

# Label: burned = 1 if fraction > 0.5, else 0
df["burned"] = (df["fraction"] > 0.5).astype(np.uint8)

print("\nClass counts (burned label):")
print(df["burned"].value_counts(dropna=False))
print(df["burned"].value_counts(normalize=True).mul(100))

# Same predictor selection as your main script
drop_cols = {"fraction", "burned", "bin", "year", "month", "latitude", "longitude"}
predictors = [c for c in df.columns if c not in drop_cols]

X_full = df[predictors].copy()
y_full = df["burned"].astype(np.uint8)

# Treat land cover as categorical if present
if "b1" in X_full.columns and not pd.api.types.is_categorical_dtype(X_full["b1"]):
    X_full["b1"] = X_full["b1"].astype("category")
    print("\nTreating 'b1' as pandas 'category'.")

# Coerce any non-numeric predictors (except categorical b1) to numeric
coerced = 0
for c in X_full.columns:
    if c == "b1" and pd.api.types.is_categorical_dtype(X_full[c]):
        continue
    if not np.issubdtype(X_full[c].dtype, np.number):
        X_full[c] = pd.to_numeric(X_full[c], errors="coerce")
        coerced += 1

if coerced:
    pre = len(X_full)
    num_cols = [
        c for c in X_full.columns
        if not (c == "b1" and pd.api.types.is_categorical_dtype(X_full["b1"]))
    ]
    mask = X_full[num_cols].notna().all(axis=1)
    if "b1" in X_full.columns and pd.api.types.is_categorical_dtype(X_full["b1"]):
        mask &= X_full["b1"].notna()
    X_full = X_full.loc[mask].copy()
    y_full = y_full.loc[X_full.index]
    print(f"Coerced {coerced} column(s); dropped {pre - len(X_full):,} rows after coercion.")

print(f"\nFinal tuning dataset size: {len(X_full):,} rows")
print(f"Number of predictors: {len(X_full.columns)}")

# ----------------- GLOBAL TEST SPLIT (reserved, not used for tuning) -----------------
idx_trainval, idx_test = train_test_split(
    X_full.index,
    test_size=0.10,
    random_state=RANDOM_STATE,
    stratify=y_full
)
X_tv, y_tv = X_full.loc[idx_trainval], y_full.loc[idx_trainval]
X_test_holdout = X_full.loc[idx_test]
y_test_holdout = y_full.loc[idx_test]

print(f"\nTrainVal size for tuning: {len(X_tv):,} rows")
print(f"Global test (held out, unused here): {len(X_test_holdout):,} rows")
print("Global test class distribution:")
print(y_test_holdout.value_counts())
print(y_test_holdout.value_counts(normalize=True).mul(100))

# ----------------- PARAM GRID -----------------
# You can expand this list for a broader search.
param_grid = [
    dict(
        eta=0.05,
        max_depth=0,        # use max_leaves with hist/gpu_hist
        max_leaves=32,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        scale_pos_weight=5.0,
    ),
    dict(
        eta=0.05,
        max_depth=0,
        max_leaves=48,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=2.0,
        scale_pos_weight=10.0,
    ),
    dict(
        eta=0.05,
        max_depth=0,
        max_leaves=64,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_lambda=2.0,
        scale_pos_weight=10.0,
    ),
]

base_params = dict(
    booster="gbtree",
    tree_method="gpu_hist",     # change to "hist" if GPU not available
    predictor="gpu_predictor",  # or "auto" / "cpu_predictor"
    objective="binary:logitraw",  # raw logits, focal_loss_xgb will apply sigmoid
    eval_metric="aucpr",          # PR-AUC as built-in eval metric
    seed=RANDOM_STATE,
    nthread=-1,
)

skf = StratifiedKFold(
    n_splits=N_FOLDS,
    shuffle=True,
    random_state=RANDOM_STATE
)

best_params = None
best_score = -np.inf

print(f"\nStarting {N_FOLDS}-fold CV hyperparam search over {len(param_grid)} configs...")
for i, cfg in enumerate(param_grid, start=1):
    print("\n" + "=" * 80)
    print(f"Config {i}/{len(param_grid)}: {cfg}")
    fold_scores = []

    for fold_idx, (tr_idx, va_idx) in enumerate(skf.split(X_tv, y_tv), start=1):
        X_tr, X_va = X_tv.iloc[tr_idx], X_tv.iloc[va_idx]
        y_tr, y_va = y_tv.iloc[tr_idx], y_tv.iloc[va_idx]

        dtrain = xgb.DMatrix(X_tr, label=y_tr, enable_categorical=True)
        dval   = xgb.DMatrix(X_va, label=y_va, enable_categorical=True)

        params = base_params.copy()
        params.update(cfg)

        evals = [(dval, "validation")]

        booster = xgb.train(
            params,
            dtrain,
            num_boost_round=NUM_BOOST_ROUNDS,
            evals=evals,
            obj=focal_loss_xgb,
            verbose_eval=False,
        )

        va_margin = booster.predict(dval)  # raw logits
        va_proba  = sigmoid(va_margin)
        ap = average_precision_score(y_va, va_proba)
        fold_scores.append(ap)
        print(f"  Fold {fold_idx}: AUPRC={ap:.4f}")

    mean_ap = float(np.mean(fold_scores))
    print(f"Mean AUPRC for config {i}: {mean_ap:.4f}")

    if mean_ap > best_score:
        best_score = mean_ap
        best_params = cfg.copy()

print("\nBest config based on CV AUPRC:")
print(best_params)
print(f"Best mean AUPRC: {best_score:.4f}")

# Merge with base params and save
final_params = base_params.copy()
final_params.update(best_params)
final_params["num_boost_rounds"] = NUM_BOOST_ROUNDS
final_params["focal_alpha"] = FOCAL_ALPHA
final_params["focal_gamma"] = FOCAL_GAMMA

with open(BEST_PARAMS_JSON, "w") as f:
    json.dump(dict(final_params), f, indent=2)

print(f"\nSaved tuned XGBoost focal params to: {BEST_PARAMS_JSON}")


Loading parquet: /explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/cems_with_fraction_balanced_10x.parquet
Dropped 1,781,773 rows with NaNs/±inf; 2,569,673 remain.

Class counts (burned label):
0    2354966
1     214707
Name: burned, dtype: int64
0    91.644579
1     8.355421
Name: burned, dtype: float64

Treating 'b1' as pandas 'category'.

Final tuning dataset size: 2,569,673 rows
Number of predictors: 15

TrainVal size for tuning: 2,312,705 rows
Global test (held out, unused here): 256,968 rows
Global test class distribution:
0    235497
1     21471
Name: burned, dtype: int64
0    91.644485
1     8.355515
Name: burned, dtype: float64

Starting 10-fold CV hyperparam search over 3 configs...

Config 1/3: {'eta': 0.05, 'max_depth': 0, 'max_leaves': 32, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'scale_pos_weight': 5.0}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 1: AUPRC=0.2286



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 2: AUPRC=0.2321



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 3: AUPRC=0.2278



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 4: AUPRC=0.2221



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 5: AUPRC=0.2304



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 6: AUPRC=0.2285



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 7: AUPRC=0.2230



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 8: AUPRC=0.2287



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 9: AUPRC=0.2337



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 10: AUPRC=0.2225
Mean AUPRC for config 1: 0.2277

Config 2/3: {'eta': 0.05, 'max_depth': 0, 'max_leaves': 48, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 2.0, 'scale_pos_weight': 10.0}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 1: AUPRC=0.2373



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 2: AUPRC=0.2384



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 3: AUPRC=0.2392



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 4: AUPRC=0.2330



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 5: AUPRC=0.2388



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 6: AUPRC=0.2402



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 7: AUPRC=0.2309



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 8: AUPRC=0.2389



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 9: AUPRC=0.2482



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 10: AUPRC=0.2336
Mean AUPRC for config 2: 0.2379

Config 3/3: {'eta': 0.05, 'max_depth': 0, 'max_leaves': 64, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_lambda': 2.0, 'scale_pos_weight': 10.0}



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 1: AUPRC=0.2307



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 2: AUPRC=0.2314



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 3: AUPRC=0.2252



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 4: AUPRC=0.2271



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 5: AUPRC=0.2312



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 6: AUPRC=0.2324



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 7: AUPRC=0.2236



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 8: AUPRC=0.2232



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



  Fold 9: AUPRC=0.2361



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



  Fold 10: AUPRC=0.2299
Mean AUPRC for config 3: 0.2291

Best config based on CV AUPRC:
{'eta': 0.05, 'max_depth': 0, 'max_leaves': 48, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 2.0, 'scale_pos_weight': 10.0}
Best mean AUPRC: 0.2379

Saved tuned XGBoost focal params to: /explore/nobackup/people/spotter5/clelland_fire_ml/ml_training/neg_ratio_experiments_globaltest/option4_focal_loss_10x_negative_auc_thresh/tuned_xgb_focal_params.json



    E.g. tree_method = "hist", device = "cuda"



In [3]:
't'

't'