In [3]:
#!/usr/bin/env python
# coding: utf-8

import os
import pandas as pd
import geopandas as gpd

df1 = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv")
df2 = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-11-18_LCC_burnDepthField.csv")
df2 = df2.rename(columns = {'burnDepthField': 'burn_depth')
df3 = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-10-28_LCC_CombustionModelPredictors.csv")
print(df1.shape)
print(df2.shape)
print(df3.shape)


(1281, 60)
(100, 2)
(98, 57)


In [15]:
import os
import pandas as pd
import geopandas as gpd  # not used yet, but kept since you imported it

# ----------------- Load data -----------------
df1 = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv").dropna(subset = 'burn_depth')

df2 = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-11-18_LCC_burnDepthField.csv")
# fix syntax + rename to burn_depth
df2 = df2.rename(columns={'burnDepthField': 'burn_depth'})

df3 = pd.read_csv("/explore/nobackup/people/spotter5/new_combustion/2025-10-28_LCC_CombustionModelPredictors.csv")

# ----------------- Join LCC predictors with burn depth on ID -----------------
# Inner join = only IDs that appear in BOTH df2 (burn depth) and df3 (LCC predictors)
df_lcc = pd.merge(
    df3,
    df2[['ID', 'burn_depth']],  # keep only ID + burn_depth from df2
    on='ID',
    how='inner'
)

# ----------------- Concatenate with main training file -----------------
df_all = pd.concat([df1, df_lcc], ignore_index=True, sort=False)

# ----------------- Drop rows with missing burn_depth -----------------
df_all = df_all.dropna(subset=['burn_depth'])

# ----------------- Print shapes -----------------
print("Original df1 shape:", df1.shape)
print("New combined dataframe shape (after join/concat/dropna):", df_all.shape)


#!/usr/bin/env python
# coding: utf-8

import os
import json
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict, Counter
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from scipy.stats import randint, uniform
import joblib

# ============================================================
# CONFIG
# ============================================================
# Assume df_all already exists in memory from your previous join/concat.
# If not, you could recreate it here and then:
# df = df_all.copy()

OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
MODEL_DIR = os.path.join(OUT_DIR, "models")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Suffix for ALL outputs and models
SUFFIX = "_LCC_11_18"

# ----------------- SEARCH CONFIG -----------------
RANDOM_STATE   = 42
N_JOBS         = -1
INNER_FOLDS    = 5
N_ITER_SEARCH  = 40
SCORER         = make_scorer(mean_squared_error, greater_is_better=False)

RF_PARAM_DIST = {
    "n_estimators": randint(200, 1000),
    "max_depth":    randint(3, 40),
    "max_features": uniform(0.2, 0.8),  # float in (0,1]: fraction of features
    "min_samples_split": randint(2, 20),
    "min_samples_leaf":  randint(1, 10),
    "bootstrap":   [True, False]
}

# ============================================================
# LOAD / PREP DATAFRAME
# ============================================================
# Use df_all from previous step
df = df_all.copy()

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]
rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Schema snapshot
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, f"schema_summary{SUFFIX}.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project',
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date',
    'landcover_name'
}

# ----------------- TARGET PICKER -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['combusted_above', 'above.carbon.combusted'])
COL_BELOW = pick_col(['combusted_below'])
COL_DEPTH = pick_col(['burn_depth'])

# >>> Train burn_depth only for now <<<
targets = [(c, "units") for c in [COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

# IMPORTANT: include ALL possible targets for safe dropping
ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ============================================================
# Helper: build X, y, ids (keep ID for plotting LCC-only later)
# ============================================================
def build_xy(df_in: pd.DataFrame, target_col: str):
    # Find an ID column to keep (if present)
    id_col = None
    for c in ['id', 'ID', 'Id']:
        if c in df_in.columns:
            id_col = c
            break

    # Drop everything in EXCLUDE_PRED_COLS EXCEPT the id_col we want to keep
    drop_cols = [c for c in EXCLUDE_PRED_COLS if (c in df_in.columns and c != id_col)]
    work = df_in.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])

    y = work[target_col].astype(float).copy()

    # Capture IDs before we strip non-numeric predictors
    if id_col and id_col in work.columns:
        ids = work[id_col].copy()
    else:
        ids = pd.Series(range(len(work)), index=work.index, name='id')

    # Drop the explicit target AND any other known target columns to prevent leakage
    X = work.drop(columns=list(set(ALL_TARGET_COLS + [target_col])), errors='ignore')

    # Guard against accidental leakage
    assert target_col not in X.columns, f"Target leakage: {target_col} present in predictors!"

    # Drop non-numeric predictors (id is already copied into 'ids')
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    # Sanity: no empty feature set
    if X.shape[1] == 0:
        raise ValueError(f"No numeric predictors left after preprocessing for target '{target_col}'.")

    return X, y, ids

# ============================================================
# RandomizedSearch + LOOCV with per-fold R²
# ============================================================
def run_target_randomsearch_loocv(target_col: str, units_label: str = "units"):
    X, y, ids = build_xy(df, target_col)
    if X.shape[1] == 0 or len(y) < 3:
        print(f"[ERROR] Not enough predictors or samples for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")
    # Reset indices to align X, y, ids
    y = y.reset_index(drop=True)
    X = X.reset_index(drop=True)
    ids = ids.reset_index(drop=True)
    out_prefix = target_col.replace('.', '_')

    # 1. RandomizedSearchCV (global)
    print(f"\n[{target_col}] Starting global RandomizedSearchCV with {N_ITER_SEARCH} iterations...")
    kfold = KFold(n_splits=min(INNER_FOLDS, len(y)), shuffle=True, random_state=RANDOM_STATE)
    base = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS)
    tuner = RandomizedSearchCV(
        estimator=base,
        param_distributions=RF_PARAM_DIST,
        n_iter=N_ITER_SEARCH,
        scoring=SCORER,
        cv=kfold,
        random_state=RANDOM_STATE,
        n_jobs=N_JOBS,
        verbose=1,
        refit=True,
    )
    tuner.fit(X, y)

    # Leakage sanity check: the tuned model must NOT expect the target as a feature
    tuned_features = list(getattr(tuner.best_estimator_, "feature_names_in_", []))
    if target_col in tuned_features:
        raise RuntimeError(
            f"Leakage detected: tuned model expects target '{target_col}' as a feature. "
            f"Check preprocessing."
        )

    best_params = tuner.best_params_
    best_neg_mse = float(tuner.best_score_)
    best_rmse = float(np.sqrt(-best_neg_mse))
    print(f"[{target_col}] Best params: {best_params}")
    print(f"[{target_col}] RandomizedSearch best CV RMSE: {best_rmse:.4f} {units_label}")
    rs_results = pd.DataFrame(tuner.cv_results_)
    rs_results.to_csv(
        os.path.join(OUT_DIR, f"{out_prefix}_random_search_results{SUFFIX}.csv"),
        index=False
    )

    # 2. LOOCV using best params
    print(f"\n[{target_col}] Starting LOOCV with fixed best hyperparameters...")
    loo = LeaveOneOut()
    n = len(y)
    y_pred = np.zeros(n, dtype=float)
    r2_folds = np.full(n, np.nan, dtype=float)  # one R² per left-out observation
    train_means = np.full(n, np.nan, dtype=float)

    for i, (train_idx, test_idx) in enumerate(loo.split(X), start=1):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr = y.iloc[train_idx]
        yte = y.iloc[test_idx].values[0]

        model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS, **best_params)
        model.fit(Xtr, ytr)
        yhat = model.predict(Xte)[0]

        # store prediction
        test_i = test_idx[0]
        y_pred[test_i] = yhat

        # per-fold R² for this left-out sample
        mu_train = float(ytr.mean())
        train_means[test_i] = mu_train
        denom = (yte - mu_train)**2
        num = (yte - yhat)**2

        # if denom == 0, R² is undefined -> keep NaN
        if denom != 0.0:
            r2_val = 1.0 - (num / denom)
            # clamp negative R² to 0
            if r2_val < 0.0:
                r2_val = 0.0
            r2_folds[test_i] = r2_val

        if i % 25 == 0 or i == n:
            print(f"  LOOCV progress: {i}/{n}")

    # Global LOOCV metrics over all predictions
    rmse_global = mean_squared_error(y, y_pred, squared=False)
    r2_raw_global = r2_score(y, y_pred)
    # clamp negative global R² to 0 as well
    r2_global = max(0.0, r2_raw_global)
    print(f"[{target_col}] LOOCV RMSE (global): {rmse_global:.4f} | "
          f"R² (global, clamped): {r2_global:.4f} (raw: {r2_raw_global:.4f})")

    # Save per-sample predictions + per-fold R² + id
    preds_df = pd.DataFrame({
        "index": np.arange(n),
        "id": ids.astype(str),
        "y_obs": y.values,
        "y_pred": y_pred,
        "train_mean_y": train_means,
        "r2_loocv_fold": r2_folds
    })

    # CSV for violin plot
    violin_csv_path = os.path.join(OUT_DIR, f"{out_prefix}_violin{SUFFIX}.csv")
    preds_df.to_csv(violin_csv_path, index=False)
    print(f"[{target_col}] Saved per-fold R² CSV: {violin_csv_path}")

    # Also keep the LOOCV predictions CSV
    preds_path = os.path.join(OUT_DIR, f"{out_prefix}_loocv_predictions{SUFFIX}.csv")
    preds_df.to_csv(preds_path, index=False)
    print(f"[{target_col}] Saved LOOCV predictions (with fold R²) to: {preds_path}")

    # Save summary metrics
    pd.DataFrame({
        "target": [target_col],
        "n": [n],
        "n_predictors": [X.shape[1]],
        "loocv_rmse_global": [rmse_global],
        "loocv_r2_global_clamped": [r2_global],
        "loocv_r2_global_raw": [r2_raw_global],
        "random_search_best_rmse": [best_rmse]
    }).to_csv(
        os.path.join(OUT_DIR, f"{out_prefix}_loocv_metrics{SUFFIX}.csv"),
        index=False
    )

    # ----------------- GLOBAL OBS vs PRED PLOT (ALL SAMPLES) -----------------
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=preds_df["y_obs"], y=preds_df["y_pred"], s=18, edgecolor=None)
    lo = float(np.nanmin([preds_df["y_obs"].min(), preds_df["y_pred"].min()]))
    hi = float(np.nanmax([preds_df["y_obs"].max(), preds_df["y_pred"].max()]))
    plt.plot([lo, hi], [lo, hi], 'k--', lw=2)
    plt.xlabel(f"Observed {target_col}")
    plt.ylabel(f"Predicted {target_col}")
    plt.title(f"{target_col}: LOOCV Obs vs Pred (All Samples)\n"
              f"RMSE={rmse_global:.3f}, R²={r2_global:.3f} (clamped)")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{out_prefix}_loocv_obs_pred{SUFFIX}.png"), dpi=150)
    plt.close()

    # ----------------- LCC-ONLY OBS vs PRED PLOT -----------------
    # Filter to IDs starting with "LCC_"
    lcc_mask = preds_df["id"].astype(str).str.startswith("LCC_")
    preds_lcc = preds_df[lcc_mask].copy()

    if len(preds_lcc) >= 3:
        rmse_lcc = mean_squared_error(preds_lcc["y_obs"], preds_lcc["y_pred"], squared=False)
        r2_raw_lcc = r2_score(preds_lcc["y_obs"], preds_lcc["y_pred"])
        r2_lcc = max(0.0, r2_raw_lcc)

        plt.figure(figsize=(8, 6))
        sns.scatterplot(x=preds_lcc["y_obs"], y=preds_lcc["y_pred"], s=18, edgecolor=None)
        lo_lcc = float(np.nanmin([preds_lcc["y_obs"].min(), preds_lcc["y_pred"].min()]))
        hi_lcc = float(np.nanmax([preds_lcc["y_obs"].max(), preds_lcc["y_pred"].max()]))

        plt.plot([lo_lcc, hi_lcc], [lo_lcc, hi_lcc], 'k--', lw=2)
        plt.xlabel(f"Observed {target_col} (LCC only)")
        plt.ylabel(f"Predicted {target_col} (LCC only)")
        plt.title(f"{target_col}: LOOCV Obs vs Pred (LCC Samples)\n"
                  f"n={len(preds_lcc)}")

        # Put RMSE and R² in lower-right corner
        dx = hi_lcc - lo_lcc
        text_x = lo_lcc + 0.98 * dx
        text_y = lo_lcc + 0.02 * dx
        plt.text(
            text_x, text_y,
            f"RMSE={rmse_lcc:.3f}\nR²={r2_lcc:.3f}",
            ha='right', va='bottom',
            fontsize=10,
            bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)
        )

        plt.grid(True)
        plt.tight_layout()
        lcc_png_path = os.path.join(OUT_DIR, f"{out_prefix}_loocv_obs_pred_LCC_only{SUFFIX}.png")
        plt.savefig(lcc_png_path, dpi=150)
        plt.close()
        print(f"[{target_col}] Saved LCC-only Obs vs Pred plot: {lcc_png_path}")
    else:
        print(f"[{target_col}] Not enough LCC samples (found {len(preds_lcc)}) for LCC-only plot.")

    # ----------------- VIOLIN PLOT OF PER-FOLD R² -----------------
    valid_r2 = preds_df["r2_loocv_fold"].dropna()
    if len(valid_r2) > 0:
        plt.figure(figsize=(6, 6))
        sns.violinplot(y=valid_r2, cut=0)
        plt.ylabel("Per-fold LOOCV R²")
        plt.title(f"Distribution of LOOCV R² per left-out sample\nTarget: {target_col}")
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        violin_png_path = os.path.join(OUT_DIR, f"{out_prefix}_violin{SUFFIX}.png")
        plt.savefig(violin_png_path, dpi=150)
        plt.close()
        print(f"[{target_col}] Saved violin plot of per-fold R²: {violin_png_path}")
    else:
        print(f"[{target_col}] No valid per-fold R² values (all denominators zero). Skipping violin plot.")

    # ----------------- FINAL MODEL + FEATURE IMPORTANCE -----------------
    final_model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS, **best_params)
    final_model.fit(X, y)
    model_path = os.path.join(MODEL_DIR, f"rf_final_{out_prefix}{SUFFIX}.joblib")
    joblib.dump(final_model, model_path)

    feature_names = list(getattr(final_model, "feature_names_in_", X.columns))

    # Feature importance CSV + plot
    importances = final_model.feature_importances_
    fi_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    }).sort_values("importance", ascending=False)

    fi_csv_path = os.path.join(OUT_DIR, f"{out_prefix}_feature_importance{SUFFIX}.csv")
    fi_df.to_csv(fi_csv_path, index=False)
    print(f"[{target_col}] Saved feature importance CSV: {fi_csv_path}")

    top_n = min(30, len(fi_df))
    fi_top = fi_df.head(top_n)

    plt.figure(figsize=(10, max(6, 0.3 * top_n)))
    sns.barplot(data=fi_top, x="importance", y="feature", orient="h")
    plt.xlabel("Random Forest feature importance")
    plt.ylabel("Feature")
    plt.title(f"Feature importance (top {top_n})\nTarget: {target_col}")
    plt.tight_layout()
    fi_png_path = os.path.join(OUT_DIR, f"{out_prefix}_feature_importance{SUFFIX}.png")
    plt.savefig(fi_png_path, dpi=150)
    plt.close()
    print(f"[{target_col}] Saved feature importance plot: {fi_png_path}")

    # Metadata JSON
    meta = {
        "target": target_col,
        "loocv_rmse_global": rmse_global,
        "loocv_r2_global_clamped": r2_global,
        "loocv_r2_global_raw": r2_raw_global,
        "best_params": best_params,
        "model_path": model_path,
        "n_samples": n,
        "feature_names": feature_names
    }
    with open(os.path.join(OUT_DIR, f"{out_prefix}_final_model_metadata{SUFFIX}.json"), "w") as f:
        json.dump(meta, f, indent=2)

    # Extra sanity message
    print(f"[{target_col}] Final model trained with {len(meta['feature_names'])} features; "
          f"'{target_col}' in features? {target_col in meta['feature_names']}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target_randomsearch_loocv(tcol, units)

print("\nDone.")



Original df1 shape: (1174, 60)
New combined dataframe shape (after join/concat/dropna): (1225, 61)

Target: burn_depth | X: (1225, 50) | y: (1225,)

[burn_depth] Starting global RandomizedSearchCV with 40 iterations...
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[burn_depth] Best params: {'bootstrap': True, 'max_depth': 14, 'max_features': 0.40142583666029136, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 312}
[burn_depth] RandomizedSearch best CV RMSE: 4.2950 units

[burn_depth] Starting LOOCV with fixed best hyperparameters...
  LOOCV progress: 25/1225
  LOOCV progress: 50/1225
  LOOCV progress: 75/1225
  LOOCV progress: 100/1225
  LOOCV progress: 125/1225
  LOOCV progress: 150/1225
  LOOCV progress: 175/1225
  LOOCV progress: 200/1225
  LOOCV progress: 225/1225
  LOOCV progress: 250/1225
  LOOCV progress: 275/1225
  LOOCV progress: 300/1225
  LOOCV progress: 325/1225
  LOOCV progress: 350/1225
  LOOCV progress: 375/1225
  LOOCV progress: 400/1225
 

Predict model

In [16]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
from typing import List, Tuple

# ----------------- PATHS -----------------
ROOT_DIR   = "/explore/nobackup/people/spotter5/new_combustion"
INPUT_CSV  = os.path.join(ROOT_DIR, "2025-10-28_LCC_CombustionModelPredictors.csv")
OUT_DIR    = os.path.join(ROOT_DIR, "LCC")
MODEL_DIR  = os.path.join(OUT_DIR, "models")
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------- MODEL NAMING (MATCH TRAINING) -----------------
# We saved models as rf_final_<target><SUFFIX>.joblib, with:
SUFFIX        = "_LCC_11_18"
MODEL_PREFIX  = "rf_final_"
MODEL_SUFFIX  = f"{SUFFIX}.joblib"   # e.g. rf_final_burn_depth_LCC_11_18.joblib

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project',
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date',
    'landcover_name'
}

# ----------------- PREPROCESS (mirror training) -----------------
def load_and_clean_dataframe(path: str) -> pd.DataFrame:
    print(f"Reading features: {path}")
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]

    # Standardize column names like in training
    rename_map = {}
    if 'ID' in df.columns: rename_map['ID'] = 'id'
    if 'Id' in df.columns: rename_map['Id'] = 'id'
    if 'project_name' in df.columns and 'project.name' not in df.columns:
        rename_map['project_name'] = 'project.name'
    if 'Date' in df.columns and 'date' not in df.columns:
        rename_map['Date'] = 'date'
    if 'latitude' in df.columns and 'lat' not in df.columns:
        rename_map['latitude'] = 'lat'
    if 'longitude' in df.columns and 'lon' not in df.columns:
        rename_map['longitude'] = 'lon'
    if 'fireYr' in df.columns and 'burn_year' not in df.columns:
        rename_map['fireYr'] = 'burn_year'
    df = df.rename(columns=rename_map)

    # One-hot LandCover like training
    if 'LandCover' in df.columns:
        df = pd.get_dummies(df, columns=['LandCover'],
                            prefix='LC', drop_first=True, dummy_na=False)

    return df

def build_predict_matrix(df_in: pd.DataFrame,
                         model_feature_names: List[str],
                         target_cols_to_drop: List[str]) -> pd.DataFrame:
    # Drop non-predictor columns
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df_in.columns]
    X = df_in.drop(columns=drop_cols, errors='ignore').copy()

    # Drop any known target cols (if they happen to be present)
    X = X.drop(columns=[c for c in target_cols_to_drop if c in X.columns], errors='ignore')

    # Keep only numeric columns
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    # Align to model features:
    # 1) add any missing columns as 0
    missing = [c for c in model_feature_names if c not in X.columns]
    if missing:
        for c in missing:
            X[c] = 0.0

    # 2) drop any extras and order exactly like model
    X = X[model_feature_names]

    return X

def find_models(model_dir: str) -> List[Tuple[str, str]]:
    """
    Find rf_final_* models that end with our SUFFIX (e.g. rf_final_burn_depth_LCC_11_18.joblib).
    Returns list of (target_name, full_path).
    """
    out = []
    if not os.path.isdir(model_dir):
        print(f"[WARN] Model directory not found: {model_dir}")
        return out

    for fn in os.listdir(model_dir):
        if fn.startswith(MODEL_PREFIX) and fn.endswith(MODEL_SUFFIX):
            path = os.path.join(model_dir, fn)
            # infer target name from filename, e.g. rf_final_burn_depth_LCC_11_18.joblib
            # target_name = "burn_depth_LCC_11_18" here
            target = fn[len(MODEL_PREFIX):-len(".joblib")]
            out.append((target, path))
    out.sort()
    return out

# ----------------- MAIN -----------------
def main():
    df = load_and_clean_dataframe(INPUT_CSV)

    # If your training used these target columns, list them so we drop them from predictors:
    candidate_targets = ['combusted_above', 'above.carbon.combusted',
                         'combusted_below', 'burn_depth']
    present_targets = [c for c in candidate_targets if c in df.columns]

    models = find_models(MODEL_DIR)
    if not models:
        raise FileNotFoundError(
            f"No models found in {MODEL_DIR} matching {MODEL_PREFIX}*{MODEL_SUFFIX}"
        )

    print(f"Found {len(models)} model(s): {[t for t, _ in models]}")

    preds = {}   # target_name -> prediction vector

    for target_name, model_path in models:
        print(f"\nLoading model: {model_path}")
        model = joblib.load(model_path)

        # Feature alignment: use model.feature_names_in_
        if hasattr(model, "feature_names_in_"):
            feature_names = list(model.feature_names_in_)
        else:
            raise RuntimeError(
                f"Model {model_path} is missing feature_names_in_. "
                f"Retrain with scikit-learn >=1.0 so this attribute is saved."
            )

        X = build_predict_matrix(df, feature_names, present_targets)

        print(f"  Applying model '{target_name}' to {len(X)} rows, {X.shape[1]} features")
        y_pred = model.predict(X)
        preds[f"pred_{target_name}"] = y_pred

    # Assemble output table (keep some identifiers if present)
    keep_cols = []
    for c in ["id", "project.name", "lat", "lon", "burn_year", "date"]:
        if c in df.columns:
            keep_cols.append(c)
    out_df = df[keep_cols].copy() if keep_cols else pd.DataFrame(index=df.index)

    for col, arr in preds.items():
        out_df[col] = arr

    # Write predictions
    base = os.path.splitext(os.path.basename(INPUT_CSV))[0]
    out_csv = os.path.join(OUT_DIR, f"{base}_predictions{SUFFIX}.csv")
    out_df.to_csv(out_csv, index=False)
    print(f"\nWrote predictions → {out_csv}")

if __name__ == "__main__":
    main()


Reading features: /explore/nobackup/people/spotter5/new_combustion/2025-10-28_LCC_CombustionModelPredictors.csv
Found 1 model(s): ['burn_depth_LCC_11_18']

Loading model: /explore/nobackup/people/spotter5/new_combustion/LCC/models/rf_final_burn_depth_LCC_11_18.joblib
  Applying model 'burn_depth_LCC_11_18' to 98 rows, 50 features

Wrote predictions → /explore/nobackup/people/spotter5/new_combustion/LCC/2025-10-28_LCC_CombustionModelPredictors_predictions_LCC_11_18.csv
