RFE and importance

In [10]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, make_scorer

# ----------------- PATHS -----------------
# Use ONLY the file you specified
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]

# Standardize a few variant names to simplify exclusion checks
rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Quick schema snapshot (optional)
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
# (Do not use these as features)
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project'
    # allow for variants if they slipped through
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date', 'landcover_name'
}


# ----------------- TARGET PICKER (handles abov/above spelling) -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['combusted_above', 'above.carbon.combusted'])
COL_BELOW = pick_col(['combusted_below'])
COL_DEPTH = pick_col(['burn_depth'])

targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

# IMPORTANT: for every model, drop ALL dependent variables from X
ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ----------------- MODEL SETUP -----------------
rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)  # lower is better
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def run_target(target_col: str, units_label: str = "units"):
    # 1) Build modeling frame:
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df.columns]  # banned predictors
    work = df.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])
    y = work[target_col].copy()

    # Remove ALL dependent variables (including the current target) from predictors
    X = work.drop(columns=ALL_TARGET_COLS, errors='ignore')

    # Keep only numeric predictors (RF needs numeric)
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    if X.shape[1] == 0:
        print(f"[ERROR] No numeric predictors left for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")

    # 2) RFECV with RandomForest
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    rfecv = RFECV(
        estimator=rf,
        step=1,
        cv=cv,
        scoring=rmse_scorer,
        n_jobs=-1,
        min_features_to_select=1
    )
    rfecv.fit(X, y)

    # Convert scores to positive RMSE for readability
    mean_rmse = -1.0 * np.array(rfecv.cv_results_['mean_test_score'])

    # 3) Plot RFE curve
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(mean_rmse) + 1), mean_rmse, marker='o')
    plt.xlabel("Number of Features Selected")
    plt.ylabel(f"Cross-Validated RMSE ({units_label})")
    plt.title(f"{target_col}: RMSE vs Number of Features (RF + RFECV)")
    plt.grid(True)
    plt.tight_layout()
    rfe_png = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_rfe.png")
    plt.savefig(rfe_png, dpi=150)
    plt.close()
    print(f"Saved RFE curve → {rfe_png}")

    # 4) Selected features (mask)
    selected_features = list(X.columns[rfecv.support_])
    pd.Series(selected_features).to_csv(
        os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_selected_features.csv"),
        index=False, header=False
    )
    print(f"Optimal #features: {rfecv.n_features_}")
    print(f"First few selected: {selected_features[:10]}")

    # 5) Train RF on selected features to get importances
    X_sel = X[selected_features].copy()
    full_rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    full_rf.fit(X_sel, y)
    importances = pd.Series(full_rf.feature_importances_, index=X_sel.columns).sort_values(ascending=False)

    # Save top-20 importances
    top_k = importances.head(20)
    top_k.to_csv(os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_top20_importances.csv"))

    # 6) Plot importances
    plt.figure(figsize=(10, 7))
    sns.barplot(x=top_k.values, y=top_k.index)
    plt.xlabel("Mean Decrease in Impurity (Feature Importance)")
    plt.title(f"{target_col}: Top 20 Random Forest Feature Importances")
    plt.tight_layout()
    imp_png = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_rf_importance.png")
    plt.savefig(imp_png, dpi=150)
    plt.close()
    print(f"Saved importances → {imp_png}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:d
    run_target(tcol, units)

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv

Target: combusted_above | X: (885, 50) | y: (885,)




Saved RFE curve → /explore/nobackup/people/spotter5/new_combustion/LCC/combusted_above_rfe.png
Optimal #features: 43
First few selected: ['BD_30', 'pH_30', 'Sand_30', 'Silt_30', 'Clay_30', 'SOC_30', 'PFI', 'HLI', 'TRASP', 'aspect_rad']
Saved importances → /explore/nobackup/people/spotter5/new_combustion/LCC/combusted_above_rf_importance.png

Target: combusted_below | X: (1167, 50) | y: (1167,)




Saved RFE curve → /explore/nobackup/people/spotter5/new_combustion/LCC/combusted_below_rfe.png
Optimal #features: 45
First few selected: ['BD_30', 'pH_30', 'Sand_30', 'Silt_30', 'Clay_30', 'SOC_30', 'PFI', 'HLI', 'TRASP', 'elevation']
Saved importances → /explore/nobackup/people/spotter5/new_combustion/LCC/combusted_below_rf_importance.png

Target: burn_depth | X: (1174, 50) | y: (1174,)




Saved RFE curve → /explore/nobackup/people/spotter5/new_combustion/LCC/burn_depth_rfe.png
Optimal #features: 49
First few selected: ['BD_30', 'pH_30', 'Sand_30', 'Silt_30', 'Clay_30', 'SOC_30', 'PFI', 'HLI', 'TRASP', 'aspect_rad']
Saved importances → /explore/nobackup/people/spotter5/new_combustion/LCC/burn_depth_rf_importance.png

Done.


LOOCV

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score

# ----------------- PATHS (uploaded file only) -----------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]

# Standardize common variants
rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Schema snapshot (optional)
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project'
    # allow for variants if they slipped through
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date', 'landcover_name'
}


# ----------------- TARGET PICKER (robust to abov/above) -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['combusted_above', 'above.carbon.combusted'])
COL_BELOW = pick_col(['combusted_below'])
COL_DEPTH = pick_col(['burn_depth'])


# targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]
# if not targets:
#     raise ValueError("None of the expected target columns were found in the dataset.")

targets = [(c, "units") for c in [COL_BELOW, COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")


# # IMPORTANT: drop ALL dependent variables from X for every model
# ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]


# IMPORTANT: drop ALL dependent variables from X for every model
ALL_TARGET_COLS = [c for c in [COL_BELOW, COL_DEPTH] if c]

# ----------------- LOOCV ONLY -----------------
def run_target_loocv(target_col: str, units_label: str = "units"):
    # Build modeling frame (drop excluded predictors)
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df.columns]
    work = df.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])

    y = work[target_col].copy()
    # Remove ALL dependent variables (including the current target) from predictors
    X = work.drop(columns=ALL_TARGET_COLS, errors='ignore')

    # Numeric predictors only
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    if X.shape[1] == 0 or len(y) < 2:
        print(f"[ERROR] Not enough numeric predictors or samples for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")

    # LOOCV predictions
    loo = LeaveOneOut()
    model = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    y_pred = cross_val_predict(model, X, y, cv=loo, n_jobs=-1, method='predict')

    # Metrics
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2   = r2_score(y, y_pred)
    print(f"[{target_col}] LOOCV RMSE: {rmse:.4f} {units_label} | R²: {r2:.4f}")

    # Save metrics to CSV
    metrics_path = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_loocv_metrics.csv")
    pd.DataFrame({
        "target": [target_col],
        "n": [len(y)],
        "n_predictors": [X.shape[1]],
        "loocv_rmse": [rmse],
        "loocv_r2": [r2]
    }).to_csv(metrics_path, index=False)

    # Observed vs Predicted plot
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y, y=y_pred, s=18, edgecolor=None)
    lo = min(np.min(y), np.min(y_pred))
    hi = max(np.max(y), np.max(y_pred))
    plt.plot([lo, hi], [lo, hi], 'k--', lw=2, label='1:1 Line')
    plt.xlabel(f"Observed {target_col}")
    plt.ylabel(f"Predicted {target_col}")
    plt.title(f"{target_col}: LOOCV Obs vs Pred (RF)\nRMSE={rmse:.3f} {units_label}, R²={r2:.3f}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plot_path = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_loocv_obs_pred.png")
    plt.savefig(plot_path, dpi=150)
    plt.close()

    print(f"Saved metrics → {metrics_path}")
    print(f"Saved plot    → {plot_path}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target_loocv(tcol, units)

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv

Target: combusted_below | X: (1167, 50) | y: (1167,)
[combusted_below] LOOCV RMSE: 1529.9760 units | R²: 0.2784
Saved metrics → /explore/nobackup/people/spotter5/new_combustion/LCC/combusted_below_loocv_metrics.csv
Saved plot    → /explore/nobackup/people/spotter5/new_combustion/LCC/combusted_below_loocv_obs_pred.png

Target: burn_depth | X: (1174, 50) | y: (1174,)


Process LokyProcess-16:
Process LokyProcess-17:
Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
    r = call_item()
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/parallel.py", line 598, in __call__
    return [func(*args, **kwargs)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/parallel.py", line 598, in <listcomp>
    return [func(*args, **kwargs)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/sklearn/utils/parallel.py", line 136, in __call__
    return self.function(*args, **kwargs)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/sklearn/model_

Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/multiprocessing/queues.py", line 108, in get
    if not self._rlock.acquire(block, timeout):
KeyboardInterrupt

Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 426, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/multiprocessing/queues.py", line 108, in get
    if not self._rlock.acquire(block, timeout):
KeyboardInterrupt

Traceback (most recent call last):
  File "/home/spotter5/.conda/envs/xgboost_gpu/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 426, in _


KeyboardInterrupt



Predict model

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import json
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import joblib
from typing import List, Tuple

# ----------------- PATHS (edit if needed) -----------------
ROOT_DIR   = "/explore/nobackup/people/spotter5/new_combustion"
INPUT_CSV  = os.path.join(ROOT_DIR, "2025-10-28_LCC_CombustionModelPredictors.csv")
OUT_DIR    = os.path.join(ROOT_DIR, "LCC")
MODEL_DIR  = os.path.join(OUT_DIR, "models")
os.makedirs(OUT_DIR, exist_ok=True)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project',
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date', 'landcover_name'
}

# If your training run only used one target (e.g., burn_depth) that’s fine—
# this script will pick up whatever models are in MODEL_DIR that match rf_final_*.joblib
MODEL_PREFIX = "rf_final_"
MODEL_SUFFIX = ".joblib"

# ----------------- PREPROCESS (mirror training) -----------------
def load_and_clean_dataframe(path: str) -> pd.DataFrame:
    print(f"Reading features: {path}")
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]

    # Standardize column names like in training
    rename_map = {}
    if 'ID' in df.columns: rename_map['ID'] = 'id'
    if 'Id' in df.columns: rename_map['Id'] = 'id'
    if 'project_name' in df.columns and 'project.name' not in df.columns:
        rename_map['project_name'] = 'project.name'
    if 'Date' in df.columns and 'date' not in df.columns:
        rename_map['Date'] = 'date'
    if 'latitude' in df.columns and 'lat' not in df.columns:
        rename_map['latitude'] = 'lat'
    if 'longitude' in df.columns and 'lon' not in df.columns:
        rename_map['longitude'] = 'lon'
    if 'fireYr' in df.columns and 'burn_year' not in df.columns:
        rename_map['fireYr'] = 'burn_year'
    df = df.rename(columns=rename_map)

    # One-hot LandCover like training
    if 'LandCover' in df.columns:
        df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

    return df

def build_predict_matrix(df_in: pd.DataFrame, model_feature_names: List[str], target_cols_to_drop: List[str]) -> pd.DataFrame:
    # Drop non-predictor columns
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df_in.columns]
    X = df_in.drop(columns=drop_cols, errors='ignore').copy()

    # Drop any known target cols (if they happen to be present)
    X = X.drop(columns=[c for c in target_cols_to_drop if c in X.columns], errors='ignore')

    # Keep only numeric columns
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    # Align to model features:
    # 1) add any missing columns as 0
    missing = [c for c in model_feature_names if c not in X.columns]
    if missing:
        for c in missing:
            X[c] = 0.0

    # 2) drop any extras
    X = X[model_feature_names]

    return X

def find_models(model_dir: str) -> List[Tuple[str, str]]:
    out = []
    if not os.path.isdir(model_dir):
        print(f"[WARN] Model directory not found: {model_dir}")
        return out
    for fn in os.listdir(model_dir):
        if fn.startswith(MODEL_PREFIX) and fn.endswith(MODEL_SUFFIX):
            path = os.path.join(model_dir, fn)
            # infer target name from filename, e.g., rf_final_burn_depth.joblib
            target = fn[len(MODEL_PREFIX):-len(MODEL_SUFFIX)]
            out.append((target, path))
    out.sort()
    return out

def maybe_load_training_feature_list(meta_json_path: str):
    # Optional helper if you later add feature names to metadata.
    # For now, we rely on model.feature_names_in_
    if os.path.exists(meta_json_path):
        try:
            with open(meta_json_path, "r") as f:
                meta = json.load(f)
            return meta.get("feature_names", None)
        except Exception:
            return None
    return None

# ----------------- MAIN -----------------
def main():
    df = load_and_clean_dataframe(INPUT_CSV)

    # If your training used these target columns, list them so we drop them from predictors:
    # (safe to over-list; any missing ones are ignored)
    candidate_targets = ['combusted_above', 'above.carbon.combusted', 'combusted_below', 'burn_depth']
    present_targets = [c for c in candidate_targets if c in df.columns]

    models = find_models(MODEL_DIR)
    if not models:
        raise FileNotFoundError(f"No models found in {MODEL_DIR} matching {MODEL_PREFIX}*{MODEL_SUFFIX}")

    print(f"Found {len(models)} model(s): {[t for t, _ in models]}")

    preds = {}   # target_name -> prediction vector

    for target_name, model_path in models:
        print(f"\nLoading model: {model_path}")
        model = joblib.load(model_path)

        # Feature alignment: prefer model.feature_names_in_
        if hasattr(model, "feature_names_in_"):
            feature_names = list(model.feature_names_in_)
        else:
            # Optional: try companion metadata with feature list (if you later save it)
            meta_json = os.path.join(OUT_DIR, f"{target_name}_final_model_metadata.json")
            feature_names = maybe_load_training_feature_list(meta_json)
            if not feature_names:
                raise RuntimeError(
                    f"Model {model_path} is missing feature_names_in_. "
                    f"Add feature names to saved metadata or retrain with scikit-learn >=1.0."
                )

        X = build_predict_matrix(df, feature_names, present_targets)

        print(f"  Applying model to {len(X)} rows, {X.shape[1]} features")
        y_pred = model.predict(X)
        preds[f"pred_{target_name}"] = y_pred

    # Assemble output table (keep some identifiers if present)
    keep_cols = []
    for c in ["id", "project.name", "lat", "lon", "burn_year", "date"]:
        if c in df.columns:
            keep_cols.append(c)
    out_df = df[keep_cols].copy() if keep_cols else pd.DataFrame(index=df.index)

    for col, arr in preds.items():
        out_df[col] = arr

    # Write predictions
    base = os.path.splitext(os.path.basename(INPUT_CSV))[0]
    out_csv = os.path.join(OUT_DIR, f"{base}_predictions.csv")
    out_df.to_csv(out_csv, index=False)
    print(f"\nWrote predictions → {out_csv}")

if __name__ == "__main__":
    main()


Reading features: /explore/nobackup/people/spotter5/new_combustion/2025-10-28_LCC_CombustionModelPredictors.csv
Found 3 model(s): ['burn_depth', 'combusted_above', 'combusted_below']

Loading model: /explore/nobackup/people/spotter5/new_combustion/LCC/models/rf_final_burn_depth.joblib
  Applying model to 98 rows, 52 features

Loading model: /explore/nobackup/people/spotter5/new_combustion/LCC/models/rf_final_combusted_above.joblib
  Applying model to 98 rows, 51 features

Loading model: /explore/nobackup/people/spotter5/new_combustion/LCC/models/rf_final_combusted_below.joblib
  Applying model to 98 rows, 52 features

Wrote predictions → /explore/nobackup/people/spotter5/new_combustion/LCC/2025-10-28_LCC_CombustionModelPredictors_predictions.csv


With tuning

In [2]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict, Counter

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.utils.validation import check_is_fitted
from scipy.stats import randint, uniform
import joblib

# ----------------- PATHS -----------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
MODEL_DIR = os.path.join(OUT_DIR, "models")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# ----------------- SEARCH CONFIG -----------------
RANDOM_STATE   = 42
N_JOBS         = -1
INNER_FOLDS    = 5        # inner CV folds for hyperparameter tuning
N_ITER_SEARCH  = 40       # RandomizedSearch iterations per LOOCV split
SCORER         = make_scorer(mean_squared_error, greater_is_better=False)  # neg MSE

# Reasonable, broad-ish RF search spaces
RF_PARAM_DIST = {
    "n_estimators": randint(200, 1000),
    "max_depth":    randint(3, 40),
    "max_features": uniform(0.2, 0.8),  # fraction of features (0.2..1.0)
    "min_samples_split": randint(2, 20),
    "min_samples_leaf":  randint(1, 10),
    "bootstrap":   [True, False]
}

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]

rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Schema snapshot
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project',
    # allow for variants if they slipped through
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date', 'landcover_name'
}

# ----------------- TARGET PICKER -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['combusted_above', 'above.carbon.combusted'])
COL_BELOW = pick_col(['combusted_below'])
COL_DEPTH = pick_col(['burn_depth'])

# If you later want all 3, change targets below to include COL_DEPTH too:
# targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]
targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

# IMPORTANT: include ALL possible targets for safe dropping to prevent cross-target leakage
ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ------------- GLOBAL METRICS STORAGE FOR VIOLIN PLOT -------------
GLOBAL_METRICS = []   # will collect per-target LOOCV metrics (including R²)

# ----------------- Helper: build X, y -----------------
def build_xy(df_in: pd.DataFrame, target_col: str):
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df_in.columns]
    work = df_in.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])

    y = work[target_col].astype(float).copy()

    # Drop the explicit target AND any other known target columns (prevents leakage)
    X = work.drop(columns=list(set(ALL_TARGET_COLS + [target_col])), errors='ignore')

    # Guard against accidental leakage
    assert target_col not in X.columns, f"Target leakage: {target_col} present in predictors!"

    # keep only numeric predictors
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    if X.shape[1] == 0:
        raise ValueError(f"No numeric predictors left after preprocessing for target '{target_col}'.")
    return X, y

# ----------------- Nested LOOCV with inner tuning -----------------
def run_target_nested_loocv(target_col: str, units_label: str = "units"):
    X, y = build_xy(df, target_col)
    if X.shape[1] == 0 or len(y) < 3:
        print(f"[ERROR] Not enough predictors or samples for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")
    y = y.reset_index(drop=True)
    X = X.reset_index(drop=True)

    loo = LeaveOneOut()
    y_pred = np.zeros_like(y, dtype=float)

    split_records = []  # per-split metadata: best params, inner best score

    # Iterate LOOCV splits
    for i, (train_idx, test_idx) in enumerate(loo.split(X), start=1):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr, yte = y.iloc[train_idx], y.iloc[test_idx]

        # inner CV tuner
        inner = KFold(n_splits=min(INNER_FOLDS, len(ytr)), shuffle=True, random_state=RANDOM_STATE)
        base = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS)

        tuner = RandomizedSearchCV(
            estimator=base,
            param_distributions=RF_PARAM_DIST,
            n_iter=N_ITER_SEARCH,
            scoring=SCORER,
            cv=inner,
            random_state=RANDOM_STATE,
            n_jobs=N_JOBS,
            verbose=0,
            refit=True,  # refit on full train with best params
        )
        tuner.fit(Xtr, ytr)

        # Sanity: tuned model must NOT expect the target as a feature
        tuned_features = list(getattr(tuner.best_estimator_, "feature_names_in_", []))
        if target_col in tuned_features:
            raise RuntimeError(
                f"Leakage detected: tuned model expects target '{target_col}' as a feature. "
                f"Check preprocessing."
            )

        best_est = tuner.best_estimator_
        # predict held-out
        y_pred[test_idx] = best_est.predict(Xte)

        split_records.append({
            "split": i,
            "test_index": int(test_idx[0]),
            "best_params": tuner.best_params_,
            "inner_cv_neg_mse": float(tuner.best_score_),  # neg MSE
            "inner_cv_rmse": float(np.sqrt(-tuner.best_score_))
        })

        if i % 25 == 0 or i == len(y):
            print(f"  LOOCV progress: {i}/{len(y)}")

    # Metrics
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2   = r2_score(y, y_pred)
    print(f"[{target_col}] LOOCV tuned RMSE: {rmse:.4f} {units_label} | R²: {r2:.4f}")

    # Save per-split results
    preds_df = pd.DataFrame({
        "index": np.arange(len(y)),
        "y_obs": y.values,
        "y_pred": y_pred
    })
    splits_df = pd.DataFrame(split_records)
    splits_df["inner_cv_rmse"] = splits_df["inner_cv_rmse"].astype(float)

    out_prefix = target_col.replace('.', '_')
    preds_df.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_loocv_predictions.csv"), index=False)
    splits_df.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_loocv_split_tuning.csv"), index=False)

    # Save LOOCV metrics (per-target)
    metrics_df = pd.DataFrame({
        "target": [target_col],
        "n": [len(y)],
        "n_predictors": [X.shape[1]],
        "loocv_rmse": [rmse],
        "loocv_r2": [r2]
    })
    metrics_path = os.path.join(OUT_DIR, f"{out_prefix}_loocv_metrics.csv")
    metrics_df.to_csv(metrics_path, index=False)

    # ---- Add to global metrics for violin plot later ----
    GLOBAL_METRICS.append({
        "target": target_col,
        "n": len(y),
        "n_predictors": X.shape[1],
        "loocv_rmse": rmse,
        "loocv_r2": r2
    })

    # Plot 1:1
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=preds_df["y_obs"], y=preds_df["y_pred"], s=18, edgecolor=None)
    lo = float(np.nanmin([preds_df["y_obs"].min(), preds_df["y_pred"].min()]))
    hi = float(np.nanmax([preds_df["y_obs"].max(), preds_df["y_pred"].max()]))
    plt.plot([lo, hi], [lo, hi], 'k--', lw=2, label='1:1 Line')
    plt.xlabel(f"Observed {target_col}")
    plt.ylabel(f"Predicted {target_col}")
    plt.title(f"{target_col}: Nested-LOOCV Obs vs Pred (RF)\nRMSE={rmse:.3f} {units_label}, R²={r2:.3f}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plot_path = os.path.join(OUT_DIR, f"{out_prefix}_loocv_obs_pred.png")
    plt.savefig(plot_path, dpi=150)
    plt.close()

    # ----------------- Choose consensus hyperparameters -----------------
    # Strategy: group identical param dicts, compute median inner-CV RMSE per group, pick best (lowest)
    def normalize_params(d):
        return tuple(sorted(d.items(), key=lambda x: x[0]))

    grp = defaultdict(list)
    for _, row in splits_df.iterrows():
        grp[normalize_params(row["best_params"])].append(float(row["inner_cv_rmse"]))

    summary_rows = []
    for params_key, rmses in grp.items():
        summary_rows.append({
            "params_key": params_key,
            "median_inner_rmse": float(np.median(rmses)),
            "count": len(rmses)
        })
    params_summary = pd.DataFrame(summary_rows).sort_values(
        ["median_inner_rmse", "count"], ascending=[True, False]
    ).reset_index(drop=True)

    params_summary.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_param_summary.csv"), index=False)

    # Best overall param set
    best_key = params_summary.loc[0, "params_key"]
    best_params = dict(best_key)

    # ----------------- Fit final model on ALL data with best params -----------------
    final_model = RandomForestRegressor(
        random_state=RANDOM_STATE, n_jobs=N_JOBS, **best_params
    )
    final_model.fit(X, y)

    # Save model + metadata (include feature names for safe inference)
    model_path = os.path.join(MODEL_DIR, f"rf_final_{out_prefix}.joblib")
    joblib.dump(final_model, model_path)

    meta = {
        "target": target_col,
        "n_samples": int(len(y)),
        "n_predictors": int(X.shape[1]),
        "units": units_label,
        "loocv_rmse": float(rmse),
        "loocv_r2": float(r2),
        "final_params": best_params,
        "feature_names": list(getattr(final_model, "feature_names_in_", [])),
        "search_config": {
            "inner_folds": INNER_FOLDS,
            "n_iter_search": N_ITER_SEARCH,
            "random_state": RANDOM_STATE
        },
        "files": {
            "predictions_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_loocv_predictions.csv"), OUT_DIR),
            "split_tuning_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_loocv_split_tuning.csv"), OUT_DIR),
            "param_summary_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_param_summary.csv"), OUT_DIR),
            "plot_png": os.path.relpath(plot_path, OUT_DIR),
            "model_joblib": os.path.relpath(model_path, OUT_DIR)
        }
    }
    with open(os.path.join(OUT_DIR, f"{out_prefix}_final_model_metadata.json"), "w") as f:
        json.dump(meta, f, indent=2)

    print(f"Saved tuned LOOCV products for [{target_col}]")
    print(f"  Model  → {model_path}")
    print(f"  Meta   → {os.path.join(OUT_DIR, f'{out_prefix}_final_model_metadata.json')}")
    print(f"  Plots/CSVs in {OUT_DIR}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target_nested_loocv(tcol, units)

# ----------------- GLOBAL VIOLIN PLOT OF LOOCV R² -----------------
if GLOBAL_METRICS:
    global_df = pd.DataFrame(GLOBAL_METRICS)
    global_csv_path = os.path.join(OUT_DIR, "all_targets_loocv_metrics.csv")
    global_df.to_csv(global_csv_path, index=False)

    plt.figure(figsize=(8, 6))
    sns.violinplot(x="target", y="loocv_r2", data=global_df, inner="point", cut=0)
    plt.title("Distribution of LOOCV R² by Target")
    plt.xlabel("Target")
    plt.ylabel("LOOCV R²")
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    violin_path = os.path.join(OUT_DIR, "all_targets_loocv_r2_violin.png")
    plt.savefig(violin_path, dpi=150)
    plt.close()

    print(f"\nSaved global LOOCV metrics CSV → {global_csv_path}")
    print(f"Saved LOOCV R² violin plot   → {violin_path}")
else:
    print("\nNo global metrics collected; skipping violin plot and global CSV.")

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv

Target: combusted_above | X: (885, 50) | y: (885,)


KeyboardInterrupt: 

Just depth

In [None]:
't'

In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict, Counter
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from scipy.stats import randint, uniform
import joblib

# ----------------- PATHS -----------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
MODEL_DIR = os.path.join(OUT_DIR, "models")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# ----------------- SEARCH CONFIG -----------------
RANDOM_STATE   = 42
N_JOBS         = -1
INNER_FOLDS    = 5
N_ITER_SEARCH  = 40
SCORER         = make_scorer(mean_squared_error, greater_is_better=False)

RF_PARAM_DIST = {
    "n_estimators": randint(200, 1000),
    "max_depth":    randint(3, 40),
    "max_features": uniform(0.2, 0.8),  # float in (0,1]: fraction of features
    "min_samples_split": randint(2, 20),
    "min_samples_leaf":  randint(1, 10),
    "bootstrap":   [True, False]
}

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]
rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Schema snapshot
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project',
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date',
    'landcover_name'
}

# ----------------- TARGET PICKER -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['combusted_above', 'above.carbon.combusted'])
COL_BELOW = pick_col(['combusted_below'])
COL_DEPTH = pick_col(['burn_depth'])

# >>> Train burn_depth only for now <<<
targets = [(c, "units") for c in [COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

# IMPORTANT: include ALL possible targets for safe dropping
ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ----------------- Helper: build X, y -----------------
def build_xy(df_in: pd.DataFrame, target_col: str):
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df_in.columns]
    work = df_in.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])

    y = work[target_col].astype(float).copy()

    # Drop the explicit target AND any other known target columns to prevent leakage
    X = work.drop(columns=list(set(ALL_TARGET_COLS + [target_col])), errors='ignore')

    # Guard against accidental leakage
    assert target_col not in X.columns, f"Target leakage: {target_col} present in predictors!"

    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    # Sanity: no empty feature set
    if X.shape[1] == 0:
        raise ValueError(f"No numeric predictors left after preprocessing for target '{target_col}'.")
    return X, y

# ----------------- RandomizedSearch + LOOCV with per-fold R² -----------------
def run_target_randomsearch_loocv(target_col: str, units_label: str = "units"):
    X, y = build_xy(df, target_col)
    if X.shape[1] == 0 or len(y) < 3:
        print(f"[ERROR] Not enough predictors or samples for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")
    y = y.reset_index(drop=True)
    X = X.reset_index(drop=True)
    out_prefix = target_col.replace('.', '_')

    # 1. RandomizedSearchCV (global)
    print(f"\n[{target_col}] Starting global RandomizedSearchCV with {N_ITER_SEARCH} iterations...")
    kfold = KFold(n_splits=min(INNER_FOLDS, len(y)), shuffle=True, random_state=RANDOM_STATE)
    base = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS)
    tuner = RandomizedSearchCV(
        estimator=base,
        param_distributions=RF_PARAM_DIST,
        n_iter=N_ITER_SEARCH,
        scoring=SCORER,
        cv=kfold,
        random_state=RANDOM_STATE,
        n_jobs=N_JOBS,
        verbose=1,
        refit=True,
    )
    tuner.fit(X, y)

    # Leakage sanity check: the tuned model must NOT expect the target as a feature
    tuned_features = list(getattr(tuner.best_estimator_, "feature_names_in_", []))
    if target_col in tuned_features:
        raise RuntimeError(
            f"Leakage detected: tuned model expects target '{target_col}' as a feature. "
            f"Check preprocessing."
        )

    best_params = tuner.best_params_
    best_neg_mse = float(tuner.best_score_)
    best_rmse = float(np.sqrt(-best_neg_mse))
    print(f"[{target_col}] Best params: {best_params}")
    print(f"[{target_col}] RandomizedSearch best CV RMSE: {best_rmse:.4f} {units_label}")
    rs_results = pd.DataFrame(tuner.cv_results_)
    rs_results.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_random_search_results.csv"), index=False)

    # 2. LOOCV using best params
    print(f"\n[{target_col}] Starting LOOCV with fixed best hyperparameters...")
    loo = LeaveOneOut()
    n = len(y)
    y_pred = np.zeros(n, dtype=float)
    r2_folds = np.full(n, np.nan, dtype=float)  # one R² per left-out observation
    train_means = np.full(n, np.nan, dtype=float)

    for i, (train_idx, test_idx) in enumerate(loo.split(X), start=1):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr = y.iloc[train_idx]
        yte = y.iloc[test_idx].values[0]

        model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS, **best_params)
        model.fit(Xtr, ytr)
        yhat = model.predict(Xte)[0]

        # store prediction
        test_i = test_idx[0]
        y_pred[test_i] = yhat

        # per-fold R² for this left-out sample
        mu_train = float(ytr.mean())
        train_means[test_i] = mu_train
        denom = (yte - mu_train)**2
        num = (yte - yhat)**2

        # if denom == 0, R² is undefined -> keep NaN
        if denom != 0.0:
            r2_val = 1.0 - (num / denom)
            # clamp negative R² to 0
            if r2_val < 0.0:
                r2_val = 0.0
            r2_folds[test_i] = r2_val

        if i % 25 == 0 or i == n:
            print(f"  LOOCV progress: {i}/{n}")

    # Global LOOCV metrics over all predictions
    rmse_global = mean_squared_error(y, y_pred, squared=False)
    r2_raw_global = r2_score(y, y_pred)
    # clamp negative global R² to 0 as well
    r2_global = max(0.0, r2_raw_global)
    print(f"[{target_col}] LOOCV RMSE (global): {rmse_global:.4f} | R² (global, clamped): {r2_global:.4f} (raw: {r2_raw_global:.4f})")

    # Save per-sample predictions + per-fold R²
    preds_df = pd.DataFrame({
        "index": np.arange(n),
        "y_obs": y.values,
        "y_pred": y_pred,
        "train_mean_y": train_means,
        "r2_loocv_fold": r2_folds
    })

    # CSV for violin plot
    violin_csv_path = os.path.join(OUT_DIR, f"{out_prefix}_violin.csv")
    preds_df.to_csv(violin_csv_path, index=False)
    print(f"[{target_col}] Saved per-fold R² CSV: {violin_csv_path}")

    # Also keep the LOOCV predictions CSV
    preds_path = os.path.join(OUT_DIR, f"{out_prefix}_loocv_predictions.csv")
    preds_df.to_csv(preds_path, index=False)
    print(f"[{target_col}] Saved LOOCV predictions (with fold R²) to: {preds_path}")

    # Save summary metrics
    pd.DataFrame({
        "target": [target_col],
        "n": [n],
        "n_predictors": [X.shape[1]],
        "loocv_rmse_global": [rmse_global],
        "loocv_r2_global_clamped": [r2_global],
        "loocv_r2_global_raw": [r2_raw_global],
        "random_search_best_rmse": [best_rmse]
    }).to_csv(os.path.join(OUT_DIR, f"{out_prefix}_loocv_metrics.csv"), index=False)

    # Plot 1:1 scatter (global)
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=preds_df["y_obs"], y=preds_df["y_pred"], s=18, edgecolor=None)
    lo = float(np.nanmin([preds_df["y_obs"].min(), preds_df["y_pred"].min()]))
    hi = float(np.nanmax([preds_df["y_obs"].max(), preds_df["y_pred"].max()]))
    plt.plot([lo, hi], [lo, hi], 'k--', lw=2)
    plt.xlabel(f"Observed {target_col}")
    plt.ylabel(f"Predicted {target_col}")
    plt.title(f"{target_col}: LOOCV Obs vs Pred\nRMSE={rmse_global:.3f}, R²={r2_global:.3f} (clamped)")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, f"{out_prefix}_loocv_obs_pred.png"), dpi=150)
    plt.close()

    # ----------------- VIOLIN PLOT OF PER-FOLD R² -----------------
    valid_r2 = preds_df["r2_loocv_fold"].dropna()
    if len(valid_r2) > 0:
        plt.figure(figsize=(6, 6))
        sns.violinplot(y=valid_r2, cut=0)
        plt.ylabel("Per-fold LOOCV R²")
        plt.title(f"Distribution of LOOCV R² per left-out sample\nTarget: {target_col}")
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        violin_png_path = os.path.join(OUT_DIR, f"{out_prefix}_violin.png")
        plt.savefig(violin_png_path, dpi=150)
        plt.close()
        print(f"[{target_col}] Saved violin plot of per-fold R²: {violin_png_path}")
    else:
        print(f"[{target_col}] No valid per-fold R² values (all denominators zero). Skipping violin plot.")

    # Save model + metadata (with feature names!)
    final_model = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=N_JOBS, **best_params)
    final_model.fit(X, y)
    model_path = os.path.join(MODEL_DIR, f"rf_final_{out_prefix}.joblib")
    joblib.dump(final_model, model_path)
    meta = {
        "target": target_col,
        "loocv_rmse_global": rmse_global,
        "loocv_r2_global_clamped": r2_global,
        "loocv_r2_global_raw": r2_raw_global,
        "best_params": best_params,
        "model_path": model_path,
        "n_samples": n,
        "feature_names": list(getattr(final_model, "feature_names_in_", []))
    }
    with open(os.path.join(OUT_DIR, f"{out_prefix}_final_model_metadata.json"), "w") as f:
        json.dump(meta, f, indent=2)

    # Extra sanity message
    print(f"[{target_col}] Final model trained with {len(meta['feature_names'])} features; "
          f"'{target_col}' in features? {target_col in meta['feature_names']}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target_randomsearch_loocv(tcol, units)

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv

Target: burn_depth | X: (1174, 50) | y: (1174,)

[burn_depth] Starting global RandomizedSearchCV with 40 iterations...
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[burn_depth] Best params: {'bootstrap': True, 'max_depth': 14, 'max_features': 0.40142583666029136, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 312}
[burn_depth] RandomizedSearch best CV RMSE: 4.2863 units

[burn_depth] Starting LOOCV with fixed best hyperparameters...
  LOOCV progress: 25/1174
  LOOCV progress: 50/1174
  LOOCV progress: 75/1174
  LOOCV progress: 100/1174
  LOOCV progress: 125/1174
  LOOCV progress: 150/1174
  LOOCV progress: 175/1174
  LOOCV progress: 200/1174
  LOOCV progress: 225/1174
  LOOCV progress: 250/1174
  LOOCV progress: 275/1174
  LOOCV progress: 300/1174
  LOOCV progress: 325/1174
  LOOCV progress: 350/1174
  LOOCV progress: 375/1174
  LOOCV progress: 400/1174
 

In [5]:
't'

't'

Xgboost with tuning

In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import json
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

from sklearn.model_selection import LeaveOneOut, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.utils.validation import check_is_fitted
from scipy.stats import randint, uniform, loguniform
import joblib

import xgboost as xgb

# ----------------- PATHS -----------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
MODEL_DIR = os.path.join(OUT_DIR, "models")
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# ----------------- SEARCH CONFIG -----------------
RANDOM_STATE   = 42
N_JOBS         = -1
INNER_FOLDS    = 5        # inner CV folds for hyperparameter tuning
N_ITER_SEARCH  = 60       # RandomizedSearch iterations per LOOCV split
SCORER         = make_scorer(mean_squared_error, greater_is_better=False)  # neg MSE

# Toggle GPU if desired/available.
# For XGBoost >= 2.0 use device='cuda'; for older versions use tree_method='gpu_hist'.
USE_GPU = True
TREE_METHOD = "hist" if not USE_GPU else ("gpu_hist" if "gpu_hist" in xgb.__dict__.get("__all__", []) else "hist")
XGB_DEVICE_KW = {}
# xgboost>=2.0 supports 'device'
try:
    from packaging import version
    if version.parse(xgb.__version__) >= version.parse("2.0.0") and USE_GPU:
        XGB_DEVICE_KW["device"] = "cuda"
except Exception:
    pass

# Broad, sensible XGB search space
XGB_PARAM_DIST = {
    "n_estimators":      randint(300, 1500),
    "max_depth":         randint(3, 12),
    "learning_rate":     loguniform(1e-3, 3e-1),
    "subsample":         uniform(0.5, 0.5),        # 0.5..1.0
    "colsample_bytree":  uniform(0.5, 0.5),        # 0.5..1.0
    "min_child_weight":  loguniform(1e-1, 1e2),
    "reg_alpha":         loguniform(1e-6, 1e1),
    "reg_lambda":        loguniform(1e-6, 1e1),
    "gamma":             loguniform(1e-4, 1e1),
}

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]

rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Schema snapshot
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date', 'project',
    # variants
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date', 'landcover_name'
}

# ----------------- TARGET PICKER -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['combusted_above', 'above.carbon.combusted'])
COL_BELOW = pick_col(['combusted_below'])
COL_DEPTH = pick_col(['burn_depth'])

targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ----------------- Helper: build X, y -----------------
def build_xy(df_in: pd.DataFrame, target_col: str):
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df_in.columns]
    work = df_in.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])
    y = work[target_col].astype(float).copy()
    X = work.drop(columns=ALL_TARGET_COLS, errors='ignore')
    # numeric only (XGB handles NaN fine)
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)
    # drop zero-variance columns if any
    nunique = X.nunique(dropna=False)
    const_cols = nunique[nunique <= 1].index.tolist()
    if const_cols:
        X = X.drop(columns=const_cols)
    return X, y

def xgb_base():
    # sensible defaults; many will be overridden by search
    base = dict(
        objective="reg:squarederror",
        random_state=RANDOM_STATE,
        n_jobs=N_JOBS,
        tree_method=TREE_METHOD,
        **XGB_DEVICE_KW
    )
    return base

# ----------------- Nested LOOCV with inner tuning -----------------
def run_target_nested_loocv(target_col: str, units_label: str = "units"):
    X, y = build_xy(df, target_col)
    if X.shape[1] == 0 or len(y) < 3:
        print(f"[ERROR] Not enough predictors or samples for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")
    y = y.reset_index(drop=True)
    X = X.reset_index(drop=True)

    loo = LeaveOneOut()
    y_pred = np.zeros_like(y, dtype=float)

    split_records = []  # per-split metadata: best params, inner best score

    for i, (train_idx, test_idx) in enumerate(loo.split(X), start=1):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr, yte = y.iloc[train_idx], y.iloc[test_idx]

        inner = KFold(n_splits=min(INNER_FOLDS, len(ytr)), shuffle=True, random_state=RANDOM_STATE)

        base_est = xgb.XGBRegressor(**xgb_base())

        tuner = RandomizedSearchCV(
            estimator=base_est,
            param_distributions=XGB_PARAM_DIST,
            n_iter=N_ITER_SEARCH,
            scoring=SCORER,
            cv=inner,
            random_state=RANDOM_STATE,
            n_jobs=N_JOBS,
            verbose=0,
            refit=True,  # refit on full train fold with best params
        )
        tuner.fit(Xtr, ytr)

        best_est = tuner.best_estimator_
        y_pred[test_idx] = best_est.predict(Xte)

        split_records.append({
            "split": i,
            "test_index": int(test_idx[0]),
            "best_params": tuner.best_params_,
            "inner_cv_neg_mse": float(tuner.best_score_),             # neg MSE
            "inner_cv_rmse": float(np.sqrt(-tuner.best_score_))
        })

        if i % 25 == 0 or i == len(y):
            print(f"  LOOCV progress: {i}/{len(y)}")

    # Metrics
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2   = r2_score(y, y_pred)
    print(f"[{target_col}] LOOCV tuned RMSE: {rmse:.4f} {units_label} | R²: {r2:.4f}")

    # Save per-split results
    out_prefix = target_col.replace('.', '_')
    preds_df = pd.DataFrame({"index": np.arange(len(y)), "y_obs": y.values, "y_pred": y_pred})
    splits_df = pd.DataFrame(split_records)
    preds_df.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_loocv_predictions.csv"), index=False)
    splits_df.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_loocv_split_tuning.csv"), index=False)

    # Save LOOCV metrics
    pd.DataFrame({
        "target": [target_col],
        "n": [len(y)],
        "n_predictors": [X.shape[1]],
        "loocv_rmse": [rmse],
        "loocv_r2": [r2]
    }).to_csv(os.path.join(OUT_DIR, f"{out_prefix}_loocv_metrics.csv"), index=False)

    # Plot 1:1
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=preds_df["y_obs"], y=preds_df["y_pred"], s=18, edgecolor=None)
    lo = float(np.nanmin([preds_df["y_obs"].min(), preds_df["y_pred"].min()]))
    hi = float(np.nanmax([preds_df["y_obs"].max(), preds_df["y_pred"].max()]))
    plt.plot([lo, hi], [lo, hi], 'k--', lw=2, label='1:1 Line')
    plt.xlabel(f"Observed {target_col}")
    plt.ylabel(f"Predicted {target_col}")
    plt.title(f"{target_col}: Nested-LOOCV Obs vs Pred (XGBoost)\nRMSE={rmse:.3f} {units_label}, R²={r2:.3f}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plot_path = os.path.join(OUT_DIR, f"{out_prefix}_loocv_obs_pred.png")
    plt.savefig(plot_path, dpi=150)
    plt.close()

    # ----------------- Choose consensus hyperparameters -----------------
    def normalize_params(d: dict):
        return tuple(sorted(d.items(), key=lambda x: x[0]))

    grp = defaultdict(list)
    for _, row in splits_df.iterrows():
        grp[normalize_params(row["best_params"])].append(float(row["inner_cv_rmse"]))

    summary_rows = []
    for params_key, rmses in grp.items():
        summary_rows.append({
            "params_key": params_key,
            "median_inner_rmse": float(np.median(rmses)),
            "count": len(rmses)
        })
    params_summary = pd.DataFrame(summary_rows).sort_values(
        ["median_inner_rmse", "count"], ascending=[True, False]
    ).reset_index(drop=True)
    params_summary.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_param_summary.csv"), index=False)

    # Best overall param set
    best_key = params_summary.loc[0, "params_key"]
    best_params = dict(best_key)

    # ----------------- Fit final model on ALL data with best params -----------------
    final_model = xgb.XGBRegressor(**xgb_base(), **best_params)
    final_model.fit(X, y)

    # Save model + metadata
    model_path = os.path.join(MODEL_DIR, f"xgb_final_{out_prefix}.joblib")
    joblib.dump(final_model, model_path)

    # Feature importances
    try:
        fi = pd.DataFrame({
            "feature": X.columns,
            "importance_gain": getattr(final_model, "feature_importances_", None)
        }).sort_values("importance_gain", ascending=False)
        fi.to_csv(os.path.join(OUT_DIR, f"{out_prefix}_feature_importances.csv"), index=False)
    except Exception:
        pass

    meta = {
        "target": target_col,
        "n_samples": int(len(y)),
        "n_predictors": int(X.shape[1]),
        "units": units_label,
        "loocv_rmse": float(rmse),
        "loocv_r2": float(r2),
        "final_params": best_params,
        "xgboost_version": xgb.__version__,
        "device": XGB_DEVICE_KW.get("device", "cpu" if TREE_METHOD == "hist" else "gpu_hist"),
        "search_config": {
            "inner_folds": INNER_FOLDS,
            "n_iter_search": N_ITER_SEARCH,
            "random_state": RANDOM_STATE
        },
        "files": {
            "predictions_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_loocv_predictions.csv"), OUT_DIR),
            "split_tuning_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_loocv_split_tuning.csv"), OUT_DIR),
            "param_summary_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_param_summary.csv"), OUT_DIR),
            "plot_png": os.path.relpath(plot_path, OUT_DIR),
            "model_joblib": os.path.relpath(model_path, OUT_DIR),
            "feature_importances_csv": os.path.relpath(os.path.join(OUT_DIR, f"{out_prefix}_feature_importances.csv"), OUT_DIR)
        }
    }
    with open(os.path.join(OUT_DIR, f"{out_prefix}_final_model_metadata.json"), "w") as f:
        json.dump(meta, f, indent=2)

    print(f"Saved tuned LOOCV products for [{target_col}]")
    print(f"  Model  → {model_path}")
    print(f"  Meta   → {os.path.join(OUT_DIR, f'{out_prefix}_final_model_metadata.json')}")
    print(f"  Plots/CSVs in {OUT_DIR}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target_nested_loocv(tcol, units)

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-10-03_CombustionModelPredictors.csv

Target: combusted_above | X: (885, 50) | y: (885,)


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device or

  LOOCV progress: 25/885


In [None]:
't'