RFE and importance

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, make_scorer

# ----------------- PATHS -----------------
# Use ONLY the file you specified
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-09-01_ORG_LC_FISL_predictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]

# Standardize a few variant names to simplify exclusion checks
rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Quick schema snapshot (optional)
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
# (Do not use these as features)
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date',
    # allow for variants if they slipped through
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date'
}

# ----------------- TARGET PICKER (handles abov/above spelling) -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['abov.carbon.combusted', 'above.carbon.combusted'])
COL_BELOW = pick_col(['below.ground.carbon.combusted'])
COL_DEPTH = pick_col(['burn.depth'])

targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

# IMPORTANT: for every model, drop ALL dependent variables from X
ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ----------------- MODEL SETUP -----------------
rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)  # lower is better
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def run_target(target_col: str, units_label: str = "units"):
    # 1) Build modeling frame:
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df.columns]  # banned predictors
    work = df.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])
    y = work[target_col].copy()

    # Remove ALL dependent variables (including the current target) from predictors
    X = work.drop(columns=ALL_TARGET_COLS, errors='ignore')

    # Keep only numeric predictors (RF needs numeric)
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    if X.shape[1] == 0:
        print(f"[ERROR] No numeric predictors left for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")

    # 2) RFECV with RandomForest
    rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    rfecv = RFECV(
        estimator=rf,
        step=1,
        cv=cv,
        scoring=rmse_scorer,
        n_jobs=-1,
        min_features_to_select=1
    )
    rfecv.fit(X, y)

    # Convert scores to positive RMSE for readability
    mean_rmse = -1.0 * np.array(rfecv.cv_results_['mean_test_score'])

    # 3) Plot RFE curve
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(mean_rmse) + 1), mean_rmse, marker='o')
    plt.xlabel("Number of Features Selected")
    plt.ylabel(f"Cross-Validated RMSE ({units_label})")
    plt.title(f"{target_col}: RMSE vs Number of Features (RF + RFECV)")
    plt.grid(True)
    plt.tight_layout()
    rfe_png = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_rfe.png")
    plt.savefig(rfe_png, dpi=150)
    plt.close()
    print(f"Saved RFE curve → {rfe_png}")

    # 4) Selected features (mask)
    selected_features = list(X.columns[rfecv.support_])
    pd.Series(selected_features).to_csv(
        os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_selected_features.csv"),
        index=False, header=False
    )
    print(f"Optimal #features: {rfecv.n_features_}")
    print(f"First few selected: {selected_features[:10]}")

    # 5) Train RF on selected features to get importances
    X_sel = X[selected_features].copy()
    full_rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    full_rf.fit(X_sel, y)
    importances = pd.Series(full_rf.feature_importances_, index=X_sel.columns).sort_values(ascending=False)

    # Save top-20 importances
    top_k = importances.head(20)
    top_k.to_csv(os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_top20_importances.csv"))

    # 6) Plot importances
    plt.figure(figsize=(10, 7))
    sns.barplot(x=top_k.values, y=top_k.index)
    plt.xlabel("Mean Decrease in Impurity (Feature Importance)")
    plt.title(f"{target_col}: Top 20 Random Forest Feature Importances")
    plt.tight_layout()
    imp_png = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_rf_importance.png")
    plt.savefig(imp_png, dpi=150)
    plt.close()
    print(f"Saved importances → {imp_png}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target(tcol, units)

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-09-01_ORG_LC_FISL_predictors.csv

Target: above.carbon.combusted | X: (1504, 60) | y: (1504,)




Saved RFE curve → /explore/nobackup/people/spotter5/new_combustion/LCC/above_carbon_combusted_rfe.png
Optimal #features: 18
First few selected: ['pH_30', 'Silt_30', 'SOC_30', 'HLI', 'ruggedness', 'twi', 'EMT', 'FFP', 'MSP', 'BUI']
Saved importances → /explore/nobackup/people/spotter5/new_combustion/LCC/above_carbon_combusted_rf_importance.png

Target: below.ground.carbon.combusted | X: (1803, 60) | y: (1803,)




Saved RFE curve → /explore/nobackup/people/spotter5/new_combustion/LCC/below_ground_carbon_combusted_rfe.png
Optimal #features: 37
First few selected: ['BD_30', 'pH_30', 'Sand_30', 'Silt_30', 'Clay_30', 'SOC_30', 'PFI', 'HLI', 'ruggedness', 'twi']
Saved importances → /explore/nobackup/people/spotter5/new_combustion/LCC/below_ground_carbon_combusted_rf_importance.png

Target: burn.depth | X: (1812, 60) | y: (1812,)




Saved RFE curve → /explore/nobackup/people/spotter5/new_combustion/LCC/burn_depth_rfe.png
Optimal #features: 51
First few selected: ['BD_30', 'pH_30', 'Sand_30', 'Silt_30', 'Clay_30', 'SOC_30', 'PFI', 'HLI', 'TRASP', 'aspect_rad']
Saved importances → /explore/nobackup/people/spotter5/new_combustion/LCC/burn_depth_rf_importance.png

Done.


LOOCV

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score

# ----------------- PATHS (uploaded file only) -----------------
INPUT_CSV = "/explore/nobackup/people/spotter5/new_combustion/2025-09-01_ORG_LC_FISL_predictors.csv"
OUT_DIR   = "/explore/nobackup/people/spotter5/new_combustion/LCC"
os.makedirs(OUT_DIR, exist_ok=True)

print(f"Reading: {INPUT_CSV}")
df = pd.read_csv(INPUT_CSV)

# ----------------- BASIC CLEANUP -----------------
df.columns = [c.strip() for c in df.columns]

# Standardize common variants
rename_map = {}
if 'ID' in df.columns: rename_map['ID'] = 'id'
if 'Id' in df.columns: rename_map['Id'] = 'id'
if 'project_name' in df.columns and 'project.name' not in df.columns:
    rename_map['project_name'] = 'project.name'
if 'Date' in df.columns and 'date' not in df.columns:
    rename_map['Date'] = 'date'
if 'latitude' in df.columns and 'lat' not in df.columns:
    rename_map['latitude'] = 'lat'
if 'longitude' in df.columns and 'lon' not in df.columns:
    rename_map['longitude'] = 'lon'
if 'fireYr' in df.columns and 'burn_year' not in df.columns:
    rename_map['fireYr'] = 'burn_year'
df = df.rename(columns=rename_map)

# Schema snapshot (optional)
schema = pd.DataFrame({
    "column": df.columns,
    "dtype": df.dtypes.astype(str),
    "n_null": df.isna().sum(),
    "n_unique": [df[c].nunique(dropna=True) for c in df.columns]
})
schema.to_csv(os.path.join(OUT_DIR, "schema_summary.csv"), index=False)

# ----------------- CATEGORICAL: LandCover -> one-hot -----------------
if 'LandCover' in df.columns:
    df = pd.get_dummies(df, columns=['LandCover'], prefix='LC', drop_first=True, dummy_na=False)

# ----------------- EXCLUDED PREDICTOR COLUMNS -----------------
EXCLUDE_PRED_COLS = {
    'id', 'project.name', 'lat', 'lon', 'burn_year', 'date',
    'ID', 'Id', 'project_name', 'latitude', 'longitude', 'fireYr', 'Date'
}

# ----------------- TARGET PICKER (robust to abov/above) -----------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

COL_ABOVE = pick_col(['abov.carbon.combusted', 'above.carbon.combusted'])
COL_BELOW = pick_col(['below.ground.carbon.combusted'])
COL_DEPTH = pick_col(['burn.depth'])

targets = [(c, "units") for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]
if not targets:
    raise ValueError("None of the expected target columns were found in the dataset.")

# IMPORTANT: drop ALL dependent variables from X for every model
ALL_TARGET_COLS = [c for c in [COL_ABOVE, COL_BELOW, COL_DEPTH] if c]

# ----------------- LOOCV ONLY -----------------
def run_target_loocv(target_col: str, units_label: str = "units"):
    # Build modeling frame (drop excluded predictors)
    drop_cols = [c for c in EXCLUDE_PRED_COLS if c in df.columns]
    work = df.drop(columns=drop_cols, errors='ignore').copy()
    work = work.dropna(subset=[target_col])

    y = work[target_col].copy()
    # Remove ALL dependent variables (including the current target) from predictors
    X = work.drop(columns=ALL_TARGET_COLS, errors='ignore')

    # Numeric predictors only
    non_numeric = X.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        X = X.drop(columns=non_numeric)

    if X.shape[1] == 0 or len(y) < 2:
        print(f"[ERROR] Not enough numeric predictors or samples for '{target_col}'.")
        return

    print(f"\nTarget: {target_col} | X: {X.shape} | y: {y.shape}")

    # LOOCV predictions
    loo = LeaveOneOut()
    model = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
    y_pred = cross_val_predict(model, X, y, cv=loo, n_jobs=-1, method='predict')

    # Metrics
    rmse = mean_squared_error(y, y_pred, squared=False)
    r2   = r2_score(y, y_pred)
    print(f"[{target_col}] LOOCV RMSE: {rmse:.4f} {units_label} | R²: {r2:.4f}")

    # Save metrics to CSV
    metrics_path = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_loocv_metrics.csv")
    pd.DataFrame({
        "target": [target_col],
        "n": [len(y)],
        "n_predictors": [X.shape[1]],
        "loocv_rmse": [rmse],
        "loocv_r2": [r2]
    }).to_csv(metrics_path, index=False)

    # Observed vs Predicted plot
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y, y=y_pred, s=18, edgecolor=None)
    lo = min(np.min(y), np.min(y_pred))
    hi = max(np.max(y), np.max(y_pred))
    plt.plot([lo, hi], [lo, hi], 'k--', lw=2, label='1:1 Line')
    plt.xlabel(f"Observed {target_col}")
    plt.ylabel(f"Predicted {target_col}")
    plt.title(f"{target_col}: LOOCV Obs vs Pred (RF)\nRMSE={rmse:.3f} {units_label}, R²={r2:.3f}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plot_path = os.path.join(OUT_DIR, f"{target_col.replace('.', '_')}_loocv_obs_pred.png")
    plt.savefig(plot_path, dpi=150)
    plt.close()

    print(f"Saved metrics → {metrics_path}")
    print(f"Saved plot    → {plot_path}")

# ----------------- RUN FOR EACH TARGET -----------------
for tcol, units in targets:
    run_target_loocv(tcol, units)

print("\nDone.")


Reading: /explore/nobackup/people/spotter5/new_combustion/2025-09-01_ORG_LC_FISL_predictors.csv

Target: above.carbon.combusted | X: (1504, 60) | y: (1504,)
[above.carbon.combusted] LOOCV RMSE: 332.2594 units | R²: 0.4533
Saved metrics → /explore/nobackup/people/spotter5/new_combustion/LCC/above_carbon_combusted_loocv_metrics.csv
Saved plot    → /explore/nobackup/people/spotter5/new_combustion/LCC/above_carbon_combusted_loocv_obs_pred.png

Target: below.ground.carbon.combusted | X: (1803, 60) | y: (1803,)
[below.ground.carbon.combusted] LOOCV RMSE: 1327.5244 units | R²: 0.4756
Saved metrics → /explore/nobackup/people/spotter5/new_combustion/LCC/below_ground_carbon_combusted_loocv_metrics.csv
Saved plot    → /explore/nobackup/people/spotter5/new_combustion/LCC/below_ground_carbon_combusted_loocv_obs_pred.png

Target: burn.depth | X: (1812, 60) | y: (1812,)


In [None]:
't'