RFE NEE

In [4]:
#!/usr/bin/env python
# coding: utf-8

"""
CatBoost with native categoricals:
- RFECV feature selection (no one-hot needed)
- Final model importances: PredictionValuesChange, LossFunctionChange, SHAP
- Saves RFE curve, importance plots, and a reduced CSV with top-K predictors.

Outputs: /explore/nobackup/people/spotter5/anna_v/v2/loocv/<TARGET>
"""

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, root_mean_squared_error
from catboost import CatBoostRegressor, Pool

warnings.filterwarnings("ignore", category=FutureWarning)
sns.set()

# -------------------- Config --------------------
DATA_CSV = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
TARGET   = "nee"
OUT_DIR  = os.path.join("/explore/nobackup/people/spotter5/anna_v/v2/loocv", TARGET)
os.makedirs(OUT_DIR, exist_ok=True)

PREDICTORS = [
    'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03',
    'sur_refl_b07', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs', 'swe',
    'bdod_0_100cm', 'cec_0_100cm', 'cfvo_0_100cm', 'clay_0_100cm',
    'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm',
    'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT',
    'land_cover', 'month',
    'lai', 'fpar', 'Percent_NonTree_Vegetation',
    'Percent_NonVegetated', 'Percent_Tree_Cover', 'sm_surface', 'sm_rootzone',
    'snow_cover', 'snow_depth'
]
CATEGORICAL = ['land_cover', 'month']   # handled natively by CatBoost
TOP_K = 6

# For maximum robustness with loky/pickling; you can try -1 if stable on your node.
RFECV_N_JOBS = 1

def default_cb_params():
    # Return a brand-new dict each time; we won't store/modify it in the wrapper
    return {
        'loss_function': 'RMSE',
        'eval_metric':   'RMSE',
        'learning_rate': 0.05,
        'depth':         8,
        'n_estimators':  800,
        'random_seed':   42,
        'verbose':       False,
        # 'task_type': 'GPU',  # uncomment if supported in your env
    }

class CatBoostWrapper(BaseEstimator, RegressorMixin):
    """
    Sklearn-compatible wrapper around CatBoost for RFECV.

    IMPORTANT: Do NOT copy/modify __init__ parameters; sklearn requires object identity
    to be preserved for cloning. We only *use* them in fit(), never mutate them.
    """
    def __init__(self, params=None, categorical_features=None, importance_type='PredictionValuesChange'):
        self.params = params       # keep exact object (may be None)
        self.categorical_features = categorical_features  # keep exact object (may be None)
        self.importance_type = importance_type

        # the following are runtime attributes (not sklearn params)
        self.model_ = None
        self.feature_names_in_ = None
        self._last_train_pool_ = None
        self.feature_importances_ = None

    def _cat_idx_for(self, X):
        if self.categorical_features is None:
            return []
        cols = list(X.columns) if hasattr(X, "columns") else list(range(X.shape[1]))
        idx = []
        for c in self.categorical_features:
            if isinstance(c, str):
                if c in cols:
                    idx.append(cols.index(c))
            else:
                # already an index
                if 0 <= int(c) < len(cols):
                    idx.append(int(c))
        return idx

    def fit(self, X, y):
        # Ensure DataFrame to keep column names (for recomputing cat indices)
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X)
        self.feature_names_in_ = list(X.columns)
        cat_idx = self._cat_idx_for(X)

        # Build params at fit time; don't modify self.params
        params_to_use = self.params if self.params is not None else default_cb_params()
        self.model_ = CatBoostRegressor(**params_to_use)

        train_pool = Pool(X, y, cat_features=cat_idx)
        self.model_.fit(train_pool)
        self._last_train_pool_ = train_pool

        # Expose an sklearn-like attribute used by RFE (when needed)
        self.feature_importances_ = self.model_.get_feature_importance(
            self._last_train_pool_, type=self.importance_type
        )
        return self

    def predict(self, X):
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X, columns=self.feature_names_in_[:X.shape[1]])
        cat_idx = self._cat_idx_for(X)
        pool = Pool(X, cat_features=cat_idx)
        return self.model_.predict(pool)

# -------------------- Load & prep data --------------------
df = pd.read_csv(DATA_CSV)
df = df[df.get('flux_method', 'EC') == 'EC'].copy()

# Create tmean_C if needed
if 'tmean_C' not in df.columns and {'tmmn', 'tmmx'}.issubset(df.columns):
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)

# Ensure categoricals are strings
for c in CATEGORICAL:
    if c in df.columns:
        df[c] = df[c].astype(str)

# Keep only available predictors
use_cols = [c for c in PREDICTORS if c in df.columns]
missing = sorted(set(PREDICTORS) - set(use_cols))
if missing:
    print(f"Warning: missing columns ignored: {missing}")

df_model = df[use_cols + [TARGET]].dropna().reset_index(drop=True)
X = df_model[use_cols].copy()
y = df_model[TARGET].copy()

# -------------------- RFECV with CatBoost --------------------
print("Running RFECV with CatBoost (native categoricals)...")
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

est = CatBoostWrapper(
    params=None,                        # None -> use default_cb_params() at fit time
    categorical_features=CATEGORICAL,   # keep identity (no copying here)
    importance_type='PredictionValuesChange'
)

rfecv = RFECV(
    estimator=est,
    step=1,
    cv=cv,
    scoring=rmse_scorer,
    n_jobs=RFECV_N_JOBS
)
rfecv.fit(X, y)
print("RFECV complete.")

# Plot RFECV curve (plot positive RMSE)
plt.figure(figsize=(10, 6))
mean_test = -rfecv.cv_results_['mean_test_score']
plt.plot(range(1, len(mean_test) + 1), mean_test, marker='o')
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross-Validated RMSE (NEE)")
plt.title(f"CatBoost RFECV: {TARGET} RMSE vs Number of Features")
plt.grid(True)
plt.tight_layout()
rfe_plot = os.path.join(OUT_DIR, "catboost_rfe.png")
plt.savefig(rfe_plot)
plt.close()
print(f"Saved RFECV curve: {rfe_plot}")

selected_mask = rfecv.support_
selected_features = X.columns[selected_mask].tolist()
print("\n--- RFECV Results ---")
print("Optimal # of features:", rfecv.n_features_)
print("Selected features:", selected_features)
print("---------------------\n")

# -------------------- Final CatBoost fit (all features) --------------------
final_est = CatBoostWrapper(
    params=None,
    categorical_features=CATEGORICAL,
    importance_type='PredictionValuesChange'
)
final_est.fit(X, y)

# -------------------- Importances --------------------
# 1) PredictionValuesChange (sums roughly to 100)
imp_pred = pd.Series(final_est.feature_importances_, index=X.columns).sort_values(ascending=False)

# 2) LossFunctionChange
imp_loss = pd.Series(
    final_est.model_.get_feature_importance(final_est._last_train_pool_, type='LossFunctionChange'),
    index=X.columns
).sort_values(ascending=False)

# 3) SHAP (mean |SHAP| across samples)
shap_vals = final_est.model_.get_feature_importance(final_est._last_train_pool_, type='ShapValues')
imp_shap = pd.Series(np.mean(np.abs(shap_vals[:, :-1]), axis=0), index=X.columns)\
            .sort_values(ascending=False)

def _plot(series, title, out_png):
    plt.figure(figsize=(12, 10))
    sns.barplot(x=series.values, y=series.index, palette="viridis")
    plt.xlabel("Importance")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png)
    plt.close()
    print(f"Saved: {out_png}")

_plot(imp_pred, f"CatBoost Importance (PredictionValuesChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_prediction_values_change.png"))
_plot(imp_loss, f"CatBoost Importance (LossFunctionChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_loss_function_change.png"))
_plot(imp_shap, f"CatBoost Mean |SHAP| Importance — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_shap_mean_abs.png"))

# -------------------- Top-K by SHAP & save reduced CSV --------------------
top_k = imp_shap.head(TOP_K).index.tolist()
print(f"Top {TOP_K} predictors (by mean |SHAP|): {top_k}")

df_topk = df_model[top_k + [TARGET]].copy()
out_csv = os.path.join(OUT_DIR, f"training_data_{TARGET}_top{TOP_K}_catboost.csv")
df_topk.to_csv(out_csv, index=False)
print(f"Saved reduced training CSV: {out_csv}")
print(f"Shape: {df_topk.shape}")


Running RFECV with CatBoost (native categoricals)...
RFECV complete.
Saved RFECV curve: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/catboost_rfe.png

--- RFECV Results ---
Optimal # of features: 19
Selected features: ['EVI', 'NDVI', 'srad', 'tmean_C', 'vap', 'cfvo_0_100cm', 'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm', 'silt_0_100cm', 'co2_cont', 'ALT', 'land_cover', 'month', 'lai', 'fpar', 'Percent_Tree_Cover', 'sm_surface']
---------------------

Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/cb_importance_prediction_values_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/cb_importance_loss_function_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/cb_importance_shap_mean_abs.png
Top 6 predictors (by mean |SHAP|): ['month', 'srad', 'lai', 'fpar', 'tmean_C', 'cfvo_0_100cm']
Saved reduced training CSV: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/training_data_nee_top6_catboost.csv
Shape

RFE GPP

In [5]:
#!/usr/bin/env python
# coding: utf-8

"""
CatBoost with native categoricals:
- RFECV feature selection (no one-hot needed)
- Final model importances: PredictionValuesChange, LossFunctionChange, SHAP
- Saves RFE curve, importance plots, and a reduced CSV with top-K predictors.

Outputs: /explore/nobackup/people/spotter5/anna_v/v2/loocv/<TARGET>
"""

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, root_mean_squared_error
from catboost import CatBoostRegressor, Pool

warnings.filterwarnings("ignore", category=FutureWarning)
sns.set()

# -------------------- Config --------------------
DATA_CSV = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
TARGET   = "gpp"
OUT_DIR  = os.path.join("/explore/nobackup/people/spotter5/anna_v/v2/loocv", TARGET)
os.makedirs(OUT_DIR, exist_ok=True)

PREDICTORS = [
    'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03',
    'sur_refl_b07', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs', 'swe',
    'bdod_0_100cm', 'cec_0_100cm', 'cfvo_0_100cm', 'clay_0_100cm',
    'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm',
    'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT',
    'land_cover', 'month',
    'lai', 'fpar', 'Percent_NonTree_Vegetation',
    'Percent_NonVegetated', 'Percent_Tree_Cover', 'sm_surface', 'sm_rootzone',
    'snow_cover', 'snow_depth'
]
CATEGORICAL = ['land_cover', 'month']   # handled natively by CatBoost
TOP_K = 6

# For maximum robustness with loky/pickling; you can try -1 if stable on your node.
RFECV_N_JOBS = 1

def default_cb_params():
    # Return a brand-new dict each time; we won't store/modify it in the wrapper
    return {
        'loss_function': 'RMSE',
        'eval_metric':   'RMSE',
        'learning_rate': 0.05,
        'depth':         8,
        'n_estimators':  800,
        'random_seed':   42,
        'verbose':       False,
        # 'task_type': 'GPU',  # uncomment if supported in your env
    }

class CatBoostWrapper(BaseEstimator, RegressorMixin):
    """
    Sklearn-compatible wrapper around CatBoost for RFECV.

    IMPORTANT: Do NOT copy/modify __init__ parameters; sklearn requires object identity
    to be preserved for cloning. We only *use* them in fit(), never mutate them.
    """
    def __init__(self, params=None, categorical_features=None, importance_type='PredictionValuesChange'):
        self.params = params       # keep exact object (may be None)
        self.categorical_features = categorical_features  # keep exact object (may be None)
        self.importance_type = importance_type

        # the following are runtime attributes (not sklearn params)
        self.model_ = None
        self.feature_names_in_ = None
        self._last_train_pool_ = None
        self.feature_importances_ = None

    def _cat_idx_for(self, X):
        if self.categorical_features is None:
            return []
        cols = list(X.columns) if hasattr(X, "columns") else list(range(X.shape[1]))
        idx = []
        for c in self.categorical_features:
            if isinstance(c, str):
                if c in cols:
                    idx.append(cols.index(c))
            else:
                # already an index
                if 0 <= int(c) < len(cols):
                    idx.append(int(c))
        return idx

    def fit(self, X, y):
        # Ensure DataFrame to keep column names (for recomputing cat indices)
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X)
        self.feature_names_in_ = list(X.columns)
        cat_idx = self._cat_idx_for(X)

        # Build params at fit time; don't modify self.params
        params_to_use = self.params if self.params is not None else default_cb_params()
        self.model_ = CatBoostRegressor(**params_to_use)

        train_pool = Pool(X, y, cat_features=cat_idx)
        self.model_.fit(train_pool)
        self._last_train_pool_ = train_pool

        # Expose an sklearn-like attribute used by RFE (when needed)
        self.feature_importances_ = self.model_.get_feature_importance(
            self._last_train_pool_, type=self.importance_type
        )
        return self

    def predict(self, X):
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X, columns=self.feature_names_in_[:X.shape[1]])
        cat_idx = self._cat_idx_for(X)
        pool = Pool(X, cat_features=cat_idx)
        return self.model_.predict(pool)

# -------------------- Load & prep data --------------------
df = pd.read_csv(DATA_CSV)
df = df[df.get('flux_method', 'EC') == 'EC'].copy()

# Create tmean_C if needed
if 'tmean_C' not in df.columns and {'tmmn', 'tmmx'}.issubset(df.columns):
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)

# Ensure categoricals are strings
for c in CATEGORICAL:
    if c in df.columns:
        df[c] = df[c].astype(str)

# Keep only available predictors
use_cols = [c for c in PREDICTORS if c in df.columns]
missing = sorted(set(PREDICTORS) - set(use_cols))
if missing:
    print(f"Warning: missing columns ignored: {missing}")

df_model = df[use_cols + [TARGET]].dropna().reset_index(drop=True)
X = df_model[use_cols].copy()
y = df_model[TARGET].copy()

# -------------------- RFECV with CatBoost --------------------
print("Running RFECV with CatBoost (native categoricals)...")
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

est = CatBoostWrapper(
    params=None,                        # None -> use default_cb_params() at fit time
    categorical_features=CATEGORICAL,   # keep identity (no copying here)
    importance_type='PredictionValuesChange'
)

rfecv = RFECV(
    estimator=est,
    step=1,
    cv=cv,
    scoring=rmse_scorer,
    n_jobs=RFECV_N_JOBS
)
rfecv.fit(X, y)
print("RFECV complete.")

# Plot RFECV curve (plot positive RMSE)
plt.figure(figsize=(10, 6))
mean_test = -rfecv.cv_results_['mean_test_score']
plt.plot(range(1, len(mean_test) + 1), mean_test, marker='o')
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross-Validated RMSE (NEE)")
plt.title(f"CatBoost RFECV: {TARGET} RMSE vs Number of Features")
plt.grid(True)
plt.tight_layout()
rfe_plot = os.path.join(OUT_DIR, "catboost_rfe.png")
plt.savefig(rfe_plot)
plt.close()
print(f"Saved RFECV curve: {rfe_plot}")

selected_mask = rfecv.support_
selected_features = X.columns[selected_mask].tolist()
print("\n--- RFECV Results ---")
print("Optimal # of features:", rfecv.n_features_)
print("Selected features:", selected_features)
print("---------------------\n")

# -------------------- Final CatBoost fit (all features) --------------------
final_est = CatBoostWrapper(
    params=None,
    categorical_features=CATEGORICAL,
    importance_type='PredictionValuesChange'
)
final_est.fit(X, y)

# -------------------- Importances --------------------
# 1) PredictionValuesChange (sums roughly to 100)
imp_pred = pd.Series(final_est.feature_importances_, index=X.columns).sort_values(ascending=False)

# 2) LossFunctionChange
imp_loss = pd.Series(
    final_est.model_.get_feature_importance(final_est._last_train_pool_, type='LossFunctionChange'),
    index=X.columns
).sort_values(ascending=False)

# 3) SHAP (mean |SHAP| across samples)
shap_vals = final_est.model_.get_feature_importance(final_est._last_train_pool_, type='ShapValues')
imp_shap = pd.Series(np.mean(np.abs(shap_vals[:, :-1]), axis=0), index=X.columns)\
            .sort_values(ascending=False)

def _plot(series, title, out_png):
    plt.figure(figsize=(12, 10))
    sns.barplot(x=series.values, y=series.index, palette="viridis")
    plt.xlabel("Importance")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png)
    plt.close()
    print(f"Saved: {out_png}")

_plot(imp_pred, f"CatBoost Importance (PredictionValuesChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_prediction_values_change.png"))
_plot(imp_loss, f"CatBoost Importance (LossFunctionChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_loss_function_change.png"))
_plot(imp_shap, f"CatBoost Mean |SHAP| Importance — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_shap_mean_abs.png"))

# -------------------- Top-K by SHAP & save reduced CSV --------------------
top_k = imp_shap.head(TOP_K).index.tolist()
print(f"Top {TOP_K} predictors (by mean |SHAP|): {top_k}")

df_topk = df_model[top_k + [TARGET]].copy()
out_csv = os.path.join(OUT_DIR, f"training_data_{TARGET}_top{TOP_K}_catboost.csv")
df_topk.to_csv(out_csv, index=False)
print(f"Saved reduced training CSV: {out_csv}")
print(f"Shape: {df_topk.shape}")


Running RFECV with CatBoost (native categoricals)...
RFECV complete.
Saved RFECV curve: /explore/nobackup/people/spotter5/anna_v/v2/loocv/gpp/catboost_rfe.png

--- RFECV Results ---
Optimal # of features: 23
Selected features: ['EVI', 'NDVI', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'cec_0_100cm', 'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm', 'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT', 'land_cover', 'month', 'lai', 'fpar', 'Percent_Tree_Cover', 'sm_surface', 'snow_depth']
---------------------

Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/gpp/cb_importance_prediction_values_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/gpp/cb_importance_loss_function_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/gpp/cb_importance_shap_mean_abs.png
Top 6 predictors (by mean |SHAP|): ['tmean_C', 'month', 'srad', 'lai', 'EVI', 'snow_cover']
Saved reduced training CSV: /explore/nobackup/people/spotter5/anna_v/v2/loocv/gpp/

RFE RECO

In [6]:
#!/usr/bin/env python
# coding: utf-8

"""
CatBoost with native categoricals:
- RFECV feature selection (no one-hot needed)
- Final model importances: PredictionValuesChange, LossFunctionChange, SHAP
- Saves RFE curve, importance plots, and a reduced CSV with top-K predictors.

Outputs: /explore/nobackup/people/spotter5/anna_v/v2/loocv/<TARGET>
"""

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, root_mean_squared_error
from catboost import CatBoostRegressor, Pool

warnings.filterwarnings("ignore", category=FutureWarning)
sns.set()

# -------------------- Config --------------------
DATA_CSV = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
TARGET   = "reco"
OUT_DIR  = os.path.join("/explore/nobackup/people/spotter5/anna_v/v2/loocv", TARGET)
os.makedirs(OUT_DIR, exist_ok=True)

PREDICTORS = [
    'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03',
    'sur_refl_b07', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs', 'swe',
    'bdod_0_100cm', 'cec_0_100cm', 'cfvo_0_100cm', 'clay_0_100cm',
    'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm',
    'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT',
    'land_cover', 'month',
    'lai', 'fpar', 'Percent_NonTree_Vegetation',
    'Percent_NonVegetated', 'Percent_Tree_Cover', 'sm_surface', 'sm_rootzone',
    'snow_cover', 'snow_depth'
]
CATEGORICAL = ['land_cover', 'month']   # handled natively by CatBoost
TOP_K = 6

# For maximum robustness with loky/pickling; you can try -1 if stable on your node.
RFECV_N_JOBS = 1

def default_cb_params():
    # Return a brand-new dict each time; we won't store/modify it in the wrapper
    return {
        'loss_function': 'RMSE',
        'eval_metric':   'RMSE',
        'learning_rate': 0.05,
        'depth':         8,
        'n_estimators':  800,
        'random_seed':   42,
        'verbose':       False,
        # 'task_type': 'GPU',  # uncomment if supported in your env
    }

class CatBoostWrapper(BaseEstimator, RegressorMixin):
    """
    Sklearn-compatible wrapper around CatBoost for RFECV.

    IMPORTANT: Do NOT copy/modify __init__ parameters; sklearn requires object identity
    to be preserved for cloning. We only *use* them in fit(), never mutate them.
    """
    def __init__(self, params=None, categorical_features=None, importance_type='PredictionValuesChange'):
        self.params = params       # keep exact object (may be None)
        self.categorical_features = categorical_features  # keep exact object (may be None)
        self.importance_type = importance_type

        # the following are runtime attributes (not sklearn params)
        self.model_ = None
        self.feature_names_in_ = None
        self._last_train_pool_ = None
        self.feature_importances_ = None

    def _cat_idx_for(self, X):
        if self.categorical_features is None:
            return []
        cols = list(X.columns) if hasattr(X, "columns") else list(range(X.shape[1]))
        idx = []
        for c in self.categorical_features:
            if isinstance(c, str):
                if c in cols:
                    idx.append(cols.index(c))
            else:
                # already an index
                if 0 <= int(c) < len(cols):
                    idx.append(int(c))
        return idx

    def fit(self, X, y):
        # Ensure DataFrame to keep column names (for recomputing cat indices)
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X)
        self.feature_names_in_ = list(X.columns)
        cat_idx = self._cat_idx_for(X)

        # Build params at fit time; don't modify self.params
        params_to_use = self.params if self.params is not None else default_cb_params()
        self.model_ = CatBoostRegressor(**params_to_use)

        train_pool = Pool(X, y, cat_features=cat_idx)
        self.model_.fit(train_pool)
        self._last_train_pool_ = train_pool

        # Expose an sklearn-like attribute used by RFE (when needed)
        self.feature_importances_ = self.model_.get_feature_importance(
            self._last_train_pool_, type=self.importance_type
        )
        return self

    def predict(self, X):
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X, columns=self.feature_names_in_[:X.shape[1]])
        cat_idx = self._cat_idx_for(X)
        pool = Pool(X, cat_features=cat_idx)
        return self.model_.predict(pool)

# -------------------- Load & prep data --------------------
df = pd.read_csv(DATA_CSV)
df = df[df.get('flux_method', 'EC') == 'EC'].copy()

# Create tmean_C if needed
if 'tmean_C' not in df.columns and {'tmmn', 'tmmx'}.issubset(df.columns):
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)

# Ensure categoricals are strings
for c in CATEGORICAL:
    if c in df.columns:
        df[c] = df[c].astype(str)

# Keep only available predictors
use_cols = [c for c in PREDICTORS if c in df.columns]
missing = sorted(set(PREDICTORS) - set(use_cols))
if missing:
    print(f"Warning: missing columns ignored: {missing}")

df_model = df[use_cols + [TARGET]].dropna().reset_index(drop=True)
X = df_model[use_cols].copy()
y = df_model[TARGET].copy()

# -------------------- RFECV with CatBoost --------------------
print("Running RFECV with CatBoost (native categoricals)...")
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

est = CatBoostWrapper(
    params=None,                        # None -> use default_cb_params() at fit time
    categorical_features=CATEGORICAL,   # keep identity (no copying here)
    importance_type='PredictionValuesChange'
)

rfecv = RFECV(
    estimator=est,
    step=1,
    cv=cv,
    scoring=rmse_scorer,
    n_jobs=RFECV_N_JOBS
)
rfecv.fit(X, y)
print("RFECV complete.")

# Plot RFECV curve (plot positive RMSE)
plt.figure(figsize=(10, 6))
mean_test = -rfecv.cv_results_['mean_test_score']
plt.plot(range(1, len(mean_test) + 1), mean_test, marker='o')
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross-Validated RMSE (NEE)")
plt.title(f"CatBoost RFECV: {TARGET} RMSE vs Number of Features")
plt.grid(True)
plt.tight_layout()
rfe_plot = os.path.join(OUT_DIR, "catboost_rfe.png")
plt.savefig(rfe_plot)
plt.close()
print(f"Saved RFECV curve: {rfe_plot}")

selected_mask = rfecv.support_
selected_features = X.columns[selected_mask].tolist()
print("\n--- RFECV Results ---")
print("Optimal # of features:", rfecv.n_features_)
print("Selected features:", selected_features)
print("---------------------\n")

# -------------------- Final CatBoost fit (all features) --------------------
final_est = CatBoostWrapper(
    params=None,
    categorical_features=CATEGORICAL,
    importance_type='PredictionValuesChange'
)
final_est.fit(X, y)

# -------------------- Importances --------------------
# 1) PredictionValuesChange (sums roughly to 100)
imp_pred = pd.Series(final_est.feature_importances_, index=X.columns).sort_values(ascending=False)

# 2) LossFunctionChange
imp_loss = pd.Series(
    final_est.model_.get_feature_importance(final_est._last_train_pool_, type='LossFunctionChange'),
    index=X.columns
).sort_values(ascending=False)

# 3) SHAP (mean |SHAP| across samples)
shap_vals = final_est.model_.get_feature_importance(final_est._last_train_pool_, type='ShapValues')
imp_shap = pd.Series(np.mean(np.abs(shap_vals[:, :-1]), axis=0), index=X.columns)\
            .sort_values(ascending=False)

def _plot(series, title, out_png):
    plt.figure(figsize=(12, 10))
    sns.barplot(x=series.values, y=series.index, palette="viridis")
    plt.xlabel("Importance")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png)
    plt.close()
    print(f"Saved: {out_png}")

_plot(imp_pred, f"CatBoost Importance (PredictionValuesChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_prediction_values_change.png"))
_plot(imp_loss, f"CatBoost Importance (LossFunctionChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_loss_function_change.png"))
_plot(imp_shap, f"CatBoost Mean |SHAP| Importance — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_shap_mean_abs.png"))

# -------------------- Top-K by SHAP & save reduced CSV --------------------
top_k = imp_shap.head(TOP_K).index.tolist()
print(f"Top {TOP_K} predictors (by mean |SHAP|): {top_k}")

df_topk = df_model[top_k + [TARGET]].copy()
out_csv = os.path.join(OUT_DIR, f"training_data_{TARGET}_top{TOP_K}_catboost.csv")
df_topk.to_csv(out_csv, index=False)
print(f"Saved reduced training CSV: {out_csv}")
print(f"Shape: {df_topk.shape}")


Running RFECV with CatBoost (native categoricals)...
RFECV complete.
Saved RFECV curve: /explore/nobackup/people/spotter5/anna_v/v2/loocv/reco/catboost_rfe.png

--- RFECV Results ---
Optimal # of features: 24
Selected features: ['NDVI', 'sur_refl_b01', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'cfvo_0_100cm', 'clay_0_100cm', 'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm', 'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT', 'land_cover', 'fpar', 'Percent_NonTree_Vegetation', 'Percent_NonVegetated', 'Percent_Tree_Cover', 'sm_rootzone', 'snow_cover']
---------------------

Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/reco/cb_importance_prediction_values_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/reco/cb_importance_loss_function_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/reco/cb_importance_shap_mean_abs.png
Top 6 predictors (by mean |SHAP|): ['tmean_C', 'vap', 'Percent_Tree_Cover', 'NDVI', 'snow_depth', 'land_c

RFE CH4

In [8]:
#!/usr/bin/env python
# coding: utf-8

"""
CatBoost with native categoricals:
- RFECV feature selection (no one-hot needed)
- Final model importances: PredictionValuesChange, LossFunctionChange, SHAP
- Saves RFE curve, importance plots, and a reduced CSV with top-K predictors.

Outputs: /explore/nobackup/people/spotter5/anna_v/v2/loocv/<TARGET>
"""

import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, root_mean_squared_error
from catboost import CatBoostRegressor, Pool

warnings.filterwarnings("ignore", category=FutureWarning)
sns.set()

# -------------------- Config --------------------
DATA_CSV = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
TARGET   = "ch4_flux_total"
OUT_DIR  = os.path.join("/explore/nobackup/people/spotter5/anna_v/v2/loocv", TARGET)
os.makedirs(OUT_DIR, exist_ok=True)

TOWERS_CSV       = "/explore/nobackup/people/spotter5/anna_v/v2/methane_towers.csv"
towers_df = pd.read_csv(TOWERS_CSV)

PREDICTORS = [
    'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03',
    'sur_refl_b07', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs', 'swe',
    'bdod_0_100cm', 'cec_0_100cm', 'cfvo_0_100cm', 'clay_0_100cm',
    'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm',
    'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT',
    'land_cover', 'month',
    'lai', 'fpar', 'Percent_NonTree_Vegetation',
    'Percent_NonVegetated', 'Percent_Tree_Cover', 'sm_surface', 'sm_rootzone',
    'snow_cover', 'snow_depth'
]
CATEGORICAL = ['land_cover', 'month']   # handled natively by CatBoost
TOP_K = 6

# For maximum robustness with loky/pickling; you can try -1 if stable on your node.
RFECV_N_JOBS = 1

def default_cb_params():
    # Return a brand-new dict each time; we won't store/modify it in the wrapper
    return {
        'loss_function': 'RMSE',
        'eval_metric':   'RMSE',
        'learning_rate': 0.05,
        'depth':         8,
        'n_estimators':  800,
        'random_seed':   42,
        'verbose':       False,
        # 'task_type': 'GPU',  # uncomment if supported in your env
    }

class CatBoostWrapper(BaseEstimator, RegressorMixin):
    """
    Sklearn-compatible wrapper around CatBoost for RFECV.

    IMPORTANT: Do NOT copy/modify __init__ parameters; sklearn requires object identity
    to be preserved for cloning. We only *use* them in fit(), never mutate them.
    """
    def __init__(self, params=None, categorical_features=None, importance_type='PredictionValuesChange'):
        self.params = params       # keep exact object (may be None)
        self.categorical_features = categorical_features  # keep exact object (may be None)
        self.importance_type = importance_type

        # the following are runtime attributes (not sklearn params)
        self.model_ = None
        self.feature_names_in_ = None
        self._last_train_pool_ = None
        self.feature_importances_ = None

    def _cat_idx_for(self, X):
        if self.categorical_features is None:
            return []
        cols = list(X.columns) if hasattr(X, "columns") else list(range(X.shape[1]))
        idx = []
        for c in self.categorical_features:
            if isinstance(c, str):
                if c in cols:
                    idx.append(cols.index(c))
            else:
                # already an index
                if 0 <= int(c) < len(cols):
                    idx.append(int(c))
        return idx

    def fit(self, X, y):
        # Ensure DataFrame to keep column names (for recomputing cat indices)
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X)
        self.feature_names_in_ = list(X.columns)
        cat_idx = self._cat_idx_for(X)

        # Build params at fit time; don't modify self.params
        params_to_use = self.params if self.params is not None else default_cb_params()
        self.model_ = CatBoostRegressor(**params_to_use)

        train_pool = Pool(X, y, cat_features=cat_idx)
        self.model_.fit(train_pool)
        self._last_train_pool_ = train_pool

        # Expose an sklearn-like attribute used by RFE (when needed)
        self.feature_importances_ = self.model_.get_feature_importance(
            self._last_train_pool_, type=self.importance_type
        )
        return self

    def predict(self, X):
        if not hasattr(X, "columns"):
            X = pd.DataFrame(X, columns=self.feature_names_in_[:X.shape[1]])
        cat_idx = self._cat_idx_for(X)
        pool = Pool(X, cat_features=cat_idx)
        return self.model_.predict(pool)

# -------------------- Load & prep data --------------------
df = pd.read_csv(DATA_CSV)
df = df[df.get('flux_method', 'EC') == 'EC'].copy()

df = df[df['site_reference'] .isin (towers_df['site_reference'].unique())]

# Create tmean_C if needed
if 'tmean_C' not in df.columns and {'tmmn', 'tmmx'}.issubset(df.columns):
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)

# Ensure categoricals are strings
for c in CATEGORICAL:
    if c in df.columns:
        df[c] = df[c].astype(str)

# Keep only available predictors
use_cols = [c for c in PREDICTORS if c in df.columns]
missing = sorted(set(PREDICTORS) - set(use_cols))
if missing:
    print(f"Warning: missing columns ignored: {missing}")

df_model = df[use_cols + [TARGET]].dropna().reset_index(drop=True)
X = df_model[use_cols].copy()
y = df_model[TARGET].copy()

# -------------------- RFECV with CatBoost --------------------
print("Running RFECV with CatBoost (native categoricals)...")
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

est = CatBoostWrapper(
    params=None,                        # None -> use default_cb_params() at fit time
    categorical_features=CATEGORICAL,   # keep identity (no copying here)
    importance_type='PredictionValuesChange'
)

rfecv = RFECV(
    estimator=est,
    step=1,
    cv=cv,
    scoring=rmse_scorer,
    n_jobs=RFECV_N_JOBS
)
rfecv.fit(X, y)
print("RFECV complete.")

# Plot RFECV curve (plot positive RMSE)
plt.figure(figsize=(10, 6))
mean_test = -rfecv.cv_results_['mean_test_score']
plt.plot(range(1, len(mean_test) + 1), mean_test, marker='o')
plt.xlabel("Number of Features Selected")
plt.ylabel("Cross-Validated RMSE (NEE)")
plt.title(f"CatBoost RFECV: {TARGET} RMSE vs Number of Features")
plt.grid(True)
plt.tight_layout()
rfe_plot = os.path.join(OUT_DIR, "catboost_rfe.png")
plt.savefig(rfe_plot)
plt.close()
print(f"Saved RFECV curve: {rfe_plot}")

selected_mask = rfecv.support_
selected_features = X.columns[selected_mask].tolist()
print("\n--- RFECV Results ---")
print("Optimal # of features:", rfecv.n_features_)
print("Selected features:", selected_features)
print("---------------------\n")

# -------------------- Final CatBoost fit (all features) --------------------
final_est = CatBoostWrapper(
    params=None,
    categorical_features=CATEGORICAL,
    importance_type='PredictionValuesChange'
)
final_est.fit(X, y)

# -------------------- Importances --------------------
# 1) PredictionValuesChange (sums roughly to 100)
imp_pred = pd.Series(final_est.feature_importances_, index=X.columns).sort_values(ascending=False)

# 2) LossFunctionChange
imp_loss = pd.Series(
    final_est.model_.get_feature_importance(final_est._last_train_pool_, type='LossFunctionChange'),
    index=X.columns
).sort_values(ascending=False)

# 3) SHAP (mean |SHAP| across samples)
shap_vals = final_est.model_.get_feature_importance(final_est._last_train_pool_, type='ShapValues')
imp_shap = pd.Series(np.mean(np.abs(shap_vals[:, :-1]), axis=0), index=X.columns)\
            .sort_values(ascending=False)

def _plot(series, title, out_png):
    plt.figure(figsize=(12, 10))
    sns.barplot(x=series.values, y=series.index, palette="viridis")
    plt.xlabel("Importance")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(out_png)
    plt.close()
    print(f"Saved: {out_png}")

_plot(imp_pred, f"CatBoost Importance (PredictionValuesChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_prediction_values_change.png"))
_plot(imp_loss, f"CatBoost Importance (LossFunctionChange) — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_loss_function_change.png"))
_plot(imp_shap, f"CatBoost Mean |SHAP| Importance — {TARGET}",
      os.path.join(OUT_DIR, "cb_importance_shap_mean_abs.png"))

# -------------------- Top-K by SHAP & save reduced CSV --------------------
top_k = imp_shap.head(TOP_K).index.tolist()
print(f"Top {TOP_K} predictors (by mean |SHAP|): {top_k}")

df_topk = df_model[top_k + [TARGET]].copy()
out_csv = os.path.join(OUT_DIR, f"training_data_{TARGET}_top{TOP_K}_catboost.csv")
df_topk.to_csv(out_csv, index=False)
print(f"Saved reduced training CSV: {out_csv}")
print(f"Shape: {df_topk.shape}")


Running RFECV with CatBoost (native categoricals)...
RFECV complete.
Saved RFECV curve: /explore/nobackup/people/spotter5/anna_v/v2/loocv/ch4_flux_total/catboost_rfe.png

--- RFECV Results ---
Optimal # of features: 1
Selected features: ['soc_0_100cm']
---------------------

Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/ch4_flux_total/cb_importance_prediction_values_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/ch4_flux_total/cb_importance_loss_function_change.png
Saved: /explore/nobackup/people/spotter5/anna_v/v2/loocv/ch4_flux_total/cb_importance_shap_mean_abs.png
Top 6 predictors (by mean |SHAP|): ['soc_0_100cm', 'co2_cont', 'vap', 'land_cover', 'lai', 'month']
Saved reduced training CSV: /explore/nobackup/people/spotter5/anna_v/v2/loocv/ch4_flux_total/training_data_ch4_flux_total_top6_catboost.csv
Shape: (481, 7)


In [None]:
't'