In [1]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from catboost import CatBoostRegressor

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)

def run_catboost_loso(df, feature_cols, categorical_features, target_col):
    """Runs LOSO CV for CatBoost and returns predictions and metrics."""
    print("--- Running CatBoost LOSO ---")
    X = df[feature_cols].copy()
    y = df[target_col]
    sites = df["site_reference"].unique()

    for col in categorical_features:
        X[col] = X[col].astype('category')

    results = []
    all_preds_df_list = []

    for test_site in sites:
        print(f"  Processing site: {test_site}...")
        train_idx = df["site_reference"] != test_site
        test_idx = df["site_reference"] == test_site
        
        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y.loc[test_idx]
        
        # --- FIX: Replaced 'colsample_bytree' with the correct CatBoost parameter 'rsm' ---
        model = CatBoostRegressor(iterations=1200, learning_rate=0.01, depth=8, subsample=0.7,
                                  random_state=42, l2_leaf_reg=0.1, rsm=0.8,
                                  cat_features=categorical_features, verbose=0, allow_writing_files=False)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        site_df = pd.DataFrame({"Site": test_site, "Date": df.loc[test_idx, "date"], 
                                "Observed": y_test, "Predicted": y_pred})
        all_preds_df_list.append(site_df)
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append({"Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2})

    return pd.DataFrame(results), pd.concat(all_preds_df_list, ignore_index=True)

def run_svm_loso(df, feature_cols, target_col):
    """Runs LOSO CV for SVM and returns predictions and metrics."""
    print("--- Running SVM LOSO ---")
    X_initial = df[feature_cols].copy()
    y = df[target_col]
    sites = df["site_reference"].unique()
    
    categorical_features = [f for f in X_initial.select_dtypes(include=['category', 'object']).columns if f in feature_cols]
    X = pd.get_dummies(X_initial, columns=categorical_features, drop_first=True) if categorical_features else X_initial

    results = []
    all_preds_df_list = []

    for test_site in sites:
        print(f"  Processing site: {test_site}...")
        train_idx = df["site_reference"] != test_site
        test_idx = df["site_reference"] == test_site

        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y.loc[test_idx]
        
        model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        site_df = pd.DataFrame({"Site": test_site, "Date": df.loc[test_idx, "date"],
                                "Observed": y_test, "Predicted": y_pred})
        all_preds_df_list.append(site_df)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append({"Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2})
        
    return pd.DataFrame(results), pd.concat(all_preds_df_list, ignore_index=True)

def generate_comparison_plots(catboost_preds, svm_preds, catboost_metrics, svm_metrics, target_col, figures_path):
    """Generates 1x2 side-by-side comparison plots for each site."""
    print("\n--- Generating Comparison Plots ---")
    all_sites = catboost_preds["Site"].unique()
    
    for site in all_sites:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 8), sharey=True)

        # --- Left Panel: CatBoost ---
        cb_site_df = catboost_preds[catboost_preds["Site"] == site].sort_values("Date")
        cb_metrics = catboost_metrics[catboost_metrics["Site"] == site].iloc[0]
        
        ax1.plot(cb_site_df["Date"], cb_site_df["Observed"], label="Observed", marker="o", linestyle='-', c='black', markersize=4)
        ax1.plot(cb_site_df["Date"], cb_site_df["Predicted"], label="CatBoost Predicted", marker="x", linestyle='--', c='crimson', markersize=4)
        ax1.set_title(f"CatBoost Performance for Site: {site}", fontsize=16)
        ax1.set_ylabel(target_col.upper(), fontsize=12)
        ax1.legend()
        ax1.grid(True, linestyle='--', alpha=0.6)
        fig.autofmt_xdate()
        
        cb_text = f"R²: {cb_metrics['R2']:.2f}\nRMSE: {cb_metrics['RMSE']:.2f}\nMAE: {cb_metrics['MAE']:.2f}"
        ax1.text(0.95, 0.05, cb_text, transform=ax1.transAxes, fontsize=12, verticalalignment='bottom', 
                 horizontalalignment='right', bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

        # --- Right Panel: SVM ---
        svm_site_df = svm_preds[svm_preds["Site"] == site].sort_values("Date")
        svm_metrics = svm_metrics[svm_metrics["Site"] == site].iloc[0]

        ax2.plot(svm_site_df["Date"], svm_site_df["Observed"], label="Observed", marker="o", linestyle='-', c='black', markersize=4)
        ax2.plot(svm_site_df["Date"], svm_site_df["Predicted"], label="SVM Predicted", marker="x", linestyle='--', c='royalblue', markersize=4)
        ax2.set_title(f"SVM Performance for Site: {site}", fontsize=16)
        ax2.legend()
        ax2.grid(True, linestyle='--', alpha=0.6)

        svm_text = f"R²: {svm_metrics['R2']:.2f}\nRMSE: {svm_metrics['RMSE']:.2f}\nMAE: {svm_metrics['MAE']:.2f}"
        ax2.text(0.95, 0.05, svm_text, transform=ax2.transAxes, fontsize=12, verticalalignment='bottom', 
                 horizontalalignment='right', bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))

        # Save the combined figure
        plot_filename = f'comparison_{target_col}_{site}.png'
        plot_path = os.path.join(figures_path, plot_filename)
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)

    print(f"All comparison plots saved to: {figures_path}")

if __name__ == '__main__':
    TARGET_COL = 'nee'
    
    print(f"\n{'='*50}\nRUNNING COMPARISON ANALYSIS FOR: {TARGET_COL.upper()}\n{'='*50}")

    # --- Data Loading and Feature Definition ---
    base_path = "/explore/nobackup/people/spotter5/anna_v/v2/loocv"
    full_dataset_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
    df_full = pd.read_csv(full_dataset_path)

    df_full['land_cover'] = df_full['land_cover'].astype('category')
    df_full['month'] = df_full['month'].astype('category')
    df_full = df_full[df_full['flux_method'] == 'EC']
    df_full['tmean_C'] = df_full[['tmmn', 'tmmx']].mean(axis=1)
    df_full['date'] = pd.to_datetime(df_full[['year', 'month']].assign(day=1))
    
    catboost_feature_cols = [
        'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03', 'sur_refl_b07', 
        'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs', 'bdod_0_100cm', 'cec_0_100cm', 
        'cfvo_0_100cm', 'clay_0_100cm', 'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 
        'sand_0_100cm', 'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT', 'land_cover', 'month', 
        'lai', 'fpar', 'Percent_NonTree_Vegetation', 'Percent_NonVegetated', 'Percent_Tree_Cover'
    ]
    catboost_categorical_features = ['land_cover', 'month']

    top_features_path = os.path.join(base_path, TARGET_COL, f'training_data_{TARGET_COL}_top_preds.csv')
    try:
        top_features_df = pd.read_csv(top_features_path)
        svm_feature_cols = [col for col in top_features_df.columns if col != TARGET_COL]
    except FileNotFoundError:
        print(f"FATAL ERROR: Top features file for SVM not found at {top_features_path}")
        exit()

    df_catboost = df_full.dropna(subset=catboost_feature_cols + [TARGET_COL, 'site_reference'])
    df_svm = df_full.dropna(subset=svm_feature_cols + [TARGET_COL, 'site_reference'])
    
    # --- Run Models ---
    catboost_metrics_df, catboost_preds_df = run_catboost_loso(df_catboost, catboost_feature_cols, catboost_categorical_features, TARGET_COL)
    svm_metrics_df, svm_preds_df = run_svm_loso(df_svm, svm_feature_cols, TARGET_COL)

    # --- Generate Plots ---
    comparison_figures_path = os.path.join(base_path, TARGET_COL, "figures", "svm_v_cat")
    os.makedirs(comparison_figures_path, exist_ok=True)
    generate_comparison_plots(catboost_preds_df, svm_preds_df, catboost_metrics_df, svm_metrics_df, TARGET_COL, comparison_figures_path)

    print(f"\n{'='*50}\nCOMPLETED COMPARISON ANALYSIS FOR: {TARGET_COL.upper()}\n{'='*50}")


RUNNING COMPARISON ANALYSIS FOR: NEE
--- Running CatBoost LOSO ---
  Processing site: Hakasia 5yr_RU-Ha2_tower...


TypeError: __init__() got an unexpected keyword argument 'colsample_bytree'