In [5]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib  # <-- ADDED IMPORT
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)

def run_loso_with_top_features(target_col):
    """
    Performs Leave-One-Site-Out (LOSO) modeling using a Radial Support Vector Machine (SVM).
    Features are one-hot encoded and standardized within a pipeline.
    """
    # --- 1. Load the file with the top pre-selected features ---
    base_path = "/explore/nobackup/people/spotter5/anna_v/v2/loocv"
    top_features_path = os.path.join(base_path, target_col, f'training_data_{target_col}_top_preds.csv')

    try:
        # Load the CSV with top predictors to get the feature list
        top_features_df = pd.read_csv(top_features_path)
    except FileNotFoundError:
        print(f"SKIPPING: Top features file not found for target '{target_col}'.")
        print(f"Expected at: {top_features_path}")
        print("Please run the feature selection script for this target first.\n")
        return

    # Automatically get the feature columns from the loaded file
    feature_cols = [col for col in top_features_df.columns if col != target_col]
    print(f"Using top selected features for '{target_col}': {feature_cols}")

    # --- 2. Load the main dataset and prepare it ---
    full_dataset_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
    df = pd.read_csv(full_dataset_path)

    # Basic data cleaning and preparation
    df['land_cover'] = df['land_cover'].astype('category')
    df['month'] = df['month'].astype('category')
    df = df[df['flux_method'] == 'EC']
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)
    df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

    # Drop rows with missing data for essential columns
    required_cols = feature_cols + [target_col, 'site_reference']
    df = df.dropna(subset=required_cols)

    # --- 3. Prepare data for modeling (including one-hot encoding) ---
    out_path = os.path.join(base_path, target_col)
    figures_path = os.path.join(out_path, "figures_svm_top_features")
    os.makedirs(figures_path, exist_ok=True)

    X_initial = df[feature_cols].copy()
    y = df[target_col]
    sites = df["site_reference"].unique()

    # One-hot encode categorical features. This must be done before the CV split.
    categorical_features = [f for f in X_initial.select_dtypes(include=['category', 'object']).columns if f in feature_cols]
    if categorical_features:
        print(f"One-hot encoding categorical features: {categorical_features}")
        X = pd.get_dummies(X_initial, columns=categorical_features, drop_first=True)
    else:
        X = X_initial
    
    final_feature_cols = X.columns.tolist()

    results = []
    all_preds_df_list = []

    # --- 4. Run Leave-One-Site-Out CV with SVM ---
    for test_site in sites:
        print(f"  Processing site: {test_site}...")
        train_idx = df["site_reference"] != test_site
        test_idx = df["site_reference"] == test_site

        if test_idx.sum() < 1:
            continue

        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y.loc[test_idx]
        dates_test = df.loc[test_idx, "date"]
        
        model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        site_df = pd.DataFrame({
            "Site": test_site, "Date": dates_test.values,
            "Observed": y_test.values, "Predicted": y_pred
        })
        all_preds_df_list.append(site_df)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append({"Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2})

    # --- 5. Aggregate, Save, and Report Results ---
    if not results:
        print(f"No data processed for target '{target_col}'.")
        return

    results_df = pd.DataFrame(results)
    all_preds_df = pd.concat(all_preds_df_list, ignore_index=True)

    results_csv_path = os.path.join(out_path, f'svm_results_{target_col}_top_features.csv')
    predictions_csv_path = os.path.join(out_path, f'svm_predictions_{target_col}_top_features.csv')
    results_df.to_csv(results_csv_path, index=False)
    all_preds_df.to_csv(predictions_csv_path, index=False)
    print(f"  Results saved to: {results_csv_path}")

    rmse_all = np.sqrt(mean_squared_error(all_preds_df["Observed"], all_preds_df["Predicted"]))
    r2_all = r2_score(all_preds_df["Observed"], all_preds_df["Predicted"])
    mae_all = mean_absolute_error(all_preds_df["Observed"], all_preds_df["Predicted"])
    print(f"\n--- SVM Pooled Metrics for {target_col.upper()} (Top Features) ---")
    print(f"Pooled R²: {r2_all:.4f}, Pooled RMSE: {rmse_all:.4f}, Pooled MAE: {mae_all:.4f}")

    mean_r2 = results_df['R2'].mean()
    median_r2 = results_df['R2'].median()
    mean_rmse = results_df['RMSE'].mean()
    median_rmse = results_df['RMSE'].median()
    mean_mae = results_df['MAE'].mean()
    median_mae = results_df['MAE'].median()
    
    print(f"\n--- SVM Summary Metrics Across Sites for {target_col.upper()} (Top Features) ---")
    print(f"  Mean R²:   {mean_r2:.4f}, Median R²:   {median_r2:.4f}")
    print(f"  Mean RMSE: {mean_rmse:.4f}, Median RMSE: {median_rmse:.4f}")
    print(f"  Mean MAE:  {mean_mae:.4f}, Median MAE:  {median_mae:.4f}")

    # --- 6. Plotting ---
    print("\n  Generating and saving individual site plots...")
    for site in all_preds_df["Site"].unique():
        fig, ax = plt.subplots(figsize=(12, 7))
        site_df = all_preds_df[all_preds_df["Site"] == site].sort_values("Date")
        site_metrics = results_df[results_df["Site"] == site].iloc[0]

        ax.plot(site_df["Date"], site_df["Observed"], label="Observed", marker="o", linestyle='-', markersize=4)
        ax.plot(site_df["Date"], site_df["Predicted"], label="Predicted", marker="x", linestyle='--', markersize=4)
        ax.set_title(f"Observed vs. Predicted {target_col} (SVM, Top Features) for Site: {site}")
        ax.legend(), ax.grid(True), fig.autofmt_xdate()

        textstr = f"RMSE: {site_metrics['RMSE']:.2f}\nMAE: {site_metrics['MAE']:.2f}\nR²: {site_metrics['R2']:.2f}"
        ax.text(0.97, 0.03, textstr, transform=ax.transAxes, fontsize=10,
                verticalalignment='bottom', horizontalalignment='right',
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7))
        
        plot_filename = f'svm_{target_col}_{site}_timeseries_top_features.png'
        plot_path = os.path.join(figures_path, plot_filename)
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
    print(f"  All site plots saved to: {figures_path}")

        # --- 7. Pooled Observed vs Predicted density plot (SVM, Top Features) ---
    # Clean NaNs / infs
    pooled = all_preds_df[['Observed', 'Predicted']].copy()
    pooled = pooled.replace([np.inf, -np.inf], np.nan).dropna()

    if not pooled.empty:
        fig, ax = plt.subplots(figsize=(7, 7))

        # Axis limits with padding
        lo = np.nanmin([pooled['Observed'].min(), pooled['Predicted'].min()])
        hi = np.nanmax([pooled['Observed'].max(), pooled['Predicted'].max()])
        pad = 0.05 * (hi - lo if np.isfinite(hi - lo) and (hi - lo) > 0 else 1.0)
        ax.set_xlim(lo - pad, hi + pad)
        ax.set_ylim(lo - pad, hi + pad)

        # Density plot (hexbin) — darker red = higher density
        hb = ax.hexbin(
            pooled['Observed'],
            pooled['Predicted'],
            gridsize=80,
            cmap='Reds',
            bins='log',
            mincnt=1
        )
        cbar = fig.colorbar(hb, ax=ax)
        cbar.set_label('log10(N points)')

        # 1:1 line in solid black
        ax.plot([lo - pad, hi + pad], [lo - pad, hi + pad], color='black', linewidth=1.5)

        ax.set_title(f'Observed vs Predicted (LOSO, SVM Top Features) — {target_col}')
        ax.set_xlabel('Observed')
        ax.set_ylabel('Predicted')
        ax.grid(True)

        # Round pooled metrics to 2 decimals in the lower-right corner
        annot = f"R² = {r2_all:.2f}\nRMSE = {rmse_all:.2f}\nMAE = {mae_all:.2f}"
        ax.text(
            0.97, 0.03, annot, transform=ax.transAxes,
            fontsize=11, va='bottom', ha='right',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7)
        )

        # Save density plot (same naming style as the CatBoost script, but for SVM)
        pooled_plot_path = os.path.join(
            figures_path, f'svm_{target_col}_obs_vs_pred_all_sites_top_features.png'
        )
        plt.savefig(pooled_plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
        print(f"  Pooled Observed vs Predicted density plot saved to: {pooled_plot_path}")
    else:
        print("  Skipped pooled density plot: no valid Observed/Predicted pairs after cleaning.")


    # --- START: EDITED SECTION ---
    # --- 7. Train and Save Final Model ---
    print("\n  Training and saving final model on all data...")
    models_out_path = '/explore/nobackup/people/spotter5/anna_v/v2/models_svm_top_features'
    os.makedirs(models_out_path, exist_ok=True)
    
    # Instantiate the final model pipeline
    final_model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
    
    # Train on all available data
    final_model.fit(X, y)
    
    # Save the trained model to a file
    model_filename = os.path.join(models_out_path, f'svm_{target_col}_top_features.joblib')
    joblib.dump(final_model, model_filename)
    print(f"  Final model saved to: {model_filename}")
    # --- END: EDITED SECTION ---


if __name__ == '__main__':
    # List of target variables to run the analysis for
    # targets_to_run = ['gpp', 'nee', 'reco', 'ch4_flux_total']
    targets_to_run = ['nee']

    for target in targets_to_run:
        print(f"\n{'='*20} RUNNING SVM ANALYSIS FOR: {target.upper()} {'='*20}")
        run_loso_with_top_features(target_col=target)
        print(f"{'='*20} COMPLETED SVM ANALYSIS FOR: {target.upper()} {'='*20}")




Using top selected features for 'nee': ['tmean_C', 'month', 'srad', 'lai', 'NDVI', 'cfvo_0_100cm', 'ocd_0_100cm', 'vap', 'land_cover']
One-hot encoding categorical features: ['month', 'land_cover']
  Processing site: Central Marsh_US-Cms_tower...
  Processing site: Hakasia 5yr_RU-Ha2_tower...
  Processing site: Hakasia Steppe_RU-Ha1_tower...
  Processing site: Kaamanen_FI-Kaa_tower...
  Processing site: Manitoba - Northern Old Black Spruce (former BOREAS Northern Study Area)_CA-Man_tower...
  Processing site: Nelegel_RU-Nel_tower...
  Processing site: Neleger Cutover_RU-NeC_tower...
  Processing site: Neleger larch forest_RU-NeF_tower...
  Processing site: Samoylov Island_RU-Sam_tower...
  Processing site: Saskatchewan - Western Boreal, Mature Aspen_CA-Oas_tower...
  Processing site: Saskatchewan - Western Boreal, Mature Black Spruce_CA-Obs_tower...
  Processing site: Saskatchewan - Western Boreal, forest burned in 1989_CA-SF2_tower...
  Processing site: Saskatchewan - Western Boreal,



  Processing site: Pond Inlet_CA-Pin_tower...




  Processing site: Poker Flat Research Range: Succession from fire scar to deciduous forest_US-Rpf_tower...
  Processing site: Samoylov Island_RU-Sam (open)_tower...
  Processing site: Udleg practice forest_MN-Udg_tower...
  Processing site: Daring Lake_CA-DL3_tower...
  Processing site: Elgeeii forest station_RU-Ege_tower...
  Processing site: Bonanza Creek Black Spruce_US-BZS_tower...
  Processing site: Daring Lake_CA-DL4_tower...
  Processing site: Tiksi_RU-Tks_tower...
  Processing site: Samoylov Island_RU-Sam (closed)_tower...
  Processing site: Bonanza Creek Thermokarst Bog_US-BZB_tower...
  Processing site: Poker Flat Research Range Black Spruce Forest_US-Prr_tower...
  Processing site: Bonanza Creek Rich Fen_US-BZF_tower...
  Processing site: Cascaden Ridge Fire Scar_US-Fcr_tower...




  Processing site: Lake Hazen, Ellesmere Island_CA-LHazen2-meadow wetland_tower...
  Processing site: Cherskii ecotone_RU-Eusk_cher1_tower...
  Processing site: Kenttarova_FI-Ken_tower...
  Processing site: Lompolojankka_FI-Lom_tower...
  Processing site: Sammaltunturi fell_FI-SamFell_tower...
  Processing site: ARM-NSA-Barrow_US-A10_tower...
  Processing site: Adventdalen_SJ-Adv_tower...
  Processing site: Stordalen Fen_SE-St1_tower...
  Processing site: NGEE Arctic Barrow_US-NGB_tower...
  Processing site: Cherskii disturbed forest_RU-Eusk_cher2_tower...
  Processing site: Disko_GL-Dsk_tower...
  Processing site: Havikpak Creek_CA-HPC_tower...
  Processing site: Scotty Creek Landscape_CA-SCC_tower...
  Processing site: ZOTTO Bog_RU-Zo1_tower...
  Processing site: ZOTTO Forest_RU-Zo2_tower...
  Processing site: Trail Valley Creek_CA-TVC_tower...
  Processing site: Cherskii reference_RU-Ch2_tower...
  Processing site: Flux Observations of Carbon from an Airborne Laboratory (FOCAL) Camp



  Results saved to: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/svm_results_nee_top_features.csv

--- SVM Pooled Metrics for NEE (Top Features) ---
Pooled R²: 0.4808, Pooled RMSE: 23.5060, Pooled MAE: 16.1833

--- SVM Summary Metrics Across Sites for NEE (Top Features) ---
  Mean R²:   -1.8028, Median R²:   0.4351
  Mean RMSE: 20.7431, Median RMSE: 16.9262
  Mean MAE:  16.8918, Median MAE:  13.5380

  Generating and saving individual site plots...
  All site plots saved to: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/figures_svm_top_features
  Pooled Observed vs Predicted density plot saved to: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/figures_svm_top_features/svm_nee_obs_vs_pred_all_sites_top_features.png

  Training and saving final model on all data...
  Final model saved to: /explore/nobackup/people/spotter5/anna_v/v2/models_svm_top_features/svm_nee_top_features.joblib


In [6]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib  # <-- ADDED IMPORT
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)

def run_loso_with_top_features(target_col):
    """
    Performs Leave-One-Site-Out (LOSO) modeling using a Radial Support Vector Machine (SVM).
    Features are one-hot encoded and standardized within a pipeline.
    """
    # --- 1. Load the file with the top pre-selected features ---
    base_path = "/explore/nobackup/people/spotter5/anna_v/v2/loocv"
    top_features_path = os.path.join(base_path, target_col, f'training_data_{target_col}_top_preds.csv')

    try:
        # Load the CSV with top predictors to get the feature list
        top_features_df = pd.read_csv(top_features_path)
    except FileNotFoundError:
        print(f"SKIPPING: Top features file not found for target '{target_col}'.")
        print(f"Expected at: {top_features_path}")
        print("Please run the feature selection script for this target first.\n")
        return

    # Automatically get the feature columns from the loaded file
    feature_cols = [col for col in top_features_df.columns if col != target_col]
    print(f"Using top selected features for '{target_col}': {feature_cols}")

    # --- 2. Load the main dataset and prepare it ---
    full_dataset_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
    df = pd.read_csv(full_dataset_path)

    # Basic data cleaning and preparation
    df['land_cover'] = df['land_cover'].astype('category')
    df['month'] = df['month'].astype('category')
    df = df[df['flux_method'] == 'EC']
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)
    df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

    # Drop rows with missing data for essential columns
    required_cols = feature_cols + [target_col, 'site_reference']
    df = df.dropna(subset=required_cols)

    # --- 3. Prepare data for modeling (including one-hot encoding) ---
    out_path = os.path.join(base_path, target_col)
    figures_path = os.path.join(out_path, "figures_svm_top_features")
    os.makedirs(figures_path, exist_ok=True)

    X_initial = df[feature_cols].copy()
    y = df[target_col]
    sites = df["site_reference"].unique()

    # One-hot encode categorical features. This must be done before the CV split.
    categorical_features = [f for f in X_initial.select_dtypes(include=['category', 'object']).columns if f in feature_cols]
    if categorical_features:
        print(f"One-hot encoding categorical features: {categorical_features}")
        X = pd.get_dummies(X_initial, columns=categorical_features, drop_first=True)
    else:
        X = X_initial
    
    final_feature_cols = X.columns.tolist()

    results = []
    all_preds_df_list = []

    # --- 4. Run Leave-One-Site-Out CV with SVM ---
    for test_site in sites:
        print(f"  Processing site: {test_site}...")
        train_idx = df["site_reference"] != test_site
        test_idx = df["site_reference"] == test_site

        if test_idx.sum() < 1:
            continue

        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y.loc[test_idx]
        dates_test = df.loc[test_idx, "date"]
        
        model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        site_df = pd.DataFrame({
            "Site": test_site, "Date": dates_test.values,
            "Observed": y_test.values, "Predicted": y_pred
        })
        all_preds_df_list.append(site_df)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append({"Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2})

    # --- 5. Aggregate, Save, and Report Results ---
    if not results:
        print(f"No data processed for target '{target_col}'.")
        return

    results_df = pd.DataFrame(results)
    all_preds_df = pd.concat(all_preds_df_list, ignore_index=True)

    results_csv_path = os.path.join(out_path, f'svm_results_{target_col}_top_features.csv')
    predictions_csv_path = os.path.join(out_path, f'svm_predictions_{target_col}_top_features.csv')
    results_df.to_csv(results_csv_path, index=False)
    all_preds_df.to_csv(predictions_csv_path, index=False)
    print(f"  Results saved to: {results_csv_path}")

    rmse_all = np.sqrt(mean_squared_error(all_preds_df["Observed"], all_preds_df["Predicted"]))
    r2_all = r2_score(all_preds_df["Observed"], all_preds_df["Predicted"])
    mae_all = mean_absolute_error(all_preds_df["Observed"], all_preds_df["Predicted"])
    print(f"\n--- SVM Pooled Metrics for {target_col.upper()} (Top Features) ---")
    print(f"Pooled R²: {r2_all:.4f}, Pooled RMSE: {rmse_all:.4f}, Pooled MAE: {mae_all:.4f}")

    mean_r2 = results_df['R2'].mean()
    median_r2 = results_df['R2'].median()
    mean_rmse = results_df['RMSE'].mean()
    median_rmse = results_df['RMSE'].median()
    mean_mae = results_df['MAE'].mean()
    median_mae = results_df['MAE'].median()
    
    print(f"\n--- SVM Summary Metrics Across Sites for {target_col.upper()} (Top Features) ---")
    print(f"  Mean R²:   {mean_r2:.4f}, Median R²:   {median_r2:.4f}")
    print(f"  Mean RMSE: {mean_rmse:.4f}, Median RMSE: {median_rmse:.4f}")
    print(f"  Mean MAE:  {mean_mae:.4f}, Median MAE:  {median_mae:.4f}")

    # --- 6. Plotting ---
    print("\n  Generating and saving individual site plots...")
    for site in all_preds_df["Site"].unique():
        fig, ax = plt.subplots(figsize=(12, 7))
        site_df = all_preds_df[all_preds_df["Site"] == site].sort_values("Date")
        site_metrics = results_df[results_df["Site"] == site].iloc[0]

        ax.plot(site_df["Date"], site_df["Observed"], label="Observed", marker="o", linestyle='-', markersize=4)
        ax.plot(site_df["Date"], site_df["Predicted"], label="Predicted", marker="x", linestyle='--', markersize=4)
        ax.set_title(f"Observed vs. Predicted {target_col} (SVM, Top Features) for Site: {site}")
        ax.legend(), ax.grid(True), fig.autofmt_xdate()

        textstr = f"RMSE: {site_metrics['RMSE']:.2f}\nMAE: {site_metrics['MAE']:.2f}\nR²: {site_metrics['R2']:.2f}"
        ax.text(0.97, 0.03, textstr, transform=ax.transAxes, fontsize=10,
                verticalalignment='bottom', horizontalalignment='right',
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7))
        
        plot_filename = f'svm_{target_col}_{site}_timeseries_top_features.png'
        plot_path = os.path.join(figures_path, plot_filename)
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
    print(f"  All site plots saved to: {figures_path}")

        # --- 7. Pooled Observed vs Predicted density plot (SVM, Top Features) ---
    # Clean NaNs / infs
    pooled = all_preds_df[['Observed', 'Predicted']].copy()
    pooled = pooled.replace([np.inf, -np.inf], np.nan).dropna()

    if not pooled.empty:
        fig, ax = plt.subplots(figsize=(7, 7))

        # Axis limits with padding
        lo = np.nanmin([pooled['Observed'].min(), pooled['Predicted'].min()])
        hi = np.nanmax([pooled['Observed'].max(), pooled['Predicted'].max()])
        pad = 0.05 * (hi - lo if np.isfinite(hi - lo) and (hi - lo) > 0 else 1.0)
        ax.set_xlim(lo - pad, hi + pad)
        ax.set_ylim(lo - pad, hi + pad)

        # Density plot (hexbin) — darker red = higher density
        hb = ax.hexbin(
            pooled['Observed'],
            pooled['Predicted'],
            gridsize=80,
            cmap='Reds',
            bins='log',
            mincnt=1
        )
        cbar = fig.colorbar(hb, ax=ax)
        cbar.set_label('log10(N points)')

        # 1:1 line in solid black
        ax.plot([lo - pad, hi + pad], [lo - pad, hi + pad], color='black', linewidth=1.5)

        ax.set_title(f'Observed vs Predicted (LOSO, SVM Top Features) — {target_col}')
        ax.set_xlabel('Observed')
        ax.set_ylabel('Predicted')
        ax.grid(True)

        # Round pooled metrics to 2 decimals in the lower-right corner
        annot = f"R² = {r2_all:.2f}\nRMSE = {rmse_all:.2f}\nMAE = {mae_all:.2f}"
        ax.text(
            0.97, 0.03, annot, transform=ax.transAxes,
            fontsize=11, va='bottom', ha='right',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7)
        )

        # Save density plot (same naming style as the CatBoost script, but for SVM)
        pooled_plot_path = os.path.join(
            figures_path, f'svm_{target_col}_obs_vs_pred_all_sites_top_features.png'
        )
        plt.savefig(pooled_plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
        print(f"  Pooled Observed vs Predicted density plot saved to: {pooled_plot_path}")
    else:
        print("  Skipped pooled density plot: no valid Observed/Predicted pairs after cleaning.")


    # --- START: EDITED SECTION ---
    # --- 7. Train and Save Final Model ---
    print("\n  Training and saving final model on all data...")
    models_out_path = '/explore/nobackup/people/spotter5/anna_v/v2/models_svm_top_features'
    os.makedirs(models_out_path, exist_ok=True)
    
    # Instantiate the final model pipeline
    final_model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
    
    # Train on all available data
    final_model.fit(X, y)
    
    # Save the trained model to a file
    model_filename = os.path.join(models_out_path, f'svm_{target_col}_top_features.joblib')
    joblib.dump(final_model, model_filename)
    print(f"  Final model saved to: {model_filename}")
    # --- END: EDITED SECTION ---


if __name__ == '__main__':
    # List of target variables to run the analysis for
    # targets_to_run = ['gpp', 'nee', 'reco', 'ch4_flux_total']
    targets_to_run = ['gpp']

    for target in targets_to_run:
        print(f"\n{'='*20} RUNNING SVM ANALYSIS FOR: {target.upper()} {'='*20}")
        run_loso_with_top_features(target_col=target)
        print(f"{'='*20} COMPLETED SVM ANALYSIS FOR: {target.upper()} {'='*20}")




Using top selected features for 'gpp': ['tmean_C', 'NDVI', 'silt_0_100cm', 'lai', 'srad', 'NDWI', 'sur_refl_b07', 'land_cover', 'bdod_0_100cm', 'ocd_0_100cm', 'EVI', 'cfvo_0_100cm', 'snow_depth', 'phh2o_0_100cm', 'co2_cont']
One-hot encoding categorical features: ['land_cover']
  Processing site: Central Marsh_US-Cms_tower...
  Processing site: Hakasia Steppe_RU-Ha1_tower...
  Processing site: Kaamanen_FI-Kaa_tower...
  Processing site: Manitoba - Northern Old Black Spruce (former BOREAS Northern Study Area)_CA-Man_tower...
  Processing site: Nelegel_RU-Nel_tower...
  Processing site: Neleger Cutover_RU-NeC_tower...
  Processing site: Neleger larch forest_RU-NeF_tower...
  Processing site: Saskatchewan - Western Boreal, Mature Aspen_CA-Oas_tower...
  Processing site: Saskatchewan - Western Boreal, Mature Black Spruce_CA-Obs_tower...
  Processing site: Saskatchewan - Western Boreal, forest burned in 1989_CA-SF2_tower...
  Processing site: Saskatchewan - Western Boreal, forest burned in



  Processing site: Poker Flat Research Range: Succession from fire scar to deciduous forest_US-Rpf_tower...
  Processing site: Elgeeii forest station_RU-Ege_tower...
  Processing site: Udleg practice forest_MN-Udg_tower...
  Processing site: Bonanza Creek Black Spruce_US-BZS_tower...
  Processing site: Daring Lake_CA-DL3_tower...
  Processing site: Daring Lake_CA-DL4_tower...
  Processing site: Tiksi_RU-Tks_tower...
  Processing site: Bonanza Creek Thermokarst Bog_US-BZB_tower...
  Processing site: Poker Flat Research Range Black Spruce Forest_US-Prr_tower...
  Processing site: Bonanza Creek Rich Fen_US-BZF_tower...
  Processing site: Cascaden Ridge Fire Scar_US-Fcr_tower...




  Processing site: Lake Hazen, Ellesmere Island_CA-LHazen2-meadow wetland_tower...
  Processing site: Adventdalen_SJ-Adv_tower...
  Processing site: Havikpak Creek_CA-HPC_tower...
  Processing site: Scotty Creek Landscape_CA-SCC_tower...
  Processing site: NGEE Arctic Barrow_US-NGB_tower...
  Processing site: Trail Valley Creek_CA-TVC_tower...
  Processing site: Cherskii reference_RU-Ch2_tower...
  Processing site: Barrow-BES_US-Bes_tower...
  Processing site: Varrio_FI-Var_tower...
  Processing site: Scotty Creek Bog_CA-SCB_tower...
  Processing site: Barrow-BEO_US-Beo_tower...
  Processing site: Council, Alaska_US-KOC_tower...
  Processing site: Hustai grassland_MN-Hst_tower...
  Processing site: Nalaikh grassland_MN-Nkh_tower...
  Processing site: Bibai bog_JP-Bby_tower...
  Processing site: Wolf_creek_Buckbrush_CA-WCBB_tower...
  Processing site: Wolf_creek_SparseShrub_CA-WCPLT_tower...
  Processing site: Wolf_creek_forest_CA-WCF_tower...
  Processing site: Smith Creek_CA-SMC_tower

In [7]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib  # <-- ADDED IMPORT
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)

def run_loso_with_top_features(target_col):
    """
    Performs Leave-One-Site-Out (LOSO) modeling using a Radial Support Vector Machine (SVM).
    Features are one-hot encoded and standardized within a pipeline.
    """
    # --- 1. Load the file with the top pre-selected features ---
    base_path = "/explore/nobackup/people/spotter5/anna_v/v2/loocv"
    top_features_path = os.path.join(base_path, target_col, f'training_data_{target_col}_top_preds.csv')

    try:
        # Load the CSV with top predictors to get the feature list
        top_features_df = pd.read_csv(top_features_path)
    except FileNotFoundError:
        print(f"SKIPPING: Top features file not found for target '{target_col}'.")
        print(f"Expected at: {top_features_path}")
        print("Please run the feature selection script for this target first.\n")
        return

    # Automatically get the feature columns from the loaded file
    feature_cols = [col for col in top_features_df.columns if col != target_col]
    print(f"Using top selected features for '{target_col}': {feature_cols}")

    # --- 2. Load the main dataset and prepare it ---
    full_dataset_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
    df = pd.read_csv(full_dataset_path)

    # Basic data cleaning and preparation
    df['land_cover'] = df['land_cover'].astype('category')
    df['month'] = df['month'].astype('category')
    df = df[df['flux_method'] == 'EC']
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)
    df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

    # Drop rows with missing data for essential columns
    required_cols = feature_cols + [target_col, 'site_reference']
    df = df.dropna(subset=required_cols)

    # --- 3. Prepare data for modeling (including one-hot encoding) ---
    out_path = os.path.join(base_path, target_col)
    figures_path = os.path.join(out_path, "figures_svm_top_features")
    os.makedirs(figures_path, exist_ok=True)

    X_initial = df[feature_cols].copy()
    y = df[target_col]
    sites = df["site_reference"].unique()

    # One-hot encode categorical features. This must be done before the CV split.
    categorical_features = [f for f in X_initial.select_dtypes(include=['category', 'object']).columns if f in feature_cols]
    if categorical_features:
        print(f"One-hot encoding categorical features: {categorical_features}")
        X = pd.get_dummies(X_initial, columns=categorical_features, drop_first=True)
    else:
        X = X_initial
    
    final_feature_cols = X.columns.tolist()

    results = []
    all_preds_df_list = []

    # --- 4. Run Leave-One-Site-Out CV with SVM ---
    for test_site in sites:
        print(f"  Processing site: {test_site}...")
        train_idx = df["site_reference"] != test_site
        test_idx = df["site_reference"] == test_site

        if test_idx.sum() < 1:
            continue

        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y.loc[test_idx]
        dates_test = df.loc[test_idx, "date"]
        
        model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        site_df = pd.DataFrame({
            "Site": test_site, "Date": dates_test.values,
            "Observed": y_test.values, "Predicted": y_pred
        })
        all_preds_df_list.append(site_df)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append({"Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2})

    # --- 5. Aggregate, Save, and Report Results ---
    if not results:
        print(f"No data processed for target '{target_col}'.")
        return

    results_df = pd.DataFrame(results)
    all_preds_df = pd.concat(all_preds_df_list, ignore_index=True)

    results_csv_path = os.path.join(out_path, f'svm_results_{target_col}_top_features.csv')
    predictions_csv_path = os.path.join(out_path, f'svm_predictions_{target_col}_top_features.csv')
    results_df.to_csv(results_csv_path, index=False)
    all_preds_df.to_csv(predictions_csv_path, index=False)
    print(f"  Results saved to: {results_csv_path}")

    rmse_all = np.sqrt(mean_squared_error(all_preds_df["Observed"], all_preds_df["Predicted"]))
    r2_all = r2_score(all_preds_df["Observed"], all_preds_df["Predicted"])
    mae_all = mean_absolute_error(all_preds_df["Observed"], all_preds_df["Predicted"])
    print(f"\n--- SVM Pooled Metrics for {target_col.upper()} (Top Features) ---")
    print(f"Pooled R²: {r2_all:.4f}, Pooled RMSE: {rmse_all:.4f}, Pooled MAE: {mae_all:.4f}")

    mean_r2 = results_df['R2'].mean()
    median_r2 = results_df['R2'].median()
    mean_rmse = results_df['RMSE'].mean()
    median_rmse = results_df['RMSE'].median()
    mean_mae = results_df['MAE'].mean()
    median_mae = results_df['MAE'].median()
    
    print(f"\n--- SVM Summary Metrics Across Sites for {target_col.upper()} (Top Features) ---")
    print(f"  Mean R²:   {mean_r2:.4f}, Median R²:   {median_r2:.4f}")
    print(f"  Mean RMSE: {mean_rmse:.4f}, Median RMSE: {median_rmse:.4f}")
    print(f"  Mean MAE:  {mean_mae:.4f}, Median MAE:  {median_mae:.4f}")

    # --- 6. Plotting ---
    print("\n  Generating and saving individual site plots...")
    for site in all_preds_df["Site"].unique():
        fig, ax = plt.subplots(figsize=(12, 7))
        site_df = all_preds_df[all_preds_df["Site"] == site].sort_values("Date")
        site_metrics = results_df[results_df["Site"] == site].iloc[0]

        ax.plot(site_df["Date"], site_df["Observed"], label="Observed", marker="o", linestyle='-', markersize=4)
        ax.plot(site_df["Date"], site_df["Predicted"], label="Predicted", marker="x", linestyle='--', markersize=4)
        ax.set_title(f"Observed vs. Predicted {target_col} (SVM, Top Features) for Site: {site}")
        ax.legend(), ax.grid(True), fig.autofmt_xdate()

        textstr = f"RMSE: {site_metrics['RMSE']:.2f}\nMAE: {site_metrics['MAE']:.2f}\nR²: {site_metrics['R2']:.2f}"
        ax.text(0.97, 0.03, textstr, transform=ax.transAxes, fontsize=10,
                verticalalignment='bottom', horizontalalignment='right',
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7))
        
        plot_filename = f'svm_{target_col}_{site}_timeseries_top_features.png'
        plot_path = os.path.join(figures_path, plot_filename)
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
    print(f"  All site plots saved to: {figures_path}")

        # --- 7. Pooled Observed vs Predicted density plot (SVM, Top Features) ---
    # Clean NaNs / infs
    pooled = all_preds_df[['Observed', 'Predicted']].copy()
    pooled = pooled.replace([np.inf, -np.inf], np.nan).dropna()

    if not pooled.empty:
        fig, ax = plt.subplots(figsize=(7, 7))

        # Axis limits with padding
        lo = np.nanmin([pooled['Observed'].min(), pooled['Predicted'].min()])
        hi = np.nanmax([pooled['Observed'].max(), pooled['Predicted'].max()])
        pad = 0.05 * (hi - lo if np.isfinite(hi - lo) and (hi - lo) > 0 else 1.0)
        ax.set_xlim(lo - pad, hi + pad)
        ax.set_ylim(lo - pad, hi + pad)

        # Density plot (hexbin) — darker red = higher density
        hb = ax.hexbin(
            pooled['Observed'],
            pooled['Predicted'],
            gridsize=80,
            cmap='Reds',
            bins='log',
            mincnt=1
        )
        cbar = fig.colorbar(hb, ax=ax)
        cbar.set_label('log10(N points)')

        # 1:1 line in solid black
        ax.plot([lo - pad, hi + pad], [lo - pad, hi + pad], color='black', linewidth=1.5)

        ax.set_title(f'Observed vs Predicted (LOSO, SVM Top Features) — {target_col}')
        ax.set_xlabel('Observed')
        ax.set_ylabel('Predicted')
        ax.grid(True)

        # Round pooled metrics to 2 decimals in the lower-right corner
        annot = f"R² = {r2_all:.2f}\nRMSE = {rmse_all:.2f}\nMAE = {mae_all:.2f}"
        ax.text(
            0.97, 0.03, annot, transform=ax.transAxes,
            fontsize=11, va='bottom', ha='right',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7)
        )

        # Save density plot (same naming style as the CatBoost script, but for SVM)
        pooled_plot_path = os.path.join(
            figures_path, f'svm_{target_col}_obs_vs_pred_all_sites_top_features.png'
        )
        plt.savefig(pooled_plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
        print(f"  Pooled Observed vs Predicted density plot saved to: {pooled_plot_path}")
    else:
        print("  Skipped pooled density plot: no valid Observed/Predicted pairs after cleaning.")


    # --- START: EDITED SECTION ---
    # --- 7. Train and Save Final Model ---
    print("\n  Training and saving final model on all data...")
    models_out_path = '/explore/nobackup/people/spotter5/anna_v/v2/models_svm_top_features'
    os.makedirs(models_out_path, exist_ok=True)
    
    # Instantiate the final model pipeline
    final_model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=100))
    
    # Train on all available data
    final_model.fit(X, y)
    
    # Save the trained model to a file
    model_filename = os.path.join(models_out_path, f'svm_{target_col}_top_features.joblib')
    joblib.dump(final_model, model_filename)
    print(f"  Final model saved to: {model_filename}")
    # --- END: EDITED SECTION ---


if __name__ == '__main__':
    # List of target variables to run the analysis for
    # targets_to_run = ['gpp', 'nee', 'reco', 'ch4_flux_total']
    targets_to_run = ['reco']

    for target in targets_to_run:
        print(f"\n{'='*20} RUNNING SVM ANALYSIS FOR: {target.upper()} {'='*20}")
        run_loso_with_top_features(target_col=target)
        print(f"{'='*20} COMPLETED SVM ANALYSIS FOR: {target.upper()} {'='*20}")


Using top selected features for 'reco': ['vap', 'tmean_C', 'silt_0_100cm', 'land_cover', 'sur_refl_b07', 'NDVI', 'ocd_0_100cm', 'sur_refl_b01', 'NDWI', 'bdod_0_100cm']
One-hot encoding categorical features: ['land_cover']
  Processing site: Saskatchewan - Western Boreal, Mature Aspen_CA-Oas_tower...
  Processing site: Manitoba - Northern Old Black Spruce (former BOREAS Northern Study Area)_CA-Man_tower...
  Processing site: Saskatchewan - Western Boreal, Mature Black Spruce_CA-Obs_tower...
  Processing site: Kaamanen_FI-Kaa_tower...
  Processing site: Neleger Burnt Forest_RU-NeB_tower...
  Processing site: Neleger larch forest_RU-NeF_tower...
  Processing site: Nelegel_RU-Nel_tower...
  Processing site: Central Marsh_US-Cms_tower...
  Processing site: Zotino; Central Siberia_RU-Zfw 2_tower...




  Processing site: Neleger Cutover_RU-NeC_tower...
  Processing site: Sodankyla_FI-Sod_tower...
  Processing site: Saskatchewan - Western Boreal, forest burned in 1989_CA-SF2_tower...
  Processing site: UCI-1930 burn site_CA-NS2_tower...
  Processing site: UCI-1964 burn site_CA-NS3_tower...
  Processing site: UCI-1981 burn site_CA-NS5_tower...
  Processing site: UCI-1989 burn site_CA-NS6_tower...
  Processing site: Samoylov Island_RU-Sam_tower...
  Processing site: UCI-1998 burn site_CA-NS7_tower...
  Processing site: Zotino_RU-Zot_tower...
  Processing site: UCI-1850 burn site_CA-NS1_tower...
  Processing site: Hakasia Steppe_RU-Ha1_tower...
  Processing site: Saskatchewan - Western Boreal, forest burned in 1998_CA-SF3_tower...
  Processing site: UCI-1964 burn site wet_CA-NS4_tower...
  Processing site: Kherlenbayan Ulaan_MN-Kbu_tower...
  Processing site: Southern Khentei Taiga_MN-Skt_tower...
  Processing site: University of Alaska, Fairbanks_US-Uaf_tower...
  Processing site: Atqas



  Processing site: Pond Inlet_CA-Pin_tower...




  Processing site: Poker Flat Research Range: Succession from fire scar to deciduous forest_US-Rpf_tower...
  Processing site: Samoylov Island_RU-Sam (open)_tower...
  Processing site: Elgeeii forest station_RU-Ege_tower...
  Processing site: Udleg practice forest_MN-Udg_tower...
  Processing site: Daring Lake_CA-DL3_tower...
  Processing site: Bonanza Creek Black Spruce_US-BZS_tower...
  Processing site: Daring Lake_CA-DL4_tower...
  Processing site: Tiksi_RU-Tks_tower...
  Processing site: Samoylov Island_RU-Sam (closed)_tower...
  Processing site: Bonanza Creek Thermokarst Bog_US-BZB_tower...
  Processing site: Poker Flat Research Range Black Spruce Forest_US-Prr_tower...
  Processing site: Bonanza Creek Rich Fen_US-BZF_tower...
  Processing site: Cascaden Ridge Fire Scar_US-Fcr_tower...




  Processing site: Lake Hazen, Ellesmere Island_CA-LHazen2-meadow wetland_tower...
  Processing site: Adventdalen_SJ-Adv_tower...
  Processing site: Barrow-BEO_US-Beo_tower...
  Processing site: Havikpak Creek_CA-HPC_tower...
  Processing site: Scotty Creek Landscape_CA-SCC_tower...
  Processing site: Barrow-BES_US-Bes_tower...
  Processing site: Trail Valley Creek_CA-TVC_tower...
  Processing site: Cherskii reference_RU-Ch2_tower...
  Processing site: NGEE Arctic Barrow_US-NGB_tower...
  Processing site: Varrio_FI-Var_tower...
  Processing site: Scotty Creek Bog_CA-SCB_tower...
  Processing site: Stordalen Palsa Bog_SE-Sto_tower...
  Processing site: Council, Alaska_US-KOC_tower...
  Processing site: Hustai grassland_MN-Hst_tower...
  Processing site: Nalaikh grassland_MN-Nkh_tower...
  Processing site: Bibai bog_JP-Bby_tower...
  Processing site: Wolf_creek_Buckbrush_CA-WCBB_tower...
  Processing site: Wolf_creek_SparseShrub_CA-WCPLT_tower...
  Processing site: Wolf_creek_forest_CA-W