In [3]:
#!/usr/bin/env python
# coding: utf-8

import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)

def run_loso_with_top_features(target_col):
    """
    Performs Leave-One-Site-Out (LOSO) modeling using a Radial Support Vector Machine (SVM)
    with nested cross-validation for hyperparameter tuning.
    """
    # --- 1. Load the file with the top pre-selected features ---
    base_path = "/explore/nobackup/people/spotter5/anna_v/v2/loocv"
    top_features_path = os.path.join(base_path, target_col, f'training_data_{target_col}_top_preds.csv')

    try:
        top_features_df = pd.read_csv(top_features_path)
    except FileNotFoundError:
        print(f"SKIPPING: Top features file not found for target '{target_col}'.")
        return

    feature_cols = [col for col in top_features_df.columns if col != target_col]
    print(f"Using top selected features for '{target_col}': {feature_cols}")

    # --- 2. Load the main dataset and prepare it ---
    full_dataset_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_final.csv"
    df = pd.read_csv(full_dataset_path)

    df['land_cover'] = df['land_cover'].astype('category')
    df['month'] = df['month'].astype('category')
    df = df[df['flux_method'] == 'EC']
    df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)
    df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

    required_cols = feature_cols + [target_col, 'site_reference']
    df = df.dropna(subset=required_cols)

    # --- 3. Prepare data for modeling (including one-hot encoding) ---
    out_path = os.path.join(base_path, target_col)
    figures_path = os.path.join(out_path, "figures_svm_tuned_top_features") # New folder for tuned plots
    os.makedirs(figures_path, exist_ok=True)

    X_initial = df[feature_cols].copy()
    y = df[target_col]
    sites = df["site_reference"].unique()

    categorical_features = [f for f in X_initial.select_dtypes(include=['category', 'object']).columns if f in feature_cols]
    if categorical_features:
        print(f"One-hot encoding categorical features: {categorical_features}")
        X = pd.get_dummies(X_initial, columns=categorical_features, drop_first=True)
    else:
        X = X_initial

    results = []
    all_preds_df_list = []

    # --- 4. Run Nested CV: Outer loop is LOSO, Inner loop is GridSearchCV ---
    for test_site in sites:
        print(f"  Processing site: {test_site}...")
        train_idx = df["site_reference"] != test_site
        test_idx = df["site_reference"] == test_site

        if test_idx.sum() < 1:
            continue

        X_train, y_train = X.loc[train_idx], y.loc[train_idx]
        X_test, y_test = X.loc[test_idx], y.loc[test_idx]
        dates_test = df.loc[test_idx, "date"]

        # --- A. DEFINE THE MODEL AND PARAMETER GRID FOR TUNING ---
        # The base model pipeline that will be tuned
        model_pipeline = make_pipeline(StandardScaler(), SVR(kernel='rbf'))
        
        # Define the grid of parameters to search.
        # Note the 'svr__' prefix to specify which pipeline step these parameters belong to.
        param_grid = {
            'svr__C': [10, 100, 500],
            'svr__gamma': ['scale', 0.1, 1]
        }

        # --- B. SETUP AND RUN THE INNER CROSS-VALIDATION (GRID SEARCH) ---
        # Define the inner CV strategy (e.g., 5-fold) that runs on the X_train data
        inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)

        # Setup the grid search
        grid_search = GridSearchCV(
            estimator=model_pipeline,
            param_grid=param_grid,
            cv=inner_cv,
            scoring='neg_root_mean_squared_error',
            n_jobs=-1 # Use all available CPU cores
        )

        # Run the grid search on the outer loop's training data
        grid_search.fit(X_train, y_train)

        # --- C. EVALUATE ON THE OUTER TEST SET ---
        print(f"    Best params for this fold: {grid_search.best_params_}")
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        site_df = pd.DataFrame({
            "Site": test_site, "Date": dates_test.values,
            "Observed": y_test.values, "Predicted": y_pred
        })
        all_preds_df_list.append(site_df)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        # Store the best params found for this fold
        results.append({
            "Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2,
            "Best_C": grid_search.best_params_['svr__C'],
            "Best_Gamma": grid_search.best_params_['svr__gamma']
        })

    # --- 5. Aggregate, Save, and Report Results ---
    if not results:
        print(f"No data processed for target '{target_col}'.")
        return

    results_df = pd.DataFrame(results)
    all_preds_df = pd.concat(all_preds_df_list, ignore_index=True)

    results_csv_path = os.path.join(out_path, f'svm_tuned_results_{target_col}_top_features.csv')
    predictions_csv_path = os.path.join(out_path, f'svm_tuned_predictions_{target_col}_top_features.csv')
    results_df.to_csv(results_csv_path, index=False)
    all_preds_df.to_csv(predictions_csv_path, index=False)
    print(f"  Tuned results saved to: {results_csv_path}")

    rmse_all = np.sqrt(mean_squared_error(all_preds_df["Observed"], all_preds_df["Predicted"]))
    r2_all = r2_score(all_preds_df["Observed"], all_preds_df["Predicted"])
    mae_all = mean_absolute_error(all_preds_df["Observed"], all_preds_df["Predicted"])
    print(f"\n--- Tuned SVM Pooled Metrics for {target_col.upper()} (Top Features) ---")
    print(f"Pooled R²: {r2_all:.4f}, Pooled RMSE: {rmse_all:.4f}, Pooled MAE: {mae_all:.4f}")
    
    # (Plotting code remains the same as before)
    # ...

if __name__ == '__main__':
    targets_to_run = ['gpp', 'nee', 'reco', 'ch4_flux_total']
    targets_to_run = ['nee']

    for target in targets_to_run:
        print(f"\n{'='*20} RUNNING TUNED SVM ANALYSIS FOR: {target.upper()} {'='*20}")
        run_loso_with_top_features(target_col=target)
        print(f"{'='*20} COMPLETED TUNED SVM ANALYSIS FOR: {target.upper()} {'='*20}")


Using top selected features for 'nee': ['tmean_C', 'srad', 'month', 'lai', 'cfvo_0_100cm', 'ocd_0_100cm']
One-hot encoding categorical features: ['month']
  Processing site: Central Marsh_US-Cms_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Hakasia 5yr_RU-Ha2_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Hakasia Steppe_RU-Ha1_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Kaamanen_FI-Kaa_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Manitoba - Northern Old Black Spruce (former BOREAS Northern Study Area)_CA-Man_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Nelegel_RU-Nel_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Neleger Cutover_RU-NeC_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Pr



    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Poker Flat Research Range: Succession from fire scar to deciduous forest_US-Rpf_tower...




    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Samoylov Island_RU-Sam (open)_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Udleg practice forest_MN-Udg_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Daring Lake_CA-DL3_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Elgeeii forest station_RU-Ege_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Bonanza Creek Black Spruce_US-BZS_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Daring Lake_CA-DL4_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Tiksi_RU-Tks_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma': 1}
  Processing site: Samoylov Island_RU-Sam (closed)_tower...
    Best params for this fold: {'svr__C': 100, 'svr__gamma



In [2]:
't'

't'