LOOSO NEE

In [7]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy import stats
from scipy.special import inv_boxcox

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=pd.core.common.SettingWithCopyWarning)


# 1. Load your dataset
file_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_alt_soil_lc_co2.csv"
df = pd.read_csv(file_path)

df = df[df['flux_method'] == 'EC']

# 2. Create tmean_C and date
df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)
df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

# 3. Define predictors and target
feature_cols = [
    'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03', 
    'sur_refl_b07', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs',
    'bdod_0_100cm', 'cec_0_100cm', 'cfvo_0_100cm', 'clay_0_100cm',
    'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm',
    'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT',
    'land_cover', 'month' # Categorical features are now included directly
]
target_col = 'nee'
categorical_features = ['land_cover', 'month']

# Drop rows only if the target variable or site_reference is missing.
df = df.dropna(subset=['site_reference', target_col])

# Define output path for CSVs and create it
out_path = os.path.join("/explore/nobackup/people/spotter5/anna_v/v2/loocv", target_col)
os.makedirs(out_path, exist_ok=True)

# Define a separate output path for figures and create it
figures_path = os.path.join(out_path, "figures")
os.makedirs(figures_path, exist_ok=True)

# Prepare features (X) and target (y)
X = df[feature_cols]
y = df[target_col]
sites = df["site_reference"].unique()

# Convert categorical features to 'category' dtype for CatBoost
for col in categorical_features:
    X[col] = X[col].astype('category')

results = []
all_preds_df_list = []

# Leave-One-Site-Out CV
for test_site in sites:
    print(f"Processing site: {test_site}...")
    train_idx = df["site_reference"] != test_site
    test_idx = df["site_reference"] == test_site

    if test_idx.sum() < 1:
        continue

    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_test, y_test = X.loc[test_idx], y.loc[test_idx]
    dates_test = df.loc[test_idx, "date"]

    # --- NEW: Box-Cox Transformation ---
    # Box-Cox requires all data to be positive. We shift the data if necessary.
    shift_value = 0
    min_y_train = y_train.min()
    if min_y_train <= 0:
        # Add a small constant (1) to ensure all values are > 0
        shift_value = -min_y_train + 1
        y_train_shifted = y_train + shift_value
    else:
        y_train_shifted = y_train

    # Apply the Box-Cox transformation and find the optimal lambda
    y_train_transformed, lmbda = stats.boxcox(y_train_shifted)
    print(f"  Applied Box-Cox with lambda={lmbda:.4f} and shift={shift_value:.4f}")
    
    # Initialize and train the CatBoost model on the TRANSFORMED data
    model = CatBoostRegressor(
        iterations=1200, #1200, 8 is best
        learning_rate=0.01,
        depth=12,
        # subsample=0.7,
        l2_leaf_reg=0.1,
        random_state=42,
        cat_features=categorical_features,
        verbose=0, # Suppress verbose output
        allow_writing_files=False
    )
    # Fit the model on the transformed target variable
    model.fit(X_train, y_train_transformed)
    
    # Predict in the transformed space
    y_pred_transformed = model.predict(X_test)

    # --- NEW: Inverse Transformation ---
    # Reverse the Box-Cox transformation to get predictions back to the original scale
    if lmbda == 0:
        # Special case for lambda = 0, which corresponds to a log transform
        y_pred_inv = np.exp(y_pred_transformed)
    else:
        y_pred_inv = inv_boxcox(y_pred_transformed, lmbda)
    
    # Reverse the initial shift
    y_pred_orig_scale = y_pred_inv - shift_value
    
    # Use the correctly scaled prediction for metrics and plotting
    y_pred = y_pred_orig_scale
    # --- End of Transformation Logic ---

    site_df = pd.DataFrame({
        "Site": test_site,
        "Date": dates_test.values,
        "Observed": y_test.values,
        "Predicted": y_pred
    })
    all_preds_df_list.append(site_df)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    results.append({
        "Site": test_site,
        "RMSE": rmse,
        "MAE": mae,
        "R2": r2
    })

# Combine results
results_df = pd.DataFrame(results)
all_preds_df = pd.concat(all_preds_df_list, ignore_index=True)

# Save to disk with '_cat' suffix
results_csv_path = os.path.join(out_path, f'catboost_results_{target_col}_cat.csv')
predictions_csv_path = os.path.join(out_path, f'catboost_predictions_{target_col}_cat.csv')
results_df.to_csv(results_csv_path, index=False)
all_preds_df.to_csv(predictions_csv_path, index=False)
print(f"\nResults saved to: {results_csv_path}")
print(f"Predictions saved to: {predictions_csv_path}")


# Pooled metrics
rmse_all = np.sqrt(mean_squared_error(all_preds_df["Observed"], all_preds_df["Predicted"]))
r2_all = r2_score(all_preds_df["Observed"], all_preds_df["Predicted"])
mae_all = mean_absolute_error(all_preds_df["Observed"], all_preds_df["Predicted"])

print("\n--- Site-Specific Results ---")
print(results_df)
print("\n--- Pooled Metrics ---")
print(f"Pooled RMSE: {rmse_all:.4f}")
print(f"Pooled MAE:  {mae_all:.4f}")
print(f"Pooled R²:   {r2_all:.4f}")

# Median metrics across sites
if not results_df.empty:
    median_rmse = results_df["RMSE"].median()
    median_mae = results_df["MAE"].median()
    median_r2 = results_df["R2"].median()

    print("\n--- Median Metrics Across Sites ---")
    print(f"Median RMSE: {median_rmse:.4f}")
    print(f"Median MAE:  {median_mae:.4f}")
    print(f"Median R²:   {median_r2:.4f}")

# --- Plotting ---
unique_sites = all_preds_df["Site"].unique()
if not unique_sites.any():
    print("\nNo sites to plot.")
else:
    print("\nGenerating and saving individual site plots...")
    for site in unique_sites:
        fig, ax = plt.subplots(figsize=(12, 7))
        
        # Check if site exists in results_df before trying to access it
        if site in results_df["Site"].values:
            site_df_plot = all_preds_df[all_preds_df["Site"] == site].sort_values("Date")
            site_metrics = results_df[results_df["Site"] == site].iloc[0]
            rmse_val = round(site_metrics["RMSE"], 2)
            r2_val = round(site_metrics["R2"], 2)
            mae_val = round(site_metrics["MAE"], 2)

            ax.plot(site_df_plot["Date"], site_df_plot["Observed"], label="Observed", marker="o", linestyle='-', markersize=4)
            ax.plot(site_df_plot["Date"], site_df_plot["Predicted"], label="Predicted", marker="x", linestyle='--', markersize=4)
            ax.set_title(f"Observed vs. Predicted {target_col} for Site: {site} (with Box-Cox)")
            ax.set_xlabel("Date")
            ax.set_ylabel(target_col)
            ax.legend()
            ax.grid(True)
            fig.autofmt_xdate()

            textstr = f"RMSE: {rmse_val}\nMAE: {mae_val}\n$R^2$: {r2_val}"
            ax.text(
                0.97, 0.03, textstr,
                transform=ax.transAxes,
                fontsize=10,
                verticalalignment='bottom',
                horizontalalignment='right',
                bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7)
            )
            
            plot_filename = f'catboost_{target_col}_{site}_timeseries_cat.png'
            plot_path = os.path.join(figures_path, plot_filename)
            plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        
        plt.close(fig) # Close the plot to free up memory
        
    print(f"All site plots saved to: {figures_path}")

out_path_model = '/explore/nobackup/people/spotter5/anna_v/v2/models'
os.makedirs(out_path_model, exist_ok = True)
model_filename = f'{target_col}.json'
model.save_model(os.path.join(out_path_model, model_filename))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')


Processing site: Fyodorovskoye_RU-Fyo_tower...
  Applied Box-Cox with lambda=2.6752 and shift=218.1562
Processing site: Saskatchewan - Western Boreal, Mature Aspen_CA-Oas_tower...
  Applied Box-Cox with lambda=2.6106 and shift=218.1562
Processing site: Saskatchewan - Western Boreal, Mature Jack Pine_CA-Ojp_tower...
  Applied Box-Cox with lambda=2.6434 and shift=218.1562
Processing site: Flakaliden_SE-Fla_tower...
  Applied Box-Cox with lambda=2.4694 and shift=198.5322
Processing site: Hyytiala_FI-Hyy_tower...
  Applied Box-Cox with lambda=2.5877 and shift=218.1562
Processing site: Manitoba - Northern Old Black Spruce (former BOREAS Northern Study Area)_CA-Man_tower...
  Applied Box-Cox with lambda=2.6311 and shift=218.1562
Processing site: Saskatchewan - Western Boreal, Mature Black Spruce_CA-Obs_tower...
  Applied Box-Cox with lambda=2.6327 and shift=218.1562
Processing site: Kaamanen_FI-Kaa_tower...
  Applied Box-Cox with lambda=2.6301 and shift=218.1562
Processing site: Nelegel_RU-N



Processing site: Zotino; Central Siberia_RU-Zfw 1_tower...
  Applied Box-Cox with lambda=2.6379 and shift=218.1562
Processing site: Gunnarsholt_IS-Gun_tower...
  Applied Box-Cox with lambda=2.6344 and shift=218.1562
Processing site: Happy Valley Wet Sedge Tundra_US-HVs_tower...
  Applied Box-Cox with lambda=2.6383 and shift=218.1562
Processing site: Happy Valley_US-HVa_tower...
  Applied Box-Cox with lambda=2.6386 and shift=218.1562
Processing site: Sag River_US-Sag_tower...
  Applied Box-Cox with lambda=2.6386 and shift=218.1562

Results saved to: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/catboost_results_nee_cat.csv
Predictions saved to: /explore/nobackup/people/spotter5/anna_v/v2/loocv/nee/catboost_predictions_nee_cat.csv

--- Site-Specific Results ---
                                                  Site       RMSE        MAE  \
0                           Fyodorovskoye_RU-Fyo_tower  28.513420  20.163252   
1    Saskatchewan - Western Boreal, Mature Aspen_CA...  55.743

In [4]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy import stats
from scipy.special import inv_boxcox

# Suppress warnings
os.environ['PYTHONWARNINGS'] = 'ignore::FutureWarning'
warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=pd.core.common.SettingWithCopyWarning)

# 1. Load your dataset
file_path = "/explore/nobackup/people/spotter5/anna_v/v2/v2_model_training_alt_soil_lc_co2.csv"
df = pd.read_csv(file_path)

df = df[df['flux_method'] == 'EC']

# 2. Create tmean_C and date
df['tmean_C'] = df[['tmmn', 'tmmx']].mean(axis=1)
df['date'] = pd.to_datetime(df[['year', 'month']].assign(day=1))

# 3. Define predictors and target
feature_cols = [
    'EVI', 'NDVI', 'sur_refl_b01', 'sur_refl_b02', 'sur_refl_b03', 
    'sur_refl_b07', 'NDWI', 'pdsi', 'srad', 'tmean_C', 'vap', 'vs',
    'bdod_0_100cm', 'cec_0_100cm', 'cfvo_0_100cm', 'clay_0_100cm',
    'nitrogen_0_100cm', 'ocd_0_100cm', 'phh2o_0_100cm', 'sand_0_100cm',
    'silt_0_100cm', 'soc_0_100cm', 'co2_cont', 'ALT',
    'land_cover', 'month' # Original categorical features
]
target_col = 'nee'
categorical_features = ['land_cover', 'month']

# Feature Engineering Section
print("Performing feature engineering...")
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
df['EVI_x_srad'] = df['EVI'] * df['srad']
df['NDVI_x_tmean'] = df['NDVI'] * df['tmean_C']
feature_cols.extend(['month_sin', 'month_cos', 'EVI_x_srad', 'NDVI_x_tmean'])
feature_cols.remove('month')
categorical_features.remove('month')

# Drop rows only if the target variable or site_reference is missing.
df = df.dropna(subset=['site_reference', target_col])

# Check if target variable needs shifting for Box-Cox
if df[target_col].min() <= 0:
    print("Target variable contains non-positive values. A shift will be applied for Box-Cox transformation.")

# Define output path for CSVs and create it
out_path = os.path.join("/explore/nobackup/people/spotter5/anna_v/v2/loocv", target_col + "_improved_boxcox")
os.makedirs(out_path, exist_ok=True)
figures_path = os.path.join(out_path, "figures")
os.makedirs(figures_path, exist_ok=True)

# Prepare features (X) and target (y)
X = df[feature_cols]
y = df[target_col]
sites = df["site_reference"].unique()

for col in categorical_features:
    X[col] = X[col].astype('category')

results = []
all_preds_df_list = []

# Leave-One-Site-Out CV
for test_site in sites:
    print(f"Processing site: {test_site}...")
    train_idx = df["site_reference"] != test_site
    test_idx = df["site_reference"] == test_site

    if test_idx.sum() < 20:
        print(f"  Skipping site {test_site} due to insufficient data ({test_idx.sum()} points).")
        continue

    X_train_full, y_train_full = X.loc[train_idx], y.loc[train_idx]
    X_test, y_test = X.loc[test_idx], y.loc[test_idx]
    dates_test = df.loc[test_idx, "date"]

    # --- NEW: Box-Cox Transformation of the target variable ---
    shift_value = 0
    min_y_train = y_train_full.min()
    if min_y_train <= 0:
        shift_value = -min_y_train + 1 # Add 1 to ensure all values are > 0
        y_train_shifted = y_train_full + shift_value
    else:
        y_train_shifted = y_train_full

    # Apply Box-Cox transformation on the shifted training data to find the best lambda
    y_train_transformed, lmbda = stats.boxcox(y_train_shifted)
    y_train_transformed = pd.Series(y_train_transformed, index=y_train_full.index)
    print(f"  Applied Box-Cox with lambda={lmbda:.4f} and shift={shift_value:.4f}")

    # The model will now be trained on this transformed data.
    # We split the transformed data for training and early stopping.
    X_train, X_eval, y_train_trans, y_eval_trans = train_test_split(
        X_train_full, y_train_transformed, test_size=0.15, random_state=42
    )

    model = CatBoostRegressor(
        learning_rate=0.05,
        iterations=3000,
        depth=8,
        l2_leaf_reg=1,
        random_state=42,
        cat_features=categorical_features,
        verbose=0,
        allow_writing_files=False,
        early_stopping_rounds=50
    )
    
    # Train the model on the TRANSFORMED data
    model.fit(
        X_train, y_train_trans,
        eval_set=(X_eval, y_eval_trans),
        use_best_model=True 
    )
    
    print(f"  Trained for {model.get_best_iteration()} iterations.")
    # Predict in the transformed space
    y_pred_transformed = model.predict(X_test)

    # --- NEW: Inverse Transformation to get predictions back to the original scale ---
    if lmbda == 0:
        # Handle lambda = 0 case (logarithmic transformation)
        y_pred_inv = np.exp(y_pred_transformed)
    else:
        y_pred_inv = inv_boxcox(y_pred_transformed, lmbda)
    
    # Reverse the initial shift
    y_pred_orig_scale = y_pred_inv - shift_value
    
    # Use this final, original-scale prediction for evaluation
    y_pred = y_pred_orig_scale

    # --- The rest of the script now uses the correctly scaled predictions ---
    site_df = pd.DataFrame({
        "Site": test_site,
        "Date": dates_test.values,
        "Observed": y_test.values,
        "Predicted": y_pred
    })
    all_preds_df_list.append(site_df)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results.append({"Site": test_site, "RMSE": rmse, "MAE": mae, "R2": r2})

# Combine and save results
results_df = pd.DataFrame(results)
all_preds_df = pd.concat(all_preds_df_list, ignore_index=True)
results_csv_path = os.path.join(out_path, f'catboost_results_{target_col}_cat.csv')
predictions_csv_path = os.path.join(out_path, f'catboost_predictions_{target_col}_cat.csv')
results_df.to_csv(results_csv_path, index=False)
all_preds_df.to_csv(predictions_csv_path, index=False)
print(f"\nResults saved to: {results_csv_path}")
print(f"Predictions saved to: {predictions_csv_path}")

# Pooled and Median Metrics Calculation
rmse_all = np.sqrt(mean_squared_error(all_preds_df["Observed"], all_preds_df["Predicted"]))
r2_all = r2_score(all_preds_df["Observed"], all_preds_df["Predicted"])
mae_all = mean_absolute_error(all_preds_df["Observed"], all_preds_df["Predicted"])

print("\n--- Site-Specific Results ---")
print(results_df)
print("\n--- Pooled Metrics ---")
print(f"Pooled RMSE: {rmse_all:.4f}")
print(f"Pooled MAE:  {mae_all:.4f}")
print(f"Pooled R²:   {r2_all:.4f}")

if not results_df.empty:
    median_rmse = results_df["RMSE"].median()
    median_mae = results_df["MAE"].median()
    median_r2 = results_df["R2"].median()
    print("\n--- Median Metrics Across Sites ---")
    print(f"Median RMSE: {median_rmse:.4f}")
    print(f"Median MAE:  {median_mae:.4f}")
    print(f"Median R²:   {median_r2:.4f}")

# Plotting
unique_sites = all_preds_df["Site"].unique()
if not unique_sites.any():
    print("\nNo sites to plot.")
else:
    print("\nGenerating and saving individual site plots...")
    for site in unique_sites:
        fig, ax = plt.subplots(figsize=(12, 7))
        site_df_plot = all_preds_df[all_preds_df["Site"] == site].sort_values("Date")
        site_metrics = results_df[results_df["Site"] == site].iloc[0]
        rmse_val, r2_val, mae_val = site_metrics["RMSE"], site_metrics["R2"], site_metrics["MAE"]
        ax.plot(site_df_plot["Date"], site_df_plot["Observed"], label="Observed", marker="o", linestyle='-', markersize=4)
        ax.plot(site_df_plot["Date"], site_df_plot["Predicted"], label="Predicted", marker="x", linestyle='--', markersize=4)
        ax.set_title(f"Observed vs. Predicted {target_col} for Site: {site} (with Box-Cox)")
        ax.set_xlabel("Date")
        ax.set_ylabel(target_col)
        ax.legend()
        ax.grid(True)
        fig.autofmt_xdate()
        textstr = f"RMSE: {rmse_val:.2f}\nMAE: {mae_val:.2f}\n$R^2$: {r2_val:.2f}"
        ax.text(0.97, 0.03, textstr, transform=ax.transAxes, fontsize=10, verticalalignment='bottom', horizontalalignment='right', bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7))
        plot_filename = f'catboost_{target_col}_{site}_timeseries_cat.png'
        plot_path = os.path.join(figures_path, plot_filename)
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        plt.close(fig)
    print(f"All site plots saved to: {figures_path}")

# Save the final model
out_path_model = '/explore/nobackup/people/spotter5/anna_v/v2/models'
os.makedirs(out_path_model, exist_ok = True)
model_filename = f'{target_col}_improved_boxcox.json'
# model.save_model(os.path.join(out_path_model, model_filename))
print(f"\nFinal model from last fold saved to: {os.path.join(out_path_model, model_filename)}")

Performing feature engineering...
Target variable contains non-positive values. A shift will be applied for Box-Cox transformation.
Processing site: Fyodorovskoye_RU-Fyo_tower...
  Applied Box-Cox with lambda=2.6752 and shift=218.1562


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')


  Trained for 1015 iterations.
Processing site: Saskatchewan - Western Boreal, Mature Aspen_CA-Oas_tower...
  Applied Box-Cox with lambda=2.6106 and shift=218.1562
  Trained for 1477 iterations.
Processing site: Saskatchewan - Western Boreal, Mature Jack Pine_CA-Ojp_tower...
  Applied Box-Cox with lambda=2.6434 and shift=218.1562
  Trained for 1340 iterations.
Processing site: Flakaliden_SE-Fla_tower...
  Applied Box-Cox with lambda=2.4694 and shift=198.5322
  Trained for 1013 iterations.
Processing site: Hyytiala_FI-Hyy_tower...
  Applied Box-Cox with lambda=2.5877 and shift=218.1562
  Trained for 781 iterations.
Processing site: Manitoba - Northern Old Black Spruce (former BOREAS Northern Study Area)_CA-Man_tower...
  Applied Box-Cox with lambda=2.6311 and shift=218.1562
  Trained for 1093 iterations.
Processing site: Saskatchewan - Western Boreal, Mature Black Spruce_CA-Obs_tower...
  Applied Box-Cox with lambda=2.6327 and shift=218.1562
  Trained for 1065 iterations.
Processing sit