In [3]:

import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import pearsonr
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

# Optional: AutoML
try:
    from pycaret.regression import setup, compare_models, predict_model
    has_pycaret = True
except ImportError:
    has_pycaret = False
    print("‚ö†Ô∏è PyCaret not found. AutoML will be skipped.")

# ------------------ 1. Load Data ------------------ #
def load_data():
    met_ds = xr.open_dataset("US-UMB_2000-2014_FLUXNET2015_Met.nc")
    flux_ds = xr.open_dataset("US-UMB_2000-2014_FLUXNET2015_Flux.nc")
    met_df = met_ds.to_dataframe().reset_index()
    flux_df = flux_ds.to_dataframe().reset_index()
    df = pd.merge_asof(met_df.sort_values('time'), flux_df.sort_values('time'), on='time')

    features_raw = ['SWdown', 'LWdown', 'Tair', 'Qair', 'RH', 'Psurf', 'Wind',
                    'CO2air', 'VPD', 'LAI', 'Ustar']
    target_vars = ['GPP', 'NEE']
    df = df[['time'] + features_raw + target_vars].dropna()

    # Derived features
    df['SW_LAI'] = df['SWdown'] * df['LAI']
    df['RH_Tair'] = df['RH'] * df['Tair']
    df['SWdown_lag1'] = df['SWdown'].shift(1)
    df['Tair_lag1'] = df['Tair'].shift(1)
    df = df.dropna()

    df = df.set_index('time').sort_index()
    features = features_raw + ['SW_LAI', 'RH_Tair', 'SWdown_lag1', 'Tair_lag1']
    return df, features

# ------------------ 2. Evaluation ------------------ #
def evaluate(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    rho, _ = pearsonr(y_true, y_pred)
    return r2, rmse, mae, rho

# ------------------ 3. Model Runners ------------------ #
def run_xgb(X_train, y_train, X_test):
    model = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=100, max_depth=3, learning_rate=0.05,
                             colsample_bytree=0.6, subsample=0.8, reg_alpha=0.5, reg_lambda=0.5)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def run_rf(X_train, y_train, X_test):
    model = RandomForestRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=4,
                                   max_features='sqrt', max_depth=None, random_state=42)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def run_automl(train_df, test_df, features, target):
    if not has_pycaret:
        return None
    train_data = pd.concat([train_df[features], train_df[target]], axis=1).reset_index(drop=True)
    setup(
    data=train_data,
    target=target,
    session_id=42,
    train_size=0.999,
    fold_strategy='timeseries',
    fold=3,
    fold_shuffle=False,                  # ÂøÖÈ°ªÊòéÁ°ÆËÆæÁΩÆ
    data_split_shuffle=False,           # ÂøÖÈ°ªÊòéÁ°ÆËÆæÁΩÆ
    preprocess=True,
    numeric_features=features,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95)


    
    best = compare_models(sort='R2')
    test_data = test_df[features].reset_index(drop=True)
    pred = predict_model(best, data=test_data)
    return pred['prediction_label'].values

# ------------------ 4. Plotting ------------------ #
def plot_comparison(index, y_true, predictions, target):
    plt.figure(figsize=(12, 4))
    plt.plot(index, y_true, label='Observed', color='black', linestyle='--')
    for name, pred in predictions.items():
        plt.plot(index, pred, label=name)
    plt.title(f"{target} ‚Äì Observed vs Predicted")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{target}_comparison.png")
    plt.close()

    plt.figure()
    for name, pred in predictions.items():
        sns.histplot(y_true - pred, kde=True, label=name, bins=50)
    plt.title(f"{target} ‚Äì Residual Distribution")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{target}_residuals.png")
    plt.close()

# ------------------ 5. Main Execution ------------------ #
def run_all_models():
    df, features = load_data()
    split_idx = int(len(df) * 0.7)
    train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]

    for target in ['GPP', 'NEE']:
        X_train, y_train = train_df[features], train_df[target]
        X_test, y_test = test_df[features], test_df[target]

        preds = {}
        metrics = {}

        preds['XGBoost'] = run_xgb(X_train, y_train, X_test)
        preds['RandomForest'] = run_rf(X_train, y_train, X_test)
        if has_pycaret:
            preds['AutoML'] = run_automl(train_df, test_df, features, target)

        print(f"\nüìä Results for {target}")
        for name, y_pred in preds.items():
            r2, rmse, mae, rho = evaluate(y_test, y_pred)
            print(f"{name}: R¬≤={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}, œÅ={rho:.4f}")

        plot_comparison(y_test.index, y_test.values, preds, target)

if __name__ == '__main__':
    run_all_models()


Unnamed: 0,Description,Value
0,Session id,42
1,Target,GPP
2,Target type,Regression
3,Original data shape,"(92046, 16)"
4,Transformed data shape,"(92046, 14)"
5,Transformed train set shape,"(91953, 14)"
6,Transformed test set shape,"(93, 14)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.2019,5.1101,2.2285,0.9074,0.3796,5.1266,2.0367
rf,Random Forest Regressor,1.2258,5.2689,2.2659,0.9041,0.3794,5.7155,8.41
lightgbm,Light Gradient Boosting Machine,1.2067,5.323,2.2739,0.9034,0.4066,2.6668,0.28
gbr,Gradient Boosting Regressor,1.216,5.3845,2.29,0.9022,0.4076,2.7737,5.9867
catboost,CatBoost Regressor,1.2277,5.4337,2.2962,0.9013,0.3874,3.7791,1.4533
ada,AdaBoost Regressor,1.3194,5.8689,2.395,0.8932,0.4517,4.1592,1.6133
xgboost,Extreme Gradient Boosting,1.2797,5.9528,2.406,0.8915,0.389,4.4877,0.1433
knn,K Neighbors Regressor,1.4099,6.6322,2.5483,0.8792,0.4218,4.8821,0.07
lr,Linear Regression,1.7146,9.1307,2.9958,0.833,0.427,9.4148,0.7433
llar,Lasso Least Angle Regression,1.7035,9.261,3.0148,0.8308,0.4121,12.7664,0.04



üìä Results for GPP
XGBoost: R¬≤=0.9087, RMSE=2.3288, MAE=1.2731, œÅ=0.9549
RandomForest: R¬≤=0.9150, RMSE=2.2474, MAE=1.2336, œÅ=0.9575
AutoML: R¬≤=0.9139, RMSE=2.2623, MAE=1.2496, œÅ=0.9563


Unnamed: 0,Description,Value
0,Session id,42
1,Target,NEE
2,Target type,Regression
3,Original data shape,"(92046, 16)"
4,Transformed data shape,"(92046, 14)"
5,Transformed train set shape,"(91953, 14)"
6,Transformed test set shape,"(93, 14)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,1.1638,4.457,2.0804,0.8903,0.3398,1.3268,1.85
rf,Random Forest Regressor,1.1833,4.5786,2.1105,0.887,0.3489,1.3463,7.16
lightgbm,Light Gradient Boosting Machine,1.1684,4.6289,2.1202,0.8857,0.3396,1.2986,0.2967
catboost,CatBoost Regressor,1.1759,4.6568,2.1251,0.8849,0.34,1.3352,1.4867
gbr,Gradient Boosting Regressor,1.1818,4.7378,2.1477,0.883,0.3376,1.3372,5.2933
xgboost,Extreme Gradient Boosting,1.2454,5.3381,2.2777,0.8668,0.3626,1.4033,0.1533
ada,AdaBoost Regressor,1.4987,5.6902,2.3707,0.8571,0.419,2.2249,1.3267
knn,K Neighbors Regressor,1.4581,6.0517,2.4406,0.8489,0.4202,1.5731,0.09
lr,Linear Regression,1.6855,8.3824,2.8679,0.7916,0.4676,2.0692,0.0567
ridge,Ridge Regression,1.6918,8.435,2.8758,0.7905,0.4682,2.1125,0.0367



üìä Results for NEE
XGBoost: R¬≤=0.8855, RMSE=2.2796, MAE=1.2755, œÅ=0.9446
RandomForest: R¬≤=0.8941, RMSE=2.1915, MAE=1.2289, œÅ=0.9478
AutoML: R¬≤=0.8944, RMSE=2.1891, MAE=1.2406, œÅ=0.9465
