In [2]:

import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import pearsonr
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

# Optional: AutoML
try:
    from pycaret.regression import setup, compare_models, predict_model
    has_pycaret = True
except ImportError:
    has_pycaret = False
    print("⚠️ PyCaret not found. AutoML will be skipped.")

# ------------------ 1. Load Data ------------------ #
def load_data():
    met_ds = xr.open_dataset("FI-Hyy_1996-2014_FLUXNET2015_Met.nc")
    flux_ds = xr.open_dataset("FI-Hyy_1996-2014_FLUXNET2015_Flux.nc")
    met_df = met_ds.to_dataframe().reset_index()
    flux_df = flux_ds.to_dataframe().reset_index()
    df = pd.merge_asof(met_df.sort_values('time'), flux_df.sort_values('time'), on='time')

    features_raw = ['SWdown', 'LWdown', 'Tair', 'Qair', 'RH', 'Psurf', 'Wind',
                    'CO2air', 'VPD', 'LAI', 'Ustar']
    target_vars = ['GPP', 'NEE']
    df = df[['time'] + features_raw + target_vars].dropna()

    # Derived features
    df['SW_LAI'] = df['SWdown'] * df['LAI']
    df['RH_Tair'] = df['RH'] * df['Tair']
    df['SWdown_lag1'] = df['SWdown'].shift(1)
    df['Tair_lag1'] = df['Tair'].shift(1)
    df = df.dropna()

    df = df.set_index('time').sort_index()
    features = features_raw + ['SW_LAI', 'RH_Tair', 'SWdown_lag1', 'Tair_lag1']
    return df, features

# ------------------ 2. Evaluation ------------------ #
def evaluate(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    rho, _ = pearsonr(y_true, y_pred)
    return r2, rmse, mae, rho

# ------------------ 3. Model Runners ------------------ #
def run_xgb(X_train, y_train, X_test):
    model = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=100, max_depth=3, learning_rate=0.05,
                             colsample_bytree=0.6, subsample=0.8, reg_alpha=0.5, reg_lambda=0.5)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def run_rf(X_train, y_train, X_test):
    model = RandomForestRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=4,
                                   max_features='sqrt', max_depth=None, random_state=42)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def run_automl(train_df, test_df, features, target):
    if not has_pycaret:
        return None
    train_data = pd.concat([train_df[features], train_df[target]], axis=1).reset_index(drop=True)
    setup(
    data=train_data,
    target=target,
    session_id=42,
    train_size=0.999,
    fold_strategy='timeseries',
    fold=3,
    fold_shuffle=False,                  # 必须明确设置
    data_split_shuffle=False,           # 必须明确设置
    preprocess=True,
    numeric_features=features,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95)


    
    best = compare_models(sort='R2')
    test_data = test_df[features].reset_index(drop=True)
    pred = predict_model(best, data=test_data)
    return pred['prediction_label'].values

# ------------------ 4. Plotting ------------------ #
def plot_comparison(index, y_true, predictions, target):
    plt.figure(figsize=(12, 4))
    plt.plot(index, y_true, label='Observed', color='black', linestyle='--')
    for name, pred in predictions.items():
        plt.plot(index, pred, label=name)
    plt.title(f"{target} – Observed vs Predicted")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{target}_comparison.png")
    plt.close()

    plt.figure()
    for name, pred in predictions.items():
        sns.histplot(y_true - pred, kde=True, label=name, bins=50)
    plt.title(f"{target} – Residual Distribution")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{target}_residuals.png")
    plt.close()

# ------------------ 5. Main Execution ------------------ #
def run_all_models():
    df, features = load_data()
    split_idx = int(len(df) * 0.7)
    train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]

    for target in ['GPP', 'NEE']:
        X_train, y_train = train_df[features], train_df[target]
        X_test, y_test = test_df[features], test_df[target]

        preds = {}
        metrics = {}

        preds['XGBoost'] = run_xgb(X_train, y_train, X_test)
        preds['RandomForest'] = run_rf(X_train, y_train, X_test)
        if has_pycaret:
            preds['AutoML'] = run_automl(train_df, test_df, features, target)

        print(f"\n📊 Results for {target}")
        for name, y_pred in preds.items():
            r2, rmse, mae, rho = evaluate(y_test, y_pred)
            print(f"{name}: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}, ρ={rho:.4f}")

        plot_comparison(y_test.index, y_test.values, preds, target)

if __name__ == '__main__':
    run_all_models()


Unnamed: 0,Description,Value
0,Session id,42
1,Target,GPP
2,Target type,Regression
3,Original data shape,"(227539, 16)"
4,Transformed data shape,"(227539, 13)"
5,Transformed train set shape,"(227311, 13)"
6,Transformed test set shape,"(228, 13)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.7754,1.7476,1.3202,0.9284,0.2926,3.2645,4.86
gbr,Gradient Boosting Regressor,0.8029,1.8522,1.3595,0.9241,0.3085,2.9368,11.94
lightgbm,Light Gradient Boosting Machine,0.7936,1.8875,1.3722,0.9226,0.3127,2.4256,0.3933
rf,Random Forest Regressor,0.8148,1.9577,1.3972,0.9198,0.3066,3.6357,20.4533
catboost,CatBoost Regressor,0.8213,2.0043,1.4144,0.9178,0.3095,2.8931,2.43
xgboost,Extreme Gradient Boosting,0.8538,2.2075,1.4823,0.9097,0.3125,3.0029,0.2933
ada,AdaBoost Regressor,1.1619,2.7859,1.6677,0.8847,0.4075,13.7901,4.0633
knn,K Neighbors Regressor,1.0287,2.9143,1.707,0.8799,0.3789,5.0646,0.1967
dt,Decision Tree Regressor,1.1494,3.6375,1.9039,0.851,0.3816,7.523,0.92
ridge,Ridge Regression,1.4636,4.4873,2.1142,0.8165,0.4775,16.9748,0.5567



📊 Results for GPP
XGBoost: R²=0.9110, RMSE=1.7243, MAE=0.9985, ρ=0.9677
RandomForest: R²=0.9160, RMSE=1.6759, MAE=0.9648, ρ=0.9696
AutoML: R²=0.9140, RMSE=1.6957, MAE=0.9793, ρ=0.9680


Unnamed: 0,Description,Value
0,Session id,42
1,Target,NEE
2,Target type,Regression
3,Original data shape,"(227539, 16)"
4,Transformed data shape,"(227539, 13)"
5,Transformed train set shape,"(227311, 13)"
6,Transformed test set shape,"(228, 13)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.7683,1.5,1.223,0.9096,0.2866,1.2059,4.1133
lightgbm,Light Gradient Boosting Machine,0.7757,1.555,1.2452,0.9063,0.2921,1.2078,0.41
gbr,Gradient Boosting Regressor,0.7906,1.5824,1.2565,0.9045,0.2928,1.3721,11.4533
catboost,CatBoost Regressor,0.7924,1.6263,1.2743,0.9018,0.2966,1.2715,2.7533
rf,Random Forest Regressor,0.8156,1.6704,1.2912,0.8992,0.3117,1.3383,16.9467
xgboost,Extreme Gradient Boosting,0.8553,1.8242,1.3443,0.8906,0.3173,1.2663,0.2967
ada,AdaBoost Regressor,1.0731,2.4408,1.5621,0.8522,0.3954,2.5208,3.4867
knn,K Neighbors Regressor,1.1151,2.8817,1.6942,0.8266,0.4069,1.8434,0.21
dt,Decision Tree Regressor,1.1445,3.2235,1.794,0.8052,0.3992,1.9997,0.83
ridge,Ridge Regression,1.384,3.9379,1.9835,0.7619,0.4835,2.5281,0.1167



📊 Results for NEE
XGBoost: R²=0.8899, RMSE=1.5670, MAE=0.9675, ρ=0.9568
RandomForest: R²=0.9027, RMSE=1.4736, MAE=0.9002, ρ=0.9610
AutoML: R²=0.9010, RMSE=1.4861, MAE=0.9119, ρ=0.9593
