In [15]:
pip install 'pycaret[full]'


Collecting pycaret[full]
  Using cached pycaret-3.3.2-py3-none-any.whl.metadata (17 kB)
Collecting ipywidgets>=7.6.5 (from pycaret[full])
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting pandas<2.2.0 (from pycaret[full])
  Using cached pandas-2.1.4-cp310-cp310-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting jinja2>=3 (from pycaret[full])
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting scipy<=1.11.4,>=1.6.1 (from pycaret[full])
  Using cached scipy-1.11.4-cp310-cp310-macosx_12_0_arm64.whl.metadata (112 kB)
Collecting joblib<1.4,>=1.2.0 (from pycaret[full])
  Using cached joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pyod>=1.1.3 (from pycaret[full])
  Using cached pyod-2.0.5-py3-none-any.whl.metadata (46 kB)
Collecting imbalanced-learn>=0.12.0 (from pycaret[full])
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting category-encoders>=2.4.0 (from pycaret[full])
  Using cached category_encod

In [8]:

import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from scipy.stats import pearsonr
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

# Optional: AutoML
try:
    from pycaret.regression import setup, compare_models, predict_model
    has_pycaret = True
except ImportError:
    has_pycaret = False
    print("⚠️ PyCaret not found. AutoML will be skipped.")

# ------------------ 1. Load Data ------------------ #
def load_data():
    met_ds = xr.open_dataset("AU-ASM_2011-2017_OzFlux_Met.nc")
    flux_ds = xr.open_dataset("AU-ASM_2011-2017_OzFlux_Flux.nc")
    met_df = met_ds.to_dataframe().reset_index()
    flux_df = flux_ds.to_dataframe().reset_index()
    df = pd.merge_asof(met_df.sort_values('time'), flux_df.sort_values('time'), on='time')

    features_raw = ['SWdown', 'LWdown', 'Tair', 'Qair', 'RH', 'Psurf', 'Wind',
                    'CO2air', 'VPD', 'LAI', 'Ustar']
    target_vars = ['GPP', 'NEE']
    df = df[['time'] + features_raw + target_vars].dropna()

    # Derived features
    df['SW_LAI'] = df['SWdown'] * df['LAI']
    df['RH_Tair'] = df['RH'] * df['Tair']
    df['SWdown_lag1'] = df['SWdown'].shift(1)
    df['Tair_lag1'] = df['Tair'].shift(1)
    df = df.dropna()

    df = df.set_index('time').sort_index()
    features = features_raw + ['SW_LAI', 'RH_Tair', 'SWdown_lag1', 'Tair_lag1']
    return df, features

# ------------------ 2. Evaluation ------------------ #
def evaluate(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    rho, _ = pearsonr(y_true, y_pred)
    return r2, rmse, mae, rho

# ------------------ 3. Model Runners ------------------ #
def run_xgb(X_train, y_train, X_test):
    model = xgb.XGBRegressor(objective='reg:squarederror',
                             n_estimators=100, max_depth=3, learning_rate=0.05,
                             colsample_bytree=0.6, subsample=0.8, reg_alpha=0.5, reg_lambda=0.5)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def run_rf(X_train, y_train, X_test):
    model = RandomForestRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=4,
                                   max_features='sqrt', max_depth=None, random_state=42)
    model.fit(X_train, y_train)
    return model.predict(X_test)

def run_automl(train_df, test_df, features, target):
    if not has_pycaret:
        return None
    train_data = pd.concat([train_df[features], train_df[target]], axis=1).reset_index(drop=True)
    setup(
    data=train_data,
    target=target,
    session_id=42,
    train_size=0.999,
    fold_strategy='timeseries',
    fold=3,
    fold_shuffle=False,                  # 必须明确设置
    data_split_shuffle=False,           # 必须明确设置
    preprocess=True,
    numeric_features=features,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95)


    
    best = compare_models(sort='R2')
    test_data = test_df[features].reset_index(drop=True)
    pred = predict_model(best, data=test_data)
    return pred['prediction_label'].values

# ------------------ 4. Plotting ------------------ #
def plot_comparison(index, y_true, predictions, target):
    plt.figure(figsize=(12, 4))
    plt.plot(index, y_true, label='Observed', color='black', linestyle='--')
    for name, pred in predictions.items():
        plt.plot(index, pred, label=name)
    plt.title(f"{target} – Observed vs Predicted")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{target}_comparison.png")
    plt.close()

    plt.figure()
    for name, pred in predictions.items():
        sns.histplot(y_true - pred, kde=True, label=name, bins=50)
    plt.title(f"{target} – Residual Distribution")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{target}_residuals.png")
    plt.close()

# ------------------ 5. Main Execution ------------------ #
def run_all_models():
    df, features = load_data()
    split_idx = int(len(df) * 0.7)
    train_df, test_df = df.iloc[:split_idx], df.iloc[split_idx:]

    for target in ['GPP', 'NEE']:
        X_train, y_train = train_df[features], train_df[target]
        X_test, y_test = test_df[features], test_df[target]

        preds = {}
        metrics = {}

        preds['XGBoost'] = run_xgb(X_train, y_train, X_test)
        preds['RandomForest'] = run_rf(X_train, y_train, X_test)
        if has_pycaret:
            preds['AutoML'] = run_automl(train_df, test_df, features, target)

        print(f"\n📊 Results for {target}")
        for name, y_pred in preds.items():
            r2, rmse, mae, rho = evaluate(y_test, y_pred)
            print(f"{name}: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}, ρ={rho:.4f}")

        plot_comparison(y_test.index, y_test.values, preds, target)

if __name__ == '__main__':
    run_all_models()


Unnamed: 0,Description,Value
0,Session id,42
1,Target,GPP
2,Target type,Regression
3,Original data shape,"(85914, 16)"
4,Transformed data shape,"(85914, 13)"
5,Transformed train set shape,"(85828, 13)"
6,Transformed test set shape,"(86, 13)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.4192,0.7608,0.8437,0.5441,0.2809,3.9021,5.5467
lightgbm,Light Gradient Boosting Machine,0.4218,0.8009,0.8705,0.5071,0.2872,4.1388,0.27
rf,Random Forest Regressor,0.4109,0.7999,0.87,0.5034,0.2748,4.4547,5.17
et,Extra Trees Regressor,0.4231,0.7963,0.8768,0.4738,0.2838,4.1996,1.0533
knn,K Neighbors Regressor,0.4447,0.9283,0.9315,0.4335,0.3045,5.7079,0.0867
xgboost,Extreme Gradient Boosting,0.4644,0.9018,0.926,0.4285,0.3026,5.223,0.6367
catboost,CatBoost Regressor,0.4676,0.9029,0.9312,0.4092,0.303,4.1985,1.1867
ada,AdaBoost Regressor,0.5864,0.9408,0.9579,0.3538,0.3622,5.9042,1.15
par,Passive Aggressive Regressor,0.8039,1.6759,1.184,0.1556,0.4586,5.1197,0.0467
lasso,Lasso Regression,0.5871,1.1079,1.0414,0.0777,0.3179,3.4735,0.5



📊 Results for GPP
XGBoost: R²=0.6725, RMSE=1.3974, MAE=0.7506, ρ=0.8298
RandomForest: R²=0.5640, RMSE=1.6125, MAE=0.8303, ρ=0.7802
AutoML: R²=0.6514, RMSE=1.4418, MAE=0.8100, ρ=0.8158


Unnamed: 0,Description,Value
0,Session id,42
1,Target,NEE
2,Target type,Regression
3,Original data shape,"(85914, 16)"
4,Transformed data shape,"(85914, 13)"
5,Transformed train set shape,"(85828, 13)"
6,Transformed test set shape,"(86, 13)"
7,Numeric features,15
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.5234,0.6892,0.825,0.4425,0.3328,4.3837,5.81
lightgbm,Light Gradient Boosting Machine,0.5557,0.761,0.8674,0.3783,0.3439,4.324,0.2967
et,Extra Trees Regressor,0.5603,0.7747,0.8776,0.3499,0.3417,4.2243,1.64
catboost,CatBoost Regressor,0.5923,0.8087,0.8966,0.3159,0.3506,5.1812,1.18
ada,AdaBoost Regressor,0.7181,0.955,0.9714,0.2343,0.4154,8.1798,1.83
rf,Random Forest Regressor,0.6179,0.9184,0.9575,0.187,0.3677,4.5882,6.6567
par,Passive Aggressive Regressor,0.6606,1.0407,1.011,0.1845,0.4238,6.6519,0.08
xgboost,Extreme Gradient Boosting,0.6769,1.0526,1.0203,-0.0023,0.3852,5.5133,0.15
dummy,Dummy Regressor,0.7135,1.3549,1.1419,-0.004,0.5762,1.1079,0.05
omp,Orthogonal Matching Pursuit,0.7869,1.1855,1.0878,-0.0579,0.4576,9.2026,0.0433



📊 Results for NEE
XGBoost: R²=0.6563, RMSE=1.3355, MAE=0.9248, ρ=0.8196
RandomForest: R²=0.6663, RMSE=1.3158, MAE=0.8982, ρ=0.8239
AutoML: R²=0.6080, RMSE=1.4261, MAE=0.9931, ρ=0.8112
