In [7]:
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from pycaret.regression import *
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

# --------------------- 1. Load & Preprocess ---------------------
met_ds = xr.open_dataset("US-UMB_2000-2014_FLUXNET2015_Met.nc")
flux_ds = xr.open_dataset("US-UMB_2000-2014_FLUXNET2015_Flux.nc")

met_df = met_ds.to_dataframe().reset_index()
flux_df = flux_ds.to_dataframe().reset_index()
df = pd.merge_asof(met_df.sort_values('time'), flux_df.sort_values('time'), on='time')

features_raw = ['SWdown', 'LWdown', 'Tair', 'Qair', 'RH', 'Psurf', 'Wind',
                'CO2air', 'VPD', 'LAI', 'Ustar']
target_vars = ['GPP', 'NEE']
df = df[['time'] + features_raw + target_vars].dropna()

# 派生特征
df['SW_LAI'] = df['SWdown'] * df['LAI']
df['RH_Tair'] = df['RH'] * df['Tair']
df['SWdown_lag1'] = df['SWdown'].shift(1)
df['Tair_lag1'] = df['Tair'].shift(1)
df = df.dropna()

df = df.set_index('time').sort_index()
features = features_raw + ['SW_LAI', 'RH_Tair', 'SWdown_lag1', 'Tair_lag1']

# --------------------- 2. Split Train/Test ---------------------
split_idx = int(len(df) * 0.7)
train_df = df.iloc[:split_idx]
test_df = df.iloc[split_idx:]

# --------------------- 3. AutoML 函数 ---------------------
def run_pycaret_automl(target_var):
    print(f"\n===================== [AutoML - {target_var}] =====================")

    data = pd.concat([train_df[features], train_df[target_var]], axis=1)
    data.reset_index(drop=True, inplace=True)

    # ✅ setup 兼容 PyCaret 3.x 的参数设置
    exp = setup(
        data=data,
        target=target_var,
        session_id=42,
        train_size=0.999,
        fold_strategy='timeseries',
        fold=3,
        fold_shuffle=False,
        data_split_shuffle=False,
        preprocess=True,
        numeric_features=features,
        remove_multicollinearity=True,
        multicollinearity_threshold=0.95
    )

    # 自动模型比较
    best = compare_models(sort='R2')
    
    # 预测 + 指标评估
    test_data = test_df[features].copy().reset_index(drop=True)
    test_target = test_df[target_var].copy().reset_index(drop=True)
    pred = predict_model(best, data=test_data)

    y_pred = pred['prediction_label'].values
    y_true = test_target.values

    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    rho, _ = pearsonr(y_true, y_pred)

    print(f"Best Model: {best}")
    print(f"R² (Coefficient of Determination):   {r2:.4f}")
    print(f"RMSE (Root Mean Squared Error):      {rmse:.4f}")
    print(f"MAE (Mean Absolute Error):           {mae:.4f}")
    print(f"ρ (Pearson Correlation Coefficient): {rho:.4f}")

    # 可视化：预测 vs 实测
    plt.figure(figsize=(12, 4))
    plt.plot(test_df.index, y_true, label="Observed", alpha=0.7)
    plt.plot(test_df.index, y_pred, label="Predicted", alpha=0.7)
    plt.title(f"{target_var} Prediction (AutoML - {best})")
    plt.legend()
    plt.tight_layout()
    plt.show()

# --------------------- 4. Run ---------------------
run_pycaret_automl('GPP')
run_pycaret_automl('NEE')


FileNotFoundError: [Errno 2] No such file or directory: '/Users/kylan/Desktop/毕业论文/实验结果/FI-Hyy/US-UMB_2000-2014_FLUXNET2015_Met.nc'