In [1]:
import numpy as np
from fluxlib import *
from scitbx import Yaml
from pathlib import Path
from sklearn.model_selection import train_test_split

In [3]:
cfg = Yaml("rfr_gapfill_fluxnet_cfg.yaml").load()

drivers = cfg["drivers"]
flux = cfg["flux"]
rg = cfg["rg"]
timestamp_format = cfg["timestamp_format"]
paths = Path(cfg["source"]).glob(r"*.csv")

In [5]:
filler = Filler()
for data_path in paths:
    loader = Loader(data_path)
    df = loader.load_fluxnet(drivers + flux, timestamp_format)
    #-------------------------------------------------
    # set tags:
    df, stat_tags = filler.set_stats(df, flux)
    df, season_tag = filler.set_season_tag(df)
    df, rg_tag = filler.set_rg_tag(df, rg)
    df, doy_year_tag = filler.set_doy_year_tag(df)
    #-------------------------------------------------
    # prepare and split data for RFR
    param_columns = drivers + stat_tags + season_tag + rg_tag + doy_year_tag
    X = df.dropna()[param_columns]
    y = df.dropna()[flux]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.33, 
        random_state=42
    )
    X_apply = df[param_columns].dropna()
    y_apply = df[flux].dropna()
    #--------------------------------------------------
    # train and test/apply RFR
    regr = filler.train_rfr(X_train, y_train, n_estimators = 100)
    result_df, r2, rmse = filler.test_rfr(regr, X_test, y_test)
    print(f"{data_path.stem}, {np.round(r2, 4)}, {np.round(rmse, 4)}")
    # applied_df, r2, rmse = test_rfr(regr, X_apply, y_apply)
    # print(f"apply results=> r2:{np.round(r2, 4)}, rmse: {np.round(rmse, 4)}")
    # sys.exit(0)