In [1]:
import numpy as np
from fluxlib import *
from scitbx import Yaml
from pathlib import Path
from sklearn.model_selection import train_test_split

In [6]:
cfg = Yaml("gapfill_malaysia_cfg.yaml").load()

drivers = cfg["drivers"]
flux = cfg["flux"]
rg = cfg["rg"]
timestamp_format = cfg["timestamp_format"]
bench_flux = cfg["bench_flux"]
qc = cfg["qc"]
timestamp_name = cfg["timestamp_name"]
timestamp_format = cfg["timestamp_format"]
data_path = cfg["source"]

In [9]:
# run for Sebungan and Sabaju
filler = Filler()
loader = Loader(data_path)
#------------------------------------------------------------------------------------
# load data:
df = loader.load_lowcost(drivers + flux + qc + bench_flux, timestamp_format)
#-------------------------------------------------
# set tags:
df, stat_tags = filler.set_stats(df, flux)
df, season_tag = filler.set_season_tag(df)
df, rg_tag = filler.set_rg_tag(df, rg)
df, doy_year_tag = filler.set_doy_year_tag(df)
#-------------------------------------------------
# prepare and split data for RFR
param_columns = drivers + stat_tags + season_tag + rg_tag + doy_year_tag
X = df.dropna()[param_columns]
y = df.dropna()[flux]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.33, 
    random_state=42
)
X_apply = df[param_columns].dropna()
#--------------------------------------------------
# train and test/apply RFR
regr = filler.train_rfr(X_train, y_train, n_estimators = 500)
result_df, r2, rmse = filler.test_rfr(regr, X_test, y_test)
print(f"test results=> r2: {np.round(r2, 4)}, rmse: {np.round(rmse, 4)}")
applied_df, r2 = filler.apply_rfr(regr, X_apply, df, flux, bench_flux)
print(f"apply results=> r2:{np.round(r2, 4)}")
result_df.to_csv("sebungan_test_0613.csv")
applied_df.to_csv("sebungan_apply_0613.csv")

test results=> r2: 0.7641, rmse: 6.8726
apply results=> r2:0.8508
