In [1]:
import numpy as np
from fluxlib import *
from scitbx import *
from pathlib import Path
from sklearn.model_selection import train_test_split

In [3]:
# load lowcost config for gapfilling
cfg = Yaml("rfr_gapfill_malaysia_cfg.yaml").load()
drivers = cfg["drivers"]
flux = cfg["flux"]
rg = cfg["rg"]
qc = cfg["qc"]
timestamp_format = cfg["timestamp_format"]
data_path = Path(cfg["source"])
# -------------------------------------------------
test_folder = Path(cfg["destination"]["test"])
create_all_parents(test_folder)
# ------------------------------
apply_folder = Path(cfg["destination"]["apply"])
create_all_parents(apply_folder)
# ------------------------------
flux4mds_folder = Path(cfg["destination"]["flux4mds"])
create_all_parents(flux4mds_folder)
# ------------------------------
flux4mds_validation_folder = Path(cfg["destination"]["flux4mds_validation"])
create_all_parents(flux4mds_validation_folder)

In [28]:
# create gaps:
def make_gap_pipeline(df, flux):
    series = df[flux]
    np.random.seed(0)
    pointers = np.arange(len(series))
    samples = []

    # scenario 1:
    tgr = 0.25 # total_gap_ratio
    window_size = 48 * 30 # long gaps
    p = 0.5 * tgr
    # print(pointers.shape[0])
    samples_lg, pointers = utils.make_gaps(pointers, window_size, p, series)
    # print(pointers.shape[0] + len(samples))

    window_size = 48 * 7
    p = 0.3 * tgr
    samples_mg, pointers = utils.make_gaps(pointers, window_size, p, series)

    window_size = 48 * 1
    p = 0.2 * tgr
    samples_sg, pointers = utils.make_gaps(pointers, window_size, p, series)

    samples.extend(samples_lg)
    samples.extend(samples_mg)
    samples.extend(samples_sg)

#     # scenario 2:
#     tgr = 0.5 # total_gap_ratio
#     window_size = 48 * 200
#     p = 1 * tgr
#     samples_sl, pointers = utils.make_gaps(pointers, window_size, p, series)
#     samples.extend(samples_sl)

    # print(len(samples) / len(series))
    test_idx = samples
    train_idx = pointers.tolist()
    # pointers.shape[0] + len(samples), len(series)
    return train_idx, test_idx
# =================================================================================

import sys

def run_filling(df, flux, rg, train_idx, test_idx):
    filler = Filler()
    #-------------------------------------------------
    # set tags:
    df, stat_tags = filler.set_stats(df, flux)
    df, season_tag = filler.set_season_tag(df)
    df, rg_tag = filler.set_rg_tag(df, rg)
    df, doy_year_tag = filler.set_doy_year_tag(df)
    #-------------------------------------------------
    # prepare and split data for RFR
    param_columns = drivers + stat_tags + season_tag + rg_tag + doy_year_tag
    # X = df.dropna()[param_columns]
    # y = df.dropna()[flux]
    X = df[param_columns]
    y = df[flux]
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y, 
    #     test_size=0.33, 
    #     random_state=42
    # )
    X = X.interpolate(method = "pad")
    y = y.interpolate(method = "pad")
    X_train = X.iloc[train_idx, :]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx, :]
    y_test =y.iloc[test_idx]

    X_apply = df[param_columns].interpolate(method = "pad")
    y_apply = df[flux]#.interpolate(method = "pad")
    #--------------------------------------------------
    # train and test/apply RFR
    regr = filler.train_rfr(X_train, y_train, n_estimators = 100)
    # xgbr = filler.train_xgb(X_train, y_train)
    result_df, r2, rmse = filler.test_rfr(regr, X_test, y_test)
    # result_df, r2, rmse = filler.test_xgb(xgbr, X_test, y_test)
    print(f"{data_path.stem}, {np.round(r2, 4)}, {np.round(rmse, 4)}")
    ## result_df.to_csv("fff.csv")
    # applied_df, r2, rmse = filler.test_rfr(regr, X_apply, y_apply)
    # print(f"apply results=> r2:{np.round(r2, 4)}, rmse: {np.round(rmse, 4)}")
    applied_df = filler.test_rfr(regr, X_apply, y_apply, stat = False)
    # applied_df = filler.test_rfr(xgbr, X_apply, y_apply, stat = False)
    return result_df, applied_df
    # sys.exit(0)
    
# ==============================================================================================

In [29]:
# fill gaps via rfr
loader = Loader(data_path)
df_raw = loader.load_lowcost(drivers + flux, timestamp_format)
df = df_raw.copy()
nan_idx = np.where(np.isnan(df.values))[0]
train_idx, test_idx = make_gap_pipeline(df, flux)
train_idx = np.setdiff1d(train_idx, nan_idx)
test_idx = np.setdiff1d(test_idx, nan_idx)
result_df, applied_df = run_filling(df, flux, rg, train_idx, test_idx)
result_df.index = df.index[test_idx]
applied_df.index = df.index
result_df.to_csv(test_folder.joinpath(f"{data_path.stem}_test.csv"))
applied_df.to_csv(apply_folder.joinpath(f"{data_path.stem}_apply.csv"))

Sabaju_gapfilled_Ustar_005, 0.5108, 5.8526


In [13]:
# generate artificial gaps for mds
import sys

print(data_path.stem)
loader = Loader(data_path)
df_raw = loader.load_lowcost(drivers + flux, timestamp_format)
df = df_raw.copy()
df_mds = df_raw.copy()
nan_idx = np.where(np.isnan(df.values))[0]
train_idx, test_idx = make_gap_pipeline(df, flux)
train_idx = np.setdiff1d(train_idx, nan_idx)
test_idx = np.setdiff1d(test_idx, nan_idx)
# create dfs for validateion and mds
validate = df_raw.iloc[test_idx, :].loc[:, flux[0]]
# validate.to_csv(f"data/fluxnet4mds_csv_validate/{data_path.stem}.csv")
validate.to_csv(
    flux4mds_validation_folder.joinpath(f"{data_path.stem}.csv")
)
# --------------------------------------------------
# # make gaps and covert to REddyProc format
# df_mds.loc[df_mds[qc[0]] != 0, flux[0]] = -9999
df_mds.loc[df_mds.index[test_idx], flux[0]] = -9999

df_mds["Year"] = df_mds.index.map(
    lambda x: x.year
)
df_mds["DoY"] = df_mds.index.map(
    lambda x: np.int(x.strftime('%j'))
)
df_mds["Hour"] = df_mds.index.map(
    lambda x: x.minute / 60 + x.hour
)
df_mds = df_mds.rename(columns = {
    "NEE_f": "NEE",
    "Rg_f": "Rg",
    "Tair_f": "Tair",
    "VPD_f": "VPD"
})
df_mds = df_mds[["Year", "DoY", "Hour", "NEE", "Rg", "Tair", "VPD"]]
df_mds = df_mds.reset_index(drop = True)
df_mds.loc[-1] = ["-", "-", "-", "umolm-2s-1", "Wm-2", "degC", "hPa"]
df_mds.index = df_mds.index + 1  # shifting index
df_mds = df_mds.sort_index()  # sorting by index
# print(validate, df_mds)

# df_mds = df_mds.dropna()
df_mds = df_mds.fillna(-9999.)
df_mds.to_csv(
    flux4mds_folder.joinpath(f"{data_path.stem}.txt"), 
    index=None, 
    sep='\t', 
    mode='w'
)

# df_mds.to_csv(flux4mds_folder.joinpath(f"{data_path.stem}.csv"))

# break

Sabaju_gapfilled_Ustar_005
