In [16]:
import numpy as np
import pandas as pd
from fluxlib import *
from scitbx import *
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# load fluxnet config for gapfilling
cfg = Yaml("rfr_gapfill_fluxnet_cfg.yaml").load()

drivers = cfg["drivers"]
flux = cfg["flux"]
rg = cfg["rg"]
qc = cfg["qc"]
timestamp_format = cfg["timestamp_format"]
paths = list(Path(cfg["source"]).glob(r"*.csv"))
# -------------------------------------------------
test_folder = Path(cfg["destination"]["test"])
create_all_parents(test_folder)
# ------------------------------
apply_folder = Path(cfg["destination"]["apply"])
create_all_parents(apply_folder)
# ------------------------------
flux4mds_folder = Path(cfg["destination"]["flux4mds"])
create_all_parents(flux4mds_folder)
# ------------------------------
flux4mds_validation_folder = Path(cfg["destination"]["flux4mds_validation"])
create_all_parents(flux4mds_validation_folder)

In [24]:
# create gaps:
def make_gap_pipeline(df, flux):
    series = df[flux]
    np.random.seed(0)
    pointers = np.arange(len(series))
    samples = []
    tags = []

    # scenario 1:
    tgr = 0.25 # total_gap_ratio
    window_size = 48 * 30 # long gaps
    p = 0.5 * tgr
    # print(pointers.shape[0])
    samples_lg, pointers = utils.make_gaps(pointers, window_size, p, series)
    tag_lg = list(np.ones(len(samples_lg)) * 2)
    # print(pointers.shape[0] + len(samples))

    window_size = 48 * 7
    p = 0.3 * tgr
    samples_mg, pointers = utils.make_gaps(pointers, window_size, p, series)
    tag_mg = list(np.ones(len(samples_mg)) * 1)

    window_size = 48 * 1
    p = 0.2 * tgr
    samples_sg, pointers = utils.make_gaps(pointers, window_size, p, series)
    tag_sg = list(np.ones(len(samples_sg)) * 0)

    samples.extend(samples_lg)
    samples.extend(samples_mg)
    samples.extend(samples_sg)
    tags.extend(tag_lg)
    tags.extend(tag_mg)
    tags.extend(tag_sg)

#     # scenario 2:
#     tgr = 0.5 # total_gap_ratio
#     window_size = 48 * 200
#     p = 1 * tgr
#     samples_sl, pointers = utils.make_gaps(pointers, window_size, p, series)
#     samples.extend(samples_sl)

    # print(len(samples) / len(series))
    test_idx = samples
    train_idx = pointers.tolist()
    # pointers.shape[0] + len(samples), len(series)
    return train_idx, test_idx, tags
# =================================================================================

import sys

def run_filling(df, flux, rg, train_idx, test_idx):
    filler = Filler()
    #-------------------------------------------------
    # set tags:
    df, stat_tags = filler.set_stats(df, flux)
    df, season_tag = filler.set_season_tag(df)
    df, rg_tag = filler.set_rg_tag(df, rg)
    df, doy_year_tag = filler.set_doy_year_tag(df)
    #-------------------------------------------------
    # prepare and split data for RFR
    param_columns = drivers + stat_tags + season_tag + rg_tag + doy_year_tag
    # X = df.dropna()[param_columns]
    # y = df.dropna()[flux]
    X = df[param_columns]
    y = df[flux]
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y, 
    #     test_size=0.33, 
    #     random_state=42
    # )
    X = X.interpolate(method = "pad")
    y = y.interpolate(method = "pad")
    X_train = X.iloc[train_idx, :]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx, :]
    y_test =y.iloc[test_idx]

    X_apply = df[param_columns].interpolate(method = "pad")
    y_apply = df[flux]#.interpolate(method = "pad")
    #--------------------------------------------------
    # train and test/apply RFR
    regr = filler.train_rfr(X_train, y_train, n_estimators = 100)
    result_df, r2, rmse = filler.test_rfr(regr, X_test, y_test)
    print(f"{data_path.stem}, {np.round(r2, 4)}, {np.round(rmse, 4)}")
    ## result_df.to_csv("fff.csv")
    # applied_df, r2, rmse = filler.test_rfr(regr, X_apply, y_apply)
    # print(f"apply results=> r2:{np.round(r2, 4)}, rmse: {np.round(rmse, 4)}")
    applied_df = filler.test_rfr(regr, X_apply, y_apply, stat = False)
    return result_df, applied_df
    # sys.exit(0)
    
# ==============================================================================================

In [45]:
# running rfr gap-fill
for count, data_path in enumerate(paths):
    loader = Loader(data_path)
    df_raw = loader.load_fluxnet(drivers + flux, timestamp_format)
    df = df_raw.copy()
    df.loc[df[qc[0]] != 0, flux[0]] = np.nan
    nan_idx = np.where(np.isnan(df.values))[0]
    train_idx, test_idx, tags = make_gap_pipeline(df, flux)
#     # gap tag, 0: small gaps, 1: medium gaps, 2: long gaps
#     tag_info = pd.DataFrame(zip(test_idx, tags), columns = ["idx", "gap_tag"])
#     train_idx = np.setdiff1d(train_idx, nan_idx)
#     test_idx = np.setdiff1d(test_idx, nan_idx)
#     tag_info = tag_info[tag_info['idx'].isin(test_idx)].sort_values(by=["idx"])
#     tag_info.index = df.iloc[tag_info["idx"]].index
#     print(data_path.stem)
#     tag_info.to_csv(test_folder.joinpath(f"{data_path.stem}_test_tag.csv"))
    # # break
    # fill gaps:
    try:
        result_df, applied_df = run_filling(df, flux, rg, train_idx, test_idx)
        result_df.index = df.index[test_idx]
        applied_df.index = df.index
        result_df.to_csv(test_folder.joinpath(f"{data_path.stem}_test.csv"))
        applied_df.to_csv(apply_folder.joinpath(f"{data_path.stem}_apply.csv"))
    except Exception as e:
        print(count)
        print(e)
        continue
    # # break

AR-SLu_MF_LAT_-33.4648_LON_-66.4598
AR-Vir_ENF_LAT_-28.2395_LON_-56.1886
AT-Neu_GRA_LAT_47.11667_LON_11.3175
AU-Ade_WSA_LAT_-13.0769_LON_131.1178
AU-ASM_ENF_LAT_-22.2830_LON_133.2490
AU-Cpr_SAV_LAT_-34.0021_LON_140.5891
AU-Cum_EBF_LAT_-33.61518_LON_150.72362
AU-DaP_GRA_LAT_-14.0633_LON_131.3181
AU-DaS_SAV_LAT_-14.1593_LON_131.3881
AU-Dry_SAV_LAT_-15.2588_LON_132.3706
AU-Emr_GRA_LAT_-23.8587_LON_148.4746
AU-Fog_WET_LAT_-12.5452_LON_131.3072
AU-Gin_WSA_LAT_-31.3764_LON_115.7138
AU-GWW_SAV_LAT_-30.1913_LON_120.6541
AU-How_WSA_LAT_-12.4943_LON_131.1523
AU-Lox_DBF_LAT_-34.4704_LON_140.6551
AU-RDF_WSA_LAT_-14.5636_LON_132.4776
AU-Rig_GRA_LAT_-36.6499_LON_145.5759
AU-Rob_EBF_LAT_-17.1175_LON_145.6301
AU-Stp_GRA_LAT_-17.1507_LON_133.3502
AU-TTE_OSH_LAT_-22.2870_LON_133.6400
AU-Tum_EBF_LAT_-35.6566_LON_148.1517
AU-Wac_EBF_LAT_-37.4259_LON_145.1878
AU-Whr_EBF_LAT_-36.6732_LON_145.0294
AU-Wom_EBF_LAT_-37.4222_LON_144.0944
AU-Ync_GRA_LAT_-34.9893_LON_146.2907
BE-Bra_MF_LAT_51.30761_LON_4.51984
BE-

In [53]:
# generate artificial gaps for mds
import sys
for count, data_path in enumerate(paths):
    print(data_path.stem)
    loader = Loader(data_path)
    df_raw = loader.load_fluxnet(drivers + flux, timestamp_format)
    df = df_raw.copy()
    df_mds = df_raw.copy()
    # df.loc[df["NEE_VUT_REF_QC"] != 0, "NEE_VUT_REF"] = np.nan
    df.loc[df[qc[0]] != 0, flux[0]] = np.nan
    nan_idx = np.where(np.isnan(df.values))[0]
    train_idx, test_idx = make_gap_pipeline(df, flux)
    train_idx = np.setdiff1d(train_idx, nan_idx)
    test_idx = np.setdiff1d(test_idx, nan_idx)
    # create dfs for validateion and mds
    try:
        validate = df_raw.iloc[test_idx, :].loc[:, flux[0]]
        # validate.to_csv(f"data/fluxnet4mds_csv_validate/{data_path.stem}.csv")
        validate.to_csv(
            flux4mds_validation_folder.joinpath(f"{data_path.stem}.csv")
        )
        # --------------------------------------------------
        # make gaps and covert to REddyProc format
        df_mds.loc[df_mds[qc[0]] != 0, flux[0]] = -9999
        df_mds.loc[df_mds.index[test_idx], flux[0]] = -9999

        df_mds["Year"] = df_mds.index.map(
            lambda x: x.year
        )
        df_mds["DoY"] = df_mds.index.map(
            lambda x: np.int(x.strftime('%j'))
        )
        df_mds["Hour"] = df_mds.index.map(
            lambda x: x.minute / 60 + x.hour
        )
        df_mds = df_mds.rename(columns = {
            "NEE_VUT_REF": "NEE",
            "SW_IN_ERA": "Rg",
            "TA_ERA": "Tair",
            "VPD_ERA": "VPD"
        })
        df_mds = df_mds[["Year", "DoY", "Hour", "NEE", "Rg", "Tair", "VPD"]]
        df_mds = df_mds.reset_index(drop = True)
        df_mds.loc[-1] = ["-", "-", "-", "umolm-2s-1", "Wm-2", "degC", "hPa"]
        df_mds.index = df_mds.index + 1  # shifting index
        df_mds = df_mds.sort_index()  # sorting by index
        # print(validate, df_mds)
        # df_mds.to_csv(f"data/fluxnet4mds_csv/{data_path.stem}.csv")
        df_mds.to_csv(
            flux4mds_folder.joinpath(f"{data_path.stem}.txt"), 
            index=None, 
            sep='\t', 
            mode='w'
        )
    except Exception as e:
        print(count)
        print(e)
        continue

    # break

In [None]:
# # evenly cut the range(0, length) into n_gap parts, and randomly select from these parts, deprecated method.
# import pandas as pd
# np.random.seed(0)

# var_name = ["NEE_VUT_REF"]
# length = len(df)
# window = 48 * 30
# gap_ratio = 0.25
# n_point = np.int(length * gap_ratio / amonth)
# iter_theta = 5
# valid_theta = 0.5

# art_gap_idx = []
# step = np.int(length / n_point)
# anchors = np.arange(0, length, step = step).tolist()#.append(length - 1)

# for anc in anchors:
#     pos_candidates = np.random.choice(range(anc, anc + step - window), iter_theta)
#     for pos_count, pos in enumerate(pos_candidates):
#         tmp_series = df[var_name].iloc[pos: pos + window]
#         if len(tmp_series.dropna()) / len(tmp_series) > valid_theta:
#             break
#         else:
#             print(pos_count)
#     art_gap_idx.extend(np.arange(pos, pos + window).tolist())
        
#     # break
# diff = lambda list1, list2: [x for x in list1 if x not in list2]
# train_list = diff(np.arange(length).tolist(), art_gap_idx)
# # train_list