In [11]:
import warnings
import numpy as np
import pandas as pd
from netCDF4 import Dataset
from pathlib import Path
import xarray as xr 
from datetime import datetime, timedelta
from scipy import stats
from fluxlib import *
from scitbx import *
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

warnings.simplefilter("ignore")
save_folder = Path("data/formatted_griffin_roth")
create_all_parents(save_folder)
result_folder = Path("data/rfr_result_griffin_roth")
create_all_parents(result_folder)

drivers = ["Rg", "Tair", "VPD"]

In [5]:
# create gaps:
def make_gap_pipeline(df, flux):
    series = df[flux]
    np.random.seed(0)
    pointers = np.arange(len(series))
    samples = []

    # scenario 1:
    tgr = 0.25 # total_gap_ratio
    window_size = 48 * 30 # long gaps
    p = 0.5 * tgr
    # print(pointers.shape[0])
    samples_lg, pointers = utils.make_gaps(pointers, window_size, p, series)
    # print(pointers.shape[0] + len(samples))

    window_size = 48 * 7
    p = 0.3 * tgr
    samples_mg, pointers = utils.make_gaps(pointers, window_size, p, series)

    window_size = 48 * 1
    p = 0.2 * tgr
    samples_sg, pointers = utils.make_gaps(pointers, window_size, p, series)

    samples.extend(samples_lg)
    samples.extend(samples_mg)
    samples.extend(samples_sg)

#     # scenario 2:
#     tgr = 0.5 # total_gap_ratio
#     window_size = 48 * 200
#     p = 1 * tgr
#     samples_sl, pointers = utils.make_gaps(pointers, window_size, p, series)
#     samples.extend(samples_sl)

    # print(len(samples) / len(series))
    test_idx = samples
    train_idx = pointers.tolist()
    # pointers.shape[0] + len(samples), len(series)
    return train_idx, test_idx
# =================================================================================

import sys

def run_filling(df, flux, rg, train_idx, test_idx):
    filler = Filler()
    #-------------------------------------------------
    # set tags:
    df, stat_tags = filler.set_stats(df, flux)
    df, season_tag = filler.set_season_tag(df)
    df, rg_tag = filler.set_rg_tag(df, rg)
    df, doy_year_tag = filler.set_doy_year_tag(df)
    #-------------------------------------------------
    # prepare and split data for RFR
    param_columns = drivers + stat_tags + season_tag + rg_tag + doy_year_tag
    # X = df.dropna()[param_columns]
    # y = df.dropna()[flux]
    X = df[param_columns]
    y = df[flux]
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y, 
    #     test_size=0.33, 
    #     random_state=42
    # )
    X = X.interpolate(method = "pad")
    y = y.interpolate(method = "pad")
    X_train = X.iloc[train_idx, :]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx, :]
    y_test =y.iloc[test_idx]

    X_apply = df[param_columns].interpolate(method = "pad")
    y_apply = df[flux]#.interpolate(method = "pad")
    #--------------------------------------------------
    # train and test/apply RFR
    regr = filler.train_rfr(X_train, y_train, n_estimators = 100)
    result_df, r2, rmse = filler.test_rfr(regr, X_test, y_test)
    print(f"{data_path.stem}, {np.round(r2, 4)}, {np.round(rmse, 4)}")
    ## result_df.to_csv("fff.csv")
    # applied_df, r2, rmse = filler.test_rfr(regr, X_apply, y_apply)
    # print(f"apply results=> r2:{np.round(r2, 4)}, rmse: {np.round(rmse, 4)}")
    applied_df = filler.test_rfr(regr, X_apply, y_apply, stat = False)
    return result_df, applied_df
    # sys.exit(0)
    
# ==============================================================================================

In [77]:
# Formatting data
# ======================================================================
# roth licor:

# roth rg
rad_path = r"C:\workspace\malaysia\data\roth_griffin\roth_era5_rad.nc"
with xr.open_dataset(rad_path) as ds:
    df_rad = ds.to_dataframe()
    df_rad.index = df_rad.index.get_level_values('time')
    # J/m-2 to W/m-2
    df_rad = df_rad[["ssr"]].resample('30T').mean().bfill() / 3600
    df_rad[df_rad["ssr"] < 0] = 0 
# with Dataset(rad_path, mode = "r") as ds:
#     pass

# -----------------------------------------------------------
# roth licor data
timestamp_format = r"%Y-%m-%d %H:%M:%S"
x_names = ["air_temperature", "VPD"]
y_name = ["co2_flux"]
qc_name = ["qc_co2_flux"]
data_path = Path(r"C:\workspace\malaysia\data\roth_griffin\Roth_licor\Red_EddyPro_2017-2018.csv")
df_raw = pd.read_csv(data_path, index_col = 0)
df_raw.columns = df_raw.columns.str.strip()
df_raw.index = df_raw.index.map(
    lambda x: datetime.strptime(x, timestamp_format)
)
df_raw = df_raw.replace(-9999, np.nan)
df_raw = df_raw[x_names + y_name + qc_name]
# insert ssr into df_raw
df_rad = df_rad.loc[df_raw.index, :]
df_raw["Rg"] = df_rad["ssr"]
df_raw = df_raw.rename(columns = {
    "co2_flux": "NEE",
    "Rg": "Rg",
    "air_temperature": "Tair",
    "VPD": "VPD",
    "qc_co2_flux": "qcNEE"
})
df_licor = df_raw.copy()

df_licor.to_csv(save_folder.joinpath(f"Roth_L.csv"))
# ======================================================================

# Roth low_cost
timestamp_name = "Date/Time"
timestamp_format = r"%d/%m/%y %H:%M:%S"
paths = [
    r"C:\workspace\malaysia\data\roth_griffin\Roth_lowcost\Roth_N.csv",
    r"C:\workspace\malaysia\data\roth_griffin\Roth_lowcost\Roth_S.csv"
]
for path in paths:
    # path = paths[0]
    data_path = Path(path)
    df_raw = pd.read_csv(data_path, index_col = 0)
    df_raw.columns = df_raw.columns.str.strip()
    df_raw.index = df_raw.index.map(
        lambda x: datetime.strptime(x, timestamp_format)
    )
    df_raw = df_raw.replace(-9999, np.nan)
    df_raw = df_raw[["Fc"]]

    # upsampling
    upsampled = df_raw.resample('15T')
    interpolated = upsampled.interpolate(method='spline', order=2)
    # intersecting two indices
    df_meteo = df_licor.copy()
    new_index = interpolated.index.intersection(df_meteo.index)
    df_lowcost = pd.concat([interpolated.loc[new_index, :], df_meteo.loc[new_index, :]], axis = 1)
    df_lowcost["NEE"] = df_lowcost["Fc"]
    df_lowcost = df_lowcost.drop(["Fc"], axis = 1)
    # print(df_lowcost)
    df_lowcost.to_csv(save_folder.joinpath(f"{data_path.stem}.csv"))
    
# ======================================================================
# Griffin
# x_names = ["s_Rn", "Ta", "VPD_Taw", "WS"]
x_names = ["s_Rn", "Ta", "VPD_Taw"]
y_name = ["Fc"]
rg_name = ["s_Rn"]
qc_name = ["qcNEE"]
data_path = Path(r"C:\workspace\malaysia\data\roth_griffin\griffin.csv")
timestamp_name = "Timestamp"
timestamp_format = r"%d/%m/%Y %H:%M"

df_raw = pd.read_csv(data_path, index_col = 0)
df_raw.columns = df_raw.columns.str.strip()
df_raw.index = df_raw.index.map(
    lambda x: datetime.strptime(x, timestamp_format)
)

df_raw = df_raw.drop_duplicates()
df_raw = df_raw.resample('30T').mean()
# df_raw[x_names] = df_raw[x_names].interpolate(method = "linear")

# df_raw = df_raw.replace(-9999., np.nan)
df_raw = df_raw[x_names + y_name + qc_name]

df_raw = df_raw.rename(columns = {
    "Fc": "NEE",
    "s_Rn": "Rg",
    "Ta": "Tair",
    "VPD_Taw": "VPD",
    "qcNEE": "qcNEE"
})
# df_griffin = df_raw.fillna(-9999.)
df_griffin = df_raw
# print(df_griffin)
df_griffin.to_csv(save_folder.joinpath(f"griffin.csv"))

In [83]:
# fill meteo

tsf = r"%Y-%m-%d %H:%M:%S"
paths = list(save_folder.glob(r"*.csv"))

for p in paths:
    p = paths[0]
    print(p.stem)
    df = pd.read_csv(p, index_col = 0)
    df.index = df.index.map(
        lambda x: datetime.strptime(x, tsf)
    )
    df = df["1997-01-23"::]
    auxfiller = AuxFiller(df[["Rg"]])
    auxfiller.create_dataset()
    auxfiller.train()
    met_var = auxfiller.apply()
    met_var.to_csv(save_folder.joinpath(f"griffin_Rg.csv"))
    break

griffin
cannot set using a slice indexer with a different length than the value
cannot set using a slice indexer with a different length than the value
cannot set using a slice indexer with a different length than the value


In [89]:
# fill gaps and created for mds

cfg = Yaml("rfr_gapfill_griffin_roth_cfg.yaml").load()

drivers = cfg["drivers"]
flux = cfg["flux"]
rg = cfg["rg"]
qc = cfg["qc"]
timestamp_format = cfg["timestamp_format"]
paths = list(Path(cfg["source"]).glob(r"*.csv"))
# -------------------------------------------------
test_folder = Path(cfg["destination"]["test"])
create_all_parents(test_folder)
# ------------------------------
apply_folder = Path(cfg["destination"]["apply"])
create_all_parents(apply_folder)
# ------------------------------
flux4mds_folder = Path(cfg["destination"]["flux4mds"])
create_all_parents(flux4mds_folder)
# ------------------------------
flux4mds_validation_folder = Path(cfg["destination"]["flux4mds_validation"])
create_all_parents(flux4mds_validation_folder)

In [93]:
# running rfr gap-fill
for count, data_path in enumerate(paths):
    loader = Loader(data_path)
    # print(loader.data_path)
    df_raw = loader.load_format(drivers + flux + qc, timestamp_format)
    df = df_raw.copy()
    df.loc[df[qc[0]] != 0, flux[0]] = np.nan
    df.loc[df[flux[0]] > 100, flux[0]] = np.nan
    df.loc[df[flux[0]] < -100, flux[0]] = np.nan
    nan_idx = np.where(np.isnan(df.values))[0]
    train_idx, test_idx = make_gap_pipeline(df, flux)
    train_idx = np.setdiff1d(train_idx, nan_idx)
    test_idx = np.setdiff1d(test_idx, nan_idx)
    try:
        result_df, applied_df = run_filling(df, flux, rg, train_idx, test_idx)
        result_df.index = df.index[test_idx]
        applied_df.index = df.index
        result_df.to_csv(test_folder.joinpath(f"{data_path.stem}_test.csv"))
        applied_df.to_csv(apply_folder.joinpath(f"{data_path.stem}_apply.csv"))
    except Exception as e:
        print(count)
        print(e)
        continue
    # break

griffin, 0.676, 3.8658
Roth_L, 0.8072, 3.3104
Roth_N, 0.2547, 10.7894
Roth_S, 0.1014, 10.4688


In [74]:
# generate artificial gaps for mds
import sys

timestamp_format = r"%d/%m/%Y %H:%M"

for count, data_path in enumerate(paths):
    print(data_path.stem)
    loader = Loader(data_path)
    df_raw = loader.load_format(drivers + flux + qc, timestamp_format)
#     # ---------------------------------
#     # for griffin only
#     df_raw = df_raw.drop_duplicates()
#     df_raw = df_raw.resample('30T').mean()
#     df_raw[drivers] = df_raw[drivers].interpolate(method = "linear")
#     # ---------------------------------
    # df_raw.index = df_raw.index - timedelta(minutes = 15)
    df = df_raw.copy()
    df_mds = df_raw.copy()
    # df.loc[df["NEE_VUT_REF_QC"] != 0, "NEE_VUT_REF"] = np.nan
    df.loc[df[qc[0]] != 0, flux[0]] = np.nan
    nan_idx = np.where(np.isnan(df.values))[0]
    train_idx, test_idx = make_gap_pipeline(df, flux)
    train_idx = np.setdiff1d(train_idx, nan_idx)
    test_idx = np.setdiff1d(test_idx, nan_idx)
    # create dfs for validateion and mds
    try:
        validate = df_raw.iloc[test_idx, :].loc[:, flux[0]]
        # validate.to_csv(f"data/fluxnet4mds_csv_validate/{data_path.stem}.csv")
        validate.to_csv(
            flux4mds_validation_folder.joinpath(f"{data_path.stem}.csv")
        )
        # --------------------------------------------------
        # make gaps and covert to REddyProc format
        df_mds.loc[df_mds[qc[0]] != 0, flux[0]] = -9999
        df_mds.loc[df_mds.index[test_idx], flux[0]] = -9999

        df_mds["Year"] = df_mds.index.map(
            lambda x: x.year
        )
        df_mds["DoY"] = df_mds.index.map(
            lambda x: np.int(x.strftime('%j'))
        )
        df_mds["Hour"] = df_mds.index.map(
            lambda x: x.minute / 60 + x.hour
        )

        df_mds = df_mds[["Year", "DoY", "Hour", "NEE", "Rg", "Tair", "VPD"]]
        df_mds = df_mds.reset_index(drop = True)
        df_mds.loc[-1] = ["-", "-", "-", "umolm-2s-1", "Wm-2", "degC", "hPa"]
        df_mds.index = df_mds.index + 1  # shifting index
        df_mds = df_mds.sort_index()  # sorting by index
        # print(validate, df_mds)
        # df_mds.to_csv(f"data/fluxnet4mds_csv/{data_path.stem}.csv")
        df_mds.to_csv(
            flux4mds_folder.joinpath(f"{data_path.stem}.txt"), 
            index=None, 
            sep='\t', 
            mode='w'
        )
    except Exception as e:
        print(count)
        print(e)
        continue
    
    break

griffin


In [94]:
# # check duplicates and non half-hour intervals
# df_raw = df_raw_
# # print(df_raw)
# df_raw = df_raw.resample('30T').mean()
# print(df_raw)
# df_raw[drivers] = df_raw[drivers].interpolate(method = "linear")
# print(df_raw[drivers].interpolate(method = "linear"))
# print(df_raw)

# dif = df_raw.index[1] - df_raw.index[0]
# for count, dt in enumerate(df_raw.index.values):
#     if count <= 1: continue
#     # print(count, dt)
#     if not ((dt - df_raw.index[count - 1]) == dif):
#         print(dt, df_raw.index[count - 1])
# #     if count >= 100: break