In [14]:
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from scipy import stats
from scitbx import *
from sklearn.metrics import mean_squared_error

In [2]:
# for fluxnet
timestamp_format = "%Y-%m-%d %H:%M:%S"

out_folder = r"C:\workspace\repositories\fluxlib\data\comparison"
out_full_folder = r"C:\workspace\repositories\fluxlib\data\comparison_full"

mds_flder = r"C:\workspace\repositories\fluxlib\data\mds_out"
flx4mds_flder = r"C:\workspace\repositories\fluxlib\data\fluxnet4mds_csv"
vld_flder = r"C:\workspace\repositories\fluxlib\data\fluxnet4mds_csv_validate"
apl_flder = r"C:\workspace\repositories\fluxlib\data\fluxnet4rfr_apply"

mds_paths = list(Path(mds_flder).glob(r"*.txt"))
x_paths = list(Path(flx4mds_flder).glob(r"*.csv"))
x_names = [p.stem.split("_")[0] for p in x_paths]

vld_paths = list(Path(vld_flder).glob(r"*.csv"))
apl_paths = list(Path(apl_flder).glob(r"*.csv"))

for pm in mds_paths:
    name = pm.stem.split("_")[0]
    idx = x_names.index(name)
    df_x = pd.read_csv(x_paths[idx], index_col = 0)
    df_x.index = df_x.index.map(
        lambda x: datetime.strptime(str(x), timestamp_format)
    )
    mds = pd.read_csv(pm, delimiter = "\t", skiprows = [1]).set_index(df_x.index)[["NEE_f"]]
    mds = mds.replace(-9999, np.nan)
    
    assert vld_paths[idx].stem.split("_")[0] == name == apl_paths[idx].stem.split("_")[0]
    rfr = pd.read_csv(apl_paths[idx]).set_index(df_x.index)[["truth", "estimates"]]
    res = pd.concat([rfr, mds], axis = 1)
    res.to_csv(Path(out_full_folder).joinpath(f"{pm.stem[0: -4]}.csv"))
    
    vld = pd.read_csv(vld_paths[idx], index_col = 0)
    vld.index = vld.index.map(
        lambda x: datetime.strptime(str(x), timestamp_format)
    )
    # print(vld)
    extract_vld = res.loc[vld.index]
    x = extract_vld.dropna()["truth"]
    y1 = extract_vld.dropna()["estimates"]
    y2 = extract_vld.dropna()["NEE_f"]
    #--------------------------------------------
    _, _, r_value, _, _ = stats.linregress(x, y1)
    r2_1 = r_value**2
    mse = mean_squared_error(x, y1)
    rmse_1 = np.sqrt(mse)   
    # -------------------------------------------
    _, _, r_value, _, _ = stats.linregress(x, y2)
    r2_2 = r_value**2
    mse = mean_squared_error(x, y2)
    rmse_2 = np.sqrt(mse)  
    # -------------------------------------------
    
    extract_vld.to_csv(Path(out_folder).joinpath(f"{pm.stem[0: -4]}.csv"))
    print(f"{name}, RFR: {np.round(r2_1, 2)}, {np.round(rmse_1, 2)}; MDS: {np.round(r2_2, 2)}, {np.round(rmse_2, 2)}")
    # break

In [21]:
# for malaysia
timestamp_format = "%Y-%m-%d %H:%M:%S"

out_folder = r"C:\workspace\repositories\fluxlib\data\malaysia_synthetic_scenario\comparison"
create_all_parents(out_folder)
out_full_folder = r"C:\workspace\repositories\fluxlib\data\malaysia_synthetic_scenario\comparison_full"
create_all_parents(out_full_folder)

mds_flder = r"C:\workspace\repositories\fluxlib\data\malaysia_synthetic_scenario\mds_out"
# flx4mds_flder = r"C:\workspace\repositories\fluxlib\data\malaysia_synthetic_scenario\fluxnet4mds_csv"
vld_flder = r"C:\workspace\repositories\fluxlib\data\malaysia_synthetic_scenario\malaysia4mds_csv_validate"
apl_flder = r"C:\workspace\repositories\fluxlib\data\malaysia_synthetic_scenario\malaysia4rfr_apply"

mds_path = list(Path(mds_flder).glob(r"*.txt"))[0]

vld_path = list(Path(vld_flder).glob(r"*.csv"))[0]
apl_path = list(Path(apl_flder).glob(r"*.csv"))[0]

df_app = pd.read_csv(apl_path, index_col = 0)
df_app.index = df_app.index.map(
    lambda x: datetime.strptime(str(x), timestamp_format)
)
mds = pd.read_csv(mds_path, delimiter = "\t", skiprows = [1]).set_index(df_app.index)[["NEE_f"]]
mds = mds.replace(-9999, np.nan)

rfr = df_app[["truth", "estimates"]]
res = pd.concat([rfr, mds], axis = 1)
res.to_csv(Path(out_full_folder).joinpath(f"{pm.stem[0: -4]}.csv"))

vld = pd.read_csv(vld_path, index_col = 0)
vld.index = vld.index.map(
    lambda x: datetime.strptime(str(x), timestamp_format)
)
# print(vld)
extract_vld = res.loc[vld.index]
x = extract_vld.dropna()["truth"]
y1 = extract_vld.dropna()["estimates"]
y2 = extract_vld.dropna()["NEE_f"]
extract_vld.to_csv(Path(out_folder).joinpath(f"{pm.stem[0: -4]}.csv"))
#--------------------------------------------
_, _, r_value, _, _ = stats.linregress(x, y1)
r2_1 = r_value**2
mse = mean_squared_error(x, y1)
rmse_1 = np.sqrt(mse)   
# -------------------------------------------
_, _, r_value, _, _ = stats.linregress(x, y2)
r2_2 = r_value**2
mse = mean_squared_error(x, y2)
rmse_2 = np.sqrt(mse)  
# -------------------------------------------


print(f"{name}, RFR: {np.round(r2_1, 2)}, {np.round(rmse_1, 2)}; MDS: {np.round(r2_2, 2)}, {np.round(rmse_2, 2)}")
# break

Sebungan, RFR: 0.78, 6.69; MDS: 0.75, 7.12
