In [1]:
import numpy as np
from fluxlib import *
from scitbx import *
from pathlib import Path
from sklearn.model_selection import train_test_split

In [3]:
# load lowcost config for gapfilling
cfg = Yaml("rfr_gapfill_malaysia_cfg.yaml").load()
drivers = cfg["drivers"]
flux = cfg["flux"]
rg = cfg["rg"]
qc = cfg["qc"]
timestamp_format = cfg["timestamp_format"]
data_path = Path(cfg["source"])
# -------------------------------------------------
test_folder = Path(cfg["destination"]["test"])
create_all_parents(test_folder)
# ------------------------------
apply_folder = Path(cfg["destination"]["apply"])
create_all_parents(apply_folder)
# ------------------------------
flux4mds_folder = Path(cfg["destination"]["flux4mds"])
create_all_parents(flux4mds_folder)
# ------------------------------
flux4mds_validation_folder = Path(cfg["destination"]["flux4mds_validation"])
create_all_parents(flux4mds_validation_folder)

In [6]:
# create gaps:
def make_gap_pipeline(df, flux):
    series = df[flux]
    np.random.seed(0)
    pointers = np.arange(len(series))
    samples = []

    # scenario 1:
    tgr = 0.25 # total_gap_ratio
    window_size = 48 * 30 # long gaps
    p = 0.5 * tgr
    # print(pointers.shape[0])
    samples_lg, pointers = utils.make_gaps(pointers, window_size, p, series)
    # print(pointers.shape[0] + len(samples))

    window_size = 48 * 7
    p = 0.3 * tgr
    samples_mg, pointers = utils.make_gaps(pointers, window_size, p, series)

    window_size = 48 * 1
    p = 0.2 * tgr
    samples_sg, pointers = utils.make_gaps(pointers, window_size, p, series)

    samples.extend(samples_lg)
    samples.extend(samples_mg)
    samples.extend(samples_sg)

#     # scenario 2:
#     tgr = 0.5 # total_gap_ratio
#     window_size = 48 * 200
#     p = 1 * tgr
#     samples_sl, pointers = utils.make_gaps(pointers, window_size, p, series)
#     samples.extend(samples_sl)

    # print(len(samples) / len(series))
    test_idx = samples
    train_idx = pointers.tolist()
    # pointers.shape[0] + len(samples), len(series)
    return train_idx, test_idx
# =================================================================================

import sys

def run_filling(df, flux, rg, train_idx, test_idx):
    filler = Filler()
    #-------------------------------------------------
    # set tags:
    df, stat_tags = filler.set_stats(df, flux)
    df, season_tag = filler.set_season_tag(df)
    df, rg_tag = filler.set_rg_tag(df, rg)
    df, doy_year_tag = filler.set_doy_year_tag(df)
    #-------------------------------------------------
    # prepare and split data for RFR
    param_columns = drivers + stat_tags + season_tag + rg_tag + doy_year_tag
    # X = df.dropna()[param_columns]
    # y = df.dropna()[flux]
    X = df[param_columns]
    y = df[flux]
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y, 
    #     test_size=0.33, 
    #     random_state=42
    # )
    X = X.interpolate(method = "pad")
    y = y.interpolate(method = "pad")
    X_train = X.iloc[train_idx, :]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx, :]
    y_test =y.iloc[test_idx]

    X_apply = df[param_columns].interpolate(method = "pad")
    y_apply = df[flux]#.interpolate(method = "pad")
    #--------------------------------------------------
    # train and test/apply RFR
    regr = filler.train_rfr(X_train, y_train, n_estimators = 100)
    # xgbr = filler.train_xgb(X_train, y_train)
    result_df, r2, rmse = filler.test_rfr(regr, X_test, y_test)
    # result_df, r2, rmse = filler.test_xgb(xgbr, X_test, y_test)
    print(f"{data_path.stem}, {np.round(r2, 4)}, {np.round(rmse, 4)}")
    ## result_df.to_csv("fff.csv")
    # applied_df, r2, rmse = filler.test_rfr(regr, X_apply, y_apply)
    # print(f"apply results=> r2:{np.round(r2, 4)}, rmse: {np.round(rmse, 4)}")
    applied_df = filler.test_rfr(regr, X_apply, y_apply, stat = False)
    # applied_df = filler.test_rfr(xgbr, X_apply, y_apply, stat = False)
    return result_df, applied_df
    # sys.exit(0)
    
# ==============================================================================================

In [7]:
from torch import nn
import torch
import torch.nn.functional as F
from skorch import NeuralNetClassifier, NeuralNetRegressor


class LSTM(nn.Module):
    def __init__(self, input_size = 1, hidden_size = 4, output_size = 1):
        super().__init__()
        self.lstm = torch.nn.LSTM(
            input_size = input_size,
            hidden_size = hidden_size,
            num_layers = 3,
            batch_first = True,
            dropout = 0.2
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 64),
            nn.Linear(64, output_size)
        )
        # self.out = torch.nn.Linear(in_features=hidden_size, out_features=1)


    def forward(self, x):
        # 一下关于shape的注释只针对单项
        # output: [batch_size, time_step, hidden_size]
        # h_n: [num_layers,batch_size, hidden_size] # 虽然LSTM的batch_first为True,但是h_n/c_n的第一维还是num_layers
        # c_n: 同h_n
        output, (h_n, c_n) = self.lstm(x)
        # print(output.size())
        # output_in_last_timestep=output[:,-1,:] # 也是可以的
        output_in_last_timestep = h_n[-1, :, :]
        # print(output_in_last_timestep.equal(output[:,-1,:])) #ture
        # x = self.out(output_in_last_timestep)
        x = self.fc(output_in_last_timestep)
        return x

net = NeuralNetRegressor(
    LSTM,
    max_epochs=10,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    batch_size = 512,
)

In [8]:
# ==========================================
# UNDER DEVELOPMENT!!
# improve method -> more general
#===========================================
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy import stats
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error


loader = Loader(data_path)
df_raw = loader.load_lowcost(drivers + flux, timestamp_format)
df = df_raw.copy()
nan_idx = np.where(np.isnan(df.values))[0]
train_idx, test_idx = make_gap_pipeline(df, flux)
train_idx = np.setdiff1d(train_idx, nan_idx)
test_idx = np.setdiff1d(test_idx, nan_idx)

df["hour_dif"] = (df.index - df.index[0]).total_seconds() / 3600
lbl = ["hour_dif"]

X = df[drivers + lbl]
y = df[flux]

# -------------------
# normalize:
for col in range(X.shape[1]):
    tmp = X.iloc[:, col]
    X.iloc[:, col] = (tmp - np.min(tmp)) / (np.max(tmp) - np.min(tmp))
y = (y - np.min(y)) / (np.max(y) - np.min(y))
# --------------------

X_train = X.iloc[train_idx, :]
y_train = y.iloc[train_idx]
X_test = X.iloc[test_idx, :]
y_test =y.iloc[test_idx]

# regr = RandomForestRegressor(
#     max_depth = 20, 
#     min_samples_leaf = 3, 
#     # max_features = 10,
#     min_samples_split = 12,
#     n_estimators = 500, 
#     n_jobs = -1, 
#     random_state = 0
# )
# regr = xgb.XGBRegressor(
#     objective = "reg:squarederror", 
#     random_state = 0,
#     max_depth = 4,
#     booster = "dart",
#     eta = 0.3,
#     subsample = 1,
#     colsample_bytree = 1,
#     reg_alpha = 4,
#     reg_lambda = 0,
#     scale_pos_weight = 3000,
# )
# regr = GradientBoostingRegressor()
# regr = AdaBoostRegressor()
# regr = SVR()

# regr.fit(X_train, y_train)
# predicts = regr.predict(X_test)[:, np.newaxis]

net.fit(X_train.values.astype(np.float32)[:, :, np.newaxis], y_train.values.astype(np.float32))
predicts = net.predict_proba(X_test.values.astype(np.float32)[:, :, np.newaxis])

df = pd.DataFrame(np.concatenate([y_test, predicts], axis = 1), columns = ["truth", "estimates"])
slope, intercept, r_value, p_value, std_err = stats.linregress(df.dropna()["truth"], df.dropna()["estimates"])
r2 = r_value**2
mse = mean_squared_error(predicts, y_test)
rmse = np.sqrt(mse)
print(r2, rmse)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m0.0197[0m        [32m0.0021[0m  1.0946
      2        [36m0.0043[0m        0.0023  1.1998
      3        [36m0.0042[0m        0.0022  1.0701
      4        [36m0.0041[0m        0.0021  0.9655
      5        [36m0.0041[0m        0.0022  0.9445
      6        [36m0.0040[0m        [32m0.0020[0m  0.9814
      7        [36m0.0039[0m        0.0021  1.0592
      8        [36m0.0039[0m        0.0021  1.0592
      9        [36m0.0038[0m        0.0022  1.5042
     10        [36m0.0038[0m        0.0021  1.3644
0.2904731677427226 0.05737191470802653
