In [None]:
import sys

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import keras

sys.path.append("./scripts/particles/")

In [None]:
import data_handler as dh
import metrics
import utils

In [None]:
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200

In [None]:
outputs = ['PM1', 'PM2.5', 'PM10']
inputs = [
    'PM1_2.5_OUT',
    'PM1_2.5_H_OUT',
    'PM2.5_OUT',
    'PM2.5_H_OUT',
    'PM2.5_10_OUT',
    'PM2.5_10_H_OUT',
    'PERSON_NUMBER',
    'AIR_PURIFIER',
    'WINDOW',
    'AIR_CONDITIONER',
    'DOOR',
    # 'TEMPERATURE',
    # 'WIND_SPEED',
    'WIND_DEG',
    'HUMIDITY'
]

in_time_step = 60
offset = 1
out_time_step = 1
batch_size = 64

config = {
    "name": "conv",
    "description": "High quality prediction interval model with conv1d",
    "version": "hq07",
    "root_dir": "../projects/particle/model",
    "dirs": {
        "weights": "training/weights",
        "history": "training/history",
        "metric": "result/metric",
        "model": "result/model",
        "predict": "result/predict",
    },
    "model": {
        "lr": 0.0001,
        "batch_size": batch_size,
        "epochs": 300,
        "window_size": in_time_step,
        "offset": offset,
        "loss": "MSE",
    },
    "data": {
        "moving_average_window": 20,
        "moving_average_method": 'mean',
        "train": 0.60,
        "validation": 0.15,
        "test": 0.25,
        "dates": [
            {"start": "2022-05-07 09:40", "end": "2022-05-17 08:38"},
            {"start": "2022-05-17 11:25", "end": "2022-05-30 23:26"},
            {"start": "2022-06-01 22:40", "end": "2022-07-02 07:00"},
            {"start": "2022-07-02 16:40", "end": "2022-07-09 07:13"},
            {"start": "2022-07-09 14:30", "end": "2022-07-12 10:00"},
            {"start": "2022-07-25 12:00", "end": "2022-08-01 10:00"},
            {"start": "2022-08-03 09:00", "end": "2022-08-11 22:18"},
            {"start": "2022-08-12 12:14", "end": "2022-08-20 00:00"},
            {"start": "2022-08-20 09:38", "end": "2022-09-01 00:00"},
        ],
        "meta": None
    },
}

In [None]:
root_dir = (
    config["root_dir"] + "/" + config["name"] + "_" + config["version"]
)

weights_dir = config["dirs"]["weights"]
history_dir = config["dirs"]["history"]
predict_dir = config["dirs"]["predict"]
model_dir = config["dirs"]["model"]
metric_dir = config["dirs"]["metric"]

In [None]:
import json
import os
import shutil


def create_folder(path):
    if os.path.exists(path):
        cmd = input(f'Folder name `{path}` already exsists. You mean overwrite?[Y/n]')
        if cmd == 'Y' or cmd == 'y':
            shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

create_folder(root_dir + "/" + weights_dir)
create_folder(root_dir + "/" + history_dir)
create_folder(root_dir + "/" + predict_dir)
create_folder(root_dir + "/" + model_dir)
create_folder(root_dir + "/" + metric_dir)

with open(f"{root_dir}/config.json", "w") as outfile:
    json.dump(config, outfile)
    outfile.close()

In [None]:
weather_df = pd.read_csv('../storage/particle/weather.csv', index_col='DATE', parse_dates=True)[['TEMPERATURE', 'WIND_DEG', 'WIND_SPEED', 'HUMIDITY']]
weather_df['WIND_DEG'] = np.sin(weather_df['WIND_DEG'].values * np.pi / 180)

df_org = dh.load_data("../storage/particle/data.csv")
df_org = dh.add_pm_diff(df_org)

excludes = ['PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR']
df = dh.apply_moving_average(pd.concat([df_org, weather_df], axis=1), 
                             window=config['data']['moving_average_window'], 
                             method=config['data']['moving_average_method'], 
                             excludes=excludes, 
                             min_periods=1)
df = pd.concat([df, df_org[excludes]], axis=1)
df[excludes] = df[excludes].fillna(method='ffill')
df.dropna(inplace=True)

dfs = dh.trim_df(df, config['data']['dates'])
val_size = config['data']['validation']
test_size = config['data']['test']

train_dfs, val_dfs, test_dfs = dh.train_test_split_df(dfs, val_size, test_size)
meta_df = pd.concat(train_dfs).describe()
meta_df.to_csv(f'{root_dir}/meta.csv', index_label='component')
config['data']['meta'] = f'{root_dir}/meta.csv'

In [None]:
def to_dataset(_dfs, in_time_step):
    return dh.dfs_to_dataset(_dfs, meta_df, inputs, outputs, in_time_step=in_time_step, out_time_step=out_time_step, offset=offset, excludes=outputs)

win_size = config['model']['window_size']
X_train, y_train = to_dataset(train_dfs, win_size)
X_val, y_val = to_dataset(val_dfs, win_size)
X_test, y_test = to_dataset(test_dfs, win_size)

In [None]:
from tensorflow.keras import backend
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


rlr_cb = ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=30, mode="min", verbose=1, min_lr=1e-6
)
ely_cb = EarlyStopping(monitor="val_loss", patience=20, mode="min", verbose=1)
mcp_cb = ModelCheckpoint(
    filepath=root_dir
    + "/"
    + config["dirs"]["weights"]
    + "/e{epoch:02d}-v{val_loss:.2f}.h5",
    monitor="val_loss",
    save_weights_only=True,
    mode="min",
    period=1,
    verbose=0,
)

In [None]:
lmbd = 0.01
lmbd_2 = 0.7
alpha = 0.05

soften = 300.
loss_type = 'soft' # soft or hard

def qd_loss(y_true, y_pred):
    y_true = y_true[:, :, 0]
    y_u = y_pred[:, :, 0]
    y_l = y_pred[:, :, 1]

    k_u = None
    k_l = None
    if loss_type == 'soft':
        k_u = tf.sigmoid(soften * (y_u - y_true))
        k_l = tf.sigmoid(soften * (y_true - y_l))
    elif loss_type == 'hard':
        k_u = tf.maximum(0., tf.sign(y_u - y_true))
        k_l = tf.maximum(0., tf.sign(y_true - y_l))

    k = tf.multiply(k_u, k_l)
    mpiw = tf.reduce_sum(tf.multiply(y_u - y_l, k)) / tf.maximum(tf.reduce_sum(k), 1e-12)
    picp = tf.reduce_mean(k)
    # mpiw_2 = tf.sqrt(tf.reduce_mean(tf.square(tf.multiply(y_u - y_true, y_true - y_l))))
    rmsqrt_u = tf.sqrt(tf.reduce_mean(tf.square(y_u - y_true)))
    rmsqrt_l = tf.sqrt(tf.reduce_mean(tf.square(y_true - y_l)))
    rmsqrt = (rmsqrt_u + rmsqrt_l) / 2

    loss = mpiw + rmsqrt * lmbd_2 + lmbd * batch_size * tf.square(tf.maximum(0., (1 - alpha) - picp)) / (alpha * (1 - alpha))
    return loss

In [None]:
lmbd = 0.03
gamma = 0.7
alpha = 0.05

soften = 300.
loss_type = 'soft' # soft or hard

def qd_loss_v2(y_true, y_pred):
    y_true = y_true[:, :, 0]
    y_u = y_pred[:, :, 0]
    y_l = y_pred[:, :, 1]

    k_u = None
    k_l = None
    if loss_type == 'soft':
        k_u = tf.sigmoid(soften * (y_u - y_true))
        k_l = tf.sigmoid(soften * (y_true - y_l))
    elif loss_type == 'hard':
        k_u = tf.maximum(0., tf.sign(y_u - y_true))
        k_l = tf.maximum(0., tf.sign(y_true - y_l))

    k = tf.multiply(k_u, k_l)
    mpiw = tf.reduce_sum(tf.multiply(y_u - y_l, k)) / tf.maximum(tf.reduce_sum(k), 1e-12)
    picp = tf.reduce_mean(k)
    rmse = tf.sqrt(tf.reduce_mean(tf.square(y_u - y))) + tf.sqrt(tf.reduce_mean(tf.square(y - y_l)))

    loss = mpiw + gamma * tf.sqrt(tf.reduce_mean(tf.square(tf.multiply(y_u - y_true, y_true - y_l)))) + lmbd * batch_size * tf.square(tf.maximum(0., (1 - alpha) - picp)) / (alpha * (1 - alpha))
    return loss

In [None]:
lmbd = 0.015
gamma = 0.4
alpha = 0.05

soften = 300.
loss_type = 'soft' # soft or hard

def qd_loss_v2(y_true, y_pred):
    y_true = y_true[:, :, 0]
    y_u = y_pred[:, :, 0]
    y_l = y_pred[:, :, 1]

    k_u = None
    k_l = None
    if loss_type == 'soft':
        k_u = tf.sigmoid(soften * (y_u - y_true))
        k_l = tf.sigmoid(soften * (y_true - y_l))
    elif loss_type == 'hard':
        k_u = tf.maximum(0., tf.sign(y_u - y_true))
        k_l = tf.maximum(0., tf.sign(y_true - y_l))

    k = tf.multiply(k_u, k_l)
    mpiw = tf.reduce_sum(tf.multiply(y_u - y_l, k)) / tf.maximum(tf.reduce_sum(k), 1e-12)
    picp = tf.reduce_mean(k)
    rmse = tf.sqrt(tf.reduce_mean(tf.square(y_u - y_true))) + tf.sqrt(tf.reduce_mean(tf.square(y_true - y_l)))

    loss = mpiw + gamma * rmse + lmbd * batch_size * tf.square(tf.maximum(0., (1 - alpha) - picp)) / (alpha * (1 - alpha))
    return loss

In [None]:
model_org = keras.models.load_model('../projects/particle/model/conv_19/result/model/conv_19.h5')
model_org.trainable = True

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    GlobalMaxPooling1D,
    Input,
    LeakyReLU,
    MaxPooling1D,
    Reshape
)
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

model_inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
x = model_org(model_inputs)
output = Dense(
    y_train.shape[2]*2, 
    kernel_initializer=keras.initializers.Constant(value=[[1,1,0,0,0,0], [0,0,1,1,0,0], [0,0,0,0,1,1]]),
    name='range_output',
    activation='relu',
    bias_initializer=keras.initializers.Constant(value=[10.,-10.]*3),
)(x)
output = Reshape((-1, 2))(output)

model = Model(
    inputs=model_inputs,
    outputs=output,
    name=f'{config["name"].lower()}_v{config["version"]}',
)

model.compile(
    optimizer=Adam(learning_rate=config["model"]["lr"]),
    loss=qd_loss_v2,
    metrics=RootMeanSquaredError(),
)
model.summary()

In [None]:
with tf.device("/device:GPU:0"):
    training_res = model.fit(
        x=X_train,
        y=np.stack((y_train[:, 0, :], y_train[:, 0, :]), axis=2),
        batch_size=batch_size,
        shuffle=False,
        epochs=100,
        validation_data=(X_val, np.stack((y_val[:, 0, :], y_val[:, 0, :]), axis=2)),
        callbacks=[rlr_cb, ely_cb, mcp_cb],
    )
    pd.DataFrame(training_res.history).to_csv(
        root_dir + "/" + config["dirs"]["history"] + "/history.csv", index=False
    )
    plt.figure(figsize=(28, 10))
    plt.plot(training_res.history["loss"], "o--", label="train")
    plt.plot(training_res.history["val_loss"], "o--", label="valid")
    plt.xlabel("Epochs", fontsize=15)
    plt.ylabel("Loss - RMSE", fontsize=15)
    plt.legend(fontsize=15)

In [None]:
model.load_weights(f"{root_dir}/{weights_dir}/e50-v33.82.h5")

In [None]:
def get_result(_dfs, output_scaled=False):
    res_dfs = []
    for _df in _dfs:
        df_cp = _df.copy()
        _X, _y = dh.dfs_to_dataset([df_cp], meta_df, inputs, outputs, in_time_step=in_time_step)
        y_hat = model.predict(_X, verbose=False)
        org_y_hat = model_org.predict(_X, verbose=False)
        df_cp = df_cp.iloc[in_time_step + out_time_step + offset - 1:]
        for idx, output in enumerate(outputs):
            if output_scaled:
                min_val = meta_df[output]['min']
                max_val = meta_df[output]['max']
                df_cp[output + '_U_PRED'] = y_hat[:, idx, 0] * (max_val - min_val) + min_val
                df_cp[output + '_L_PRED'] = y_hat[:, idx, 1] * (max_val - min_val) + min_val
                df_cp[output + '_PRED'] = org_y_hat[:, idx] * (max_val - min_val) + min_val
            else:
                df_cp[output + '_U_PRED'] = y_hat[:, idx, 0]
                df_cp[output + '_L_PRED'] = y_hat[:, idx, 1]
                df_cp[output + '_PRED'] = org_y_hat[:, idx]
        res_dfs.append(df_cp)
    return pd.concat(res_dfs)

In [None]:
# model.load_weights(f"{root_dir}/{weights_dir}/e45-v15.40.h5")

# model = tf.keras.models.load_model("../projects/particle/model/conv_hq06/result/model/conv_hq06.h5", compile=False)
# model.compile(
#     optimizer=Adam(learning_rate=config["model"]["lr"]),
#     loss=qd_loss_v2,
#     metrics=RootMeanSquaredError(),
# )

train_res = get_result(train_dfs)
train_res['TYPE'] = 'train'
val_res = get_result(val_dfs)
val_res['TYPE'] = 'val'
test_res = get_result(test_dfs)
test_res['TYPE'] = 'test'

In [None]:
from keras.utils import plot_model

plot_model(model, show_shapes=True)

In [None]:
# # model.load_weights('project/GRU/GRU09/training/weights/e23-v17.85.h5')
# model = tf.keras.models.load_model("project/GRU/GRUkt01/result/model/gru_kt01.h5")
# train_res = get_result_df(model, train_dfs, meta)
# val_res = get_result_df(model, val_dfs, meta)
# test_res = get_result_df(model, test_dfs, meta)

In [None]:
total_res = pd.concat([train_res, val_res, test_res])

In [None]:
ax = train_res.resample('1T').mean().plot(y=['PM2.5'], figsize=(30, 8))
ax.fill_between(
    train_res.index,
    train_res['PM2.5_L_PRED'],
    train_res['PM2.5_U_PRED'],
    facecolor="green",
    alpha=0.2,
    interpolate=True,
)
plt.show()

In [None]:
# ax = val_res[(val_res.index >=pd.to_datetime('2022-07-25')) & (val_res.index <=pd.to_datetime('2022-08-01'))].resample('1T').mean().plot(y=['PM2.5'], figsize=(30, 8))
ax = val_res.resample('1T').mean().plot(y=['PM2.5'], figsize=(30, 8))
ax.fill_between(
    val_res.index,
    val_res['PM2.5_L_PRED'],
    val_res['PM2.5_U_PRED'],
    facecolor="green",
    alpha=0.2,
    interpolate=True,
)
plt.show()

In [None]:
ax = test_res.resample('1T').mean().plot(y=['PM2.5'], figsize=(30, 8))
ax.fill_between(
    test_res.index,
    test_res['PM2.5_L_PRED'],
    test_res['PM2.5_U_PRED'],
    facecolor="green",
    alpha=0.2,
    interpolate=True,
)
plt.show()

In [None]:
yhat_train = model.predict(X_train)
yhat_val = model.predict(X_val)
yhat_test = model.predict(X_test)

train_loss = qd_loss_v2(np.stack((y_train[:, 0, :], y_train[:, 0, :]), axis=2), yhat_train)
val_loss = qd_loss_v2(np.stack((y_val[:, 0, :], y_val[:, 0, :]), axis=2), yhat_val)
test_loss = qd_loss_v2(np.stack((y_test[:, 0, :], y_test[:, 0, :]), axis=2), yhat_test)
print(f'train loss: {train_loss:.3f}, val loss: {val_loss:.3f}, test loss: {test_loss:.3f}')

In [None]:
y_tmps = [y_train, y_val, y_test]
yhat_tmps = [yhat_train, yhat_val, yhat_test]

for y_tmp, yhat_tmp in zip(y_tmps, yhat_tmps):
    count = 0
    for i in range(len(yhat_tmp)):
        for j in range(3):
            if y_tmp[i, 0, j] >= yhat_tmp[i, j, 1] and y_tmp[i, 0, j] <= yhat_tmp[i, j, 0]:
                count += 1

    mpiw = np.sum(yhat_tmp[:, :, 0] - yhat_tmp[:, :, 1]) / (len(yhat_tmp) * 3)
    print(f'Captured: {(count / 3) / len(y_tmp):.3f}, MPIW: {mpiw:.3f}')

In [None]:
model.save(
    root_dir
    + "/"
    + config["dirs"]["model"]
    + f'/{config["name"].lower()}_{config["version"]}.h5'
)