In [None]:
import matplotlib as mpl
import sys

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf

sys.path.append("../scripts/particles/")

mpl.rcParams['figure.dpi'] = 200


In [None]:
import data_handler as dh
import metrics
import utils

In [None]:
outputs = ['PM1', 'PM2.5', 'PM10']
inputs = [
    'PM1_2.5_OUT',
    'PM1_2.5_H_OUT',
    'PM2.5_OUT',
    'PM2.5_H_OUT',
    'PM2.5_10_OUT',
    'PM2.5_10_H_OUT',
    'PERSON_NUMBER',
    'AIR_PURIFIER',
    'WINDOW',
    'AIR_CONDITIONER',
    'DOOR',
    # 'TEMPERATURE',
    #'WIND_SPEED',
    'WIND_DEG',
    'HUMIDITY'
]

in_time_step = 60
offset = 1
out_time_step = 1
batch_size = 32

config = {
    "name": "conv",
    "description": "conv1D",
    "version": "25",
    "root_dir": "projects/particle/model",
    "dirs": {
        "weights": "training/weights",
        "history": "training/history",
        "metric": "result/metric",
        "model": "result/model",
        "predict": "result/predict",
    },
    "model": {
        "lr": 0.0001,
        "batch_size": batch_size,
        "epochs": 300,
        "window_size": in_time_step,
        "offset": offset,
        "loss": "MSE",
    },
    "data": {
        "moving_average_window": 20,
        "moving_average_method": 'mean',
        "train": 0.60,
        "validation": 0.15,
        "test": 0.25,
        "dates": [
            {"start": "2022-05-07 09:40", "end": "2022-05-17 08:38"},
            {"start": "2022-05-17 11:25", "end": "2022-05-30 23:26"},
            {"start": "2022-06-01 22:40", "end": "2022-07-02 07:00"},
            {"start": "2022-07-02 16:40", "end": "2022-07-09 07:13"},
            {"start": "2022-07-09 14:30", "end": "2022-07-12 10:00"},
            {"start": "2022-07-25 12:00", "end": "2022-08-01 10:00"},
            {"start": "2022-08-03 09:00", "end": "2022-08-11 22:18"},
            {"start": "2022-08-12 12:14", "end": "2022-08-20 00:00"},
            {"start": "2022-08-20 09:38", "end": "2022-09-01 00:00"},
        ],
        "meta": None
    },
}

In [None]:
root_dir = (
    config["root_dir"] + "/" + config["name"] + "_" + config["version"]
)

weights_dir = config["dirs"]["weights"]
history_dir = config["dirs"]["history"]
predict_dir = config["dirs"]["predict"]
model_dir = config["dirs"]["model"]
metric_dir = config["dirs"]["metric"]

In [None]:
import json
import os
import shutil


def create_folder(path):
    if os.path.exists(path):
        cmd = input(f'Folder name `{path}` already exsists. You mean overwrite?[Y/n]')
        if cmd == 'Y' or cmd == 'y':
            shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

create_folder(root_dir + "/" + weights_dir)
create_folder(root_dir + "/" + history_dir)
create_folder(root_dir + "/" + predict_dir)
create_folder(root_dir + "/" + model_dir)
create_folder(root_dir + "/" + metric_dir)

with open(f"{root_dir}/config.json", "w") as outfile:
    json.dump(config, outfile)
    outfile.close()

In [None]:
def get_datasets(usable_dates, val_size, test_size, **kwargs):
    weather_df = pd.read_csv(
        "../../storage/particle/weather.csv", index_col="DATE", parse_dates=True
    )[["TEMPERATURE", "WIND_DEG", "WIND_SPEED", "HUMIDITY"]]
    weather_df["WIND_DEG"] = np.sin(weather_df["WIND_DEG"].values * np.pi / 180 / 4)

    df_org = dh.load_data()
    df_org = dh.add_pm_diff(df_org)

    excludes = ["PERSON_NUMBER", "AIR_PURIFIER", "AIR_CONDITIONER", "WINDOW", "DOOR", "WIND_DEG"]
    df_org = pd.concat([df_org, weather_df], axis=1)
    df = dh.apply_moving_average(
        df_org, min_periods=1, excludes=excludes, **kwargs
    )
    df = pd.concat([df, df_org[excludes]], axis=1)[inputs + outputs]

    dfs = dh.trim_df(df, usable_dates)

    return dh.train_test_split_df(dfs, val_size, test_size)

In [None]:
train, val, test = get_datasets(
    config["data"]["dates"],
    config["data"]["validation"],
    config["data"]["test"],
    window=config["data"]["moving_average_window"],
    method=config["data"]["moving_average_method"],
)


In [None]:
weather_df = pd.read_csv('storage/particle/weather.csv', index_col='DATE',
                         parse_dates=True)[['TEMPERATURE', 'WIND_DEG', 'WIND_SPEED', 'HUMIDITY']]
weather_df['WIND_DEG'] = np.sin(weather_df['WIND_DEG'].values * np.pi / 180 / 4)

df_org = dh.load_data("storage/particle/data.csv")
df_org = dh.add_pm_diff(df_org)

excludes = ['PERSON_NUMBER', 'AIR_PURIFIER',
            'AIR_CONDITIONER', 'WINDOW', 'DOOR']
df = dh.apply_moving_average(pd.concat([df_org, weather_df], axis=1),
                             window=config['data']['moving_average_window'],
                             method=config['data']['moving_average_method'],
                             excludes=excludes,
                             min_periods=1)
df = pd.concat([df, df_org[excludes]], axis=1)
df[excludes] = df[excludes].fillna(method='ffill')
df.dropna(inplace=True)

dfs = dh.trim_df(df, config['data']['dates'])
val_size = config['data']['validation']
test_size = config['data']['test']

train_dfs, val_dfs, test_dfs = dh.train_test_split_df(dfs, val_size, test_size)
meta_df = pd.concat(train_dfs).describe()
meta_df.to_csv(f'{root_dir}/meta.csv', index_label='component')
config['data']['meta'] = f'{root_dir}/meta.csv'


In [None]:
def to_dataset(_dfs, in_time_step):
    return dh.dfs_to_dataset(_dfs, meta_df, inputs, outputs, in_time_step=in_time_step, out_time_step=out_time_step, offset=offset, excludes=outputs)

win_size = config['model']['window_size']
X_train, y_train = to_dataset(train_dfs, win_size)
X_val, y_val = to_dataset(val_dfs, win_size)
X_test, y_test = to_dataset(test_dfs, win_size)

In [None]:
from tensorflow.keras import backend
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


rlr_cb = ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=10, mode="min", verbose=1, min_lr=1e-6
)
ely_cb = EarlyStopping(monitor="val_loss", patience=20, mode="min", verbose=1)
mcp_cb = ModelCheckpoint(
    filepath=root_dir
    + "/"
    + config["dirs"]["weights"]
    + "/e{epoch:02d}-v{val_loss:.2f}.h5",
    monitor="val_loss",
    save_weights_only=True,
    mode="min",
    period=1,
    verbose=0,
)

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    GlobalMaxPooling1D,
    Input,
    LeakyReLU,
    MaxPooling1D,
    Attention,
    Permute,
)
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

def attention_3d_block(inputs):
    input_dim = int(inputs.shape[2])
    
    a = Permute((2, 1))(inputs) # same transpose
    a = Dense(inputs.shape[1], activation='softmax')(a)
    
    a_probs = Permute((2, 1), name='attention_vec')(a)

    output_attention_mul  = tf.keras.layers.multiply([inputs, a_probs])
    return output_attention_mul

def build_model(input_shape):
    input_tensor = Input(shape=input_shape, name="input")

    x = Conv1D(32, kernel_size=3, kernel_initializer='he_uniform', activation='relu', strides=1, padding='same')(input_tensor)
    # x = Conv1D(128, kernel_size=3, activation='relu', strides=1, padding="valid")(x)
    # x = GRU(
    #     units=32,
    #     activation="tanh",
    #     kernel_initializer="glorot_uniform",
    #     return_sequences=True,
    # )(input_tensor)
    # x = GRU(
    #     units=160,
    #     activation="tanh",
    #     kernel_initializer="he_uniform",
    #     return_sequences=True,
    # )(x)
    # x = attention_3d_block(x)
    x = MaxPooling1D(pool_size=5, strides=3)(x)
    x = Flatten()(x)
    x = Dense(256, kernel_initializer='he_uniform', activation='relu')(x)
    # x = Dropout(0.5)(x)
    output = Dense(y_train.shape[2], kernel_initializer='he_uniform', activation="relu", name="output")(x)

    _model = Model(
        inputs=input_tensor,
        outputs=output,
        name=f'{config["name"].lower()}_v{config["version"]}',
    )

    _model.compile(
        optimizer=Adam(learning_rate=config["model"]["lr"]),
        loss=config["model"]["loss"].lower(),
        metrics=RootMeanSquaredError(),
    )

    return _model


model = build_model((X_train.shape[1], X_train.shape[2]))
model.summary()

In [None]:
with tf.device("/device:GPU:0"):
    training_res = model.fit(
        x=X_train,
        y=y_train,
        batch_size=batch_size,
        shuffle=False,
        epochs=config["model"]["epochs"],
        validation_data=(X_val, y_val),
        callbacks=[rlr_cb, ely_cb, mcp_cb],
    )
    pd.DataFrame(training_res.history).to_csv(
        root_dir + "/" + config["dirs"]["history"] + "/history.csv", index=False
    )
    plt.figure(figsize=(28, 10))
    plt.plot(training_res.history["loss"], "o--", label="train")
    plt.plot(training_res.history["val_loss"], "o--", label="valid")
    plt.xlabel("Epochs", fontsize=15)
    plt.ylabel("Loss - RMSE", fontsize=15)
    plt.legend(fontsize=15)

In [None]:
def get_result(_dfs, output_scaled=False):
    res_dfs = []
    for _df in _dfs:
        df_cp = _df.copy()
        _X, _y = dh.dfs_to_dataset([df_cp], meta_df, inputs, outputs, in_time_step=in_time_step)
        y_hat = model.predict(_X, verbose=False)
        df_cp = df_cp.iloc[in_time_step + out_time_step + offset - 1:]
        for idx, output in enumerate(outputs):
            if output_scaled:
                min_val = meta_df[output]['min']
                max_val = meta_df[output]['max']
                df_cp[output + '_PRED'] = y_hat[:, idx] * (max_val - min_val) + min_val
            else:
                df_cp[output + '_PRED'] = y_hat[:, idx]
        res_dfs.append(df_cp)
    return pd.concat(res_dfs)

In [None]:
# model.load_weights(f"{root_dir}/{weights_dir}/e27-v8.17.h5")
# train_res = get_result(train_dfs)
# train_res['TYPE'] = 'train'
# val_res = get_result(val_dfs)
# val_res['TYPE'] = 'val'
# test_res = get_result(test_dfs)
# test_res['TYPE'] = 'test'

In [None]:
# # model.load_weights('project/GRU/GRU09/training/weights/e23-v17.85.h5')
#model = tf.keras.models.load_model("../projects/particle/model/conv_17/result/model/conv_17.h5")
train_res = get_result(train_dfs)
train_res['TYPE'] = 'train'
val_res = get_result(val_dfs)
val_res['TYPE'] = 'val'
test_res = get_result(test_dfs)
test_res['TYPE'] = 'test'

In [None]:
#total_res = pd.read_csv('../projects/particle/model/conv_09/result/predict/predict.csv', index_col='DATE', parse_dates=True)
#train_res = total_res[total_res['TYPE'] == 'train']
#val_res = total_res[total_res['TYPE'] == 'val']
#test_res = total_res[total_res['TYPE'] == 'test']

In [None]:
_ = utils.plot(train_res, ['PM2.5_PRED', 'PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT', 'PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR'])

In [None]:
_ = utils.plot(val_res, ['PM2.5_PRED', 'PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT', 'PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR'])

In [None]:
_ = utils.plot(test_res, ['PM2.5_PRED', 'PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT', 'PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR'])

In [None]:
(test_res['PM2.5'] - test_res['PM2.5_PRED']).hist(bins=100, figsize=(22, 6))

In [None]:
(test_res['PM1'] - test_res['PM1_PRED']).hist(bins=100, figsize=(22, 6))

In [None]:
fig, ax = plt.subplots(figsize=(9, 9))

val_res.plot.scatter(x="PM2.5", y="PM2.5_PRED", c="y", ax=ax)
test_res.plot.scatter(x="PM2.5", y="PM2.5_PRED", c="g", ax=ax)
lims = [
    np.min([ax.get_xlim(), ax.get_ylim()]),
    np.max([ax.get_xlim(), ax.get_ylim()]),
]

ax.plot(lims, lims, "r-", linewidth=2, alpha=0.75, zorder=2)
ax.set_aspect("equal")

In [None]:
save = True

cols = ["pm1", "pm2.5", "pm10"]
total_res = pd.concat([train_res, val_res, test_res])
res_dfs = [total_res, train_res, val_res, test_res]
res_indices = ["Total", "Train", "Validation", "Test"]
metric_funcs = [metrics.calc_r2, metrics.calc_corrcoef, metrics.calc_nmse, metrics.calc_fb, metrics.calc_b, metrics.calc_a_co]
metrics_indices = ["R Square", "Corr", "NMSE", "FB", "B", "a/C"]


def calc_metric(_f, _df, _col):
    return _f(_df[_col].values, _df[_col + "_PRED"].values)


for col in cols:
    print(f"======== {col} prediction results ========")
    res_dict = {
        "Metric": metrics_indices,
        "Total": [],
        "Train": [],
        "Validation": [],
        "Test": [],
    }

    for j, m in enumerate(metric_funcs):
        for i, rd in enumerate(res_dfs):
            s = calc_metric(m, rd, col.upper())
            res_dict[res_indices[i]].append(s)

    r_df = pd.DataFrame(res_dict)
    print(r_df)
    print()
    if save:
        r_df.to_csv(
            f'{root_dir}/{config["dirs"]["metric"]}/result_{col}.csv',
            index=False,
            float_format="%.3f",
        )

In [None]:
if save:
    total_res.to_csv(
        root_dir + "/" + config["dirs"]["predict"] + "/predict.csv",
        index_label="DATE",
    )

    model.save(
        root_dir
        + "/"
        + config["dirs"]["model"]
        + f'/{config["name"].lower()}_{config["version"]}.h5'
    )