In [1]:
from pyDOE import lhs
import sys

import pandas as pd
import tensorflow as tf

sys.path.append("../scripts/")

In [2]:
TARGET_LABEL = ["PM1", "PM2.5", "PM10"]
FEATURE_LABEL = [
    "PM1_2.5_OUT",
    "PM1_2.5_H_OUT",
    "PM2.5_OUT",
    "PM2.5_H_OUT",
    "PM2.5_10_OUT",
    "PM2.5_10_H_OUT",
    "PERSON_NUMBER",
    "AIR_PURIFIER",
    "WINDOW",
    "AIR_CONDITIONER",
    "DOOR",
    "TEMPERATURE",
    "WIND_SPEED",
    "WIND_DEG",
    "HUMIDITY",
]

NON_ACTIVITY = [
    "PERSON_NUMBER",
    "PM2.5_OUT",
    "PM2.5_H_OUT",
    "PM1_2.5_OUT",
    "PM1_2.5_H_OUT",
    "PM2.5_10_OUT",
    "PM2.5_10_H_OUT",
    "PM1_OUT",
    "PM1_H_OUT",
    "PM10_OUT",
    "PM10_H_OUT",
    "TEMPERATURE",
    "WIND_SPEED",
    "WIND_DEG",
    "HUMIDITY",
]

ACTIVITY = [x for x in FEATURE_LABEL if x not in NON_ACTIVITY]

used_data = [
    {"start": "2022-05-07 09:40", "end": "2022-05-17 08:38"},
    {"start": "2022-05-17 11:25", "end": "2022-05-30 23:26"},
    {"start": "2022-06-01 22:40", "end": "2022-07-02 07:00"},
    {"start": "2022-07-02 16:40", "end": "2022-07-09 07:13"},
    {"start": "2022-07-09 14:30", "end": "2022-07-12 10:00"},
    {"start": "2022-07-25 12:00", "end": "2022-08-01 10:00"},
    {"start": "2022-08-03 09:00", "end": "2022-08-11 22:18"},
    {"start": "2022-08-12 12:14", "end": "2022-08-20 00:00"},
]

moving_average_window = 15
moving_average_method = 'median'
val_size = 0.15
test_size = 0.25
train_size = 1 - val_size - test_size

In [3]:
weather_df = pd.read_csv('../data/weather.csv', index_col='DATE', parse_dates=True)[['TEMPERATURE', 'WIND_DEG', 'WIND_SPEED', 'HUMIDITY']]
weather_df['WIND_DEG'] = np.sin(weather_df['WIND_DEG'].values * np.pi / 180)

In [4]:
df_org = load_pm("../data/data.csv")
add_diff(df_org)

df = apply_moving_average(
    pd.concat([df_org, weather_df], axis=1), 'mean', moving_average_window, True
)
df[['PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR']].fillna(method='ffill')
df.dropna(inplace=True)

dfs = trim_df(df, used_data)
train_dfs, val_dfs, test_dfs = train_test_split_df(dfs, val_size, test_size)
meta = get_meta(train_dfs, NON_ACTIVITY)

excludes: ['PERSON_NUMBER', 'AIR_PURIFIER', 'WINDOW', 'DOOR', 'AIR_CONDITIONER']
includes: ['PM1', 'PM2.5', 'PM10', 'PM1_OUT', 'PM2.5_OUT', 'PM10_OUT', 'PM1_H_OUT', 'PM2.5_H_OUT', 'PM10_H_OUT', 'PM1_2.5', 'PM2.5_10', 'PM1_2.5_OUT', 'PM1_2.5_H_OUT', 'PM2.5_10_OUT', 'PM2.5_10_H_OUT', 'TEMPERATURE', 'WIND_DEG', 'WIND_SPEED', 'HUMIDITY']


In [5]:
# ax = df[(df.index >= pd.to_datetime('2022-07-09 23:00')) & (df.index <= pd.to_datetime('2022-07-12'))].resample('T').first().fillna(value=np.nan).plot(kind='line', y=['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], figsize=(18, 12), fontsize=17)
# ax.set_ylabel('PM2.5 $\mu g m^3$', fontsize=17)
# ax.set_xlabel('Date', fontsize=17)
# ax.legend(fontsize=17)

In [6]:
basic_params = {
    "window_size": [5, 12, 16, 30, 60],
    "pool_size": [x for x in range(2, 6)],
    "pool_strides": [x for x in range(1, 4)],
    "dense": {
        "units": [x*32+16 for x in range(8)],
        "dropout": np.arange(0, 0.5+0.05, 0.05),
        "leaky_relu": np.arange(0, 0.5+0.05, 0.05),
    },
    "batch_size": [x*32+32 for x in range(8)],
    "lr": [0.001, 0.0001, 0.00001],
}

conv_params = {
    "conv_0": {
        "filters": [x*32+16 for x in range(8)],
        "kernel_size": [x*2+3 for x in range(3)],
        "strides": [x+1 for x in range(3)],
    },
    "conv_1": {
        "filters": [None]+[x*32+16 for x in range(8)],
        "kernel_size": [x*2+3 for x in range(3)],
        "strides": [x+1 for x in range(3)],
    },
}

rnn_params = {
    "conv_0": {
        "activated": [True, False],
        "filters": [None]+[x*32+16 for x in range(8)],
        "kernel_size": [x*2+3 for x in range(3)],
        "strides": [x+1 for x in range(3)],
    },
    "rnn_0": {
        "layer": ['naive', 'lstm', 'gru'],
        "units": [x*32+16 for x in range(8)],
        "dropout": np.arange(0, 0.5+0.05, 0.05),
    },
    "rnn_1": {
        "layer": ['naive', 'lstm', 'gru'],
        "units": [None]+[x*32+16 for x in range(8)],
        "dropout": np.arange(0, 0.5+0.05, 0.05),
    },
}

In [7]:
train_ds = {}
val_ds = {}
test_ds = {}
for win in basic_params["window_size"]:
    train_ds[str(win)] = translate_to_dataset(train_dfs, FEATURE_LABEL, TARGET_LABEL, win, 1, 0, NON_ACTIVITY, scale=True, verbose=False, _meta=meta)
    val_ds[str(win)] = translate_to_dataset(val_dfs, FEATURE_LABEL, TARGET_LABEL, win, 1, 0, NON_ACTIVITY, scale=True, verbose=False, _meta=meta)
    test_ds[str(win)] = translate_to_dataset(test_dfs, FEATURE_LABEL, TARGET_LABEL, win, 1, 0, NON_ACTIVITY, scale=True, verbose=False, _meta=meta)

In [8]:
def get_param_len(param):
    l = 0
    for p in param.keys():
        if type(param[p]) == dict:
            l += get_param_len(param[p])
        else:
            l += 1
    return l

def get_param_len_list(param):
    li = []
    for p in param.keys():
        if type(param[p]) == dict:
            li += get_param_len_list(param[p])
        else:
            li.append(len(param[p]))
    return li

def get_param_keys(param):
    keys = []
    for p in param.keys():
        if type(param[p]) == dict:
            keys += get_param_len_list(param[p])
        else:
            keys.append(p)
    return keys

def get_samples(param, n_samples):
    n_dim = get_param_len(basic_params) + get_param_len(param)
    return lhs(n_dim, n_samples, 'maximin')

def smp_to_indices(sample, param):
    len_list = get_param_len_list(basic_params) + get_param_len_list(param)
    smp_cpy = np.zeros(sample.shape)
    if len(len_list) != sample.shape[1]:
        print('[ERROR] invalid shape')
        return
    for i, ll in enumerate(len_list):
        smp_cpy[:, i] = np.floor(sample[:, i] * ll)
    smp_cpy = np.int32(smp_cpy)
    return smp_cpy

def dict_cat(param):
    new_param = {}
    for key in basic_params.keys():
        new_param[key] = basic_params[key]
    for key in param.keys():
        new_param[key] = param[key]
    return new_param

def get_smp_values(param, _indices):
    val_dict = {}
    param = dict_cat(param)
    val_list = get_param_len_list(param)
    i = 0
    for p in param.keys():
        if type(param[p]) == dict:
            val_dict[p] = {}
            for p2 in param[p].keys():
                val_dict[p][p2] = param[p][p2][_indices[i]]
                i += 1
        else:
            val_dict[p] = param[p][_indices[i]]
            i += 1
    return val_dict

In [9]:
# conv_smp = get_samples(conv_params, 1024)
# conv_idc = smp_to_indices(conv_smp, conv_params)

# rnn_smp = get_samples(rnn_params, 1024)
# rnn_idc = smp_to_indices(rnn_smp, rnn_params)

conv_idc = np.load('lhs_opt/2022-08-23_17:10/conv_idc.npy')
rnn_idc = np.load('lhs_opt/2022-08-23_17:10/rnn_idc.npy')

In [10]:
# np.save('lhs_opt/2022-08-23_17:10/conv_idc.npy', conv_idc)
# np.save('lhs_opt/2022-08-23_17:10/rnn_idc.npy', rnn_idc)

In [11]:
# import datetime as dt
# proj_name = dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d_%H:%M')
# create_folder('lhs_opt/'+proj_name)

In [12]:
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    SimpleRNN,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    Input,
    LeakyReLU,
    MaxPooling1D,
)
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

def build_conv_layer(param, _input):
    x = _input
    for p in param.keys():
        info = str(p).split('_')
        layer_type = info[0]
        if layer_type == 'conv' and param[p]["filters"] is not None:
            f = param[p]["filters"]
            k = param[p]["kernel_size"]
            s = param[p]["strides"]
            x = Conv1D(f, kernel_size=k, kernel_initializer='he_uniform', activation='relu', strides=s, padding='same')(x)
    return x

def build_rnn_layer(param, _input):
    x = _input
    for p in param.keys():
        info = str(p).split('_')
        layer_type = info[0]
        num_type = int(info[1])
        if layer_type == 'conv' and param[p]["activated"]:
            f = param[p]["filters"]
            k = param[p]["kernel_size"]
            s = param[p]["strides"]
            i += 3
            x = Conv1D(f, kernel_size=k, kernel_initializer='he_uniform', activation='relu', strides=s, padding='same')(x)
        elif layer_type == 'rnn':
            layer = param[p]["layer"]
            units = param[p]["units"]
            dropout = param[p]["dropout"]
            if layer == 'naive':
                x = SimpleRNN(units=units, 
                              dropout=dropout,
                              activation='tanh', 
                              kernel_initializer='glorot_uniform', 
                              return_sequences=True,
                             )(x)
            elif layer == 'lstm':
                x = LSTM(units=units,
                         dropout=dropout,
                         activation='tanh', 
                         kernel_initializer='glorot_uniform', 
                         return_sequences=True,
                        )(x)
            elif layer == 'gru':
                x = GRU(units=units, 
                        dropout=dropout,
                        activation='tanh', 
                        kernel_initializer='glorot_uniform', 
                        return_sequences=True,
                       )(x)
    return x

def model_builder(p, input_shape, output_size, layer_type='conv'):
    input_tensor = Input(shape=input_shape, name="input")
    x = input_tensor
    if layer_type == 'conv':
        x = build_conv_layer(p, x)
    elif layer_type == 'rnn':
        x = build_rnn_layer(p, x)
    
    x = MaxPooling1D(pool_size=p["pool_size"], strides=p["pool_strides"], padding='same')(x)
    x = Flatten()(x)
    x = Dense(p["dense"]["units"], kernel_initializer='he_uniform', activation=LeakyReLU(p["dense"]["leaky_relu"]))(x)
    x = Dropout(p["dense"]["dropout"])(x)
    output = Dense(output_size, kernel_initializer='he_uniform', activation="relu", name="output")(x)

    _model = Model(
        inputs=input_tensor,
        outputs=output,
        name='test',
    )

    _model.compile(
        optimizer=Adam(learning_rate=p["lr"]),
        loss='mse',
        metrics=RootMeanSquaredError(),
    )
    return _model

In [13]:
import tensorflow as tf
import numpy as np
import random
from tensorflow.keras import backend as K

def reset_seeds():
    np.random.seed(1)
    random.seed(2)
    if tf.__version__[0] == '2':
        tf.random.set_seed(3)
    else:
        tf.set_random_seed(3)

In [14]:
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


rlr_cb = ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=5, mode="min", min_lr=1e-6, verbose=False
)
ely_cb = EarlyStopping(monitor="val_loss", patience=15, mode="min", verbose=False, restore_best_weights=True)

In [15]:
import json

def calc_metric(real, pred):
    metrics = [calc_r2, calc_corrcoef, calc_nmse, calc_fb, calc_b, calc_a_div_co, calc_mse]
    res = np.zeros(len(metrics))
    metrics_indices = ["R Square", "Corr", "NMSE", "FB", "B", "a/C"]

    for i, metric in enumerate(metrics):
        print(metrics_indices[i])
        res[i] = metric(real, pred)
    return res

def optimize_params(param, idc, proj_dir):
    metric_df = pd.DataFrame(np.zeros((len(idc), get_param_len(param))), columns=['r2', 'corr', 'nmse', 'fb', 'b', 'a/c'])
    for i, conv_idx in enumerate(idc):
        reset_seeds()
        root_dir = proj_dir+f'/trial{i:03d}'
        create_folder(root_dir, ignore_exist=True)
        param_values = get_smp_values(param, conv_idx)
        print(f'[INFO] Trial{i:03d} training start')
        with open(f"{root_dir}/params.json", "w") as outfile:
            json.dump(param_values, outfile)
            outfile.close()
        X_train = train_ds[str(param_values["window_size"])][0]
        y_train = train_ds[str(param_values["window_size"])][1].reshape(-1, 3)
        X_val = val_ds[str(param_values["window_size"])][0]
        y_val = val_ds[str(param_values["window_size"])][1].reshape(-1, 3)
        X_test = test_ds[str(param_values["window_size"])][0]
        y_test = test_ds[str(param_values["window_size"])][1].reshape(-1, 3)

        model = model_builder(param_values, X_train[0].shape, y_train.shape[1])

        with tf.device("/device:GPU:0"):
            history = model.fit(
                x=X_train,
                y=y_train,
                batch_size=param_values["batch_size"],
                shuffle=False,
                epochs=100,
                validation_data=(X_val, y_val),
                callbacks=[rlr_cb, ely_cb],
                verbose=False,
            )
            pd.DataFrame(history.history).to_csv(root_dir+'/history.csv', index=False)
            print(f'[INFO] Trial{i:03d} finished training')

            y_hat = model.predict(X_test)
            print(f'[INFO] Trial{i:03d} finished predict')
            metric = calc_metric(y_test, y_hat)
            metric_df.iloc[i] = metric
            metric_df.to_csv(f'{proj_dir}/metric.csv', index_label='index')
            print(f'[INFO] Trial{i:03d} successfully ended.. Clear session')
        tf.compat.v1.reset_default_graph()
        del model
        K.clear_session()

In [16]:
# optimize_params(conv_params, conv_idc[:128], 'lhs_opt/2022-08-23_17:10')

In [None]:
import time

def train_model(param, _model):
    X_train = train_ds[str(param["window_size"])][0]
    y_train = train_ds[str(param["window_size"])][1].reshape(-1, 3)
    X_val = val_ds[str(param["window_size"])][0]
    y_val = val_ds[str(param["window_size"])][1].reshape(-1, 3)
    
    history = _model.fit(
        x=X_train,
        y=y_train,
        batch_size=param_values["batch_size"],
        shuffle=False,
        epochs=100,
        validation_data=(X_val, y_val),
        callbacks=[rlr_cb, ely_cb],
        verbose=False,
    )
    pd.DataFrame(history.history).to_csv(root_dir+'/history.csv', index=False)
    print(f'[INFO] Finished training')
    K.clear_session()

idc = conv_idc[:128]
param = conv_params
proj_dir = 'lhs_opt/2022-08-23_17:10'
metric_df = pd.DataFrame(np.zeros((len(idc), get_param_len(param))), columns=['r2', 'corr', 'nmse', 'fb', 'b', 'a/c'])
for i, conv_idx in enumerate(idc):
    reset_seeds()
    root_dir = proj_dir+f'/trial{i:03d}'
    create_folder(root_dir, ignore_exist=True)
    param_values = get_smp_values(param, conv_idx)
    print(f'[INFO] Trial{i:03d} training start')
    with open(f"{root_dir}/params.json", "w") as outfile:
        json.dump(param_values, outfile)
        outfile.close()
        
    X_test = test_ds[str(param_values["window_size"])][0]
    y_test = test_ds[str(param_values["window_size"])][1].reshape(-1, 3)

    model = model_builder(param_values, X_test[0].shape, y_test.shape[1])
    train_model(param_values, model)

    y_hat = model.predict(X_test, batch_size=1)
    print(f'[INFO] Trial{i:03d} finished predict')
    tf.compat.v1.reset_default_graph()
    del model
    K.clear_session()
    print(f'[INFO] Trial{i:03d} successfully ended.. Clear session')
    metric = calc_metric(y_test, y_hat)
    print(f'[INFO] Trial{i:03d} calculated metrics')
    metric_df.iloc[i] = metric
    metric_df.to_csv(f'{proj_dir}/metric.csv', index_label='index')
    print(f'[INFO] Trial{i:03d} successfully saved metrics')

Creating folder in `lhs_opt/2022-08-23_17:10/trial000`
[INFO] Trial000 training start
[INFO] Finished training


In [127]:
pd.read_csv('lhs_opt/2022-08-24_11:38/metric.csv', index_col='index').describe()

  diff_b_a = subtract(b, a)


Unnamed: 0,r2,corr,nmse,fb,b,a/c
count,128.0,128.0,128.0,128.0,128.0,128.0
mean,-1.234757,0.999987,inf,-0.826999,0.447484,0.086904
std,2.27691,3.1e-05,,0.856268,0.341585,0.159765
min,-4.315153,0.999911,0.01842,-2.0,0.0,-0.456213
25%,-4.300174,0.999996,0.039599,-2.0,0.0,0.0
50%,0.433191,0.999999,0.057279,-0.2349,0.643407,0.095825
75%,0.583454,1.0,,-0.160986,0.715055,0.148432
max,0.772793,1.0,inf,0.024952,0.961168,0.877114
