In [1]:
import matplotlib as mpl
import sys
import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tensorflow as tf

sys.path.append('../scripts/particles/')

mpl.rcParams['figure.dpi'] = 200


In [2]:
import data_handler as dh
import metrics
import utils

In [3]:
outputs = ['PM1', 'PM2.5', 'PM10']
inputs = [
    'PM1_2.5_OUT',
    'PM1_2.5_H_OUT',
    'PM2.5_OUT',
    'PM2.5_H_OUT',
    'PM2.5_10_OUT',
    'PM2.5_10_H_OUT',
    'PERSON_NUMBER',
    'AIR_PURIFIER',
    'WINDOW',
    'AIR_CONDITIONER',
    'DOOR',
    'WIND_DEG',
    'HUMIDITY'
]

model_dir = '../../projects/particle/model'
model_name = 'conv_20'
model_path = model_dir + '/' + model_name

config_path = model_path + "/config.json"
f = open(config_path, "r")
config = json.load(f)
f.close()

in_time_step = config["model"]["window_size"]
out_time_step = 1
offset = config["model"]["offset"]

del_mean = -15

In [4]:
model = tf.keras.models.load_model(f'{model_path}/result/model/{model_name}.h5')

In [5]:
weather_df = pd.read_csv('../../storage/particle/weather.csv', index_col='DATE',
                         parse_dates=True)[['TEMPERATURE', 'WIND_DEG', 'WIND_SPEED', 'HUMIDITY']]
weather_df['WIND_DEG'] = np.sin(weather_df['WIND_DEG'].values * np.pi / 180)

df_org = dh.load_data('../../storage/particle/data.csv')
df_org = dh.add_pm_diff(df_org)

excludes = ['PERSON_NUMBER', 'AIR_PURIFIER',
            'AIR_CONDITIONER', 'WINDOW', 'DOOR']
df = dh.apply_moving_average(pd.concat([df_org, weather_df], axis=1),
                             window=config['data']['moving_average_window'],
                             method=config['data']['moving_average_method'],
                             excludes=excludes,
                             min_periods=1)
df = pd.concat([df, df_org[excludes]], axis=1)
df[excludes] = df[excludes].fillna(method='ffill')
df.dropna(inplace=True)

dfs = dh.trim_df(df, config['data']['dates'])
val_size = config['data']['validation']
test_size = config['data']['test']

train_dfs, val_dfs, test_dfs = dh.train_test_split_df(dfs, val_size, test_size)
meta_df = pd.concat(train_dfs).describe()

In [6]:
win_size = config['model']['window_size']

In [7]:
def to_dataset(_dfs, in_time_step, delta_mean):
    w = [0.71440267, 0.68195107, 0.6384208]
    new_dfs = []
    
    for i, _df in enumerate(_dfs):
        t_df = _df.copy()
        delta = np.random.normal(loc=delta_mean, scale=2, size=len(_df))
        t_df['PM1_OUT'] = np.maximum(t_df['PM1_OUT'] + delta, 0)
        t_df['PM2.5_OUT'] = np.maximum(t_df['PM2.5_OUT'] + delta, 0)
        t_df['PM10_OUT'] = np.maximum(t_df['PM10_OUT'] + delta, 0)
        t_df['PM1_H_OUT'] = np.maximum(t_df['PM1_H_OUT'] + delta * w[0], 0)
        t_df['PM2.5_H_OUT'] = np.maximum(t_df['PM2.5_H_OUT'] + delta * w[1], 0)
        t_df['PM10_H_OUT'] = np.maximum(t_df['PM10_H_OUT'] + delta * w[2], 0)
        new_dfs.append(t_df)
    return dh.dfs_to_dataset(new_dfs, meta_df, inputs, outputs, in_time_step=in_time_step, out_time_step=out_time_step, offset=offset, excludes=outputs)

X_test, y_test = to_dataset(test_dfs, win_size, del_mean)

In [8]:
NUM_ITER = 10000
cis = [x for x in np.arange(0, 1 + 0.05, 0.05)]
dropouts = [x for x in np.arange(0.2, 0.6 + 0.05, 0.05)]
pred = model.predict(X_test)



In [9]:
weights = model.get_weights()

In [10]:
del model

tf.keras.backend.clear_session()

In [11]:
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    GlobalMaxPooling1D,
    Input,
    LeakyReLU,
    MaxPooling1D,
    Attention,
    Permute,
)
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

input_tensor = Input(shape=(60, 13), name="input")
x = Conv1D(32, kernel_size=3, kernel_initializer='he_uniform', activation='relu', strides=1, padding='same')(input_tensor)
x = MaxPooling1D(pool_size=5, strides=3)(x)
x = Flatten()(x)
x = Dense(256, kernel_initializer='he_uniform', activation='relu')(x)
x = Dropout(0.5)(x)
output = Dense(y_test.shape[2], kernel_initializer='he_uniform', activation="relu", name="output")(x)

model = Model(
    inputs=input_tensor,
    outputs=output,
    name=f'{config["name"].lower()}_v{config["version"]}',
)

model.compile(
    optimizer=Adam(learning_rate=config["model"]["lr"]),
    loss=config["model"]["loss"].lower(),
    metrics=RootMeanSquaredError(),
)

In [12]:
conf = model.get_config()

In [13]:
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, Sequential


def create_dropout_predict_function(_conf, _weights, _dropout):
    conf = _conf
    for layer in conf["layers"]:
        if layer["class_name"] == "Dropout":
            layer["config"]["rate"] = _dropout
        elif "dropout" in layer["config"].keys():
            layer["config"]["dropout"] = _dropout

    # if type(_model) == Sequential:
    #     model_dropout = Sequential.from_config(conf)
    # else:
        # model_dropout = Model.from_config(conf)

    model_dropout = Model.from_config(conf)
    model_dropout.set_weights(_weights)

    return model_dropout

def predict_with_dropout(_conf, _weights, _X, _y, _dropout, num_iter):
    num_samples = _X.shape[0]

    dropout_prediction = create_dropout_predict_function(_conf, _weights, _dropout)
    predictions = np.zeros((num_iter, _y.shape[0], _y.shape[1]))

    with tf.device('/gpu:0'):
        for i in range(num_iter):
            predictions[i] = dropout_prediction(tf.convert_to_tensor(_X), training=True)

    return predictions

In [14]:
from tensorflow.keras.models import load_model
from scipy import stats


def calc_ci(_X, _y, predictions, conf_int, num_iter):
    ddof = num_iter - 1
    num_samples = len(_y)
    m = np.mean(predictions, axis=0)
    ci = stats.t.interval(
        conf_int, ddof, loc=m, scale=stats.sem(predictions, ddof=ddof, axis=0)
    )

    _res = np.zeros(3)
    _dfs = []
    for i in range(3):
        _df = pd.DataFrame(
            {"real": _y[:, i], "lower": ci[0][:, i], "upper": ci[1][:, i], "pred": pred[:, i]}
        )
        percentage = (
            len(_df[(_df["real"] <= _df["upper"]) & (_df["real"] >= _df["lower"])])
            / num_samples
            * 100
        )
        _res[i] = percentage
        _dfs.append(_df)
    return _res, _dfs

In [None]:
import datetime as dt
import json
import os
import shutil

def create_folder(path):
    if os.path.exists(path):
        cmd = input(f'Folder name `{path}` already exsists. You mean overwrite?[Y/n]')
        if cmd == 'Y' or cmd == 'y':
            shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

proj_dir = f'../../projects/particle/ci_result/pm{del_mean:02d}'

create_folder(proj_dir + "/predict")
create_folder(proj_dir + "/ci")

for dropout in dropouts:
    X, y = to_dataset(test_dfs, win_size, del_mean)
    predict = predict_with_dropout(conf, weights, X, y.reshape(-1, 3), dropout, NUM_ITER)
    np.save(f"{proj_dir}/predict/d_{dropout:.02f}.npy", predict)
    for ci in cis:
        res, dfs = calc_ci(X_test, y_test.reshape(-1, 3), predict, ci, NUM_ITER)
        for idx, df in enumerate(dfs):
            df.to_csv(
                f"{proj_dir}/ci/d_{dropout:.02f}_ci_{ci:.02f}_{outputs[idx]}.csv",
                index=False,
            )

In [None]:
test_data = np.load('/home/jiheo/workspace/deep_learning/projects/particle/ci_result/2022-09-21_06:49/predict/d_0.25.npy')

In [None]:
plt.plot(test_data[:, 0, 0])

In [None]:
for dropout in dropouts:
    predict = np.load(f'{proj_dir}/predict/d_{dropout:.02f}.npy')
    for ci in cis:
        res, dfs = calc_ci(X_test, y_test.reshape(-1, 3), predict, ci, NUM_ITER)
        for idx, df in enumerate(dfs):
            df.to_csv(
                f"{proj_dir}/ci/d_{dropout:.02f}_ci_{ci:.02f}_{outputs[idx]}.csv",
                index=False,
            )

In [None]:
for idx in range(3):
    res = np.zeros((len(dropouts) * len(cis), 3))
    res_index = 0
    for dropout in dropouts:
        for ci in cis:
            path = f"{proj_dir}/ci/d_{dropout:.2f}_ci_{ci:.2f}_{outputs[idx]}.csv"
            df = pd.read_csv(path)
            df = df[["real", "lower", "upper", "pred"]]
            score = (
                len(df[(df["real"] <= df["upper"]) & (df["real"] >= df["lower"])])
                / len(df)
                * 100
            )
            res[res_index] = np.array([dropout, ci, score])
            res_index += 1
    pd.DataFrame(res, columns=["dropout", "ci", "score"]).to_csv(
        f"{proj_dir}/ci_{outputs[idx]}.csv", index=False
    )

In [None]:
ax = None
pm1_ci = pd.read_csv(f'{proj_dir}/ci_pm1.csv')
for dropout in dropouts:
    if ax is None:
        ax = pm1_ci[pm1_ci["dropout"] == dropout].plot(
            x="ci", y="score", figsize=(12, 10)
        )
    else:
        ax = pm1_ci[pm1_ci["dropout"] == dropout].plot(
            x="ci", y="score", figsize=(12, 10), ax=ax
        )

legned_label = [f"dropout={x:.2f}" for x in dropouts]
ax.legend(legned_label)

In [None]:
ci_test_df = pd.read_csv(f"{proj_dir}/ci/d_0.55_ci_0.95_pm1.csv")
ax = ci_test_df.plot(y="pred", figsize=(30, 10), color="r")
ax = ci_test_df.plot(y="real", figsize=(30, 10), color="b", ax=ax)
ax.fill_between(
    ci_test_df.index,
    ci_test_df["lower"],
    ci_test_df["upper"],
    facecolor="green",
    alpha=0.2,
    interpolate=True,
)
plt.show()