In [None]:
import sys

import numpy as np
import pandas as pd
import tensorflow as tf
from plotly import graph_objects as go
from plotly.subplots import make_subplots

sys.path.append("../scripts/")

In [None]:
import particle_data
from particle_utils import *

In [None]:
TARGET_LABEL = ["PM1", "PM2.5", "PM10"]
FEATURE_LABEL = [
    "PM1_2.5_OUT",
    "PM1_2.5_H_OUT",
    "PM2.5_OUT",
    "PM2.5_H_OUT",
    "PM2.5_10_OUT",
    "PM2.5_10_H_OUT",
    "PERSON_NUMBER",
    "AIR_PURIFIER",
    "WINDOW",
    "AIR_CONDITIONER",
    "DOOR",
]

WINDOW_SIZE = 30
OFFSET = 0
OUTPUT_SIZE = 1
BATCH_SIZE = 64

In [None]:
def split_dfs(_dfs, target_date):
    _train_dfs = []
    _test_dfs = []
    for _d in _dfs:
        if _d.index[0] < target_date and _d.index[-1] > target_date:
            _train_dfs.append(_d.loc[:target_date])
            _test_dfs.append(_d.loc[target_date:])
        elif _d.index[0] >= target_date:
            _test_dfs.append(_d)
        else:
            _train_dfs.append(_d)
    return _train_dfs, _test_dfs

def train_test_split_df(_dfs, val_size, test_size):
    _tot_df = pd.concat(_dfs)
    _tot_len = len(_tot_df)
    _train_len = int((1 - val_size - test_size) * _tot_len)
    _val_len = int(val_size * _tot_len)
    _train_dfs, _test_dfs = split_dfs(_dfs, _tot_df.index[_train_len])
    _val_dfs, _test_dfs = split_dfs(_test_dfs, _tot_df.index[_train_len + _val_len])
    return _train_dfs, _val_dfs, _test_dfs

def translate_to_dataset(_dfs, _meta=None):
    _m = _meta
    if _meta == None:
        _m = get_meta(_dfs, [
            "PERSON_NUMBER",
            "PM2.5_OUT",
            "PM2.5_H_OUT",
            "PM1_2.5_OUT",
            "PM1_2.5_H_OUT",
            "PM2.5_10_OUT",
            "PM2.5_10_H_OUT",
            "PM1_OUT",
            "PM1_H_OUT",
            "PM10_OUT",
            "PM10_H_OUT",
            # "HUMIDITY",
        ],)
    _X, _y = gen_dataset(
        _dfs,
        _m,
        features=FEATURE_LABEL,
        targets=TARGET_LABEL,
        window_size=WINDOW_SIZE,
        output_size=OUTPUT_SIZE,
        offset=OFFSET,
        scale_cols=[
                    'PM2.5_OUT',
                    'PM2.5_H_OUT',
                    'PM1_2.5_OUT',
                    'PM1_2.5_H_OUT',
                    'PM2.5_10_OUT',
                    'PM2.5_10_H_OUT',
                    'PERSON_NUMBER',
                ],
        scale=True,
        verbose=True,
    )
    return _X, _y

In [None]:
import json
from tensorflow.keras import backend as K


class ProjectHanlder:
    def __init__(self, src):
        config_path = src + "/config.json"
        f = open(config_path, "r")
        self.config_dict = json.load(f)
        f.close()
        self.model = None
        self.root_dir = None

    def load_model(self):
        self.root_dir = (
            self.config_dict["root_dir"]
            + "/"
            + self.config_dict["name"]
            + self.config_dict["version"]
        )
        model_path = (
            self.root_dir
            + "/"
            + self.config_dict["dirs"]["model"]
            + "/"
            + self.config_dict["name"].lower()
            + "_"
            + self.config_dict["version"]
            + ".h5"
        )
        self.model = tf.keras.models.load_model(model_path)
        return self.model
    
    def load_prediction(self):
        self.root_dir = (
            self.config_dict["root_dir"]
            + "/"
            + self.config_dict["name"]
            + self.config_dict["version"]
        )
        pred_path = (
            self.root_dir
            + "/"
            + self.config_dict["dirs"]["predict"]
            + "/"
            + "predict.csv"
        )
        res_df = pd.read_csv(pred_path)
        res_df.index = res_df.pop('DATE').apply(pd.to_datetime)
        return res_df
    
    def clear_session(self):
        del self.model
        K.clear_session()

In [None]:
with tf.device('/cpu:0'):
    ph = ProjectHanlder("project/GRU/GRUkt01")
    model = ph.load_model()
    prediction = ph.load_prediction()

In [None]:
df_org = load_pm("../data/data.csv")
add_diff(df_org)
df = apply_moving_average(
    df_org, 'median', ph.config_dict["model"]["data"]["moving_average_window"], True, 3
)

dfs = trim_df(df, ph.config_dict["model"]["data"]["used_data"])
meta = ph.config_dict['model']['data']['meta']

train_dfs, val_dfs, test_dfs = train_test_split_df(dfs, ph.config_dict["model"]["data"]["validation"], ph.config_dict["model"]["data"]["test"])

X_train, y_train = translate_to_dataset(train_dfs)
X_val, y_val = translate_to_dataset(val_dfs, meta)
X_test, y_test = translate_to_dataset(test_dfs, meta)

print("X_train, y_train shape: ", X_train.shape, y_train.shape)
print("X_val, y_val shape: ", X_val.shape, y_val.shape)
print("X_test, y_test shape: ", X_test.shape, y_test.shape)

In [None]:
NUM_ITER = 10000
cis = [x for x in np.arange(0, 1 + 0.05, 0.05)]
dropouts = [x for x in np.arange(0.2, 0.6 + 0.05, 0.05)]
targets = ["pm1", "pm2.5", "pm10"]
pred = model.predict(X_test)

In [None]:
conf = model.get_config()
weights = model.get_weights()

ph.clear_session()

In [None]:
del model

K.clear_session()

In [None]:
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, Sequential


def create_dropout_predict_function(_conf, _weights, _dropout):
    conf = _conf
    for layer in conf["layers"]:
        if layer["class_name"] == "Dropout":
            layer["config"]["rate"] = _dropout
        elif "dropout" in layer["config"].keys():
            layer["config"]["dropout"] = _dropout

    # if type(_model) == Sequential:
    #     model_dropout = Sequential.from_config(conf)
    # else:
        # model_dropout = Model.from_config(conf)
        
    model_dropout = Model.from_config(conf)
    model_dropout.set_weights(_weights)

    return model_dropout

def predict_with_dropout(_conf, _weights, _X, _y, _dropout, num_iter):
    num_samples = _X.shape[0]

    dropout_prediction = create_dropout_predict_function(_conf, _weights, _dropout)
    predictions = np.zeros((num_iter, _y.shape[0], _y.shape[1]))

    with tf.device('/gpu:0'):
        for i in range(num_iter):
            predictions[i] = dropout_prediction(tf.convert_to_tensor(_X), training=True)

    return predictions

In [None]:
from tensorflow.keras.models import load_model
from scipy import stats


def calc_ci(_X, _y, predictions, conf_int, num_iter):
    ddof = num_iter - 1
    num_samples = len(_y)
    m = np.mean(predictions, axis=0)
    ci = stats.t.interval(
        conf_int, ddof, loc=m, scale=stats.sem(predictions, ddof=ddof, axis=0)
    )

    _res = np.zeros(3)
    _dfs = []
    for i in range(3):
        _df = pd.DataFrame(
            {"real": _y[:, i], "lower": ci[0][:, i], "upper": ci[1][:, i], "pred": pred[:, i]}
        )
        percentage = (
            len(_df[(_df["real"] <= _df["upper"]) & (_df["real"] >= _df["lower"])])
            / num_samples
            * 100
        )
        _res[i] = percentage
        _dfs.append(_df)
    return _res, _dfs

In [None]:
pred[:, 1]
resd = np.zeros(len(np_pred))
resdpred[0, 1] - resd[]

In [None]:
time = 17000

data = np_pred[:, time, 1] - pred[time, 1]
df = pd.DataFrame(data)

print(df.mean()[0], df.std()[0])
df.plot(kind='hist', bins=100, figsize=(22, 10))

In [None]:
import datetime as dt

proj_dir = f'ci_result/{dt.datetime.now().strftime("%Y-%m-%d_%H:%M")}'

create_folder(proj_dir + "/predict")
create_folder(proj_dir + "/ci")

for dropout in dropouts:
    predict = predict_with_dropout(conf, weights, X_test, y_test.reshape(-1, 3), dropout, NUM_ITER)
    np.save(f"{proj_dir}/predict/d_{dropout:.02f}.npy", predict)
    for ci in cis:
        res, dfs = calc_ci(X_test, y_test.reshape(-1, 3), predict, ci, NUM_ITER)
        for idx, df in enumerate(dfs):
            df.to_csv(
                f"{proj_dir}/ci/d_{dropout:.02f}_ci_{ci:.02f}_{targets[idx]}.csv",
                index=False,
            )

In [None]:
proj_dir = 'ci_result/2022-08-19_20:38'

In [None]:
for dropout in dropouts:
    predict = np.load(f'{proj_dir}/predict/d_{dropout:.02f}.npy')
    for ci in cis:
        res, dfs = calc_ci(X_test, y_test.reshape(-1, 3), predict, ci, NUM_ITER)
        for idx, df in enumerate(dfs):
            df.to_csv(
                f"{proj_dir}/ci/d_{dropout:.02f}_ci_{ci:.02f}_{targets[idx]}.csv",
                index=False,
            )

In [None]:
for idx in range(3):
    res = np.zeros((len(dropouts) * len(cis), 3))
    res_index = 0
    for dropout in dropouts:
        for ci in cis:
            path = f"{proj_dir}/ci/d_{dropout:.2f}_ci_{ci:.2f}_{targets[idx]}.csv"
            df = pd.read_csv(path)
            df = df[["real", "lower", "upper", "pred"]]
            score = (
                len(df[(df["real"] <= df["upper"]) & (df["real"] >= df["lower"])])
                / len(df)
                * 100
            )
            res[res_index] = np.array([dropout, ci, score])
            res_index += 1
    pd.DataFrame(res, columns=["dropout", "ci", "score"]).to_csv(
        f"{proj_dir}/ci_{targets[idx]}.csv", index=False
    )

In [None]:
ax = None
pm1_ci = pd.read_csv(f'{proj_dir}/ci_pm1.csv')
for dropout in dropouts:
    if ax is None:
        ax = pm1_ci[pm1_ci["dropout"] == dropout].plot(
            x="ci", y="score", figsize=(12, 10)
        )
    else:
        ax = pm1_ci[pm1_ci["dropout"] == dropout].plot(
            x="ci", y="score", figsize=(12, 10), ax=ax
        )

legned_label = [f"dropout={x:.2f}" for x in dropouts]
ax.legend(legned_label)

In [None]:
ci_test_df = pd.read_csv(f"{proj_dir}/ci/d_0.55_ci_0.95_pm1.csv")
ax = ci_test_df.plot(y="pred", figsize=(30, 10), color="r")
ax = ci_test_df.plot(y="real", figsize=(30, 10), color="b", ax=ax)
ax.fill_between(
    ci_test_df.index,
    ci_test_df["lower"],
    ci_test_df["upper"],
    facecolor="green",
    alpha=0.2,
    interpolate=True,
)
plt.show()