In [None]:
import sys

import numpy as np
import pandas as pd
import json

sys.path.append("../scripts/")

In [None]:
from particle_da import *
from particle_utils import *

In [None]:
TARGET_LABEL = ["PM1", "PM2.5", "PM10"]
FEATURE_LABEL = [
    "PM1_2.5_OUT",
    "PM1_2.5_H_OUT",
    "PM2.5_OUT",
    "PM2.5_H_OUT",
    "PM2.5_10_OUT",
    "PM2.5_10_H_OUT",
    "PERSON_NUMBER",
    "AIR_PURIFIER",
    "WINDOW",
    "AIR_CONDITIONER",
    "DOOR",
]

VIEW_TARGET = [
    "PM1",
    "PM1_OUT",
    "PM1_H_OUT",
    "PM2.5",
    "PM2.5_OUT",
    "PM2.5_H_OUT",
    "PM10",
    "PM10_OUT",
    "PM10_H_OUT",
    "PM1_2.5_OUT",
    "PM1_2.5_H_OUT",
    "PM2.5_10_OUT",
    "PM2.5_10_H_OUT",
]

WINDOW_SIZE = 30
OFFSET = 0
OUTPUT_SIZE = 1
BATCH_SIZE = 64

config_file = 'project/GRU/GRUkt01/config.json'
f = open(config_file, 'r')
config = json.load(f)
f.close()

used_data = config['model']['data']['used_data']
meta = config['model']['data']['meta']

In [None]:
df = pd.read_csv(config['root_dir']+'/'+config['name']+config['version']+'/'+config['dirs']['predict']+'/predict.csv',
                 index_col='DATE',
                 parse_dates=True
                )

In [None]:
train_df = df[df["TYPE"] == "train"]
val_df = df[df["TYPE"] == "val"]
test_df = df[df["TYPE"] == "test"]

In [None]:
plot(test_df, ['PM2.5', 'PM2.5_PRED'])

In [None]:
# import matplotlib.patches as mpatches

# df_tmp = df[event_cond]


# def add_label(violin, label):

#     color = violin["bodies"][0].get_facecolor().flatten()

#     return (mpatches.Patch(color=color), label)


# fig, axes = plt.subplots()
# labels = []

# train_df = df_tmp[df_tmp["TYPE"] == "train"]
# val_df = df_tmp[df_tmp["TYPE"] == "val"]
# test_df = df_tmp[df_tmp["TYPE"] == "test"]
# labels.append(add_label(axes.violinplot(train_df[VIEW_TARGET]), "train"))
# labels.append(add_label(axes.violinplot(test_df[VIEW_TARGET]), "val"))
# labels.append(add_label(axes.violinplot(val_df[VIEW_TARGET]), "test"))
# plt.legend(*zip(*labels), loc=2)

In [None]:
data_ratio_df = get_data_ratio([train_df, val_df, test_df])
data_ratio_df = add_corr(data_ratio_df, [train_df, val_df, test_df], 'PM2.5')

In [None]:
data_ratio_df

In [None]:
data_ratio_df[['Total', 'Train', 'Val', 'Test']].iloc[:10].plot(kind='pie', subplots=True, figsize=(40, 20))

In [None]:
plot(df[get_cond(df, "01010")], ['PM2.5', 'PM1_2.5', 'PM1_2.5_OUT', 'PM1_2.5_H_OUT', 'PM2.5_10_OUT', 'PM2.5_10_H_OUT', 'PM2.5_PRED'])

In [None]:
import matplotlib.pyplot as plt

target_cols = [
    ["PM1", "PM2.5", "PM10"],
    ["PM1_OUT", "PM2.5_OUT", "PM10_OUT"],
    ["PM1_H_OUT", "PM2.5_H_OUT", "PM10_H_OUT"],
]

for row in range(3):
    fig, axes = plt.subplots(ncols=3, sharey=True, figsize=(40, 10))
    for col in range(3):
        target_df = df[get_cond(df, "10000")][target_cols[row][col]]
        ax = target_df.plot(
            kind="hist",
            bins=100,
            # density=1,
            title=f"{target_cols[row][col]}",
            ax=axes[col],
            fontsize=17,
        )
        ax.title.set_size(20)
        ax.set_xlabel("PM", fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
def sampling(_df, _targets, sample_size, num_iter):
    sample_means = np.zeros((num_iter, len(_targets)))
    for i in range(num_iter):
        for j, _target in enumerate(_targets):
            sample_means[i, j] = _df[_target].sample(sample_size, replace=True).mean()
    return sample_means

In [None]:
person_smp = sampling(df[get_cond(df, "10000")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
no_person_smp = sampling(df[get_cond(df, "00000")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)

# person_smp = pd.read_csv('../data/person_sample.csv')
# no_person_smp = pd.read_csv('../data/no_person_sample.csv')

In [None]:
person_smp_1 = sampling(df[get_cond(df, "10000")], ['PM1', 'PM1_OUT', 'PM1_H_OUT'], 1000, 100000)
no_person_smp_1 = sampling(df[get_cond(df, "00000")], ['PM1', 'PM1_OUT', 'PM1_H_OUT'], 1000, 100000)

In [None]:
plot_sample(person_smp_1, no_person_smp_1, 0.95, pm1_labels, ['people in', 'no person'], 'CASE 00000 vs 10000 with 0.95 CI')

In [None]:
ap_person_on_smp = sampling(df[get_cond(df, "11000")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
ap_person_off_smp = np.copy(ac_person_off_smp)

In [None]:
ap_on_smp = sampling(df[get_cond(df, "01000")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
ap_off_smp = np.copy(no_person_smp)
ap_person_on_smp = sampling(df[get_cond(df, "11000")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
ap_person_off_smp = np.copy(ac_person_off_smp)

# ap_on_smp = pd.read_csv('../data/ap_on_sample.csv')
# ap_off_smp = pd.read_csv('../data/person_sample.csv')

In [None]:
ac_person_on_smp = sampling(df[get_cond(df, "10100")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
ac_person_off_smp = np.copy(person_smp)
ac_no_p_on_smp = sampling(df[get_cond(df, "00100")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
ac_no_p_off_smp = np.copy(no_person_smp)

# ac_person_on_smp = pd.read_csv('../data/ac_on_person_sample.csv')
# ac_person_off_smp = pd.read_csv('../data/ac_off_person_sample.csv')

# ac_no_p_smp = pd.read_csv('../data/ac_on_noperson_sample.csv')
# ac_no_p_off_smp = pd.read_csv('../data/ac_off_noperson_sample.csv')

In [None]:
wd_person_open_smp = sampling(df[get_cond(df, "10010")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
wd_person_closed_smp = np.copy(person_smp)
wd_no_p_open_smp = sampling(df[get_cond(df, "00010")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
wd_no_p_closed_smp = np.copy(no_person_smp)

# wd_person_open_smp = pd.read_csv('../data/wd_open_person_sample.csv')
# wd_person_closed_smp = pd.read_csv('../data/wd_closed_person_sample.csv')

# wd_no_p_open_smp = pd.read_csv('../data/wd_open_noperson_sample.csv')
# wd_no_p_closed_smp = pd.read_csv('../data/wd_closed_noperson_sample.csv')

In [None]:
# dr_open_smp = sampling(df[get_cond(df, "xxxx1")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)
# dr_closed_smp = sampling(df[get_cond(df, "xxxx0")], ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], 1000, 100000)

dr_open_smp = pd.read_csv('../data/person_sample.csv')
dr_closed_smp = pd.read_csv('../data/person_sample.csv')

In [None]:
pd.DataFrame(wd_open_smp, columns=['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT']).to_csv('../data/wd_open_person_sample.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as st

plt.rcParams['font.size'] = '17'
pm1_labels = ['PM1', 'PM1_OUT', 'PM1_H_OUT']
pm2_5_labels = ['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT']
pm10_labels = ['PM10', 'PM10_OUT', 'PM10_H_OUT']

def get_ci(alpha, data):
    ddof = len(data) - 1
    return st.t.interval(alpha, ddof, loc=np.mean(data), scale=st.sem(data, ddof=ddof))

def find_nearest_index(arr, value):
    arr = np.asarray(arr)
    idx = (np.abs(arr - value)).argmin()
    return idx

def get_near_indices(pool, values):
    l = []
    for v in values:
        l.append(find_nearest_index(pool, v))
    return l

def moving_average(a, n=3):
    ret = np.cumsum(a, dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:] / n

def plot_sample_in_ax(data, alpha, label, xlabel, ax):
    ci = get_ci(alpha, data)
    hist = ax.hist(data, bins=100, label=label, alpha=0.2)
    idc = get_near_indices(hist[1], ci)
    ax.fill_between(moving_average(hist[1][idc[0]:idc[1] + 2], n=2), hist[0][idc[0]:idc[1] + 1], alpha=0.6)
    return ax

def plot_sample(data_1, data_2, alpha, xlabels, labels, title):
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(30, 10))

    for i in range(3):
        ax = plot_sample_in_ax(data_1[:, i], alpha, labels[0], xlabels[i], axes[i])
        ax = plot_sample_in_ax(data_2[:, i], alpha, labels[1], xlabels[i], ax)
        axes[i].title.set_size(20)
        axes[i].set_xlabel(xlabels[i], fontsize=18)
        axes[i].set_ylabel("Frequency", fontsize=18)
        axes[i].legend()
    fig.suptitle(title, fontsize=22)

    print('=== MEAN ===')
    for i in range(len(xlabels)):
        print(f'{xlabels[i]}: {data_1[:, i].mean():.3f}, {data_2[:, i].mean():.3f}')
    print('=== STD ===')
    for i in range(len(xlabels)):
        print(f'{xlabels[i]}: {data_1[:, i].std():.3f}, {data_2[:, i].std():.3f}')

In [None]:
plot_sample(person_smp, no_person_smp, 0.95, pm2_5_labels, ['people in', 'no person'], 'CASE 00000 vs 10000 with 0.95 CI')

In [None]:
plot_sample(ap_on_smp, ap_off_smp, 0.95, pm2_5_labels, ['ap on', 'ap off'], 'CASE 00000 vs 01000 with 0.95 CI')

In [None]:
plot_sample(ap_person_on_smp, ap_person_off_smp, 0.95, pm2_5_labels, ['ap on', 'ap off'], 'CASE 10000 vs 11000 with 0.95 CI')

In [None]:
plot_sample(ac_person_on_smp, ac_person_off_smp, 0.95, pm2_5_labels, ['ac on', 'ac off'], 'CASE 10000 vs 10100 with 0.95 CI')

In [None]:
plot_sample(ac_no_p_on_smp, ac_no_p_off_smp, 0.95, pm2_5_labels, ['ac on', 'ac off'], 'CASE 00000 vs 00100 with 0.95 CI')

In [None]:
plot_sample(wd_person_open_smp, wd_person_closed_smp, 0.95, pm2_5_labels, ['window open', 'window closed'], 'CASE 10000 vs 10010 with 0.95 CI')

In [None]:
plot_sample(wd_no_p_open_smp, wd_no_p_closed_smp, 0.95, pm2_5_labels, ['window open', 'window closed'], 'CASE 00000 vs 00010 with 0.95 CI')

In [None]:
import matplotlib.pyplot as plt

target_cols = [
    ["PM1", "PM2.5", "PM10"],
    ["PM1_OUT", "PM2.5_OUT", "PM10_OUT"],
    ["PM1_H_OUT", "PM2.5_H_OUT", "PM10_H_OUT"],
]

for row in range(3):
    fig, axes = plt.subplots(ncols=3, sharey=True, figsize=(40, 10))
    for col in range(3):
        target_df = df[get_cond(df, "00000")][target_cols[row][col]]
        ax = target_df.plot(
            kind="hist",
            bins=100,
            # density=1,
            title=f"{target_cols[row][col]}",
            ax=axes[col],
            fontsize=17,
        )
        ax.title.set_size(20)
        ax.set_xlabel("PM", fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
import matplotlib.pyplot as plt

target_cols = [
    ["PM1", "PM2.5", "PM10"],
    ["PM1_OUT", "PM2.5_OUT", "PM10_OUT"],
    ["PM1_H_OUT", "PM2.5_H_OUT", "PM10_H_OUT"],
]

for row in range(3):
    fig, axes = plt.subplots(ncols=3, sharey=True, figsize=(40, 10))
    for col in range(3):
        target_df = df[get_cond(df, "11110")][target_cols[row][col]]
        ax = target_df.plot(
            kind="hist",
            bins=100,
            # density=1,
            title=f"{target_cols[row][col]}",
            ax=axes[col],
            fontsize=17,
        )
        ax.title.set_size(20)
        ax.set_xlabel("PM", fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
import matplotlib.pyplot as plt

target_dfs = [train_df, val_df, test_df]
dfs_indices = ["train", "val", "test"]
# target_cols = ["PM1", "PM2.5", "PM10"]
# target_cols = ["PM1_OUT", "PM2.5_OUT", "PM10_OUT"]
# target_cols = ["PM1_H_OUT", "PM2.5_H_OUT", "PM10_H_OUT"]
# target_cols = ["PM1_2.5_OUT", "PM1_2.5_OUT", "PM1_2.5_OUT"]
target_cols = ["PM2.5", "PM1_2.5_OUT", "PM1_2.5_H_OUT", "PM2.5_10_OUT", "PM2.5_10_H_OUT"]


for row in range(len(target_cols)):
    fig, axes = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(40, 10))
    for col in range(3):
        target_df = target_dfs[col]
        target_df = target_df[get_cond(target_df, "00000")][target_cols[row]]
        ax = target_df.plot(
            kind="hist",
            bins=100,
            # density=1,
            title=f"{target_cols[row]} {dfs_indices[col].upper()}",
            ax=axes[col],
            fontsize=17,
        )
        ax.title.set_size(20)
        ax.set_xlabel("PM", fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
def calc_metric(_f, real, pred):
    return _f(real, pred)

def print_metric(_df):
    cols = ["pm1", "pm2.5", "pm10"]
    res_indices = ["Total", "Train", "Val", "Test"]
    metrics = [calc_r2, calc_nmse, calc_fb, calc_b, calc_corrcoef]
    metrics_indices = ["R Square", "NMSE", "FB", "B", "Corr"]
    
    for col in cols:
        print(f"======== {col} prediction results ========")
        res_dict = {
            "Metric": metrics_indices,
            "Total": [],
            "Train": [],
            "Val": [],
            "Test": [],
        }

        for j, m in enumerate(metrics):
            for ri in res_indices:
                if ri == 'Total':
                    s = calc_metric(m, _df[col.upper()].values, _df[col.upper() + "_PRED"].values)
                else:
                    s = calc_metric(m, _df[_df['TYPE'] == ri.lower()][col.upper()].values, _df[_df['TYPE'] == ri.lower()][col.upper() + "_PRED"].values)
                res_dict[ri].append(s)

        with pd.option_context('display.float_format', '{:0.03f}'.format):
            print(pd.DataFrame(res_dict))

In [None]:
print_metric(df[get_cond(df, "11110")])

In [None]:
import matplotlib.pyplot as plt

target_dfs = [train_df, val_df, test_df]
dfs_indices = ["train", "val", "test"]
target_cols = ["PERSON_NUMBER", "WINDOW", "DOOR", "AIR_CONDITIONER", "AIR_PURIFIER"]

fig, axes = plt.subplots(nrows=len(target_cols), ncols=3, figsize=(40, 60))
for row in range(len(target_cols)):
    for col in range(3):
        target_df = target_dfs[col]
        target_df = target_df[target_cols[row]]
        ax = target_df.plot(
            kind="hist",
            bins=15,
            title=f"{target_cols[row]} {dfs_indices[col].upper()}",
            ax=axes[row][col],
            fontsize=17,
        )
        ax.title.set_size(20)
        ax.set_xlabel(target_cols[row], fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
import matplotlib.pyplot as plt

target_dfs = [train_df, val_df, test_df]
dfs_indices = ["train", "val", "test"]
target_cols = ["PM1_OUT", "PM2.5_OUT", "PM10_OUT"]

for row in range(3):
    fig, axes = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(40, 10))
    for col in range(3):
        target_df = target_dfs[col]
        target_df = target_df[cond_11100][target_cols[row]]
        ax = target_df.plot(
            kind="hist",
            bins=100,
            density=1,
            title=f"{target_cols[row]} {dfs_indices[col].upper()}",
            ax=axes[col],
            fontsize=17,
            # xlim=(0, 14),
            # ylim=(0, 0.3),
        )
        ax.title.set_size(20)
        ax.set_xlabel("PM", fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
import matplotlib.pyplot as plt

target_dfs = [train_df, val_df, test_df]
dfs_indices = ["train", "val", "test"]
target_cols = ["PM1_H_OUT", "PM2.5_H_OUT", "PM10_H_OUT"]

for row in range(3):
    fig, axes = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(40, 10))
    for col in range(3):
        target_df = target_dfs[col]
        target_df = target_df[cond_11100][target_cols[row]]
        ax = target_df.plot(
            kind="hist",
            bins=100,
            density=1,
            title=f"{target_cols[row]} {dfs_indices[col].upper()}",
            ax=axes[col],
            fontsize=17,
            # xlim=(0, 40),
            # ylim=(0, 0.3),
        )
        ax.title.set_size(20)
        ax.set_xlabel("PM", fontsize=18)
        ax.set_ylabel("Frequency", fontsize=18)

In [None]:
ddd = pd.read_csv("project/GRU/GRU05/result/predict/predict.csv")
ddd.index = ddd.pop("DATE").apply(pd.to_datetime)

In [None]:
real = ddd[ddd.index.isin(test_df.index)]["PM2.5"].values
pred = ddd[ddd.index.isin(test_df.index)]["PM2.5_PRED"].values

In [None]:
plot(
    ddd[
        (ddd["PERSON_NUMBER"] == 0)
        & (ddd["AIR_PURIFIER"] == 0)
        & (ddd["AIR_CONDITIONER"] == 0)
        & (ddd["WINDOW"] != 0)
        & (ddd["DOOR"] == 0)
    ],
    ["PM2.5", "PM2.5_PRED", "PERSON_NUMBER"],
)

In [None]:
raw_particle_df = pd.read_csv(
    "http://api.khu-cpfd.com:9019/v1/logs/file/particle?machine=107,120,121,124,134,199"
)
raw_particle_df.index = raw_particle_df.pop("DATE").apply(pd.to_datetime).dt.floor("T")

In [None]:
df_107 = raw_particle_df[raw_particle_df["MACHINE"] == 107]
df_120 = raw_particle_df[raw_particle_df["MACHINE"] == 120]
df_121 = raw_particle_df[raw_particle_df["MACHINE"] == 121]
df_124 = raw_particle_df[raw_particle_df["MACHINE"] == 124]
df_134 = raw_particle_df[raw_particle_df["MACHINE"] == 134]
df_199 = raw_particle_df[raw_particle_df["MACHINE"] == 199]

In [None]:
plot(df_134, ["PM1", "PM2.5", "PM10"])