In [1]:
import matplotlib as mpl
import sys
import json

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import tensorflow as tf

sys.path.append('../../codes/scripts/particles/')

mpl.rcParams['figure.dpi'] = 200


In [2]:
import data_handler as dh
import metrics
import utils

In [3]:
outputs = ['PM1', 'PM2.5', 'PM10']
inputs = [
    'PM1_2.5_OUT',
    'PM1_2.5_H_OUT',
    'PM2.5_OUT',
    'PM2.5_H_OUT',
    'PM2.5_10_OUT',
    'PM2.5_10_H_OUT',
    'PERSON_NUMBER',
    'AIR_PURIFIER',
    'WINDOW',
    'AIR_CONDITIONER',
    'DOOR',
    'WIND_DEG',
    'HUMIDITY'
]

model_dir = '../../projects/particle/model'
model_name = 'conv_20'
model_path = model_dir + '/' + model_name

config_path = model_path + "/config.json"
f = open(config_path, "r")
config = json.load(f)
f.close()


In [4]:
in_time_step = config["model"]["window_size"]
out_time_step = 1
offset = config["model"]["offset"]

In [5]:
pred_df = pd.read_csv(f'{model_dir}/{model_name}/result/predict/predict.csv', index_col='DATE', parse_dates=True)

metric_dfs = []
for label in ['pm1', 'pm2.5', 'pm10']:
    metric_dfs.append(pd.read_csv(f'{model_dir}/{model_name}/result/metric/result_{label}.csv', index_col='Metric'))

In [6]:
def get_cond_df(_df, cond_in):
    cond_cols = ['PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR']
    cond_df = _df.copy()
    if len(cond_in) != 5:
        print('[ERROR] Invalid condition length')
        raise Exception('Invalid condition length')
    if cond_in[0] == '0':
        cond_df = cond_df[cond_df[cond_cols[0]] == 0]
    elif cond_in[1] == '1':
        cond_df = cond_df[cond_df[cond_cols[0]] != 0]
    for i in range(1, 5, 1):
        if cond_in[i] == 'x' or cond_in[i] == 'X':
            continue
        cond_df = cond_df[cond_df[cond_cols[i]] == int(cond_in[i])]
    return cond_df

In [None]:
_ = utils.plot(get_cond_df(pred_df, '00000'), ['PM1', 'PM2.5', 'PM10', 'PM1_PRED', 'PM2.5_PRED', 'PM10_PRED'])

In [7]:
df = get_cond_df(pred_df, '00000')
train_df = df[df['TYPE'] == 'train'].copy()
val_df = df[df['TYPE'] == 'val'].copy()
test_df = df[df['TYPE'] == 'test'].copy()

In [None]:
pms = ['PM1', 'PM2.5', 'PM10']

for pm in pms:
    print(metrics.calc_r2(test_df[pm].values, test_df[pm + '_PRED'].values))

In [None]:
_ = utils.plot(test_df, pms + [x + '_PRED' for x in pms])

# ODE Model

In [None]:
ode_pm1_df = pd.read_csv('projects/particle/ode/pm1_pred.csv', index_col='DATE', parse_dates=True)
ode_pm25_df = pd.read_csv('projects/particle/ode/pm2.5_pred.csv', index_col='DATE', parse_dates=True)
ode_pm10_df = pd.read_csv('projects/particle/ode/pm10_pred.csv', index_col='DATE', parse_dates=True)

ode_pred_df = pd.concat([ode_pm1_df['PM1_PRED'], ode_pm25_df['PM2.5_PRED'], ode_pm10_df['PM10_PRED']], axis=1)
ode_pred_df.columns = ['ODE_PM1_PRED', 'ODE_PM2.5_PRED', 'ODE_PM10_PRED']

df = pd.concat([pred_df, ode_pred_df], axis=1)

pm1_df = df[['PM1', 'PM1_PRED', 'ODE_PM1_PRED']].dropna()
pm25_df = df[['PM2.5', 'PM2.5_PRED', 'ODE_PM2.5_PRED']].dropna()
pm10_df = df[['PM10', 'PM10_PRED', 'ODE_PM10_PRED']].dropna()

In [None]:
tt = df[df['TYPE'] == 'test'].dropna()
tt.describe()[inputs]

In [None]:
_ = utils.plot(pm1_df, list(pm1_df.columns))

In [None]:
_ = utils.plot(pm25_df, pm25_df.columns)

In [None]:
_ = utils.plot(pm10_df, pm10_df.columns)

In [None]:
print(metrics.calc_r2(pm1_df['PM1_PRED'].values, pm1_df['ODE_PM1_PRED'].values))
print(metrics.calc_r2(pm25_df['PM2.5_PRED'].values, pm25_df['ODE_PM2.5_PRED'].values))
print(metrics.calc_r2(pm10_df['PM10_PRED'].values, pm10_df['ODE_PM10_PRED'].values))

# 시나리오 분석
Cond 00000: No event

1. 각 $PM$에 높낮이에 따른 $p',\,k'$값 분석
2. Steady state 가정 후 DL model의 output을 사용

In [8]:
from keras.models import load_model

with tf.device("/gpu:0"):
    model = load_model(f'{model_path}/result/model/{model_name}.h5')
    meta_df = pd.read_csv(model_path + '/meta.csv', index_col='component')

In [9]:
from datetime import timedelta

def catch_dates(_df, min_length=in_time_step):
    dates = {"start": [], "end": []}

    start_date = None
    end_date = None
    cnt = 0
    for idx, row in _df.iterrows():
        if np.isnan(row['PM2.5']) or np.isnan(row['PM2.5_H_OUT']) or np.isnan(row['PM2.5_OUT']):
            if start_date is not None and end_date is not None and cnt >= min_length:
                dates["start"].append(start_date)
                dates["end"].append(end_date)
            start_date = None
            end_date = None
            cnt = 0
        else:
            if start_date is None:
                start_date = idx
            end_date = idx
            cnt += 1
    if start_date is not None and end_date is not None:
        dates["start"].append(start_date)
        dates["end"].append(end_date)
    return pd.DataFrame(dates)

def to_dataset(_df):
    return dh.dfs_to_dataset([_df], meta_df, inputs, outputs, in_time_step=in_time_step, out_time_step=out_time_step, offset=offset, excludes=outputs+['PM1_PRED', 'PM2.5_PRED',
       'PM10_PRED', 'TYPE'])

def get_dfs(_df):
    cp_df = _df.copy()
    _dates = catch_dates(cp_df.resample('1T').mean())
    
    _dfs = []
    for i in range(_dates.shape[0]):
        _dfs.append(cp_df.loc[_dates['start'][i] : _dates['end'][i]])

    return _dfs

def get_train_test_dfs(_df, train_date='2022-07-07 13:45'):
    _train_df = _df.loc[:pd.to_datetime(train_date)].copy()
    _test_df = _df.loc[pd.to_datetime(train_date) + timedelta(minutes=1):].copy()
    
    _train_dfs = get_dfs(_train_df)
    _test_dfs = get_dfs(_test_df)
        
    return _train_dfs, _test_dfs

def get_predict_data(_dfs):
    res_dfs = []

    for td in _dfs:
        df_cp = td.copy()
        X, y = to_dataset(df_cp)
        with tf.device("/gpu:0"):
            yhat = model.predict(X, verbose=0)
        df_cp = df_cp.iloc[in_time_step + out_time_step + offset - 1:]
        for idx, output in enumerate(outputs):
            df_cp[output + '_ORG'] = df_cp[output + '_PRED']
            df_cp[output] = yhat[:, idx]
        res_dfs.append(df_cp)
    return res_dfs

In [None]:
train_dfs, test_dfs = get_train_test_dfs(df)

train_de_df = get_predict_data(train_dfs)
test_de_df = get_predict_data(test_dfs)

In [None]:
t_df = df.copy()

delta = np.random.normal(loc=5, scale=2, size=len(t_df))

t_df['PM1_OUT'] = np.maximum(t_df['PM1_OUT'] + delta, 0)
t_df['PM2.5_OUT'] = np.maximum(t_df['PM2.5_OUT'] + delta, 0)
t_df['PM10_OUT'] = np.maximum(t_df['PM10_OUT'] + delta, 0)

h_delta = delta * 0.68195107
t_df['PM1_H_OUT'] = np.maximum(t_df['PM1_H_OUT'] + h_delta, 0)
t_df['PM2.5_H_OUT'] = np.maximum(t_df['PM2.5_H_OUT'] + h_delta, 0)
t_df['PM10_H_OUT'] = np.maximum(t_df['PM10_H_OUT'] + h_delta, 0)

train_dfs, test_dfs = get_train_test_dfs(t_df)

train_de_df = get_predict_data(train_dfs)
test_de_df = get_predict_data(test_dfs)

In [None]:
def predict(par, _df, _target):
    p1 = par[0]
    p2 = par[1]
    k = par[2]

    idx_out = _target + '_OUT'
    idx_h = _target + '_H_OUT'
    pred = np.zeros(_df.shape[0])
    pred[0] = _df[_target].iloc[0]

    for i in range(0, _df.shape[0] - 1):
        delt = (_df.index[i + 1] - _df.index[i]).total_seconds() / 3600.0
        pred[i + 1] = pred[i] + delt * (p1 * (_df[idx_out].iloc[i + 1] + _df[idx_out].iloc[i]) / 2 + 
                                        p2 * (_df[idx_h].iloc[i + 1] + _df[idx_h].iloc[i]) / 2 - k * pred[i] / 2)
        pred[i + 1] /= (1 + k / 2)

    return pred

def predict_without_hall(par, _df, _target):
    p1 = par[0]
    k = par[1]

    idx_out = _target + '_OUT'
    pred = np.zeros(_df.shape[0])
    pred[0] = _df[_target].iloc[0]

    for i in range(0, _df.shape[0] - 1):
        delt = (_df.index[i + 1] - _df.index[i]).total_seconds() / 3600.0
        pred[i + 1] = pred[i] + delt * (p1 * (_df[idx_out].iloc[i + 1] + _df[idx_out].iloc[i]) / 2)
        pred[i + 1] = pred[i + 1] / (1 + k / 2)

    return pred

def loss_func(par, _dfs, _target):
    _meas_res = []
    _pred_res = []

    predictor = None
    if len(par) == 3:
        predictor = predict
    else:
        predictor = predict_without_hall

    for _df in _dfs:
        _pred_res.append(predictor(par, _df, _target))
        _meas_res.append(_df[_target].values)

    _meas = np.concatenate(_meas_res, axis=None)
    _pred = np.concatenate(_pred_res, axis=None)

    return np.var(_meas - _pred)

def get_results(par, _dfs, _target):
    _res_dfs = []

    if len(par) == 3:
        predictor = predict
    else:
        predictor = predict_without_hall

    for i, _df in enumerate(_dfs):
        pred_res = predictor(par, _df)
        _df[_target + '_PRED'] = pred_res

        _res_dfs.append(_df.copy())

    return pd.concat(_res_dfs)


In [None]:
import scipy.optimize as op


w = [0.71440267, 0.68195107, 0.6384208]
pms = ['pm1', 'pm2.5', 'pm10']
delta_means = [5, -5, 10, -10, 15, -15]

n_iter = 100
res_data = np.zeros((n_iter, 2))

for p_idx, pm in enumerate(pms):
    for dm in delta_means:
        for i in range(0, n_iter):
            t_df = df.copy()

            delta = np.random.normal(loc=dm, scale=2, size=len(t_df))

            t_df['PM1_OUT'] = np.maximum(t_df['PM1_OUT'] + delta, 0)
            t_df['PM2.5_OUT'] = np.maximum(t_df['PM2.5_OUT'] + delta, 0)
            t_df['PM10_OUT'] = np.maximum(t_df['PM10_OUT'] + delta, 0)

            t_df['PM1_H_OUT'] = np.maximum(t_df['PM1_H_OUT'] + delta * w[0], 0)
            t_df['PM2.5_H_OUT'] = np.maximum(t_df['PM2.5_H_OUT'] + delta * w[1], 0)
            t_df['PM10_H_OUT'] = np.maximum(t_df['PM10_H_OUT'] + delta * w[2], 0)

            train_dfs, test_dfs = get_train_test_dfs(t_df)

            train_de_df = get_predict_data(train_dfs)
            test_de_df = get_predict_data(test_dfs)
            bounds = [(0, 2)] * 2

            results = op.shgo(lambda x, y=train_de_df, z=pm.upper(): loss_func(x, y, z), bounds=bounds)
            res_data[i] = results.x.tolist()

        np.save(f'../../projects/particle/pk/{pm}_{dm:02d}_pk_res.npy', res_data)

In [None]:
pms = ['pm1', 'pm2.5', 'pm10']
delta_means = [5, -5, 10, -10, 15, -15]
dfs = []

for p_idx, pm in enumerate(pms):
    data = []
    for dm in delta_means:
        d = np.load(f'../../projects/particle/pk/{pm}_{dm:02d}_pk_res.npy')
        data.append(d)
    cols = [f'{pm}_{x}_p' if i % 2 == 0 else f'{pm}_{x}_k' for i, x in enumerate([5, 5, -5, -5, 10, 10, -10, -10, 15, 15, -15, -15])]

    dfs.append(pd.DataFrame(np.concatenate(data, axis=1), columns=cols))

In [None]:
def plot_p_hist(_dfs):
    fig, axes = plt.subplots(3, figsize=(22, 10))
    for i, _df in enumerate(_dfs):
        _df[_df.columns[0::2]].plot(kind='hist', bins=100, ax=axes[i])
    fig.suptitle('''$p'$ distribution''', fontsize=17)

In [None]:
plot_p_hist(dfs)

In [None]:
ax = pk_df.plot(kind='hist', y=['pm1_p', 'pm25_p', 'pm10_p'], bins=15, figsize=(22, 6))

In [None]:
import scipy.optimize as op

bounds = [(0, 2)] * 2

results = op.shgo(lambda x, y=train_de_df: loss_func(x, y), bounds=bounds)

In [None]:
results.x.tolist()

In [None]:
train_res = get_results(results.x.tolist(), train_de_df).dropna()
test_res = get_results(results.x.tolist(), test_de_df).dropna()

In [None]:
test_res = get_results([0.201964166, 0.014322877], test_de_df).dropna()

In [None]:
_ = utils.plot(train_res, [target, target + '_PRED', target + '_ORG'])

In [None]:
_ = utils.plot(test_res, [target, target + '_PRED', target + '_ORG'])

In [None]:
metrics.calc_r2(test_res[target].values, test_res[target + '_PRED'].values)

In [None]:
metrics.calc_r2(train_res[target].values, train_res[target + '_PRED'].values)

In [None]:
metrics.calc_corrcoef(test_res[target].values, test_res[target + '_PRED'].values)

In [None]:
metrics.calc_fb(test_res[target].values, test_res[target + '_PRED'].values)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression().fit(df['PM10_OUT'].values.reshape(-1, 1), df['PM10_H_OUT'].values)

In [None]:
reg.score(df['PM10_OUT'].values.reshape(-1, 1), df['PM10_H_OUT'].values)

In [None]:
print(f'coef: {reg.coef_}, bias: {reg.intercept_}')