In [None]:
from pyDOE import lhs

import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
import sys
sys.path.append("../scripts/particles/")

import data_handler as dh
import metrics
import utils

In [None]:
outputs = ['PM1', 'PM2.5', 'PM10']
inputs = [
    'PM1_2.5_OUT', 
    'PM1_2.5_H_OUT',
    'PM2.5_OUT', 
    'PM2.5_H_OUT',
    'PM2.5_10_OUT',
    'PM2.5_10_H_OUT',
    'PERSON_NUMBER',
    'AIR_PURIFIER',
    'WINDOW',
    'AIR_CONDITIONER',
    'DOOR',
    # 'TEMPERATURE',
    # 'WIND_SPEED',
    'WIND_DEG',
    'HUMIDITY'
]

dates = [
    {"start": "2022-05-07 09:40", "end": "2022-05-17 08:38"},
    {"start": "2022-05-17 11:25", "end": "2022-05-30 23:26"},
    {"start": "2022-06-01 22:40", "end": "2022-07-02 07:00"},
    {"start": "2022-07-02 16:40", "end": "2022-07-09 07:13"},
    {"start": "2022-07-09 14:30", "end": "2022-07-12 10:00"},
    {"start": "2022-07-25 12:00", "end": "2022-08-01 10:00"},
    {"start": "2022-08-03 09:00", "end": "2022-08-11 22:18"},
    {"start": "2022-08-12 12:14", "end": "2022-08-20 00:00"},
    {"start": "2022-08-20 09:38", "end": "2022-09-01 00:00"},
]

moving_average_window = 20
moving_average_method = 'mean'
val_size = 0.15
test_size = 0.25
train_size = 1 - val_size - test_size

In [None]:
weather_df = pd.read_csv('../../storage/particle/weather.csv', index_col='DATE', parse_dates=True)[['TEMPERATURE', 'WIND_DEG', 'WIND_SPEED', 'HUMIDITY']]
weather_df['WIND_DEG'] = np.sin(weather_df['WIND_DEG'].values * np.pi / 180)

df_org = dh.load_data("../../storage/particle/data.csv")
df_org = dh.add_pm_diff(df_org)

excludes = ['PERSON_NUMBER', 'AIR_PURIFIER', 'AIR_CONDITIONER', 'WINDOW', 'DOOR']
df = dh.apply_moving_average(pd.concat([df_org, weather_df], axis=1), 
                             window=moving_average_window, 
                             method=moving_average_method, 
                             excludes=excludes)
df = pd.concat([df, df_org[excludes]], axis=1)
df[excludes] = df[excludes].fillna(method='ffill')
df.dropna(inplace=True)

dfs = dh.trim_df(df, dates)
train_dfs, val_dfs, test_dfs = dh.train_test_split_df(dfs, val_size, test_size)
meta_df = pd.concat(train_dfs).describe()

In [None]:
# ax = df[(df.index >= pd.to_datetime('2022-07-09 23:00')) & (df.index <= pd.to_datetime('2022-07-12'))].resample('T').first().fillna(value=np.nan).plot(kind='line', y=['PM2.5', 'PM2.5_OUT', 'PM2.5_H_OUT'], figsize=(18, 12), fontsize=17)
# ax.set_ylabel('PM2.5 $\mu g m^3$', fontsize=17)
# ax.set_xlabel('Date', fontsize=17)
# ax.legend(fontsize=17)

In [None]:
basic_params = {
    "window_size": [5, 12, 16, 30, 60],
    "pool_size": [x for x in range(2, 6)],
    "pool_strides": [x for x in range(1, 4)],
    "dense": {
        "units": [x*32+16 for x in range(8)],
        "dropout": np.arange(0, 0.5+0.05, 0.05),
        "leaky_relu": np.arange(0, 0.5+0.05, 0.05),
    },
    "batch_size": [x*32+32 for x in range(8)],
    "lr": [0.001, 0.0001, 0.00001],
}

conv_params = {
    "conv_0": {
        "filters": [x*32+16 for x in range(8)],
        "kernel_size": [x*2+3 for x in range(3)],
        "strides": [x+1 for x in range(3)],
    },
    "conv_1": {
        "filters": [None]+[x*32+16 for x in range(8)],
        "kernel_size": [x*2+3 for x in range(3)],
        "strides": [x+1 for x in range(3)],
    },
}

rnn_params = {
    "conv_0": {
        "activated": [True, False],
        "filters": [None]+[x*32+16 for x in range(8)],
        "kernel_size": [x*2+3 for x in range(3)],
        "strides": [x+1 for x in range(3)],
    },
    "rnn_0": {
        "layer": ['naive', 'lstm', 'gru'],
        "units": [x*32+16 for x in range(8)],
        "dropout": np.arange(0, 0.5+0.05, 0.05),
    },
    "rnn_1": {
        "layer": ['naive', 'lstm', 'gru'],
        "units": [None]+[x*32+16 for x in range(8)],
        "dropout": np.arange(0, 0.5+0.05, 0.05),
    },
}

In [None]:
lr_val = [0.001, 0.0001, 0.00001]

basic_params = {
    "window_size": [12, 60],
    "pool_size": [2, 6],
    "pool_strides": [1, 4],
    "dense": {
        "units": [32, 256],
        "dropout": [0, 0.5],
        "leaky_relu": [0, 0.5],
    },
    "batch_size": [32, 256],
    "lr": [0, 2],
}

conv_params = {
    "conv": {
        "filters": [32, 256],
        "kernel_size": [3, 7],
        "strides": [0, 3],
    },
}

In [None]:
train_ds = {}
val_ds = {}
test_ds = {}

def to_dataset(_dfs, in_time_step):
    return dh.dfs_to_dataset(_dfs, meta_df, inputs, outputs, in_time_step=in_time_step)

for win_size in basic_params["window_size"]:
    train_ds[str(win_size)] = to_dataset(train_dfs, win_size)
    val_ds[str(win_size)] = to_dataset(val_dfs, win_size)
    test_ds[str(win_size)] = to_dataset(test_dfs, win_size)

In [None]:
def get_param_len(param):
    l = 0
    for p in param.keys():
        if type(param[p]) == dict:
            l += get_param_len(param[p])
        else:
            l += 1
    return l

def get_param_len_list(param):
    li = []
    for p in param.keys():
        if type(param[p]) == dict:
            li += get_param_len_list(param[p])
        else:
            li.append(len(param[p]))
    return li

def get_param_keys(param):
    keys = []
    for p in param.keys():
        if type(param[p]) == dict:
            keys += get_param_len_list(param[p])
        else:
            keys.append(p)
    return keys

def get_samples(param, n_samples):
    n_dim = get_param_len(basic_params) + get_param_len(param)
    return lhs(n_dim, n_samples, 'maximin')

def smp_to_indices(sample, param):
    len_list = get_param_len_list(basic_params) + get_param_len_list(param)
    smp_cpy = np.zeros(sample.shape)
    if len(len_list) != sample.shape[1]:
        print('[ERROR] invalid shape')
        return
    for i, ll in enumerate(len_list):
        smp_cpy[:, i] = np.floor(sample[:, i] * ll)
    smp_cpy = np.int32(smp_cpy)
    return smp_cpy

def dict_cat(param):
    new_param = {}
    for key in basic_params.keys():
        new_param[key] = basic_params[key]
    for key in param.keys():
        new_param[key] = param[key]
    return new_param

def get_smp_values(param, _indices):
    val_dict = {}
    param = dict_cat(param)
    val_list = get_param_len_list(param)
    i = 0
    for p in param.keys():
        if type(param[p]) == dict:
            val_dict[p] = {}
            for p2 in param[p].keys():
                val_dict[p][p2] = param[p][p2][_indices[i]]
                i += 1
        else:
            val_dict[p] = param[p][_indices[i]]
            i += 1
    return val_dict

In [None]:
get_param_len(basic_params) + get_param_len(conv_params)

In [None]:
cubic_data = lhs(11, 10000, 'maximin')
# cubic_data = np.load('cubic.npy')

In [None]:
np.save('../../storage/particle/lhs_conv.npy', cubic_data)

In [None]:
cubic_org = np.copy(cubic_data)

In [None]:
param_dict = {}

for k in basic_params.keys():
    if type(basic_params[k]) == dict:
        for k2 in basic_params[k]:
            param_dict[k+'_'+k2] = basic_params[k][k2]
    else:
        param_dict[k] = basic_params[k]
        
for k in conv_params.keys():
    if type(conv_params[k]) == dict:
        for k2 in conv_params[k]:
            param_dict[k+'_'+k2] = conv_params[k][k2]
    else:
        param_dict[k] = conv_params[k]

In [None]:
for i, k in enumerate(param_dict.keys()):
    p_min = param_dict[k][0]
    p_max = param_dict[k][1]
    cubic_data[:, i] = cubic_data[:, i]*(p_max - p_min) + p_min

In [None]:
root_dir = '../../projects/particle/lhs_opt'

In [None]:
new_conv_smp = get_samples(conv_params, 10000)
# new_conv_idc = smp_to_indices(new_conv_smp, conv_params)

In [None]:
import datetime as dt
import os

# conv_smp = get_samples(conv_params, 1024)
# conv_idc = smp_to_indices(conv_smp, conv_params)

# rnn_smp = get_samples(rnn_params, 1024)
# rnn_idc = smp_to_indices(rnn_smp, rnn_params)

proj_dir = f"{root_dir}/{dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d_%H:%M')}"
os.makedirs(proj_dir)

np.save(f'{proj_dir}/conv_idc.npy', cubic_data)
# np.save(f'{proj_dir}/rnn_idc.npy', rnn_idc)

In [None]:
proj_dir = '../../projects/particle/lhs_opt/2022-09-10_11:43'
conv_idc = np.load(f'{proj_dir}/conv_idc.npy')
# rnn_idc = np.load(f'{proj_dir}/rnn_idc.npy')

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    SimpleRNN,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    Input,
    LeakyReLU,
    MaxPooling1D,
)
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

def build_conv_layer(param, _input):
    x = _input
    for p in param.keys():
        info = str(p).split('_')
        layer_type = info[0]
        if layer_type == 'conv' and param[p]["filters"] is not None:
            f = param[p]["filters"]
            k = param[p]["kernel_size"]
            s = param[p]["strides"]
            x = Conv1D(f, kernel_size=k, kernel_initializer='he_uniform', activation='relu', strides=s, padding='same')(x)
    return x

def build_rnn_layer(param, _input):
    x = _input
    for p in param.keys():
        info = str(p).split('_')
        layer_type = info[0]
        num_type = int(info[1])
        if layer_type == 'conv' and param[p]["activated"]:
            f = param[p]["filters"]
            k = param[p]["kernel_size"]
            s = param[p]["strides"]
            i += 3
            x = Conv1D(f, kernel_size=k, kernel_initializer='he_uniform', activation='relu', strides=s, padding='same')(x)
        elif layer_type == 'rnn':
            layer = param[p]["layer"]
            units = param[p]["units"]
            dropout = param[p]["dropout"]
            if layer == 'naive':
                x = SimpleRNN(units=units, 
                              dropout=dropout,
                              activation='tanh', 
                              kernel_initializer='glorot_uniform', 
                              return_sequences=True,
                             )(x)
            elif layer == 'lstm':
                x = LSTM(units=units,
                         dropout=dropout,
                         activation='tanh', 
                         kernel_initializer='glorot_uniform', 
                         return_sequences=True,
                        )(x)
            elif layer == 'gru':
                x = GRU(units=units, 
                        dropout=dropout,
                        activation='tanh', 
                        kernel_initializer='glorot_uniform', 
                        return_sequences=True,
                       )(x)
    return x

def model_builder(p, input_shape, output_size, layer_type='conv'):
    input_tensor = Input(shape=input_shape, name="input")
    x = input_tensor
    if layer_type == 'conv':
        x = build_conv_layer(p, x)
    elif layer_type == 'rnn':
        x = build_rnn_layer(p, x)
    
    x = MaxPooling1D(pool_size=p["pool_size"], strides=p["pool_strides"], padding='same')(x)
    x = Flatten()(x)
    x = Dense(p["dense"]["units"], kernel_initializer='he_uniform', activation=LeakyReLU(p["dense"]["leaky_relu"]))(x)
    x = Dropout(p["dense"]["dropout"])(x)
    output = Dense(output_size, kernel_initializer='he_uniform', activation="relu", name="output")(x)

    _model = Model(
        inputs=input_tensor,
        outputs=output,
        name='test',
    )

    _model.compile(
        optimizer=Adam(learning_rate=p["lr"]),
        loss='mse',
        metrics=RootMeanSquaredError(),
    )
    return _model

In [None]:
def li_to_dt(li):
    dt = {}
    for i, k in enumerate(param_dict.keys()):
        if k != 'dense_dropout' and k != 'dense_leaky_relu':
            dt[k] = int(li[i])
        else:
            dt[k] = li[i]
    return dt

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    GRU,
    LSTM,
    SimpleRNN,
    Conv1D,
    Dense,
    Dropout,
    Flatten,
    Input,
    LeakyReLU,
    MaxPooling1D,
)
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam

def model_builder(p_dt):
    input_tensor = Input(shape=(p_dt["window_size"], len(inputs)), name="input")
    x = input_tensor
    if p_dt["conv_strides"] == 0:
        p_dt["conv_strides"] = None
    x = Conv1D(p_dt["conv_filters"], 
               kernel_size=p_dt["conv_kernel_size"], 
               kernel_initializer='he_uniform', 
               activation='relu', 
               strides=p_dt["conv_strides"],
               padding='same')(x)
    x = MaxPooling1D(pool_size=p_dt["pool_size"], 
                     strides=p_dt["pool_strides"], 
                     padding='same')(x)
    x = Flatten()(x)
    x = Dense(p_dt["dense_units"], 
              kernel_initializer='he_uniform', 
              activation=LeakyReLU(p_dt["dense_leaky_relu"]))(x)
    x = Dropout(p_dt["dense_dropout"])(x)
    output = Dense(len(outputs), kernel_initializer='he_uniform', activation="relu", name="output")(x)

    _model = Model(
        inputs=input_tensor,
        outputs=output,
        name='test',
    )

    _model.compile(
        optimizer=Adam(learning_rate=lr_val[p_dt["lr"]]),
        loss='mse',
        metrics=RootMeanSquaredError(),
    )
    return _model

In [None]:
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


rlr_cb = ReduceLROnPlateau(
    monitor="val_loss", factor=0.2, patience=5, mode="min", min_lr=1e-6, verbose=False
)
ely_cb = EarlyStopping(monitor="val_loss", patience=15, mode="min", verbose=False, restore_best_weights=True)

In [None]:
def calc_metric(real, pred):
    metric_funcs = [metrics.calc_r2,
                    metrics.calc_corrcoef, 
                    metrics.calc_nmse, 
                    metrics.calc_fb,
                    metrics.calc_b,
                    metrics.calc_a_co, 
                    metrics.calc_mse,]
    res = np.zeros(len(metric_funcs))

    for i, metric in enumerate(metric_funcs):
        res[i] = metric(real, pred)
    return res

In [None]:
metric_df = pd.read_csv(f'{proj_dir}/metric.csv', index_col='index')

In [None]:
import time
import json
import os

def to_dataset(_dfs, in_time_step):
    return dh.dfs_to_dataset(_dfs, meta_df, inputs, outputs, in_time_step=in_time_step, out_time_step=1, offset=1, excludes=outputs)

def train_model(_model):
    _ = model.fit(
        x=X_train,
        y=y_train,
        batch_size=p["batch_size"],
        shuffle=False,
        epochs=100,
        validation_data=(X_val, y_val),
        callbacks=[rlr_cb, ely_cb],
        verbose=False,
    )
    print(f'[INFO] Finished training')
    K.clear_session()

idc = conv_idc
metrics_indices = ['r2', 'corr', 'nmse', 'fb', 'b', 'a/c', 'mse']

metric_df = pd.DataFrame(np.zeros((len(conv_idc), len(metrics_indices))), columns=metrics_indices)
if os.path.exists(f'{proj_dir}/metric.csv'):
    print(f'Found metric_df. Read from source.')
    metric_df = pd.read_csv(f'{proj_dir}/metric.csv', index_col='index')


for i, conv_idx in enumerate(idc):
    root_dir = proj_dir+f'/trial{i:03d}'
    if os.path.exists(root_dir):
        continue
    os.makedirs(root_dir)
    print(f'[INFO] Trial{i:03d} training start')

    p = li_to_dt(conv_idx)
    with open(f"{root_dir}/params.json", "w") as outfile:
        json.dump(p, outfile)
        outfile.close()

    win_size = p["window_size"]
    X_train, y_train = to_dataset(train_dfs, win_size)
    X_val, y_val = to_dataset(val_dfs, win_size)
    X_test, y_test = to_dataset(test_dfs, win_size)

    y_train = y_train.reshape(-1, len(outputs))
    y_val = y_val.reshape(-1, len(outputs))
    y_test = y_test.reshape(-1, len(outputs))
    model = model_builder(p)
    train_model(model)

    y_hat = model.predict(X_test, verbose=False)
    print(f'[INFO] Trial{i:03d} finished predict')
    tf.compat.v1.reset_default_graph()
    del model
    K.clear_session()

    print(f'[INFO] Trial{i:03d} successfully ended.. Clear session')
    metric = calc_metric(y_test, y_hat)
    metric_df.iloc[i] = metric
    metric_df.to_csv(f'{proj_dir}/metric.csv', index_label='index')

In [None]:
import time
import json
import os

def train_model(param, _model):
    X_train = train_ds[str(param["window_size"])][0]
    y_train = train_ds[str(param["window_size"])][1].reshape(-1, 3)
    X_val = val_ds[str(param["window_size"])][0]
    y_val = val_ds[str(param["window_size"])][1].reshape(-1, 3)
    
    history = _model.fit(
        x=X_train,
        y=y_train,
        batch_size=param_values["batch_size"],
        shuffle=False,
        epochs=100,
        validation_data=(X_val, y_val),
        callbacks=[rlr_cb, ely_cb],
        verbose=False,
    )
    pd.DataFrame(history.history).to_csv(root_dir+'/history.csv', index=False)
    print(f'[INFO] Finished training')
    K.clear_session()

idc = conv_idc
param = conv_params
metrics_indices = ['r2', 'corr', 'nmse', 'fb', 'b', 'a/c', 'mse']

metric_df = pd.DataFrame(np.zeros((len(conv_idc), len(metrics_indices))), columns=metrics_indices)
if os.path.exists(f'{proj_dir}/metric.csv'):
    print(f'Found metric_df. Read from source.')
    metric_df = pd.read_csv(f'{proj_dir}/metric.csv', index_col='index')


for i, conv_idx in enumerate(idc):
    root_dir = proj_dir+f'/trial{i:03d}'
    if os.path.exists(root_dir):
        continue
    os.makedirs(root_dir)
    print(f'[INFO] Trial{i:03d} training start')

    with open(f"{root_dir}/params.json", "w") as outfile:
        json.dump(param_values, outfile)
        outfile.close()

    X_test = test_ds[str(param_values["window_size"])][0]
    y_test = test_ds[str(param_values["window_size"])][1].reshape(-1, 3)

    model = model_builder(param_values, X_test[0].shape, y_test.shape[1])
    train_model(param_values, model)

    y_hat = model.predict(X_test, verbose=False)
    print(f'[INFO] Trial{i:03d} finished predict')
    tf.compat.v1.reset_default_graph()
    del model
    K.clear_session()

    print(f'[INFO] Trial{i:03d} successfully ended.. Clear session')
    metric = calc_metric(y_test, y_hat)
    metric_df.iloc[i] = metric
    metric_df.to_csv(f'{proj_dir}/metric.csv', index_label='index')

In [None]:
metric_df = pd.read_csv(f'{proj_dir}/metric.csv', index_col='index')

In [None]:
mse = metric_df['mse'].values

In [None]:
inputs = conv_idc
outputs = mse

In [None]:
input_df = pd.DataFrame(inputs)
input_df['mse'] = mse
input_df.columns = ['window', 
                    'max_pool_size',
                    'max_pool_strides',
                    'dense_units',
                    'dense_dropout',
                    'leaky_relu',
                    'batch_size',
                    'lr',
                    'filter_size_1',
                    'kernel_size_1',
                    'strides_1',
                    'filter_size_2',
                    'kernel_size_2',
                    'strides_2',
                    'mse'
                   ]
dt = dict_cat(conv_params)
input_df['window'] = np.array(dt['window_size'])[input_df['window'].values]
input_df['max_pool_size'] = np.array(dt['pool_size'])[input_df['max_pool_size'].values]
input_df['max_pool_strides'] = np.array(dt['pool_strides'])[input_df['max_pool_strides'].values]
input_df['dense_units'] = np.array(dt['dense']['units'])[input_df['dense_units'].values]
input_df['dense_dropout'] = np.array(dt['dense']['dropout'])[input_df['dense_dropout'].values]
input_df['leaky_relu'] = np.array(dt['dense']['leaky_relu'])[input_df['leaky_relu'].values]
input_df['lr'] = np.array(dt['lr'])[input_df['lr'].values]
input_df['batch_size'] = np.array(dt['batch_size'])[input_df['batch_size'].values]
input_df['filter_size_1'] = np.array(dt['conv_0']['filters'])[input_df['filter_size_1'].values]
input_df['kernel_size_1'] = np.array(dt['conv_0']['kernel_size'])[input_df['kernel_size_1'].values]
input_df['strides_1'] = np.array(dt['conv_0']['strides'])[input_df['strides_1'].values]
input_df['filter_size_2'] = np.array(dt['conv_1']['filters'])[input_df['filter_size_2'].values]
input_df['kernel_size_2'] = np.array(dt['conv_1']['kernel_size'])[input_df['kernel_size_2'].values]
input_df['strides_2'] = np.array(dt['conv_1']['strides'])[input_df['strides_2'].values]
input_df = input_df.fillna(0)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_df.iloc[:, :-1].values, mse, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=10)

model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt

sorted_idx = model.feature_importances_.argsort()
plt.barh(input_df.columns[:-1][sorted_idx], model.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")

In [None]:
lr_0 = input_df[input_df['lr'] == 0.001]
lr_1 = input_df[input_df['lr'] == 0.0001]
lr_2 = input_df[input_df['lr'] == 0.00001]

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=(30, 8))
# lr_0[lr_0['mse'] < 1000].hist('mse', bins=100, ax=axes[0])
# lr_1[lr_1['mse'] < 1000].hist('mse', bins=100, ax=axes[1])
# lr_2[lr_2['mse'] < 1000].hist('mse', bins=100, ax=axes[2])

lr_0.hist('mse', bins=100, ax=axes[0])
lr_1.hist('mse', bins=100, ax=axes[1])
lr_2.hist('mse', bins=100, ax=axes[2])

In [None]:
filter_0 = input_df[input_df['filter_size_2'] == 0]
filter_1 = input_df[input_df['filter_size_2'] != 0]

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(18, 6))
# lr_0[lr_0['mse'] < 1000].hist('mse', bins=100, ax=axes[0])
# lr_1[lr_1['mse'] < 1000].hist('mse', bins=100, ax=axes[1])
# lr_2[lr_2['mse'] < 1000].hist('mse', bins=100, ax=axes[2])

filter_0.hist('mse', bins=100, ax=axes[0])
filter_1.hist('mse', bins=100, ax=axes[1])

In [None]:
filter_0.describe().transpose()

In [None]:
filter_1.describe().transpose()

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(model, X_test, y_test)
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(input_df.columns[:-1][sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

In [None]:
# cp_df = input_df.copy()
# for i in range(input_df.shape[1]):
#     for j in range(input_df.shape[1]):
#         cp_df[f'{i}*{j}'] = cp_df[i].values * cp_df[j].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_input_df.iloc[:, :-1].values, mse, test_size=0.2)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

# new_inputs = cp_df.values

poly_model = Pipeline([
    ("poly_features", PolynomialFeatures(degree=2, include_bias=True)),
    ("lin_reg", LinearRegression())
])

poly_model.fit(X_train, y_train)

print(poly_model.score(X_train, y_train))
print(poly_model.score(X_test, y_test))

In [None]:
np.random.sample()

In [None]:
input_df.sample(int(len(input_df) * 0.2))

In [None]:
# import os
# basedir = f'{proj_dir}'

# dirs = sorted(os.listdir(basedir))[4:]
# for d in dirs:
#     name = d[:5]
#     num = int(d[5:])
#     print(d, f'{num:05d}')
#     os.rename(os.path.join(basedir, d), os.path.join(basedir, f'trial{num:05d}'))

In [None]:
from sklearn.tree import export_graphviz

In [None]:
input_df.columns

In [None]:
export_graphviz(model.estimators_[0], out_file='tree.dot', 
                feature_names=input_df.columns[:-1],
                class_names='mse',
                rounded=True, proportion=False, 
                precision=2, filled=True)

from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

from IPython.display import Image
Image(filename = 'tree.png')