In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model

from particle_dataset import ParticleDS

In [None]:
BATCH_SIZE = 32
WINDOW_SIZE = 16

TARGET_LABEL = 'PM2.5'
FEATURE_LABEL = [
    'PM1_H_OUT',
    'PM2.5_H_OUT',
    'PM10_H_OUT',
    'PM1_OUT',
    'PM2.5_OUT',
    'PM10_OUT',
    'PERSON_NUMBER',
    'AIR_PURIFIER',
    'WINDOW'
]

DATASET_PATH = '../../datasets/summary/particles_inout.csv'
WEIGHT_PATH = '../model_weights/lstm_v3_weights.h5'

In [None]:
pd.read_csv('../../datasets/summary/particles_inout.csv').describe().transpose()

In [None]:
# Load dataset and remove rows where pm2.5 is over 150
print('Loading dataset...')
df_org = pd.read_csv(DATASET_PATH)
df_org['DATE'] = df_org['DATE'].apply(pd.to_datetime)
df_org.drop(df_org[df_org['PM2.5'] > 150].index, inplace=True)
print('Successfully loaded!')
print('Original dataset shape: ', df_org.shape)

print(f'Remove date column and calculate moving average with window size {WINDOW_SIZE}')
df = df_org.drop(columns=['DATE'])
df = df.rolling(window=10).mean()
df.dropna(inplace=True)
df.reset_index(drop='index', inplace=True)
print('Trimed dataset shape: ', df.shape)

y_df = df[TARGET_LABEL]
x_df = df[FEATURE_LABEL]

# Data Normalization for PM features (사람 수, 공기청정기, 창문 데이터는 해당되지 않음)
cols = ['PM1_H_OUT', 'PM2.5_H_OUT', 'PM10_H_OUT', 'PM1_OUT', 'PM2.5_OUT', 'PM10_OUT']
for col in cols:
    x_df[col] = (x_df[col] - x_df[col].mean()) / x_df[col].std()

x_df.reset_index(drop='index', inplace=True)
y_df.reset_index(drop='index', inplace=True)

data_size = x_df.shape[0] - WINDOW_SIZE

X = np.zeros((data_size, WINDOW_SIZE, x_df.shape[1]))
for i in range(data_size):
    X[i] = x_df[i:i + WINDOW_SIZE].values

y = y_df.loc[WINDOW_SIZE:].values

print('X shape: ', X.shape)
print('y shape: ', y.shape)

In [None]:
def train_test_split_with_sequence(_X, _y, test_size=0.25):
    size = _X.shape[0]
    split_index = int(np.ceil(size * (1 - test_size)))
    return _X[:split_index], _X[split_index:], _y[:split_index], _y[split_index:]


X_train, X_test, y_train, y_test = train_test_split_with_sequence(X, y, test_size=0.25)
X_train, X_val, y_train, y_val = train_test_split_with_sequence(X_train, y_train, test_size=0.2)

print('X_train, y_train shape: ', X_train.shape, y_train.shape)
print('X_val, y_val shape: ', X_val.shape, y_val.shape)
print('X_test, y_test shape: ', X_test.shape, y_test.shape)

In [None]:
train_ds = ParticleDS(X_train, y_train, window_size=WINDOW_SIZE, batch_size=BATCH_SIZE)
val_ds = ParticleDS(X_val, y_val, window_size=WINDOW_SIZE, batch_size=BATCH_SIZE)
test_ds = ParticleDS(X_test, y_test, window_size=WINDOW_SIZE, batch_size=BATCH_SIZE)

In [None]:
from tensorflow.keras.layers import Input, Dense, Conv1D, Dropout, AveragePooling1D, LSTM
from tensorflow.keras.optimizers import Adam

input_tensor = Input(shape=(WINDOW_SIZE, 9), name='input')

x = Conv1D(filters=128, kernel_size=3, strides=1, padding='same', name='conv1d_1')(input_tensor)
x = Conv1D(filters=64, kernel_size=3, strides=1, padding='same', name='conv1d_2')(x)
x = AveragePooling1D(pool_size=3, strides=1, padding='same', name='pooling_1')(x)

x = LSTM(units=12, return_sequences=True, name='lstm_1')(x)
x = Dropout(0.4)(x)
x = LSTM(units=32, return_sequences=True, name='lstm_2')(x)
x = Dropout(0.2)(x)
x = LSTM(units=16, return_sequences=False, name='lstm_3')(x)
x = Dense(128, activation='relu', name='fc_1')(x)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu', name='fc_2')(x)
output = Dense(1, name='output')(x)

model = Model(inputs=input_tensor, outputs=output, name='lstm_v3')
model.summary()

In [None]:
from tensorflow.keras.metrics import mean_squared_error


# Metric으로 사용할 목적으로 작성했지만, 결과가 일반 MSE와 크게 다르지 않아 사용하지 않음
def last_time_step_mse(y_true, y_pred):
    return mean_squared_error(y_true[:-1], y_pred[:-1])


model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_squared_error', metrics=[last_time_step_mse])

In [None]:
# 모델 학습 부분, 실제 학습하는 과정을 보려면 아래 주석 코드를 사용하면 됩니다.
# CPU 환경에서는 시간이 많이 소요되어 Kaggle 혹은 Colab GPU 환경을 사용하시는 것을 추천드립니다.

"""
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

rlr_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, mode='min', verbose=1)
ely_cb = EarlyStopping(monitor='val_loss', patience=10, mode='min', verbose=1)
mcp_cb = ModelCheckpoint(
    filepath='/content/drive/MyDrive/cpfd/models/weights/lstm_v3_weights.{epoch:02d}-{val_loss:.2f}.hdf5',
    monitor='val_loss',
    save_best_only=True, save_weights_only=True, mode='min', period=1, verbose=0)

history = model.fit(train_ds, epochs=30, validation_data=val_ds, callbacks=[rlr_cb, ely_cb, mcp_cb])

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='valid')
plt.legend()
"""

# 학습하는 것이 아니라 학습된 모델을 사용할 경우, 아래 코드를 사용해주시기 바랍니다.
model.load_weights(WEIGHT_PATH)

In [None]:
# Checking MSE for test dataset
model.evaluate(test_ds)

In [None]:
# Create result dataframe with Real and Predict data
pred = model.predict(test_ds, batch_size=32)
result = pd.DataFrame({'Real': y_test, 'Pred': pred.reshape(len(pred))})

In [None]:
ax = result.plot(kind='line', figsize=(36, 10), fontsize=17)
ax.set_xlabel('TIME', fontsize=24)
ax.set_ylabel('PM 2.5', fontsize=24)
ax.legend(fontsize=30)

In [None]:
from sklearn.metrics import r2_score

result_under_100 = result[result['Real'] < 80]
r2 = r2_score(result_under_100['Real'].values, result_under_100['Pred'].values)
print('R Square: %.4f' % r2)