## 필요한 라이브러리 임포트

In [87]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.models import Sequential
from keras.layers import LSTM, Dense, SimpleRNN
from keras.callbacks import EarlyStopping
import tensorflow as tf
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr

## 데이터 전처리

In [70]:
df = pd.read_csv('./filtered_data/merged_data.csv')

# timedelta 형식 변환
df['체류시간'] = pd.to_timedelta(df['체류시간'])

# 총 초로 변환
df['체류시간_초'] = df['체류시간'].dt.total_seconds()

print("데이터 타입:", df['체류시간_초'].dtype)

print(df['체류시간_초'].head(20))

데이터 타입: float64
0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
5             NaN
6     1260.000000
7     1833.230769
8     1769.761273
9     1766.533333
10    3516.881960
11    1606.153846
12    1915.721739
13    2052.310536
14    2577.720207
15    2369.219331
16    3373.529412
17    2461.250000
18    2751.818182
19    2734.857143
Name: 체류시간_초, dtype: float64


In [71]:
print(df.dtypes)

날짜                         object
시간대                        object
차량진입횟수                    float64
체류시간              timedelta64[ns]
anchored_count            float64
TotalTonnage              float64
PM10                      float64
PM2.5                     float64
오 존                       float64
이산화질소                     float64
일산화탄소                     float64
아황산가스                     float64
기온(C)                     float64
풍속(m/s)                   float64
강수량(mm)                   float64
습도(%)                     float64
체류시간_초                    float64
dtype: object


In [72]:
df = df.drop('체류시간', axis=1)
df

Unnamed: 0,날짜,시간대,차량진입횟수,anchored_count,TotalTonnage,PM10,PM2.5,오 존,이산화질소,일산화탄소,아황산가스,기온(C),풍속(m/s),강수량(mm),습도(%),체류시간_초
0,2020-09-01,01:00:00,,1.0,9751.0,16.0,16.0,0.026,0.004,0.1,0.001,22.1,2.1,0.0,84.1,
1,2020-09-01,02:00:00,,2.0,9929.0,11.0,9.0,0.029,0.003,0.1,0.001,22.2,1.7,0.0,82.4,
2,2020-09-01,03:00:00,,4.0,21044.0,11.0,8.0,0.030,0.004,0.1,0.001,22.4,2.8,0.0,81.7,
3,2020-09-01,04:00:00,,3.0,20866.0,17.0,7.0,0.031,0.004,0.1,0.001,22.5,2.8,0.0,83.5,
4,2020-09-01,05:00:00,,3.0,20866.0,12.0,11.0,0.031,0.004,0.1,0.001,22.1,2.1,0.0,84.2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7268,2021-06-30,21:00:00,49.0,81.0,,32.0,21.0,0.042,0.020,0.3,0.005,,,,,2133.816327
7269,2021-06-30,22:00:00,16.0,80.0,,27.0,24.0,0.038,0.018,0.3,0.003,,,,,994.875000
7270,2021-06-30,23:00:00,4.0,79.0,,27.0,23.0,0.037,0.014,0.3,0.003,,,,,1226.500000
7271,2021-07-01,00:00:00,,,,31.0,23.0,0.034,0.014,0.3,0.004,,,,,


In [73]:
df.fillna(0, inplace=True)


In [74]:
X = df[['차량진입횟수', '체류시간_초', 'anchored_count', 'TotalTonnage', '오 존', '이산화질소', '일산화탄소', '아황산가스', '기온(C)', '풍속(m/s)', '강수량(mm)', '습도(%)']]
y = df['PM10']  # PM10 농도

In [75]:
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

In [76]:
train_X, test_X, train_y, test_y = train_test_split(X_scaled, y_scaled, test_size=0.2, shuffle=False)

In [77]:
look_back = 24  # 과거 몇 타임스텝을 볼 것인지 정의

train_generator = TimeseriesGenerator(train_X, train_y, length=look_back, batch_size=30)
test_generator = TimeseriesGenerator(test_X, test_y, length=look_back, batch_size=1)

## LSTM

### 모델 설정, 훈련

In [78]:
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(look_back, train_X.shape[1])))  
model.add(LSTM(32))
model.add(Dense(1))  

# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# 모델 훈련
model.fit(train_generator, epochs=100, validation_data=test_generator, callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


<keras.src.callbacks.History at 0x1b731bbc510>

### 모델 예측

In [79]:
test_predictions = model.predict(test_generator)
test_actuals = np.array([data[1][0] for data in test_generator])



In [80]:
# 1차원 배열로 변환
test_predictions = test_predictions.ravel()
test_actuals = test_actuals.ravel()

# 역스케일링
test_predictions = scaler_y.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
test_actuals = scaler_y.inverse_transform(test_actuals.reshape(-1, 1)).ravel()

### 모델 성능 평가

In [81]:
# MAE
mae = mean_absolute_error(test_actuals, test_predictions)
print("Mean Absolute Error (MAE):", mae)

# MSE 
mse = mean_squared_error(test_actuals, test_predictions)
print("Mean Squared Error (MSE):", mse)

# RMSE 
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# 상관 계수
correlation, _ = pearsonr(test_actuals, test_predictions)
print("Correlation Coefficient:", correlation)

# 결정 계수 
r2 = r2_score(test_actuals, test_predictions)
print("Coefficient of Determination (R^2):", r2)

Mean Absolute Error (MAE): 24.149060701507526
Mean Squared Error (MSE): 3380.507981198109
Root Mean Squared Error (RMSE): 58.14213602197729
Correlation Coefficient: 0.2480137171650007
Coefficient of Determination (R^2): 0.06138759083701073


In [62]:
print(df.columns)

Index(['날짜', '시간대', '차량진입횟수', 'anchored_count', 'TotalTonnage', 'PM10',
       'PM2.5', '오 존', '이산화질소', '일산화탄소', '아황산가스', '기온(C)', '풍속(m/s)',
       '강수량(mm)', '습도(%)', '체류시간_초'],
      dtype='object')


In [63]:
# test_generator의 첫 번째 배치 추출
first_batch = test_generator[0]
input_data, actual_output = first_batch

print(actual_output.shape)


(1, 1)


In [64]:
print( actual_output[0])

[0.00760456]


In [65]:
pm10_original = scaler_y.inverse_transform(actual_output)
print("원래 PM10 값:", pm10_original)


원래 PM10 값: [[12.]]


In [66]:
print("원본 PM10 샘플 값:", df['PM10'].head())

원본 데이터의 PM10 샘플 값: 0    16.0
1    11.0
2    11.0
3    17.0
4    12.0
Name: PM10, dtype: float64


### 파라미터 조정

In [91]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(look_back, train_X.shape[1])))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# 모델 훈련
model.fit(train_generator, epochs=100, validation_data=test_generator, callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.src.callbacks.History at 0x1b73b119ed0>

In [92]:
test_predictions = model.predict(test_generator)
test_actuals = np.array([data[1][0] for data in test_generator])
# 1차원 배열로 변환
test_predictions = test_predictions.ravel()
test_actuals = test_actuals.ravel()

# 역스케일링
test_predictions = scaler_y.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
test_actuals = scaler_y.inverse_transform(test_actuals.reshape(-1, 1)).ravel()



In [93]:
# MAE
mae = mean_absolute_error(test_actuals, test_predictions)
print("Mean Absolute Error (MAE):", mae)

# MSE 
mse = mean_squared_error(test_actuals, test_predictions)
print("Mean Squared Error (MSE):", mse)

# RMSE 
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# 상관 계수
correlation, _ = pearsonr(test_actuals, test_predictions)
print("Correlation Coefficient:", correlation)

# 결정 계수 
r2 = r2_score(test_actuals, test_predictions)
print("Coefficient of Determination (R^2):", r2)

Mean Absolute Error (MAE): 26.0219454801974
Mean Squared Error (MSE): 3541.837930861355
Root Mean Squared Error (RMSE): 59.51334246084112
Correlation Coefficient: 0.1417702123432309
Coefficient of Determination (R^2): 0.016593644611835767


## RNN

In [82]:
# RNN 모델 정의
model = Sequential()
model.add(SimpleRNN(32, return_sequences=True, input_shape=(look_back, train_X.shape[1])))
model.add(SimpleRNN(32))
model.add(Dense(1)) 

# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# 모델 훈련
model.fit(train_generator, epochs=100, validation_data=test_generator, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


<keras.src.callbacks.History at 0x1b724328a50>

In [84]:
# 예측값 생성
test_predictions = model.predict(test_generator)
test_actuals = np.array([data[1][0] for data in test_generator])

# 1차원 배열로 변환
test_predictions = test_predictions.ravel()
test_actuals = test_actuals.ravel()

# 역스케일링
test_predictions = scaler_y.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
test_actuals = scaler_y.inverse_transform(test_actuals.reshape(-1, 1)).ravel()



In [85]:
# MAE
mae = mean_absolute_error(test_actuals, test_predictions)
print("Mean Absolute Error (MAE):", mae)

# MSE
mse = mean_squared_error(test_actuals, test_predictions)
print("Mean Squared Error (MSE):", mse)

# RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# 상관 계수
correlation, _ = pearsonr(test_actuals, test_predictions)
print("Correlation Coefficient:", correlation)

# 결정 계수
r2 = r2_score(test_actuals, test_predictions)
print("Coefficient of Determination (R^2):", r2)

Mean Absolute Error (MAE): 78.74762543492348
Mean Squared Error (MSE): 7701.128902126091
Root Mean Squared Error (RMSE): 87.7560761550224
Correlation Coefficient: 0.17588681301081044
Coefficient of Determination (R^2): -1.1382511718068593


### 파라미터, 모델층 수 조정

In [94]:
# RNN 모델
model = Sequential()
model.add(SimpleRNN(64, return_sequences=True, input_shape=(look_back, train_X.shape[1])))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))

# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# 모델 훈련
model.fit(train_generator, epochs=100, validation_data=test_generator, callbacks=[early_stopping])


# 예측값 생성
test_predictions = model.predict(test_generator)
test_actuals = np.array([data[1][0] for data in test_generator])

# 1차원 배열로 변환
test_predictions = test_predictions.ravel()
test_actuals = test_actuals.ravel()

# 역스케일링
test_predictions = scaler_y.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
test_actuals = scaler_y.inverse_transform(test_actuals.reshape(-1, 1)).ravel()

# MAE
mae = mean_absolute_error(test_actuals, test_predictions)
print("Mean Absolute Error (MAE):", mae)

# MSE
mse = mean_squared_error(test_actuals, test_predictions)
print("Mean Squared Error (MSE):", mse)

# RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# 상관 계수
correlation, _ = pearsonr(test_actuals, test_predictions)
print("Correlation Coefficient:", correlation)

# 결정 계수
r2 = r2_score(test_actuals, test_predictions)
print("Coefficient of Determination (R^2):", r2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Mean Absolute Error (MAE): 22.40032747190037
Mean Squared Error (MSE): 3593.4471117393045
Root Mean Squared Error (RMSE): 59.94536772544902
Correlation Coefficient: 0.11364815704631082
Coefficient of Determination (R^2): 0.0022641361864148557


## CNN 모델

In [88]:
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(look_back, train_X.shape[1])))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))

# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# 모델 훈련
model.fit(train_generator, epochs=100, validation_data=test_generator, callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


<keras.src.callbacks.History at 0x1b73a3db5d0>

In [89]:
# 예측값 생성
test_predictions = model.predict(test_generator)
test_actuals = np.array([data[1][0] for data in test_generator])

# 1차원 배열로 변환
test_predictions = test_predictions.ravel()
test_actuals = test_actuals.ravel()

# 역스케일링
test_predictions = scaler_y.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
test_actuals = scaler_y.inverse_transform(test_actuals.reshape(-1, 1)).ravel()



In [90]:
# MAE
mae = mean_absolute_error(test_actuals, test_predictions)
print("Mean Absolute Error (MAE):", mae)

# MSE
mse = mean_squared_error(test_actuals, test_predictions)
print("Mean Squared Error (MSE):", mse)

# RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# 상관 계수
correlation, _ = pearsonr(test_actuals, test_predictions)
print("Correlation Coefficient:", correlation)

# 결정 계수 
r2 = r2_score(test_actuals, test_predictions)
print("Coefficient of Determination (R^2):", r2)

Mean Absolute Error (MAE): 25.95836875472637
Mean Squared Error (MSE): 3482.745023615658
Root Mean Squared Error (RMSE): 59.014786482843924
Correlation Coefficient: 0.19861347635197288
Coefficient of Determination (R^2): 0.03300104147136662


### 파라미터 조정 - 필터 추가, 층 추가

In [95]:
model = Sequential()
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', input_shape=(look_back, train_X.shape[1])))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))


# 모델 컴파일
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# 모델 훈련
model.fit(train_generator, epochs=100, validation_data=test_generator, callbacks=[early_stopping])


# 예측값 생성
test_predictions = model.predict(test_generator)
test_actuals = np.array([data[1][0] for data in test_generator])

# 1차원 배열로 변환
test_predictions = test_predictions.ravel()
test_actuals = test_actuals.ravel()

# 역스케일링
test_predictions = scaler_y.inverse_transform(test_predictions.reshape(-1, 1)).ravel()
test_actuals = scaler_y.inverse_transform(test_actuals.reshape(-1, 1)).ravel()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [96]:
# MAE
mae = mean_absolute_error(test_actuals, test_predictions)
print("Mean Absolute Error (MAE):", mae)

# MSE
mse = mean_squared_error(test_actuals, test_predictions)
print("Mean Squared Error (MSE):", mse)

# RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# 상관 계수
correlation, _ = pearsonr(test_actuals, test_predictions)
print("Correlation Coefficient:", correlation)

# 결정 계수 
r2 = r2_score(test_actuals, test_predictions)
print("Coefficient of Determination (R^2):", r2)

Mean Absolute Error (MAE): 29.56441683339373
Mean Squared Error (MSE): 3579.3152163693644
Root Mean Squared Error (RMSE): 59.82737848484893
Correlation Coefficient: 0.17151449589865764
Coefficient of Determination (R^2): 0.006187917000716414


# 결론

기본적인 RNN에서 파라미터를 조정해서 뉴런을 추가하고, 층을 추가한 모델의 평가 지수가  
```
Mean Absolute Error (MAE): 22.40032747190037
Mean Squared Error (MSE): 3593.4471117393045
Root Mean Squared Error (RMSE): 59.94536772544902
Correlation Coefficient: 0.11364815704631082
Coefficient of Determination (R^2): 0.0022641361864148557
```
으로 가장 성능이 좋았다. 하지만 전체적으로 성능이 그렇게 좋지는 않았기 때문에, 추후 중요 변수를 선택 하고 모델 예측을 해보는 것이 좋겠다. 