### LSTM 을 활용한 주가 예측 모델
##### 원본 데이터 사용

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import datetime as dt


%matplotlib inline
warnings.filterwarnings('ignore')

sqlite DB에서 데이터 불러오기

In [2]:
import sqlite3
con = sqlite3.connect(r"C:\Users\구남이\OneDrive\바탕 화면\활동들\한이음 멘토링\stock.db")
df = pd.read_sql("SELECT * FROM '039490'", con, index_col=None)
df.rename(columns={'index':'date'}, inplace=True)
df = df.set_index('date')
df.index = pd.DatetimeIndex(df.index)
df.sort_index(ascending=True, inplace=True)
df

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-04-23,6076,6799,5353,5353,1415109
2004-04-26,5436,5741,5261,5398,343283
2004-04-27,5452,5482,5033,5063,299489
2004-04-28,5071,5452,4995,5444,182604
2004-04-29,5117,5436,5025,5101,109911
...,...,...,...,...,...
2022-05-11,87500,87500,85500,85800,108071
2022-05-12,84700,85300,82700,83200,104007
2022-05-13,84000,87400,83500,85800,86525
2022-05-16,86900,87600,85700,85900,55383


In [3]:
from sklearn.preprocessing import MinMaxScaler

# 피처값 스케일링
scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(df.drop('close', axis=1))
scaled_df = pd.DataFrame(scaled_df, columns = df.drop('close', axis=1).columns)

In [4]:
# 타켓 스케일링
scaler1 = MinMaxScaler()
scaled_df['close'] = scaler1.fit_transform(df['close'].values.reshape(-1,1))

In [5]:
scaled_df

Unnamed: 0,open,high,low,volume,close
0,0.022741,0.026226,0.019066,1.000000,0.018324
1,0.018743,0.019815,0.018467,0.241077,0.018606
2,0.018842,0.018245,0.016982,0.210068,0.016507
3,0.016462,0.018063,0.016734,0.127305,0.018894
4,0.016750,0.017967,0.016930,0.075834,0.016745
...,...,...,...,...,...
4466,0.531437,0.515237,0.540941,0.074531,0.522470
4467,0.513944,0.501906,0.522709,0.071653,0.506176
4468,0.509571,0.514631,0.527918,0.059275,0.522470
4469,0.527689,0.515843,0.542243,0.037224,0.523096


In [16]:
scaled_df.drop('close', 1)

Unnamed: 0,open,high,low,volume
0,0.02274,0.02623,0.01907,1.00000
1,0.01874,0.01981,0.01847,0.24108
2,0.01884,0.01825,0.01698,0.21007
3,0.01646,0.01806,0.01673,0.12731
4,0.01675,0.01797,0.01693,0.07583
...,...,...,...,...
4466,0.53144,0.51524,0.54094,0.07453
4467,0.51394,0.50191,0.52271,0.07165
4468,0.50957,0.51463,0.52792,0.05928
4469,0.52769,0.51584,0.54224,0.03722


In [17]:
scaled_df['close']

0      0.01832
1      0.01861
2      0.01651
3      0.01889
4      0.01674
         ...  
4466   0.52247
4467   0.50618
4468   0.52247
4469   0.52310
4470   0.53375
Name: close, Length: 4471, dtype: float64

In [6]:
#train,test 분할
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_df.drop('close', 1), scaled_df['close'], test_size=0.3, random_state=0, shuffle=False)

In [7]:
# tensorflow dataset을 활용한 시퀀스 데이터셋 구성
import tensorflow as tf
def windowed_dataset(series, window_size, batch_size, shuffle):
    series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    if shuffle:
        ds = ds.shuffle(1000)
    ds = ds.map(lambda w: (w[:-1], w[-1]))
    return ds.batch(batch_size).prefetch(1)

In [8]:
# hyper parameter 정의 
WINDOW_SIZE=20
BATCH_SIZE=32

In [9]:
# trian_data는 학습용 데이터셋, test_data는 검증용 데이터셋
train_data = windowed_dataset(y_train, WINDOW_SIZE, BATCH_SIZE, True)
test_data = windowed_dataset(y_test, WINDOW_SIZE, BATCH_SIZE, False)


In [10]:
 # 모델 생성
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Lambda
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential([
    # 1차원 feature map 생성
    Conv1D(filters=32, kernel_size=5,
        padding="causal",
        #활성화함수를 relu로 정의
        activation="relu",
        input_shape=[WINDOW_SIZE, 1]),
    # LSTM
    LSTM(16, activation='tanh'),    
    Dense(16, activation="relu"),
    Dense(1)])

loss = Huber()
optimizer = Adam(0.005) ##3compile 시 optimizer는 adam으로 사용
model.compile(loss=Huber(), optimizer=optimizer, metrics=['mse'])

earlystopping = EarlyStopping(monitor='val_loss', patience=10)

filename = os.path.join('tmp', 'ckeckpointer.ckpt')
checkpoint = ModelCheckpoint(filename, 
                            save_weights_only=True,
                            save_best_only=True, 
                            monitor='val_loss',
                            verbose=1)

In [11]:
# 모델 학습
history = model.fit(train_data, 
                validation_data=(test_data), 
                epochs=50,   #50번 학습
                callbacks=[checkpoint, earlystopping])

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.00241, saving model to tmp\ckeckpointer.ckpt
Epoch 2/50

Epoch 00002: val_loss improved from 0.00241 to 0.00068, saving model to tmp\ckeckpointer.ckpt
Epoch 3/50

Epoch 00003: val_loss improved from 0.00068 to 0.00051, saving model to tmp\ckeckpointer.ckpt
Epoch 4/50

Epoch 00004: val_loss improved from 0.00051 to 0.00044, saving model to tmp\ckeckpointer.ckpt
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.00044
Epoch 6/50

Epoch 00006: val_loss improved from 0.00044 to 0.00041, saving model to tmp\ckeckpointer.ckpt
Epoch 7/50

Epoch 00007: val_loss did not improve from 0.00041
Epoch 8/50

Epoch 00008: val_loss improved from 0.00041 to 0.00034, saving model to tmp\ckeckpointer.ckpt
Epoch 9/50

Epoch 00009: val_loss did not improve from 0.00034
Epoch 10/50

Epoch 00010: val_loss did not improve from 0.00034
Epoch 11/50

Epoch 00011: val_loss did not improve from 0.00034
Epoch 12/50

Epoch 00012: val_loss did not improve 

In [12]:
pred = model.predict(test_data)

In [13]:
rescaled_y_test = scaler1.inverse_transform(np.array(y_test).reshape(-1, 1))
rescaled_pred = scaler1.inverse_transform(np.array(pred).reshape(-1,1))

In [14]:
# 평가지표 함수
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

def confirm_result(y_test, y_pred):
    MAE = mean_absolute_error(y_test, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    MSLE = mean_squared_log_error(y_test, y_pred)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, y_pred))
    R2 = r2_score(y_test, y_pred)
        
    pd.options.display.float_format = '{:.5f}'.format
    Result = pd.DataFrame(data=[MAE,RMSE, RMSLE, R2],
                            index = ['MAE','RMSE', 'RMSLE', 'R2'],
                            columns=['Results'])
    return Result

In [15]:
confirm_result(rescaled_y_test[20:], rescaled_pred)

Unnamed: 0,Results
MAE,3362.0691
RMSE,4457.73061
RMSLE,0.04175
R2,0.95264
