### LSTM 을 활용한 주가 예측 모델
##### 주가 보조지표 추가

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import datetime as dt


%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
import sqlite3
con = sqlite3.connect(r"C:\Users\구남이\OneDrive\바탕 화면\활동들\한이음 멘토링\stock.db")
df = pd.read_sql("SELECT * FROM '039490'", con, index_col=None)
df.rename(columns={'index':'date'}, inplace=True)
df = df.set_index('date')
df.index = pd.DatetimeIndex(df.index)
df.sort_index(ascending=True, inplace=True)
df

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-04-23,6076,6799,5353,5353,1415109
2004-04-26,5436,5741,5261,5398,343283
2004-04-27,5452,5482,5033,5063,299489
2004-04-28,5071,5452,4995,5444,182604
2004-04-29,5117,5436,5025,5101,109911
...,...,...,...,...,...
2022-05-11,87500,87500,85500,85800,108071
2022-05-12,84700,85300,82700,83200,104007
2022-05-13,84000,87400,83500,85800,86525
2022-05-16,86900,87600,85700,85900,55383


In [4]:
# 이동평균선 추가
ma = [5,20,60,120]
for days in ma:
    df['ma_'+str(days)] = df['close'].rolling(window = days).mean()
df.dropna(inplace=True)
df 

Unnamed: 0_level_0,open,high,low,close,volume,ma_5,ma_20,ma_60,ma_120
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-10-14,3959,4066,3906,4035,171655,3940.8,3963.60,3421.083333,3346.366667
2004-10-15,4066,4096,3959,3990,128530,3959.2,3946.85,3442.283333,3335.008333
2004-10-18,3997,3997,3860,3868,77233,3944.0,3935.05,3461.383333,3322.258333
2004-10-19,3868,3914,3830,3853,63272,3935.0,3922.90,3481.183333,3312.175000
2004-10-20,3807,3853,3769,3769,118227,3903.0,3909.60,3499.450000,3298.216667
...,...,...,...,...,...,...,...,...,...
2022-05-11,87500,87500,85500,85800,108071,89460.0,94620.00,97348.333333,100015.833333
2022-05-12,84700,85300,82700,83200,104007,87480.0,93785.00,97120.000000,99788.333333
2022-05-13,84000,87400,83500,85800,86525,86360.0,93050.00,96945.000000,99603.333333
2022-05-16,86900,87600,85700,85900,55383,85740.0,92440.00,96716.666667,99435.833333


In [5]:
import ta

H, L, C, V = df['high'], df['low'], df['close'], df['volume']

# df['bol_high'] = ta.volatility.bollinger_hband(C)
# df['bol_low']  = ta.volatility.bollinger_lband(C)
df['MFI'] = ta.volume.money_flow_index(
    high=H, low=L, close=C, volume=V, fillna=True)

df['ADI'] = ta.volume.acc_dist_index(
    high=H, low=L, close=C, volume=V, fillna=True)

df['OBV'] = ta.volume.on_balance_volume(close=C, volume=V, fillna=True)
df['CMF'] = ta.volume.chaikin_money_flow(
    high=H, low=L, close=C, volume=V, fillna=True)

df['FI'] = ta.volume.force_index(close=C, volume=V, fillna=True)
df['EOM, EMV'] = ta.volume.ease_of_movement(
    high=H, low=L, volume=V, fillna=True)

df['VPT'] = ta.volume.volume_price_trend(close=C, volume=V, fillna=True)
df['NVI'] = ta.volume.negative_volume_index(close=C, volume=V, fillna=True)
df['VMAP'] = ta.volume.volume_weighted_average_price(
    high=H, low=L, close=C, volume=V, fillna=True)

# Volatility
df['ATR'] = ta.volatility.average_true_range(
    high=H, low=L, close=C, fillna=True)
df['BHB'] = ta.volatility.bollinger_hband(close=C, fillna=True)
df['BLB'] = ta.volatility.bollinger_lband(close=C, fillna=True)
df['KCH'] = ta.volatility.keltner_channel_hband(
    high=H, low=L, close=C, fillna=True)
df['KCL'] = ta.volatility.keltner_channel_lband(
    high=H, low=L, close=C, fillna=True)
df['KCM'] = ta.volatility.keltner_channel_mband(
    high=H, low=L, close=C, fillna=True)
df['DCH'] = ta.volatility.donchian_channel_hband(
    high=H, low=L, close=C, fillna=True)
df['DCL'] = ta.volatility.donchian_channel_lband(
    high=H, low=L, close=C, fillna=True)
df['DCM'] = ta.volatility.donchian_channel_mband(
    high=H, low=L, close=C, fillna=True)
df['UI'] = ta.volatility.ulcer_index(close=C, fillna=True)
# Trend
df['SMA'] = ta.trend.sma_indicator(close=C, fillna=True)
df['EMA'] = ta.trend.ema_indicator(close=C, fillna=True)
df['WMA'] = ta.trend.wma_indicator(close=C, fillna=True)
df['MACD'] = ta.trend.macd(close=C, fillna=True)
df['ADX'] = ta.trend.adx(high=H, low=L, close=C, fillna=True)
df['-VI'] = ta.trend.vortex_indicator_neg(
    high=H, low=L, close=C, fillna=True)
df['+VI'] = ta.trend.vortex_indicator_pos(
    high=H, low=L, close=C, fillna=True)
df['TRIX'] = ta.trend.trix(close=C, fillna=True)
df['MI'] = ta.trend.mass_index(high=H, low=L, fillna=True)
df['CCI'] = ta.trend.cci(high=H, low=L, close=C, fillna=True)
df['DPO'] = ta.trend.dpo(close=C, fillna=True)
df['KST'] = ta.trend.kst(close=C, fillna=True)
df['Ichimoku'] = ta.trend.ichimoku_a(high=H, low=L, fillna=True)
df['Parabolic SAR'] = ta.trend.psar_down(
    high=H, low=L, close=C, fillna=True)
df['STC'] = ta.trend.stc(close=C, fillna=True)
# Momentum
df['RSI'] = ta.momentum.rsi(close=C, fillna=True)
df['SRSI'] = ta.momentum.stochrsi(close=C, fillna=True)
df['TSI'] = ta.momentum.tsi(close=C, fillna=True)
df['UO'] = ta.momentum.ultimate_oscillator(
    high=H, low=L, close=C, fillna=True)
df['SR'] = ta.momentum.stoch(close=C, high=H, low=L, fillna=True)
df['WR'] = ta.momentum.williams_r(high=H, low=L, close=C, fillna=True)
df['AO'] = ta.momentum.awesome_oscillator(high=H, low=L, fillna=True)
df['KAMA'] = ta.momentum.kama(close=C, fillna=True)
df['ROC'] = ta.momentum.roc(close=C, fillna=True)
df['PPO'] = ta.momentum.ppo(close=C, fillna=True)
df['PVO'] = ta.momentum.pvo(volume=V, fillna=True)

df


Unnamed: 0_level_0,open,high,low,close,volume,ma_5,ma_20,ma_60,ma_120,MFI,...,SRSI,TSI,UO,SR,WR,AO,KAMA,ROC,PPO,PVO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-10-14,3959,4066,3906,4035,171655,3940.8,3963.60,3421.083333,3346.366667,50.000000,...,0.000000,0.000000,0.000000,80.625000,-19.375000,0.000000,4035.000000,0.000000,0.000000,0.000000
2004-10-15,4066,4096,3959,3990,128530,3959.2,3946.85,3442.283333,3335.008333,100.000000,...,0.000000,-100.000000,10.437710,44.210526,-55.789474,0.000000,4012.552770,0.000000,-0.089039,-2.042123
2004-10-18,3997,3997,3860,3868,77233,3944.0,3935.05,3461.383333,3322.258333,63.094240,...,0.000000,-100.000000,8.986175,3.389831,-96.610169,0.000000,3942.932156,0.000000,-0.400381,-6.300643
2004-10-19,3868,3914,3830,3853,63272,3935.0,3922.90,3481.183333,3312.175000,48.569734,...,0.000000,-100.000000,11.969112,8.646617,-91.353383,0.000000,3901.746168,0.000000,-0.671367,-10.668185
2004-10-20,3807,3853,3769,3769,118227,3903.0,3909.60,3499.450000,3298.216667,34.143780,...,0.000000,-100.000000,10.299003,0.000000,-100.000000,0.000000,3842.238240,0.000000,-1.046892,-11.088785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-11,87500,87500,85500,85800,108071,89460.0,94620.00,97348.333333,100015.833333,20.752387,...,0.000000,-35.840057,35.399603,2.173913,-97.826087,-6237.352941,90199.196670,-11.180124,-2.713977,13.360541
2022-05-12,84700,85300,82700,83200,104007,87480.0,93785.00,97120.000000,99788.333333,12.838422,...,0.000000,-41.172548,30.013090,3.311258,-96.688742,-7757.058824,88144.782424,-12.421053,-3.256853,17.068667
2022-05-13,84000,87400,83500,85800,86525,86360.0,93050.00,96945.000000,99603.333333,21.838865,...,0.465648,-41.365973,35.628149,23.134328,-76.865672,-8756.470588,87818.348262,-8.917197,-3.425052,17.474909
2022-05-16,86900,87600,85700,85900,55383,85740.0,92440.00,96716.666667,99435.833333,27.820394,...,0.482154,-41.369557,35.274041,25.000000,-75.000000,-9055.294118,87435.108308,-8.519702,-3.513176,14.094875


In [6]:
df.drop('close', axis=1)

Unnamed: 0_level_0,open,high,low,volume,ma_5,ma_20,ma_60,ma_120,MFI,ADI,...,SRSI,TSI,UO,SR,WR,AO,KAMA,ROC,PPO,PVO
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-10-14,3959,4066,3906,171655,3940.8,3963.60,3421.083333,3346.366667,50.000000,1.051387e+05,...,0.000000,0.000000,0.000000,80.625000,-19.375000,0.000000,4035.000000,0.000000,0.000000,0.000000
2004-10-15,4066,4096,3959,128530,3959.2,3946.85,3442.283333,3335.008333,100.000000,3.477555e+04,...,0.000000,-100.000000,10.437710,44.210526,-55.789474,0.000000,4012.552770,0.000000,-0.089039,-2.042123
2004-10-18,3997,3997,3860,77233,3944.0,3935.05,3461.383333,3322.258333,63.094240,-3.343754e+04,...,0.000000,-100.000000,8.986175,3.389831,-96.610169,0.000000,3942.932156,0.000000,-0.400381,-6.300643
2004-10-19,3868,3914,3830,63272,3935.0,3922.90,3481.183333,3312.175000,48.569734,-6.206059e+04,...,0.000000,-100.000000,11.969112,8.646617,-91.353383,0.000000,3901.746168,0.000000,-0.671367,-10.668185
2004-10-20,3807,3853,3769,118227,3903.0,3909.60,3499.450000,3298.216667,34.143780,-1.802876e+05,...,0.000000,-100.000000,10.299003,0.000000,-100.000000,0.000000,3842.238240,0.000000,-1.046892,-11.088785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-11,87500,87500,85500,108071,89460.0,94620.00,97348.333333,100015.833333,20.752387,1.167620e+07,...,0.000000,-35.840057,35.399603,2.173913,-97.826087,-6237.352941,90199.196670,-11.180124,-2.713977,13.360541
2022-05-12,84700,85300,82700,104007,87480.0,93785.00,97120.000000,99788.333333,12.838422,1.161220e+07,...,0.000000,-41.172548,30.013090,3.311258,-96.688742,-7757.058824,88144.782424,-12.421053,-3.256853,17.068667
2022-05-13,84000,87400,83500,86525,86360.0,93050.00,96945.000000,99603.333333,21.838865,1.162773e+07,...,0.465648,-41.365973,35.628149,23.134328,-76.865672,-8756.470588,87818.348262,-8.917197,-3.425052,17.474909
2022-05-16,86900,87600,85700,55383,85740.0,92440.00,96716.666667,99435.833333,27.820394,1.158401e+07,...,0.482154,-41.369557,35.274041,25.000000,-75.000000,-9055.294118,87435.108308,-8.519702,-3.513176,14.094875


In [7]:
from sklearn.preprocessing import MinMaxScaler

# 피처값 스케일링
scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(df.drop('close', axis=1))
scaled_df = pd.DataFrame(scaled_df, columns = df.drop('close', axis=1).columns)

In [8]:
# 타켓 스케일링
scaler1 = MinMaxScaler()
scaled_df['close'] = scaler1.fit_transform(df['close'].values.reshape(-1,1))

In [9]:
scaled_df

Unnamed: 0,open,high,low,volume,ma_5,ma_20,ma_60,ma_120,MFI,ADI,...,TSI,UO,SR,WR,AO,KAMA,ROC,PPO,PVO,close
0,0.003777,0.004016,0.003642,0.119625,0.003413,0.003428,0.000000,0.000860,0.500000,0.294518,...,0.565490,0.000000,0.806250,0.806250,0.481422,0.003620,0.337362,0.441245,0.381687,0.004173
1,0.004449,0.004199,0.003990,0.089071,0.003534,0.003313,0.000154,0.000774,1.000000,0.291081,...,0.000000,0.117365,0.442105,0.442105,0.481422,0.003467,0.337362,0.438104,0.364531,0.003890
2,0.004015,0.003596,0.003341,0.052727,0.003434,0.003232,0.000294,0.000677,0.630942,0.287749,...,0.000000,0.101043,0.033898,0.033898,0.481422,0.002991,0.337362,0.427118,0.328756,0.003121
3,0.003205,0.003090,0.003144,0.042836,0.003374,0.003148,0.000438,0.000601,0.485697,0.286351,...,0.000000,0.134584,0.086466,0.086466,0.481422,0.002710,0.337362,0.417556,0.292066,0.003026
4,0.002821,0.002718,0.002745,0.081771,0.003163,0.003057,0.000571,0.000495,0.341438,0.280576,...,0.000000,0.115805,0.000000,0.000000,0.481422,0.002303,0.337362,0.404306,0.288532,0.002496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4347,0.528723,0.512472,0.538159,0.074576,0.567238,0.626130,0.684249,0.734314,0.207524,0.859703,...,0.362818,0.398043,0.021739,0.021739,0.334290,0.592242,0.233640,0.345484,0.493925,0.519628
4348,0.511128,0.499065,0.519817,0.071697,0.554184,0.620395,0.682586,0.732588,0.128384,0.856577,...,0.332663,0.337476,0.033113,0.033113,0.298442,0.578207,0.222128,0.326329,0.525076,0.503237
4349,0.506730,0.511862,0.525057,0.059311,0.546800,0.615346,0.681311,0.731184,0.218389,0.857336,...,0.331569,0.400613,0.231343,0.231343,0.274867,0.575977,0.254634,0.320394,0.528489,0.519628
4350,0.524953,0.513081,0.539469,0.037247,0.542712,0.611156,0.679647,0.729913,0.278204,0.855200,...,0.331549,0.396631,0.250000,0.250000,0.267818,0.573359,0.258322,0.317284,0.500094,0.520258


In [10]:
#train,test 분할
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(scaled_df.drop('close', 1), scaled_df['close'], test_size=0.3, random_state=0, shuffle=False)
    

In [11]:
# tensorflow dataset을 활용한 시퀀스 데이터셋 구성
import tensorflow as tf
def windowed_dataset(series, window_size, batch_size, shuffle):
    series = tf.expand_dims(series, axis=-1)
    ds = tf.data.Dataset.from_tensor_slices(series)
    ds = ds.window(window_size + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda w: w.batch(window_size + 1))
    if shuffle:
        ds = ds.shuffle(1000)
    ds = ds.map(lambda w: (w[:-1], w[-1]))
    return ds.batch(batch_size).prefetch(1)

In [12]:
# hyper parameter 정의 
WINDOW_SIZE=20
BATCH_SIZE=32

In [13]:
# trian_data는 학습용 데이터셋, test_data는 검증용 데이터셋
train_data = windowed_dataset(y_train, WINDOW_SIZE, BATCH_SIZE, True)
test_data = windowed_dataset(y_test, WINDOW_SIZE, BATCH_SIZE, False)

In [14]:
 # 모델 생성
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Lambda
from tensorflow.keras.losses import Huber
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential([
    # 1차원 feature map 생성
    Conv1D(filters=32, kernel_size=5,
        padding="causal",
        #활성화함수를 relu로 정의
        activation="relu",
        input_shape=[WINDOW_SIZE, 1]),
    # LSTM
    LSTM(16, activation='tanh'),    
    Dense(16, activation="relu"),
    Dense(1)])

loss = Huber()
optimizer = Adam(0.005) ##3compile 시 optimizer는 adam으로 사용
model.compile(loss=Huber(), optimizer=optimizer, metrics=['mse'])

earlystopping = EarlyStopping(monitor='val_loss', patience=10)

filename = os.path.join('tmp', 'ckeckpointer.ckpt')
checkpoint = ModelCheckpoint(filename, 
                            save_weights_only=True,
                            save_best_only=True, 
                            monitor='val_loss',
                            verbose=1)

In [15]:
# 모델 학습
history = model.fit(train_data, 
                validation_data=(test_data), 
                epochs=50,   #50번 학습
                callbacks=[checkpoint, earlystopping])

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.00148, saving model to tmp\ckeckpointer.ckpt
Epoch 2/50

Epoch 00002: val_loss improved from 0.00148 to 0.00112, saving model to tmp\ckeckpointer.ckpt
Epoch 3/50

Epoch 00003: val_loss improved from 0.00112 to 0.00065, saving model to tmp\ckeckpointer.ckpt
Epoch 4/50

Epoch 00004: val_loss improved from 0.00065 to 0.00053, saving model to tmp\ckeckpointer.ckpt
Epoch 5/50

Epoch 00005: val_loss improved from 0.00053 to 0.00036, saving model to tmp\ckeckpointer.ckpt
Epoch 6/50

Epoch 00006: val_loss improved from 0.00036 to 0.00033, saving model to tmp\ckeckpointer.ckpt
Epoch 7/50

Epoch 00007: val_loss improved from 0.00033 to 0.00026, saving model to tmp\ckeckpointer.ckpt
Epoch 8/50

Epoch 00008: val_loss improved from 0.00026 to 0.00024, saving model to tmp\ckeckpointer.ckpt
Epoch 9/50

Epoch 00009: val_loss improved from 0.00024 to 0.00021, saving model to tmp\ckeckpointer.ckpt
Epoch 10/50

Epoch 00010: val_loss did not improve

In [16]:
pred = model.predict(test_data)

In [17]:
rescaled_y_test = scaler1.inverse_transform(np.array(y_test).reshape(-1, 1))
rescaled_pred = scaler1.inverse_transform(np.array(pred).reshape(-1,1))

In [18]:
# 평가지표 함수
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

def confirm_result(y_test, y_pred):
    MAE = mean_absolute_error(y_test, y_pred)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    MSLE = mean_squared_log_error(y_test, y_pred)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, y_pred))
    R2 = r2_score(y_test, y_pred)
        
    pd.options.display.float_format = '{:.5f}'.format
    Result = pd.DataFrame(data=[MAE,RMSE, RMSLE, R2],
                            index = ['MAE','RMSE', 'RMSLE', 'R2'],
                            columns=['Results'])
    return Result

In [19]:
confirm_result(rescaled_y_test[20:], rescaled_pred)

Unnamed: 0,Results
MAE,1977.3025
RMSE,2858.21843
RMSLE,0.02755
R2,0.98041
