**각 Top Features만을 사용한 RandomForestClassifier와 LSTM의 성능 비교**

In [1]:
import numpy as np, os
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import warnings
warnings.filterwarnings(action='ignore')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dropout, InputLayer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

print(tf.__version__)
print(keras.__version__)

#한글설정
import matplotlib.font_manager as fm

font_dirs = ['/usr/share/fonts/truetype/nanum', ]
font_files = fm.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    fm.fontManager.addfont(font_file)
    
# 한글 출력을 위해서 폰트 옵션을 설정합니다.
# "axes.unicode_minus" : 마이너스가 깨질 것을 방지

sns.set(font="NanumBarunGothic", 
        rc={"axes.unicode_minus":False},
        style='darkgrid')

2.5.0
2.5.0


# RandomForest FI Top10 list
1. Feature Importance를 구하여 Top Features 추출
2. Top Features만을 사용한 데이터로 RandomForestClassifier 학습 후 정확도 측정

In [2]:
COLS = list(pd.read_csv('/project/LSH/total_data_7727.csv')['ITEMID'].sort_values().unique())

In [3]:
import random
random.seed(42)
path = '/project/LSH/'
x =  np.load(path + 'x_(7727,4068).npy')
y = np.load(path + 'y_(7727,1).npy')

In [4]:
x.shape

(7727, 4068)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
np.random.seed(42)
model = RandomForestClassifier()
model.fit(x, y)

pred = model.predict(x)

print(f'정확도 : {metrics.accuracy_score(y, pred)}')

정확도 : 1.0


In [6]:
importances = model.feature_importances_
importances

array([1.40546352e-03, 8.98727598e-04, 3.07665371e-03, ...,
       3.64771738e-04, 9.96495876e-05, 4.21937811e-05])

In [7]:
result = []
for f in range(x.shape[1]):
    result.append({'feature' : COLS[f], 'importances' : importances[f]})

In [8]:
rf_fi = pd.DataFrame(result).sort_values('importances', ascending=False)
rf_fi

Unnamed: 0,feature,importances
206,51277,0.013739
121,51006,0.009766
56,50912,0.009121
103,50983,0.007452
20,50862,0.007445
...,...,...
2281,472120006,0.000000
1604,131541070,0.000000
3217,51079079420,0.000000
1042,62401002,0.000000


In [9]:
rf_feature = rf_fi.feature[:30].to_list()
rf_feature

[51277,
 51006,
 50912,
 50983,
 50862,
 63323026201,
 51003,
 904224461,
 50882,
 50868,
 51275,
 50813,
 227194,
 51244,
 409606211,
 51274,
 50971,
 245004101,
 51256,
 51079000220,
 51493,
 55390000401,
 50960,
 50902,
 51265,
 51248,
 224275,
 51249,
 50893,
 50970]

In [10]:
#COLS에서의 index 추출 (COLS의 index가 곧 3차원데이터 feature의 index이기 때문)

rf_feature_index = [COLS.index(i) for i in rf_feature]
rf_feature_index

[206,
 121,
 56,
 103,
 20,
 3832,
 119,
 2611,
 34,
 26,
 204,
 8,
 383,
 184,
 2195,
 203,
 95,
 1903,
 193,
 3045,
 278,
 3456,
 86,
 48,
 197,
 186,
 305,
 187,
 41,
 94]

# LSTM FI Top10 list

In [11]:
lstm_fi = pd.read_csv('./data/method1_entropy.csv')
lstm_fi['abs_score'] = lstm_fi['score'].apply(lambda x:abs(x))
lstm_fi = lstm_fi.sort_values('abs_score', ascending=False)
lstm_feature = lstm_fi.feature[:30].to_list()
lstm_feature

[409606211,
 74606211,
 67434504,
 54817525,
 10019055302,
 227194,
 54858516,
 74706811,
 45006701,
 4003822,
 228125,
 50889,
 63323016501,
 58177025504,
 182138167,
 51479005520,
 51079007520,
 51200,
 406112101,
 25021011210,
 63323016101,
 58468002101,
 781155613,
 224270,
 245021201,
 51079008620,
 50383068304,
 172572810,
 63323030201,
 63323038810]

In [12]:
#COLS에서의 index 추출 (COLS의 index가 곧 3차원데이터 feature의 index이기 때문)

lstm_feature_index = [COLS.index(i) for i in lstm_feature]
lstm_feature_index

[2195,
 1276,
 1081,
 982,
 2744,
 383,
 1011,
 1306,
 845,
 469,
 392,
 37,
 3820,
 3557,
 1815,
 3283,
 3070,
 162,
 2110,
 2860,
 3818,
 3567,
 2547,
 301,
 1915,
 3076,
 2988,
 1722,
 3841,
 3852]

# With RF FI Top10

## RF

In [13]:
path = '/project/LSH/'
X =  np.load(path + 'x_(7727,4068).npy')[:,rf_feature_index]
y = np.load(path + 'y_(7727,1).npy')

idx = list(range(len(X)))
random.shuffle(idx)

i = round(X.shape[0]*0.8)
X_train, y_train = X[idx[:i],:], y[idx[:i]]
X_test, y_test = X[idx[i:],:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6182, 30), (6182,), (1545, 30), (1545,))

In [14]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test)

print(f'정확도 : {metrics.accuracy_score(y_test, pred)}')

정확도 : 0.7080906148867314


## LSTM

In [15]:
import random    
seed_num = 42
random.seed(seed_num)
path = '/project/LSH/'
X = np.load(path + 'x_(7727,10,4068).npy')[:,:,rf_feature_index]
y = np.load(path + 'y_(7727,1).npy')

idx = list(range(len(X)))
random.shuffle(idx)


i = round(X.shape[0]*0.8)
X_train, y_train = X[idx[:i],:,:], y[idx[:i]]
X_test, y_test = X[idx[i:],:,:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6182, 10, 30), (6182,), (1545, 10, 30), (1545,))

In [16]:
# ---------------------
seed_num = 42 
# ---------------------
tf.random.set_seed(seed_num)

lstm = Sequential()
lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(units=1, activation='sigmoid'))

from tensorflow.keras.callbacks import ModelCheckpoint
import os

MODEL_SAVE_FOLDER_PATH = './model/'
if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
    os.mkdir(MODEL_SAVE_FOLDER_PATH)

model_path = MODEL_SAVE_FOLDER_PATH + 'rf_top10_seed42-{epoch:02d}-{val_loss:.4f}.hdf5'

cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss',
                                verbose=1, save_best_only=True)

early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop,cb_checkpoint], shuffle=False)

Epoch 1/500

Epoch 00001: val_loss improved from inf to 0.66589, saving model to ./model/rf_top10_seed42-01-0.6659.hdf5
Epoch 2/500

Epoch 00002: val_loss improved from 0.66589 to 0.66547, saving model to ./model/rf_top10_seed42-02-0.6655.hdf5
Epoch 3/500

Epoch 00003: val_loss improved from 0.66547 to 0.64419, saving model to ./model/rf_top10_seed42-03-0.6442.hdf5
Epoch 4/500

Epoch 00004: val_loss improved from 0.64419 to 0.58553, saving model to ./model/rf_top10_seed42-04-0.5855.hdf5
Epoch 5/500

Epoch 00005: val_loss improved from 0.58553 to 0.56185, saving model to ./model/rf_top10_seed42-05-0.5619.hdf5
Epoch 6/500

Epoch 00006: val_loss improved from 0.56185 to 0.55488, saving model to ./model/rf_top10_seed42-06-0.5549.hdf5
Epoch 7/500

Epoch 00007: val_loss improved from 0.55488 to 0.55053, saving model to ./model/rf_top10_seed42-07-0.5505.hdf5
Epoch 8/500

Epoch 00008: val_loss improved from 0.55053 to 0.54650, saving model to ./model/rf_top10_seed42-08-0.5465.hdf5
Epoch 9/500



Epoch 00033: val_loss improved from 0.52610 to 0.52519, saving model to ./model/rf_top10_seed42-33-0.5252.hdf5
Epoch 34/500

Epoch 00034: val_loss did not improve from 0.52519
Epoch 35/500

Epoch 00035: val_loss improved from 0.52519 to 0.52453, saving model to ./model/rf_top10_seed42-35-0.5245.hdf5
Epoch 36/500

Epoch 00036: val_loss did not improve from 0.52453
Epoch 37/500

Epoch 00037: val_loss did not improve from 0.52453
Epoch 38/500

Epoch 00038: val_loss did not improve from 0.52453
Epoch 39/500

Epoch 00039: val_loss did not improve from 0.52453
Epoch 40/500

Epoch 00040: val_loss did not improve from 0.52453
Epoch 41/500

Epoch 00041: val_loss improved from 0.52453 to 0.52377, saving model to ./model/rf_top10_seed42-41-0.5238.hdf5
Epoch 42/500

Epoch 00042: val_loss did not improve from 0.52377
Epoch 43/500

Epoch 00043: val_loss did not improve from 0.52377
Epoch 44/500

Epoch 00044: val_loss did not improve from 0.52377
Epoch 45/500

Epoch 00045: val_loss did not improve f


Epoch 00076: val_loss did not improve from 0.52377
Epoch 77/500

Epoch 00077: val_loss did not improve from 0.52377
Epoch 78/500

Epoch 00078: val_loss did not improve from 0.52377
Epoch 79/500

Epoch 00079: val_loss did not improve from 0.52377
Epoch 80/500

Epoch 00080: val_loss did not improve from 0.52377
Epoch 81/500

Epoch 00081: val_loss did not improve from 0.52377
Epoch 82/500

Epoch 00082: val_loss did not improve from 0.52377
Epoch 83/500

Epoch 00083: val_loss did not improve from 0.52377
Epoch 84/500

Epoch 00084: val_loss did not improve from 0.52377
Epoch 85/500

Epoch 00085: val_loss did not improve from 0.52377
Epoch 86/500

Epoch 00086: val_loss did not improve from 0.52377
Epoch 87/500

Epoch 00087: val_loss did not improve from 0.52377
Epoch 88/500

Epoch 00088: val_loss did not improve from 0.52377
Epoch 89/500

Epoch 00089: val_loss did not improve from 0.52377
Epoch 90/500

Epoch 00090: val_loss did not improve from 0.52377
Epoch 91/500

Epoch 00091: val_loss di

<tensorflow.python.keras.callbacks.History at 0x7f0b68a87490>

In [17]:
from pathlib import Path
paths = sorted(Path(MODEL_SAVE_FOLDER_PATH).iterdir(), key=os.path.getmtime)[-1]
best_model_path = str(paths)
best_model_path

'model/rf_top10_seed42-41-0.5238.hdf5'

In [18]:
best_model = keras.models.load_model(best_model_path)
result = 0
for seed in range(0, 50):
    random.seed(seed)

    X = np.load(path + 'x_(7727,10,4068).npy')[:,:,rf_feature_index]
    y = np.load(path + 'y_(7727,1).npy')
    
    idx = list(range(len(x)))
    random.shuffle(idx)
    i = round(X.shape[0]*0.8)
    X_train, y_train = X[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = X[idx[i:],:,:], y[idx[i:]]

    pred = best_model.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    result += acc
    print(f'정확도 :{acc}, seed_num = {seed}')
print(f'평균 정확도 : {result/50}')

정확도 :0.7300970873786408, seed_num = 0
정확도 :0.7385113268608414, seed_num = 1
정확도 :0.7449838187702266, seed_num = 2
정확도 :0.7385113268608414, seed_num = 3
정확도 :0.7469255663430421, seed_num = 4
정확도 :0.7521035598705501, seed_num = 5
정확도 :0.7475728155339806, seed_num = 6
정확도 :0.7378640776699029, seed_num = 7
정확도 :0.7611650485436893, seed_num = 8
정확도 :0.7404530744336569, seed_num = 9
정확도 :0.7462783171521036, seed_num = 10
정확도 :0.7585760517799353, seed_num = 11
정확도 :0.7495145631067961, seed_num = 12
정확도 :0.7423948220064724, seed_num = 13
정확도 :0.7488673139158576, seed_num = 14
정확도 :0.7501618122977346, seed_num = 15
정확도 :0.7631067961165049, seed_num = 16
정확도 :0.7436893203883496, seed_num = 17
정확도 :0.7495145631067961, seed_num = 18
정확도 :0.7423948220064724, seed_num = 19
정확도 :0.7365695792880259, seed_num = 20
정확도 :0.7352750809061489, seed_num = 21
정확도 :0.7359223300970874, seed_num = 22
정확도 :0.7417475728155339, seed_num = 23
정확도 :0.7288025889967638, seed_num = 24
정확도 :0.7404530744336569, seed_num =

# With LSTM FI Top10

## RF

In [19]:
path = '/project/LSH/'
X =  np.load(path + 'x_(7727,4068).npy')[:,lstm_feature_index]
y = np.load(path + 'y_(7727,1).npy')

idx = list(range(len(X)))
random.shuffle(idx)

i = round(X.shape[0]*0.8)
X_train, y_train = X[idx[:i],:], y[idx[:i]]
X_test, y_test = X[idx[i:],:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6182, 30), (6182,), (1545, 30), (1545,))

In [21]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test)

print(f'정확도 : {metrics.accuracy_score(y_test, pred)}')

정확도 : 0.6783171521035599


## LSTM

In [22]:
import random    
seed_num = 42
random.seed(seed_num)
path = '/project/LSH/'
X = np.load(path + 'x_(7727,10,4068).npy')[:,:,lstm_feature_index]
y = np.load(path + 'y_(7727,1).npy')

idx = list(range(len(X)))
random.shuffle(idx)


i = round(X.shape[0]*0.8)
X_train, y_train = X[idx[:i],:,:], y[idx[:i]]
X_test, y_test = X[idx[i:],:,:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6182, 10, 30), (6182,), (1545, 10, 30), (1545,))

In [23]:
# ---------------------
seed_num = 42 
# ---------------------
tf.random.set_seed(seed_num)

lstm = Sequential()
lstm.add(InputLayer(input_shape=(X_train.shape[1],X_train.shape[2])))
lstm.add(LSTM(units=128, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, activation='hard_sigmoid', return_sequences=True))
lstm.add(LSTM(units=32, activation='hard_sigmoid', return_sequences=False))
lstm.add(Dropout(0.2))
lstm.add(Dense(units=1, activation='sigmoid'))

from tensorflow.keras.callbacks import ModelCheckpoint
import os

MODEL_SAVE_FOLDER_PATH = './model/'
if not os.path.exists(MODEL_SAVE_FOLDER_PATH):
    os.mkdir(MODEL_SAVE_FOLDER_PATH)

model_path = MODEL_SAVE_FOLDER_PATH + 'lstm_top10_seed42-{epoch:02d}-{val_loss:.4f}.hdf5'

cb_checkpoint = ModelCheckpoint(filepath=model_path, monitor='val_loss',
                                verbose=1, save_best_only=True, overwrite=True,)

early_stop = EarlyStopping(monitor='val_acc', patience=50, verbose=1, restore_best_weights=True)
lstm.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.001), loss = "binary_crossentropy", metrics=['acc'])
lstm.fit(X_train, y_train, validation_split=0.25, batch_size=128, epochs=500,  callbacks=[early_stop,cb_checkpoint], shuffle=False)

Epoch 1/500

Epoch 00001: val_loss improved from inf to 0.66603, saving model to ./model/lstm_top10_seed42-01-0.6660.hdf5
Epoch 2/500

Epoch 00002: val_loss did not improve from 0.66603
Epoch 3/500

Epoch 00003: val_loss improved from 0.66603 to 0.66551, saving model to ./model/lstm_top10_seed42-03-0.6655.hdf5
Epoch 4/500

Epoch 00004: val_loss improved from 0.66551 to 0.66526, saving model to ./model/lstm_top10_seed42-04-0.6653.hdf5
Epoch 5/500

Epoch 00005: val_loss improved from 0.66526 to 0.66516, saving model to ./model/lstm_top10_seed42-05-0.6652.hdf5
Epoch 6/500

Epoch 00006: val_loss improved from 0.66516 to 0.66514, saving model to ./model/lstm_top10_seed42-06-0.6651.hdf5
Epoch 7/500

Epoch 00007: val_loss did not improve from 0.66514
Epoch 8/500

Epoch 00008: val_loss did not improve from 0.66514
Epoch 9/500

Epoch 00009: val_loss improved from 0.66514 to 0.66511, saving model to ./model/lstm_top10_seed42-09-0.6651.hdf5
Epoch 10/500

Epoch 00010: val_loss improved from 0.6651


Epoch 00034: val_loss improved from 0.59993 to 0.59917, saving model to ./model/lstm_top10_seed42-34-0.5992.hdf5
Epoch 35/500

Epoch 00035: val_loss improved from 0.59917 to 0.59829, saving model to ./model/lstm_top10_seed42-35-0.5983.hdf5
Epoch 36/500

Epoch 00036: val_loss did not improve from 0.59829
Epoch 37/500

Epoch 00037: val_loss improved from 0.59829 to 0.59766, saving model to ./model/lstm_top10_seed42-37-0.5977.hdf5
Epoch 38/500

Epoch 00038: val_loss improved from 0.59766 to 0.59765, saving model to ./model/lstm_top10_seed42-38-0.5976.hdf5
Epoch 39/500

Epoch 00039: val_loss improved from 0.59765 to 0.59722, saving model to ./model/lstm_top10_seed42-39-0.5972.hdf5
Epoch 40/500

Epoch 00040: val_loss improved from 0.59722 to 0.59673, saving model to ./model/lstm_top10_seed42-40-0.5967.hdf5
Epoch 41/500

Epoch 00041: val_loss improved from 0.59673 to 0.59628, saving model to ./model/lstm_top10_seed42-41-0.5963.hdf5
Epoch 42/500

Epoch 00042: val_loss improved from 0.59628 t


Epoch 00074: val_loss did not improve from 0.59314
Epoch 75/500

Epoch 00075: val_loss did not improve from 0.59314
Epoch 76/500

Epoch 00076: val_loss did not improve from 0.59314
Epoch 77/500

Epoch 00077: val_loss did not improve from 0.59314
Epoch 78/500

Epoch 00078: val_loss did not improve from 0.59314
Epoch 79/500

Epoch 00079: val_loss did not improve from 0.59314
Epoch 80/500

Epoch 00080: val_loss did not improve from 0.59314
Epoch 81/500

Epoch 00081: val_loss did not improve from 0.59314
Epoch 82/500

Epoch 00082: val_loss did not improve from 0.59314
Epoch 83/500

Epoch 00083: val_loss did not improve from 0.59314
Epoch 84/500

Epoch 00084: val_loss did not improve from 0.59314
Epoch 85/500

Epoch 00085: val_loss did not improve from 0.59314
Epoch 86/500

Epoch 00086: val_loss did not improve from 0.59314
Epoch 87/500

Epoch 00087: val_loss did not improve from 0.59314
Epoch 88/500

Epoch 00088: val_loss did not improve from 0.59314
Epoch 89/500

Epoch 00089: val_loss di

<tensorflow.python.keras.callbacks.History at 0x7f0b482de040>

In [24]:
from pathlib import Path
paths = sorted(Path(MODEL_SAVE_FOLDER_PATH).iterdir(), key=os.path.getmtime)[-1]
best_model_path = str(paths)
best_model_path

'model/lstm_top10_seed42-53-0.5931.hdf5'

In [26]:
# best_model = keras.models.load_model('./model/lstm_top10_seed42-84-0.6057.hdf5')
best_model = keras.models.load_model(best_model_path)
result = 0
for seed in range(0, 50):
    random.seed(seed)
    X = np.load(path + 'x_(7727,10,4068).npy')[:,:,lstm_feature_index]
    y = np.load(path + 'y_(7727,1).npy')
    
    idx = list(range(len(x)))
    random.shuffle(idx)
    i = round(X.shape[0]*0.8)
    X_train, y_train = X[idx[:i],:,:], y[idx[:i]]
    X_test, y_test = X[idx[i:],:,:], y[idx[i:]]

    pred = best_model.predict(X_test)
    pred[pred>0.5]=1
    pred[pred<=0.5]=0
    acc = metrics.accuracy_score(y_test, pred)
    result += acc
    print(f'정확도 :{acc}, seed_num = {seed}')
print(f'평균 정확도 : {result/50}')

정확도 :0.6705501618122978, seed_num = 0
정확도 :0.655663430420712, seed_num = 1
정확도 :0.656957928802589, seed_num = 2
정확도 :0.6601941747572816, seed_num = 3
정확도 :0.6757281553398058, seed_num = 4
정확도 :0.6647249190938511, seed_num = 5
정확도 :0.6718446601941748, seed_num = 6
정확도 :0.6459546925566343, seed_num = 7
정확도 :0.6679611650485436, seed_num = 8
정확도 :0.6621359223300971, seed_num = 9
정확도 :0.6427184466019418, seed_num = 10
정확도 :0.6809061488673139, seed_num = 11
정확도 :0.6634304207119741, seed_num = 12
정확도 :0.6530744336569579, seed_num = 13
정확도 :0.6601941747572816, seed_num = 14
정확도 :0.683495145631068, seed_num = 15
정확도 :0.6906148867313916, seed_num = 16
정확도 :0.6614886731391586, seed_num = 17
정확도 :0.6673139158576051, seed_num = 18
정확도 :0.6595469255663431, seed_num = 19
정확도 :0.684789644012945, seed_num = 20
정확도 :0.6666666666666666, seed_num = 21
정확도 :0.654368932038835, seed_num = 22
정확도 :0.6724919093851133, seed_num = 23
정확도 :0.6705501618122978, seed_num = 24
정확도 :0.6686084142394823, seed_num = 25
정