# 과적합 피하기 - 초음파광물 사례
## 학습셋과 테스트셋으로 구분
## K-fold Cross Validation

In [1]:
import pandas as pd

df = pd.read_csv('../dataset/sonar.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [2]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

In [3]:
# seed 값 설정
seed = 2020
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
# 데이터 입력
dataset = df.values
X = np.array(dataset[:,0:60], dtype=np.float64)
Y_obj = dataset[:,60]

In [5]:
# 문자열 변환
e = LabelEncoder().fit(Y_obj)
Y = np.array(e.transform(Y_obj), dtype=np.float64)

In [6]:
# 5개로 쪼갬
n_fold = 5
skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [7]:
# 빈 accuracy 리스트
accuracy = []

In [8]:
# 모델의 설정, 컴파일, 실행
for train, test in skf.split(X, Y):
    model = Sequential([
        Dense(24, input_dim=60, activation='relu'),
        Dense(10, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy',
        optimizer='adam', metrics=['accuracy'])
    model.fit(X[train], Y[train], epochs=100, batch_size=5, verbose=0)
    
    print(len(X[train]), len(X[test]), end='  ')
    k_accuracy = "%.4f" % (model.evaluate(X[test], Y[test], verbose=0)[1])
    print(k_accuracy)
    accuracy.append(k_accuracy)

166 42  0.8810
166 42  0.8095
166 42  0.7619
167 41  0.9024
167 41  0.8049


In [9]:
acc = np.array(accuracy, dtype=np.float64)
np.mean(acc)

0.83194