In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import warnings
warnings.filterwarnings("ignore")

# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

In [2]:
# 데이터셋 로드
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True)
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

The dataset has 1567 observations/rows and 590 variables/columns.
The majority class has 1463 observations, minority class 104.
The dataset is imbalanced. The ratio of majority class to minority class is {14.07}:1.


In [3]:
dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)

In [4]:
feature_names = secom_categorical.columns
m = list((map(lambda i: sum(secom_categorical[i].isnull()), feature_names)))

In [5]:
print(len(m))
m

474


[6,
 7,
 14,
 14,
 14,
 14,
 9,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 10,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 24,
 24,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 4,
 4,
 4,
 4,
 4,
 7,
 6,
 6,
 6,
 7,
 7,
 7,
 6,
 6,
 6,
 6,
 6,
 794,
 794,
 6,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 1,
 12,
 1341,
 0,
 0,
 0,
 51,
 51,
 6,
 2,
 2,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 2,
 2,
 6,
 6,
 6,
 6,
 1018,
 1018,
 1018,
 715,
 0,
 0,
 0,
 0,
 0,
 24,
 0,
 0,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 8,
 8,
 5,
 6,
 7,
 14,
 14,
 14,
 14,
 9,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 10,
 0,
 1429,
 1429,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 7,
 6,
 6,
 6,
 7,
 7,
 7,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 1,
 12,
 1341,
 0,
 0,
 0,
 51,
 51,
 2,
 2,
 2,
 2,
 1018,
 1018,
 1018,
 715,
 0,
 0,
 0,
 0,
 0,
 24,
 0,
 0,
 8,
 8,
 8,
 5,
 6,
 7,
 14,
 

In [6]:
1567*0.6

940.1999999999999

In [7]:
m_940thresh = list(filter(lambda i: (m[i] > 940), range(secom_categorical.shape[1])))
secom_drop_940thresh = secom_categorical.dropna(subset=m_940thresh, axis=1)

In [8]:
secom_drop_940thresh

Unnamed: 0,0,2,3,4,6,7,8,9,10,11,...,576,577,582,583,584,585,586,587,588,589
0,3030.93,2187.7333,1411.1265,1.3602,97.6133,0.1242,1.5005,0.0162,-0.0034,0.9455,...,1.6765,14.9509,0.5005,0.0118,0.0035,2.3630,,,,
1,3095.78,2230.4222,1463.6606,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,0.9627,...,1.1065,10.9003,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045
2,2932.61,2186.4111,1698.0172,1.5102,95.4878,0.1241,1.4436,0.0041,0.0013,0.9615,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2199.0333,909.7926,1.3204,104.2367,0.1217,1.4882,-0.0124,-0.0033,0.9629,...,1.7585,8.5831,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2233.3667,1326.5200,1.5334,100.3967,0.1235,1.5031,-0.0031,-0.0072,0.9569,...,1.6597,10.9698,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432
5,2946.25,2233.3667,1326.5200,1.5334,100.3967,0.1235,1.5287,0.0167,0.0055,0.9699,...,1.6679,13.7755,0.4949,0.0189,0.0044,3.8276,0.0342,0.0151,0.0052,44.0077
6,3030.27,2230.4222,1463.6606,0.8294,102.3433,0.1247,1.5816,-0.0270,0.0105,0.9591,...,1.1958,8.3645,0.5010,0.0143,0.0042,2.8515,0.0342,0.0151,0.0052,44.0077
7,3058.88,2248.9000,1004.4692,0.7884,106.2400,0.1185,1.5153,0.0157,0.0007,0.9481,...,56.4274,16.0862,0.4984,0.0106,0.0034,2.1261,0.0204,0.0194,0.0063,95.0310
8,2967.68,2248.9000,1004.4692,0.7884,106.2400,0.1185,1.5358,0.0111,-0.0066,0.9494,...,1.3248,14.2892,0.4993,0.0172,0.0046,3.4456,0.0111,0.0124,0.0045,111.6525
9,3016.11,2248.9000,1004.4692,0.7884,106.2400,0.1185,1.5381,0.0159,0.0049,0.9440,...,0.6636,7.4181,0.4967,0.0152,0.0038,3.0687,0.0212,0.0191,0.0073,90.2294


In [9]:
for column in secom_drop_940thresh:
    mean = secom_drop_940thresh[column].mean()
    secom_drop_940thresh.loc[secom_drop_940thresh[column].isnull(), column] = mean

In [10]:
secom_drop_940thresh

Unnamed: 0,0,2,3,4,6,7,8,9,10,11,...,576,577,582,583,584,585,586,587,588,589
0,3030.930000,2187.733300,1411.126500,1.360200,97.613300,0.1242,1.500500,0.016200,-0.003400,0.945500,...,1.6765,14.9509,0.5005,0.0118,0.0035,2.3630,0.021458,0.016475,0.005283,99.670066
1,3095.780000,2230.422200,1463.660600,0.829400,102.343300,0.1247,1.496600,-0.000500,-0.014800,0.962700,...,1.1065,10.9003,0.5019,0.0223,0.0055,4.4447,0.009600,0.020100,0.006000,208.204500
2,2932.610000,2186.411100,1698.017200,1.510200,95.487800,0.1241,1.443600,0.004100,0.001300,0.961500,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.058400,0.048400,0.014800,82.860200
3,2988.720000,2199.033300,909.792600,1.320400,104.236700,0.1217,1.488200,-0.012400,-0.003300,0.962900,...,1.7585,8.5831,0.4990,0.0103,0.0025,2.0544,0.020200,0.014900,0.004400,73.843200
4,3032.240000,2233.366700,1326.520000,1.533400,100.396700,0.1235,1.503100,-0.003100,-0.007200,0.956900,...,1.6597,10.9698,0.4800,0.4766,0.1045,99.3032,0.020200,0.014900,0.004400,73.843200
5,2946.250000,2233.366700,1326.520000,1.533400,100.396700,0.1235,1.528700,0.016700,0.005500,0.969900,...,1.6679,13.7755,0.4949,0.0189,0.0044,3.8276,0.034200,0.015100,0.005200,44.007700
6,3030.270000,2230.422200,1463.660600,0.829400,102.343300,0.1247,1.581600,-0.027000,0.010500,0.959100,...,1.1958,8.3645,0.5010,0.0143,0.0042,2.8515,0.034200,0.015100,0.005200,44.007700
7,3058.880000,2248.900000,1004.469200,0.788400,106.240000,0.1185,1.515300,0.015700,0.000700,0.948100,...,56.4274,16.0862,0.4984,0.0106,0.0034,2.1261,0.020400,0.019400,0.006300,95.031000
8,2967.680000,2248.900000,1004.469200,0.788400,106.240000,0.1185,1.535800,0.011100,-0.006600,0.949400,...,1.3248,14.2892,0.4993,0.0172,0.0046,3.4456,0.011100,0.012400,0.004500,111.652500
9,3016.110000,2248.900000,1004.469200,0.788400,106.240000,0.1185,1.538100,0.015900,0.004900,0.944000,...,0.6636,7.4181,0.4967,0.0152,0.0038,3.0687,0.021200,0.019100,0.007300,90.229400


In [11]:
from sklearn.decomposition import PCA
# PCA 인스턴스 객체를 생성.
pca = PCA(n_components=35)

In [12]:
pca.fit(secom_drop_940thresh)
secom_pca = pca.transform(secom_drop_940thresh)

In [None]:
pca.explained_variance_ratio_

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
sss.get_n_splits(secom_pca, y)

In [None]:
for train_index, test_index in sss.split(secom_pca, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = secom_pca[train_index], secom_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("\n")

In [None]:
a = 0
b = 0
for i in y_test:
    if i==1:
        a = a+1
    else:
        b= b+1

print(a,b)
del a
del b

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0, k_neighbors=10, sampling_strategy={1:1096,-1:2192})
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
from keras.optimizers import Adam
def mlpClassfier(shape, neurons= 20, learning_rate=0.01, activation="relu"):
    from keras.models import Sequential
    from keras.layers import Dense

    model = Sequential()
    model.add(Dense(input_dim=shape,units= neurons,activation=activation))
    model.add(Dense(1,activation="tanh"))
    optimizer = Adam(lr=learning_rate)
    model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

    return model

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
shape = [X_res.shape[1]]
model = KerasClassifier(build_fn=mlpClassfier,verbose = 0)
epochs = [30,50]
activation = ["tanh","sigmoid"]
learn_rate = [0.001, 0.01]
neurons = [30,40,50,60]
param_grid = dict(epochs=epochs, shape = shape, learning_rate = learn_rate,neurons = neurons, activation= activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_res, y_res)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
score = grid_result.evaluate(X_test,y_test)
print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))
y_predict = grid_result.predict(X_test)
print(y_predict)

In [None]:
def cal_confusion(y, y_test):
    for i,name in enumerate(y):
        if name<0 :
            y[i] = -1
        elif name>=0 :
            y[i] = 1

    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import matthews_corrcoef

    print(confusion_matrix(y_test, y))
    print(matthews_corrcoef(y_test,y))

In [None]:
a = confusion_matrix(y_test, y_predict)

In [None]:
tn = a[0][0]
fn = a[0][1]
fp = a[1][0]
tp = a[1][1]

In [None]:
precision = (tp/(tp+fp))*100
recall = (tp/(tp+fn))*100
acc = ((tp+tn)/(tp+tn+fp+fn))*100

In [None]:
print(precision,recall,acc)