In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import warnings
warnings.filterwarnings("ignore")

# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

In [2]:
# 데이터셋 로드
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True)
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

The dataset has 1567 observations/rows and 590 variables/columns.
The majority class has 1463 observations, minority class 104.
The dataset is imbalanced. The ratio of majority class to minority class is {14.07}:1.


In [3]:
dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)

In [4]:
secom_categorical.shape

(1567, 474)

In [5]:
feature_names = secom_categorical.columns
m = list((map(lambda i: sum(secom_categorical[i].isnull()), feature_names)))

In [6]:
print(len(m))
m

474


[6,
 7,
 14,
 14,
 14,
 14,
 9,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 10,
 0,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 24,
 24,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 4,
 4,
 4,
 4,
 4,
 7,
 6,
 6,
 6,
 7,
 7,
 7,
 6,
 6,
 6,
 6,
 6,
 794,
 794,
 6,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 1,
 12,
 1341,
 0,
 0,
 0,
 51,
 51,
 6,
 2,
 2,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 2,
 2,
 6,
 6,
 6,
 6,
 1018,
 1018,
 1018,
 715,
 0,
 0,
 0,
 0,
 0,
 24,
 0,
 0,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 9,
 8,
 8,
 8,
 5,
 6,
 7,
 14,
 14,
 14,
 14,
 9,
 2,
 2,
 2,
 2,
 2,
 3,
 3,
 3,
 3,
 3,
 10,
 0,
 1429,
 1429,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 7,
 6,
 6,
 6,
 7,
 7,
 7,
 6,
 6,
 6,
 6,
 6,
 6,
 6,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 24,
 1,
 12,
 1341,
 0,
 0,
 0,
 51,
 51,
 2,
 2,
 2,
 2,
 1018,
 1018,
 1018,
 715,
 0,
 0,
 0,
 0,
 0,
 24,
 0,
 0,
 8,
 8,
 8,
 5,
 6,
 7,
 14,
 

In [7]:
1567*0.6

940.1999999999999

In [8]:
m_940thresh = list(filter(lambda i: sum(secom_categorical[i].isnull()) > 940, secom_categorical.columns))
len(m_940thresh)

24

In [9]:
m_940thresh

[85,
 109,
 110,
 111,
 157,
 158,
 220,
 244,
 245,
 246,
 292,
 293,
 358,
 382,
 383,
 384,
 492,
 516,
 517,
 518,
 578,
 579,
 580,
 581]

In [10]:
secom_drop_940thresh = secom_categorical.drop(m_940thresh, 1)

In [11]:
secom_drop_940thresh.shape

(1567, 450)

In [12]:
for column in secom_drop_940thresh:
    mean = secom_drop_940thresh[column].mean()
    secom_drop_940thresh.loc[secom_drop_940thresh[column].isnull(), column] = mean

In [13]:
secom_drop_940thresh

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,576,577,582,583,584,585,586,587,588,589
0,3030.930000,2564.00,2187.733300,1411.126500,1.360200,97.613300,0.1242,1.500500,0.016200,-0.003400,...,1.6765,14.9509,0.5005,0.0118,0.0035,2.3630,0.021458,0.016475,0.005283,99.670066
1,3095.780000,2465.14,2230.422200,1463.660600,0.829400,102.343300,0.1247,1.496600,-0.000500,-0.014800,...,1.1065,10.9003,0.5019,0.0223,0.0055,4.4447,0.009600,0.020100,0.006000,208.204500
2,2932.610000,2559.94,2186.411100,1698.017200,1.510200,95.487800,0.1241,1.443600,0.004100,0.001300,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.058400,0.048400,0.014800,82.860200
3,2988.720000,2479.90,2199.033300,909.792600,1.320400,104.236700,0.1217,1.488200,-0.012400,-0.003300,...,1.7585,8.5831,0.4990,0.0103,0.0025,2.0544,0.020200,0.014900,0.004400,73.843200
4,3032.240000,2502.87,2233.366700,1326.520000,1.533400,100.396700,0.1235,1.503100,-0.003100,-0.007200,...,1.6597,10.9698,0.4800,0.4766,0.1045,99.3032,0.020200,0.014900,0.004400,73.843200
5,2946.250000,2432.84,2233.366700,1326.520000,1.533400,100.396700,0.1235,1.528700,0.016700,0.005500,...,1.6679,13.7755,0.4949,0.0189,0.0044,3.8276,0.034200,0.015100,0.005200,44.007700
6,3030.270000,2430.12,2230.422200,1463.660600,0.829400,102.343300,0.1247,1.581600,-0.027000,0.010500,...,1.1958,8.3645,0.5010,0.0143,0.0042,2.8515,0.034200,0.015100,0.005200,44.007700
7,3058.880000,2690.15,2248.900000,1004.469200,0.788400,106.240000,0.1185,1.515300,0.015700,0.000700,...,56.4274,16.0862,0.4984,0.0106,0.0034,2.1261,0.020400,0.019400,0.006300,95.031000
8,2967.680000,2600.47,2248.900000,1004.469200,0.788400,106.240000,0.1185,1.535800,0.011100,-0.006600,...,1.3248,14.2892,0.4993,0.0172,0.0046,3.4456,0.011100,0.012400,0.004500,111.652500
9,3016.110000,2428.37,2248.900000,1004.469200,0.788400,106.240000,0.1185,1.538100,0.015900,0.004900,...,0.6636,7.4181,0.4967,0.0152,0.0038,3.0687,0.021200,0.019100,0.007300,90.229400


In [14]:
from sklearn.decomposition import PCA
# PCA 인스턴스 객체를 생성.
pca = PCA(n_components=35)

In [15]:
pca.fit(secom_drop_940thresh)
secom_pca = pca.transform(secom_drop_940thresh)

In [16]:
pca.explained_variance_ratio_.cumsum()

array([0.59266722, 0.83395676, 0.92547645, 0.94848714, 0.96308214,
       0.96827142, 0.97149832, 0.9746457 , 0.97727697, 0.97958265,
       0.98175834, 0.98382594, 0.98551675, 0.98677267, 0.98797639,
       0.9891175 , 0.99022888, 0.99115336, 0.99205826, 0.99291966,
       0.99365862, 0.99438272, 0.99504443, 0.99561719, 0.99617557,
       0.99667994, 0.99716999, 0.99763087, 0.99802504, 0.99839255,
       0.99868885, 0.9988486 , 0.99896998, 0.99908652, 0.99918096])

In [17]:
pca.explained_variance_ratio_

array([5.92667221e-01, 2.41289544e-01, 9.15196811e-02, 2.30106988e-02,
       1.45949934e-02, 5.18927688e-03, 3.22690160e-03, 3.14738695e-03,
       2.63126549e-03, 2.30568079e-03, 2.17568717e-03, 2.06760491e-03,
       1.69080392e-03, 1.25592468e-03, 1.20372003e-03, 1.14111048e-03,
       1.11138044e-03, 9.24482718e-04, 9.04891116e-04, 8.61408693e-04,
       7.38959877e-04, 7.24095393e-04, 6.61711218e-04, 5.72759006e-04,
       5.58383663e-04, 5.04370247e-04, 4.90046454e-04, 4.60879879e-04,
       3.94171186e-04, 3.67510920e-04, 2.96295489e-04, 1.59747634e-04,
       1.21382699e-04, 1.16544201e-04, 9.44348621e-05])

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
sss.get_n_splits(secom_pca, y)

5

In [19]:
for train_index, test_index in sss.split(secom_pca, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = secom_pca[train_index], secom_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("\n")

TRAIN: [ 932  997 1074 ...  493  648  771] TEST: [ 265  717 1160  740 1237  410 1229  777  853 1200  187  908 1393 1171
  300  786  838  515  439  952  818  923 1017  402  689 1232  108   37
  868  192  620 1187 1234 1467  186  382 1217 1419 1266 1433 1529 1502
  678 1276  142   54   15 1524 1449  714  673 1407  394 1087  549  301
 1432 1352  927 1349 1515 1028 1004 1111 1085  772 1356 1403 1397  841
 1545  825  918    8  879 1162 1188  130 1021  237 1423  712 1305 1001
 1516 1246  147  197  621  198 1115  101 1387  395 1562  677  193  514
  117  757 1226  964 1384 1447  330 1206  271  171   23 1052  984  358
  803  157  231   45 1174  949 1201  904  134 1053  961   72 1417  966
  862  425  205  388  909  600  156  941 1534 1303  445 1138 1306 1044
   39 1401  219 1459 1361  322  120  175  195  726  563 1473  657  807
  930  970 1500  962 1340  209 1131  161  571  481  888 1241  794 1297
  433 1314   78  465  412 1347  836  875 1068 1528  872  661  503  751
 1165 1270   58  643 1294  8

In [20]:
a = 0
b = 0
for i in y_test:
    if i==1:
        a = a+1
    else:
        b= b+1

print(a,b)

31 440


In [21]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=0, k_neighbors=10, sampling_strategy={1:1096,-1:2192})
X_res, y_res = sm.fit_resample(X_train, y_train)

array([[-4.53283001e+03, -2.70120922e+03,  6.24555821e+01, ...,
        -4.18707462e+01, -1.78737511e+01, -3.50206592e+01],
       [-5.35337737e+03, -8.39278142e+02,  1.30838900e+03, ...,
         2.35834270e+01, -1.56907575e+01, -5.59316906e+01],
       [ 2.33042210e+03,  1.58084673e+03,  8.13103420e+03, ...,
         1.93341235e+02,  1.10984409e+01,  1.13979771e+02],
       ...,
       [-4.45448096e+02,  4.80781193e+03, -1.02982115e+03, ...,
         3.36516810e+02, -3.52882195e+01, -3.99491737e+01],
       [-3.91249177e+03, -2.62616941e+03, -6.81207838e+02, ...,
        -2.74678162e+01, -2.04025316e+01,  1.17215178e+01],
       [ 1.12786810e+04, -3.24326317e+03,  2.29923338e+03, ...,
        -1.33139330e+01, -4.41572850e+00, -5.90038479e+01]])

In [22]:
def mlpClassfier(X_train,y_train, epoch):
    from keras.models import Sequential
    from keras.layers import Dense

    model = Sequential()
    model.add(Dense(input_dim=X_train.shape[1],units=180,activation="tanh"))
    model.add(Dense(1,activation="tanh"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    model.fit(X_train,y_train,epochs=epoch)

    return model

In [23]:
model = mlpClassfier(X_res,y_res,30)

Using TensorFlow backend.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [24]:
score = model.evaluate(X_test,y_test)
print("%s: %.2f%%" % (model.metrics_names[1], score[1] * 100))
y_predict = model.predict(X_test)
print(y_predict)

acc: 54.78%
[[-0.6675798 ]
 [-0.07908066]
 [-0.79964805]
 [ 0.6873282 ]
 [-0.85928893]
 [-0.93242383]
 [-0.8615811 ]
 [-0.2098934 ]
 [-0.3542706 ]
 [-0.76706505]
 [-0.995931  ]
 [-0.98492837]
 [-0.91122824]
 [-0.25981826]
 [-0.7095047 ]
 [-0.8512254 ]
 [ 0.7486659 ]
 [-0.6694933 ]
 [ 0.32073775]
 [-0.8297759 ]
 [-0.6442944 ]
 [-0.39665183]
 [-0.97111696]
 [-0.6688563 ]
 [-0.6558436 ]
 [ 0.5403204 ]
 [-0.15264897]
 [-0.36530194]
 [ 0.3163061 ]
 [-0.61166954]
 [-0.94290566]
 [-0.8968594 ]
 [-0.8167537 ]
 [-0.899777  ]
 [-0.46592233]
 [-0.93594956]
 [-0.4916094 ]
 [-0.8253094 ]
 [-0.9250593 ]
 [ 0.17282161]
 [-0.94954544]
 [-0.52655756]
 [-0.9499422 ]
 [-0.658774  ]
 [-0.81536657]
 [ 0.0374088 ]
 [-0.6029957 ]
 [-0.85600746]
 [-0.9303463 ]
 [ 0.01699043]
 [-0.53955334]
 [-0.864064  ]
 [-0.9863941 ]
 [-0.95479256]
 [ 0.6857656 ]
 [-0.94896597]
 [-0.35182682]
 [-0.40254274]
 [-0.5057626 ]
 [-0.93294024]
 [ 0.41023135]
 [ 0.6924733 ]
 [-0.6770303 ]
 [ 0.17806034]
 [-0.4377676 ]
 [-0.8901658 

In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

In [26]:
def cal_confusion(y, y_test):
    for i,name in enumerate(y):
        if name<0 :
            y[i] = -1
        elif name>=0 :
            y[i] = 1

    print(confusion_matrix(y, y_test))
    print(matthews_corrcoef(y,y_test))

In [27]:
cal_confusion(y_predict, y_test)
a = confusion_matrix(y_predict,y_test)

[[342  27]
 [ 98   4]]
-0.056404064389912925


In [28]:
tn = a[0][0]
fn = a[0][1]
fp = a[1][0]
tp = a[1][1]

In [29]:
precision = (tp/(tp+fp))*100
recall = (tp/(tp+fn))*100
sensitivity = (tp/(tp+fn))*100
specitivity = (tn/(tn+fp))*100
acc = ((tp+tn)/(tp+tn+fp+fn))*100

In [30]:
print("acc : %.2f%%, | precision : %.2f%% | tpr : %.2f%% | sensitivity : %.2f%% | specitivity : %.2f%%" %(acc, precision, recall, sensitivity, specitivity))
print(acc, precision, recall, sensitivity, specitivity)

acc : 73.46%, | precision : 3.92% | tpr : 12.90% | sensitivity : 12.90% | specitivity : 77.73%
73.46072186836517 3.9215686274509802 12.903225806451612 12.903225806451612 77.72727272727272
