In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import warnings
warnings.filterwarnings("ignore")

# Keras 가 Tensorflow 를 벡엔드로 사용할 수 있도록 설정합니다.
os.environ["KERAS_BACKEND"] = "tensorflow"

# 실험을 재현하고 동일한 결과를 얻을 수 있는지 확인하기 위해 seed 를 설정합니다.
seed = 2019
np.random.seed(seed)
tf.set_random_seed(seed)

In [2]:
# 데이터셋 로드
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
secom = pd.read_csv(url, header=None, delim_whitespace=True)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"
y = pd.read_csv(url, header=None, usecols=[0], squeeze=True, delim_whitespace=True)
# delim_whitespace = True : 빈 공간(' ')을 구분자로 인식하고 데이터 읽어옴
# squeeze 만약 컬럼 하나만 읽어오면 데이터 구조를 Series로 읽어옴

print('The dataset has {} observations/rows and {} variables/columns.'.format(secom.shape[0], secom.shape[1]))
print('The majority class has {} observations, minority class {}.'.format(y[y == -1].size, y[y == 1].size))
print('The dataset is imbalanced. The ratio of majority class to minority class is {%.2f}:1.' % (float(y[y == -1].size/y[y == 1].size)))

dropthese = [i for i in range(590) if secom[i].std() == 0]
secom_categorical = secom.drop(dropthese, axis = 1)
print(secom_categorical.shape)
secom_categorical.head()

print('There are {} columns which have identical values recorded. We will drop these.'.format(len(dropthese)))
print('The data set now has {} columns.'.format(secom_categorical.shape[1]))

KeyboardInterrupt: 

In [None]:
m = list(map(lambda x: sum(secom[x].isnull()),range(secom_categorical.shape[1])))
m_200thresh = list(filter(lambda i: (m[i] > 200), range(secom_categorical.shape[1])))
print('The number of columns with more than 200 missing values: {}'.format(len(m_200thresh)))

In [None]:
secom_drop_200thresh = secom_categorical.dropna(subset=m_200thresh, axis=1)
print('No. of columns after dropping columns with more than 200 missing entries: {}'.format(secom_drop_200thresh.shape[1]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(secom_drop_200thresh, y, test_size = 0.2)
# ndarray
print(X_train.shape) # (1253, 414)
print(X_test.shape) # (314, 414)
print(y_train.shape) # (1253,)
print(y_test.shape) # (314,)

In [None]:
# 결측치 파악을 위한 히스토그램 출력
df_X_train = pd.DataFrame(X_train)
feature_names = df_X_train.columns
m = list((map(lambda i: sum(df_X_train[i].isnull()), feature_names)))
plt.hist(m)
plt.title("Distribution of missing values")
plt.xlabel("No. of missing values in a column")
plt.ylabel("Columns(count)")
plt.show()

In [None]:
criteriaList = [400, 500, 600, 700, 800, 900, 1000]

for criteria in criteriaList :
    columns_filtered = list(filter(lambda i: sum(df_X_train[i].isnull()) > criteria, df_X_train.columns))
    print('The number of columns with more than {:>4d} missing values(about {}%): {:>2d}'.format(criteria, int((criteria/1253)*100), len(columns_filtered)))

# fill_NaN_by_Gaussian 함수를 적용하기 위해 데이터 형변환 (ndarray -> DataFrame)
df_X_train = pd.DataFrame(X_train)
df_X_test = pd.DataFrame(X_test)
print(df_X_train.shape)
print(df_X_test.shape)

In [None]:
def fill_NaN_by_Gaussian(df_X_train, df_X_test):
    """결측치를 해당 열의 가우시안 분포를 따르는 난수로 대체하는 함수이다.
       단, test셋의 각 열은 train셋의 각 열의 가우시안 분포를 따른다고 가정한다.
       따라서, test셋은 train셋의 mean, std를 사용한다.                      """

    for column in df_X_train.columns.values:
        mean = df_X_train[column].mean()
        std = df_X_train[column].std()

        X_train_NaN_size = sum(df_X_train[column].isnull())
        X_test_NaN_size = sum(df_X_test[column].isnull())

        df_X_train.loc[df_X_train[column].isnull(), column] = np.random.normal(mean, std, size=X_train_NaN_size)
        df_X_test.loc[df_X_test[column].isnull(), column] = np.random.normal(mean, std, size=X_test_NaN_size)

    return (df_X_train, df_X_test)
# main
df_X_train, df_X_test = fill_NaN_by_Gaussian(df_X_train, df_X_test)
print(df_X_train.shape)
print(df_X_test.shape)

In [None]:
# 표준 정규 분포로 Normalization
# train 데이터의 각 열에 적용한 평균과 표준편차를 test 데이터의 각 열에 동일하게 적용함
def standardProcess(df_X_train, df_X_test):
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    std_scale_parameters = scaler.fit(df_X_train.values) # train 데이터의 평균과 표준편차로 파라미터를 피팅한 후 저장, test 데이터에도 똑같은 파라미터를 적용하기 위함

    scaled_X_train= std_scale_parameters.transform(df_X_train) # [n_samples, n_features]의 크기로 반환, [1253,474]
    scaled_X_test= std_scale_parameters.transform(df_X_test) # [n_samples, n_features]의 크기로 반환, [314,474]

    scaled_df_X_train = pd.DataFrame(scaled_X_train, index = df_X_train.index, columns = df_X_train.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용
    scaled_df_X_test = pd.DataFrame(scaled_X_test, index =df_X_test.index, columns = df_X_test.columns) # 데이터 프레임으로 변환, 기존의 index, columns 사용

    return scaled_df_X_train, scaled_df_X_test

def minmaxProcess(df_X_train, df_X_test):
    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()
    mms_scale_params = scaler.fit(df_X_train.values)

    scaled_X_train = mms_scale_params.transform(df_X_train)
    scaled_X_test = mms_scale_params.transform(df_X_test)

    scaled_df_X_train = pd.DataFrame(scaled_X_train, index= df_X_train.index, columns= df_X_train.columns)
    scaled_df_X_test = pd.DataFrame(scaled_X_test, index= df_X_test.index, columns= df_X_test.columns)

    return scaled_df_X_train, scaled_df_X_test

In [None]:
select = input("사용할 Scaler 함수를 입력 / mms = MinMaxScaler / std = StandardScaler\n")
if select == "std":
    print("선택한 함수는 Standard Scaler")
    scaled_df_X_train, scaled_df_X_test = standardProcess(df_X_train,df_X_test)
elif select == "mms":
    print("선택한 함수는 MinMaxScaler")
    scaled_df_X_train, scaled_df_X_test = minmaxProcess(df_X_train, df_X_test)
else:
    print("다른 값 입력으로 default로 Standard Scaler를 사용합니다.")
    scaled_df_X_train, scaled_df_X_test = standardProcess(df_X_train, df_X_test)

In [None]:
from sklearn.decomposition import PCA
# PCA 인스턴스 객체를 생성.
pca = PCA(n_components=35)

# 생성된 PCA 인스턴스 객체에 scaled_df_X_train를 Fitting.
# 주의: fit은 오직 scaled_df_X_train에만 적용.
# 동일한 파라미터를 scaled_df_X_test에 적용하기 위함.
pca.fit(scaled_df_X_train)

# transform하게 되면  ndarray로 반환하기 때문에 별도의 DataFrame 형변환 필요.
X_train_after_PCA = pca.transform(scaled_df_X_train)
X_test_after_PCA = pca.transform(scaled_df_X_test)

# PCA 수행 후의 X_train을 DataFrame으로 형변환
df_X_train_after_PCA = pd.DataFrame(data=X_train_after_PCA, index=scaled_df_X_train.index)
# PCA 수행 후의 X_test을 DataFrame으로 형변환
df_X_test_after_PCA = pd.DataFrame(data=X_test_after_PCA, index=scaled_df_X_test.index)

print(df_X_train_after_PCA.shape[0],df_X_train_after_PCA.shape[1])

In [None]:
# 아래부턴 CGAN 코드

from keras.models import Sequential, Model
from keras.layers import Dense, LeakyReLU, BatchNormalization
from keras.layers import Input, Flatten, Embedding, multiply, Dropout
from keras.optimizers import Adam
from keras import initializers

# latent space dimension
latent_dim = 100
secom_dim = df_X_train_after_PCA.shape[1]
init = initializers.RandomNormal(stddev=0.02)
# Generator network
generator = Sequential()
# Input layer and hidden layer 1
generator.add(Dense(76, input_shape=(latent_dim,),
                    kernel_initializer=init))
generator.add(BatchNormalization())
generator.add(LeakyReLU(0.2))
# Hidden layer 2
generator.add(Dense(152))
generator.add(LeakyReLU(0.2))
generator.add(BatchNormalization())
# Hidden layer 3
generator.add(Dense(304))
generator.add(LeakyReLU(0.2))
generator.add(BatchNormalization())
# Output layer
generator.add(Dense(secom_dim, activation='tanh'))



In [None]:
# Embedding condition in input layer
num_classes = 2
# Create label embeddings
label = Input(shape=(1,), dtype='int32')
label_embedding = Embedding(num_classes, latent_dim)(label)
label_embedding = Flatten()(label_embedding)
# latent space
z = Input(shape=(latent_dim,))
# Merge inputs (z x label)
input_generator = multiply([z, label_embedding])

secom_generate = generator(input_generator)
# Generator with condition input
generator = Model([z, label], secom_generate)

In [None]:
# Discriminator network
discriminator = Sequential()

# Input layer and hidden layer 1
discriminator.add(Dense(76, input_shape=(secom_dim,),
                        kernel_initializer=init))
discriminator.add(LeakyReLU(0.2))

# Hidden layer 2
discriminator.add(Dense(152))
discriminator.add(LeakyReLU(0.2))

# Hidden layer 3
discriminator.add(Dense(304))
discriminator.add(LeakyReLU(0.2))

# Output layer
discriminator.add(Dense(1, activation='sigmoid'))

In [None]:
# Embedding condition in input layer

# Create label embeddings
label_d = Input(shape=(1,), dtype='int32')
label_embedding_d = Embedding(num_classes, secom_dim)(label_d)
label_embedding_d = Flatten()(label_embedding_d)

img_d = Input(shape=(secom_dim,))

# Merge inputs (img x label)
input_discriminator = multiply([img_d, label_embedding_d])

# Output image
validity = discriminator(input_discriminator)

# Discriminator with condition input
discriminator = Model([img_d, label_d], validity)

# Optimizer
opt = Adam(lr=0.0002, beta_1=0.5)

discriminator.compile(opt, loss='binary_crossentropy',
                      metrics=['binary_accuracy'])

discriminator.trainable = False

validity = discriminator([generator([z, label]), label])

d_g = Model([z, label], validity)

d_g.compile(opt, loss='binary_crossentropy',
            metrics=['binary_accuracy'])

In [None]:
epochs = 1000
batch_size = 64
smooth = 0.1

real = np.ones(shape=(batch_size, 1))
fake = np.zeros(shape=(batch_size, 1))

d_loss = []
d_g_loss = []
np_X_train = df_X_train_after_PCA.as_matrix()
np_y_train = pd.DataFrame(y_train).as_matrix()

In [None]:
from sklearn.manifold import TSNE

In [None]:
fake_y_train_df = []
for i in range(100):
    fake_y_train_df.append(2)
fake_y_train_df = pd.DataFrame(fake_y_train_df)
tsneEmbedded = TSNE(n_components=2,perplexity=30,learning_rate=200)

In [None]:
for e in range(epochs + 1):
    for i in range(len(np_X_train) // batch_size):
        # Train Discriminator weights
        discriminator.trainable = True

        # Real samples
        X_batch = np_X_train[i * batch_size:(i + 1) * batch_size]
        real_labels = np_y_train[i * batch_size:(i + 1) * batch_size].reshape(-1, 1)
        d_loss_real = discriminator.train_on_batch(x=[X_batch, real_labels],
                                                   y=real * (1 - smooth))

        # Fake Samples
        z = np.random.normal(loc=0, scale=1, size=(batch_size, latent_dim))
        random_labels = np.random.choice([-1,1], batch_size).reshape(-1, 1)
        X_fake = generator.predict_on_batch([z, random_labels])

        d_loss_fake = discriminator.train_on_batch(x=[X_fake, random_labels], y=fake)

        # Discriminator loss
        d_loss_batch = 0.5 * (d_loss_real[0] + d_loss_fake[0])

        # Train Generator weights
        discriminator.trainable = False

        z = np.random.normal(loc=0, scale=1, size=(batch_size, latent_dim))
        random_labels = np.random.choice([-1,1], batch_size).reshape(-1, 1)
        d_g_loss_batch = d_g.train_on_batch(x=[z, random_labels], y=real)

        print(
            'epoch = %d/%d, batch = %d/%d, d_loss=%.3f, g_loss=%.3f' % (
            e + 1, epochs, i, len(X_train) // batch_size, d_loss_batch, d_g_loss_batch[0]),
            100 * ' ',
            end='\r'
        )

    d_loss.append(d_loss_batch)
    d_g_loss.append(d_g_loss_batch[0])
    print('epoch = %d/%d, d_loss=%.3f, g_loss=%.3f' % (e + 1, epochs, d_loss[-1], d_g_loss[-1]), 100 * ' ')
    if e % 25 == 0:
        samples = 100
        z = np.random.normal(loc=0, scale=1, size=(samples, latent_dim))
        labels = np.random.choice([1],samples).reshape(-1, 1)
        x_fake = generator.predict([z, labels])
        x_fake = pd.DataFrame(x_fake)
        X_y_PCA = pd.concat([df_X_train_after_PCA,y_train],axis=1)
        X_y_fake = pd.concat([x_fake,fake_y_train_df],axis=1)
        X_y_PCA = X_y_PCA.values
        X_y_except = []
        for i in X_y_PCA:
                for j in i:
                    if j==1:
                        X_y_except.append(i)
        X_y_except = pd.DataFrame(X_y_except)
        X_y_fake = X_y_fake.values
        X_y_fake = pd.DataFrame(X_y_fake)
        X_y_concat = pd.concat([X_y_except,X_y_fake],axis=0)
        X_y_concat.to_csv("csvdirectory/csv%d.csv"%e)
        df1 = X_y_concat.iloc[:,0:76]
        df2 = list(X_y_concat.iloc[:,-1])
        X_embedded = tsneEmbedded.fit_transform(df1)
        xs = X_embedded[:,0]
        ys = X_embedded[:,1]
        fig = plt.gcf()
        cntb = True
        cntg = True
        for i in range(len(xs)):
            if df2[i]== 1.0:
                if cntb==True:
                    plt.scatter(xs[i],ys[i],label="default",c="blue")
                    cntb = False
                else:
                    plt.scatter(xs[i],ys[i],c="blue")
            elif df2[i]==2.0:
                if cntg ==True:
                    plt.scatter(xs[i],ys[i],label="cGAN",c="yellow")
                    cntg = False
                else:
                    plt.scatter(xs[i],ys[i],c="yellow")
        plt.legend()
        plt.title("%d"%e)
        fig.savefig('csvdirectory/%dfig.png'%e)
        plt.show()