<a href="https://colab.research.google.com/github/shhoff/shhoff/blob/main/mcd_vae.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import argparse
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import set_random_seed

from keras import regularizers
from keras import backend as K
from keras.models import Model
from keras.utils import plot_model
from keras.losses import mse, binary_crossentropy
from keras.layers import Lambda, Input, Dense, Dropout

GLOBAL_SEED = 1
LOCAL_SEED = 42

set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

# Access Google Drive

In [None]:
# Define PATH to file
path = 'gdrive/My Drive/Generators/DataSets/Selected/breast-cancer-wisconsin/wdbc.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/balance-scale/balance-scale.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/pima-indians-diabetes/pima-indians-diabetes.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/tic-tac-toe/tic-tac-toe.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/annealing/anneal.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/breast-cancer/breast-cancer.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/cylinder-bands/bands.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/credit-screening/crx.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/statlog/australian/australian.dat'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/statlog/german/german.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/statlog/german/german.data-numeric'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/spectrometer/lrs.data'
# path = 'gdrive/My Drive/Generators/DataSets/Selected/soybean/soybean-large.data'

In [None]:
intermediate_dim = 512

# Read Data

In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
import pandas as pd
na_values = {'?', ' '}
df = pd.read_csv(path,
                 sep=',',
                 header=None,
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
print('Origin dataset:')                 
print(df.head())
# Drop N/A 
df.dropna(axis=0, how='any', inplace=True)
df.replace('U',np.NaN, inplace=True)
# print(df.head())

# For Breast cancer 
# Drop ID column
df.drop([0], axis=1, inplace=True)

# For German; 20 attribute; the last columns
# df.drop([20], axis=1, inplace=True)

df = df.reset_index(drop=True)
print("After remove")
print(df.head())

col_names = list(df)
new_names = {}
for i, name in enumerate(col_names):
    new_names[name] = 'X' + str(i)
df.rename(columns=new_names, inplace=True)


# For soybean
# colnums = len(df.columns)
# for i in df.columns:
#     df[i] = df[i].astype('category')

# For Pima diabetes
# df['X9'] = df['X9'].astype('category')
# df['X8'] = df['X8'].astype('category')

# For Breast Cancer wincosin
# df['X9'] = df['X9'].astype('category')
df = df.reindex(sorted(df.columns), axis=1)
print(df.head())

Origin dataset:
         0  1      2      3       4   ...      27      28      29      30       31
0    842302  M  17.99  10.38  122.80  ...  0.6656  0.7119  0.2654  0.4601  0.11890
1    842517  M  20.57  17.77  132.90  ...  0.1866  0.2416  0.1860  0.2750  0.08902
2  84300903  M  19.69  21.25  130.00  ...  0.4245  0.4504  0.2430  0.3613  0.08758
3  84348301  M  11.42  20.38   77.58  ...  0.8663  0.6869  0.2575  0.6638  0.17300
4  84358402  M  20.29  14.34  135.10  ...  0.2050  0.4000  0.1625  0.2364  0.07678

[5 rows x 32 columns]
After remove
  1      2      3       4       5   ...      27      28      29      30       31
0  M  17.99  10.38  122.80  1001.0  ...  0.6656  0.7119  0.2654  0.4601  0.11890
1  M  20.57  17.77  132.90  1326.0  ...  0.1866  0.2416  0.1860  0.2750  0.08902
2  M  19.69  21.25  130.00  1203.0  ...  0.4245  0.4504  0.2430  0.3613  0.08758
3  M  11.42  20.38   77.58   386.1  ...  0.8663  0.6869  0.2575  0.6638  0.17300
4  M  20.29  14.34  135.10  1297.0  ...  0.20

In [None]:
latent_dim = len(df.columns)//2
if latent_dim < 2:
    latent_dim = 2
print('Latent Dim = ', latent_dim)

Latent Dim =  15


# Split dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
np.random.seed(GLOBAL_SEED)
vals = np.copy(df.values)
total_nums = len(vals)

df_train, df_validation = train_test_split(df, test_size=0.5, 
                                           random_state=LOCAL_SEED, 
                                           shuffle=True)
# Write the test dataset
df_validation = df_validation.reindex(sorted(df_validation.columns), axis=1)
df_validation.to_csv(path + '_For_Test.csv', index=False)
print(df_validation.head())

    X0     X1      X10     X11  ...      X6       X7       X8      X9
204  B  12.47  0.06373  0.3961  ...  0.1058  0.08005  0.03821  0.1925
70   M  18.94  0.05461  0.7888  ...  0.1029  0.10800  0.07951  0.1582
131  M  15.46  0.05796  0.4743  ...  0.1223  0.14660  0.08087  0.1931
431  B  12.40  0.07102  0.1767  ...  0.1316  0.07741  0.02799  0.1811
540  B  11.54  0.06782  0.2784  ...  0.1120  0.06737  0.02594  0.1818

[5 rows x 31 columns]


# Recognize categorical columns

In [None]:
df = df_train.copy(deep=True)
print(df.head())

    X0     X1      X10     X11  ...       X6       X7        X8      X9
423  B  13.66  0.06181  0.2244  ...  0.11470  0.09657  0.048120  0.1848
546  B  10.32  0.06201  0.2104  ...  0.04994  0.01012  0.005495  0.1885
119  M  17.95  0.05025  0.5506  ...  0.06722  0.07293  0.055960  0.2129
386  B  12.21  0.06154  0.2666  ...  0.07823  0.06839  0.025340  0.1646
367  B  12.21  0.05916  0.2527  ...  0.07175  0.04392  0.020270  0.1695

[5 rows x 31 columns]


In [None]:
# For breast cancer
# df['X9'] = df['X9'].astype('category')
# For Pima Diabetes
# df['X8'] = df['X8'].astype('category')
colnums = len(df.columns)
for i in df.columns:
    try:
        if df[i].dtype.name == 'object':
            df[i] = df[i].astype('category')
        else:
            df[i].astype('float32')
    except:
        continue
print(df.head())
print(df.describe())

    X0     X1      X10     X11  ...       X6       X7        X8      X9
423  B  13.66  0.06181  0.2244  ...  0.11470  0.09657  0.048120  0.1848
546  B  10.32  0.06201  0.2104  ...  0.04994  0.01012  0.005495  0.1885
119  M  17.95  0.05025  0.5506  ...  0.06722  0.07293  0.055960  0.2129
386  B  12.21  0.06154  0.2666  ...  0.07823  0.06839  0.025340  0.1646
367  B  12.21  0.05916  0.2527  ...  0.07175  0.04392  0.020270  0.1695

[5 rows x 31 columns]
               X1         X10         X11  ...          X7          X8          X9
count  284.000000  284.000000  284.000000  ...  284.000000  284.000000  284.000000
mean    14.382239    0.062422    0.414679  ...    0.091528    0.050420    0.181324
std      3.658528    0.007022    0.311197  ...    0.080519    0.038811    0.026450
min      8.219000    0.050250    0.111500  ...    0.000000    0.000000    0.116700
25%     11.790000    0.057260    0.238775  ...    0.029635    0.020698    0.163075
50%     13.435000    0.061270    0.333550  ... 

In [None]:
# df['X9'] = df['X9'].astype('category')
# df['X8'] = df['X8'].astype('category')
categorical = df.select_dtypes(['category']).columns
print(categorical)
for f in categorical:
    dummies = pd.get_dummies(df[f], prefix = f, prefix_sep = '_')
    df = pd.concat([df, dummies], axis = 1)
    
# drop original categorical features
df.drop(categorical, axis = 1, inplace = True)

Index(['X0'], dtype='object')


In [None]:
df.to_csv(path + 'For_training.csv', index=False)

# VAE

## Split train and test data

In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
df = pd.read_csv(path + 'For_training.csv')
vae_train = np.copy(df.values)
vae_train.astype('float32')
scaler = MinMaxScaler()
print(np.amax(vae_train[:, 2]))

vae_train = scaler.fit_transform(vae_train)
x_train, x_test = train_test_split(vae_train, test_size=0.5,
                                   random_state=LOCAL_SEED,
                                   shuffle=True)

print(x_train.shape)
print(x_test.shape)
print(np.amax(x_train))
print(np.amax(x_test))

2.873
(142, 32)
(142, 32)
1.0000000000000002
1.0000000000000002


In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

original_dim = x_train.shape[1]
x_train = np.reshape(x_train, [-1, original_dim])
x_test = np.reshape(x_test, [-1, original_dim])
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print(x_train.shape)
print(x_test.shape)

(142, 32)
(142, 32)


## Define VAE class

In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
class VAE:
    def __init__(self, input_shape=(original_dim,), 
                 intermediate_dim=128, latent_dim=2, summary=False):
        
        self._build_model(input_shape,
                         intermediate_dim, 
                          latent_dim, summary)
    
    def _build_model(self, input_shape, intermediate_dim, latent_dim,
                    summary=False):
        inputs = Input(shape=input_shape, name='encoder_input')
        x = inputs
        x = Dense(intermediate_dim, activation='relu')(x)
        x = Dense(intermediate_dim//2, activation='relu')(x)
        
        z_mean = Dense(latent_dim, name='z_mean')(x)
        z_log_var = Dense(latent_dim, name='z_log_var')(x)

        z = Lambda(self.sampling, output_shape=(latent_dim,), 
                   name='z')([z_mean, z_log_var])

        self.encoder = Model(inputs, [z_mean, z_log_var, z], 
                        name='encoder')
        
        latent_inputs = Input(shape=(latent_dim,), 
                              name='z_sampling')
        x = latent_inputs
        x = Dense(intermediate_dim//2, activation='relu')(x)
        x = Dense(intermediate_dim, activation='relu')(x)
        outputs = Dense(original_dim, activation='sigmoid')(x)

        self.decoder = Model(latent_inputs, outputs, name='decoder')
        outputs = self.decoder(self.encoder(inputs)[2])
        self.vae = Model(inputs, outputs, name='vae_mlp')
        
        reconstruction_loss = binary_crossentropy(inputs, outputs)
        reconstruction_loss *= original_dim
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        
        vae_loss = K.mean(reconstruction_loss + kl_loss)	
        
        self.vae.add_loss(vae_loss)
        self.vae.compile(optimizer='adam')
        if summary: 
            print(self.vae.summary())
        
    def sampling(self, args):
        z_mean, z_log_var = args
        batch = K.shape(z_mean)[0]
        dim = K.int_shape(z_mean)[1]
        epsilon = K.random_normal(shape=(batch, dim))
        return z_mean + K.exp(0.5 * z_log_var) * epsilon
        
    def fit(self, x_train, x_test, epochs=100, batch_size=100,
           verbose=1):
        self.vae.fit(x_train, 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            verbose=verbose,
            validation_data=(x_test, None))
    
    def encoder_predict(self, x_test, batch_size=100):
        return self.encoder.predict(x_test,
                                   batch_size=batch_size)
    
    def generate(self, latent_val, batch_size=100):
        return self.decoder.predict(latent_val)
    
    def predict(self, x_test, batch_size=1):
        prediction = self.vae.predict(x_test)
        return prediction

## Training VAE

Just let the last value to test

In [None]:
print(x_train.shape)
print(np.amax(x_train))
print(np.amin(x_train))

(142, 32)
1.0
0.0


In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

vae = VAE(intermediate_dim=intermediate_dim, latent_dim=latent_dim)
vae.fit(x_train, x_test, epochs=150)

W0614 07:22:30.286930 140529874950016 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0614 07:22:30.299089 140529874950016 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0614 07:22:30.300383 140529874950016 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0614 07:22:30.351479 140529874950016 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4115: The name tf.random_normal is deprecated. Please use tf.random.normal instead.

W0614 07:22:30.425449 140529874950016 deprecation_wra

Train on 142 samples, validate on 142 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
E

## Generate data with VAE

In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

x_test = np.reshape(x_test, (-1, original_dim))
x_test_encoded = vae.encoder.predict(x_test)
x_test_encoded = np.asarray(x_test_encoded)

print(x_test_encoded.shape)

(3, 142, 15)


In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

total_nums = 2
results = []
for i in range(x_test_encoded.shape[1]):
    latent_gen = []
    for _ in range(total_nums):
        epsilon = np.random.normal(0., 1., x_test_encoded.shape[2])
        latent_gen.extend([x_test_encoded[0, i, :] + np.exp(x_test_encoded[1, i, :]*0.5)*epsilon])
    latent_gen = np.asarray(latent_gen)
    results.append(vae.generate(latent_gen))
    
results = np.asarray(results)
results = np.reshape(results, (-1, original_dim))
print(results.shape)
results = scaler.inverse_transform(results)

(284, 32)


## Handling generated data

In [None]:
print(len(results[:, 1]))
print(results[0, 0])

284
12.172525


In [None]:
d = {}
names = list(df)
for i, name in enumerate(names):
    d[name] = results[:, i]
df = pd.DataFrame(data=d)

## Re-categorical columns from generated data

In [None]:
names = list(df)
c_dict = {}
for n in names:
    if '_' in n:
        index = n.index('_')
        c_dict[n[:index]] = [c for c in names if n[:index+1] in c]
values = []
for key, items in c_dict.items():
    dummies = df[items]
    d_names = list(dummies)
    c_dict = {}
    for n in d_names:
        c_dict[n] = n[n.index('_')+1:]
    dummies.rename(columns=c_dict, 
                   inplace=True)
    df[key] = dummies.idxmax(axis=1)
    df.drop(items, axis=1, inplace=True)
print(df.head())

          X1       X10       X11       X12  ...        X7        X8        X9  X0
0  12.172525  0.059887  0.236538  1.170456  ...  0.035399  0.025448  0.171790   B
1  14.687969  0.062780  0.468621  1.265368  ...  0.089523  0.057920  0.182368   B
2  12.299572  0.062163  0.273778  1.184938  ...  0.050540  0.030058  0.174591   B
3  12.190534  0.063317  0.266791  1.307913  ...  0.035017  0.022288  0.174360   B
4  13.374092  0.061587  0.366202  1.241324  ...  0.076827  0.049000  0.176581   B

[5 rows x 31 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [None]:
df = df.reindex(sorted(df.columns), axis=1)
df.to_csv(path + '_vae.csv', index=False)

# Dropout VAE

## Split train and test data

In [None]:
df = pd.read_csv(path + 'For_training.csv')
train = np.copy(df.values)
train.astype('float32')
scaler = MinMaxScaler()
print(np.amax(train[:, 2]))

train = scaler.fit_transform(train)
x_train, x_test = train_test_split(train, test_size=0.5,
                                  random_state=LOCAL_SEED,
                                  shuffle=True)
print(x_train.shape)
print(x_test.shape)
print(np.amax(x_train))
print(np.amax(x_test))

2.873
(142, 32)
(142, 32)
1.0000000000000002
1.0000000000000002


In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
original_dim = x_train.shape[1]
x_train = np.reshape(x_train, [-1, original_dim])
x_test = np.reshape(x_test, [-1, original_dim])
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print(x_train.shape)
print(x_test.shape)

(142, 32)
(142, 32)


## Define Dropout VAE

In [None]:
from keras.regularizers import l2
from keras.losses import categorical_crossentropy
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

class DropoutVAE:
    def __init__(self, input_shape=(original_dim,), 
                 intermediate_dim=32, latent_dim=3, dropout=0.05, 
                 summary=False):
        
        self._build_model(input_shape,
                         intermediate_dim, 
                          latent_dim, summary,
                          dropout)
    
    def _build_model(self, input_shape, intermediate_dim, latent_dim,
                    summary=False, dropout=0.05):
        inputs = Input(shape=input_shape, name='encoder_input')
        x = inputs
        x = Dense(intermediate_dim, activation='relu')(x)
        x = Dense(intermediate_dim//2, activation='relu')(x)
        
        z_mean = Dense(latent_dim, name='z_mean')(x)
        z_log_var = Dense(latent_dim, name='z_log_var')(x)
        
        # We do not need this one
#         z = Lambda(self.sampling, output_shape=(latent_dim,), 
#                    name='z')([z_mean, z_log_var])
        
        # We remove the z layer ( z layer is used in VAE but not here)
        self.encoder = Model(inputs, [z_mean, z_log_var], 
                        name='encoder')
        
        latent_inputs = Input(shape=(latent_dim,), 
                              name='z_sampling')
        x = latent_inputs
        x = Dense(intermediate_dim//2, activation='relu',
                 kernel_regularizer=l2(1e-4),
                 bias_regularizer=l2(1e-4))(x)
        x = Dropout(dropout)(x)
        x = Dense(intermediate_dim, activation='relu',
                 kernel_regularizer=l2(1e-4),
                 bias_regularizer=l2(1e-4))(x)
        x = Dropout(dropout)(x)
        outputs = Dense(original_dim, activation='sigmoid',
                       kernel_regularizer=l2(1e-4),
                       bias_regularizer=l2(1e-4))(x)

        self.decoder = Model(latent_inputs, 
                             outputs, 
                             name='decoder')
        
        # Here we take the mean (not the z-layer) 
        outputs = self.decoder(self.encoder(inputs)[0])
        self.vae = Model(inputs, outputs, 
                         name='vae_mlp')
        
        reconstruction_loss = binary_crossentropy(inputs, outputs)
        reconstruction_loss *= original_dim
        kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
        kl_loss = K.sum(kl_loss, axis=-1)
        kl_loss *= -0.5
        
        vae_loss = K.mean(reconstruction_loss + kl_loss)	
        
        self.vae.add_loss(vae_loss)
        self.vae.compile(optimizer='adam')
        if summary: 
            print(self.vae.summary())
        
    # Remove this function
#     def sampling(self, args):
#         z_mean, z_log_var = args
#         batch = K.shape(z_mean)[0]
#         dim = K.int_shape(z_mean)[1]
#         epsilon = K.random_normal(shape=(batch, dim))
#         return z_mean + K.exp(0.5 * z_log_var) * epsilon
        
    def fit(self, x_train, x_test, epochs=100, batch_size=100,
           verbose=1):
        self.vae.fit(x_train, 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            verbose=verbose,
            validation_data=(x_test, None))
    
    def encoder_predict(self, x_test, batch_size=100):
        return self.encoder.predict(x_test,
                                   batch_size=batch_size)
    
    def generate(self, latent_val, batch_size=100):
        return self.decoder.predict(latent_val)
    
    def predict(self, x_test, batch_size=1, nums=1000):
        Yt_hat = []
        for _ in range(nums):
            Yt_hat.extend(self.vae.predict(x_test))
                          
        return np.asarray(Yt_hat)
                          
    def mean_predict(self, x_test, batch_size=1, nums=1000):
        predict_stochastic = K.function([self.decoder.layers[0].input,
                                K.learning_phase()],
                                [self.decoder.get_output_at(0)])
        latents = self.encoder.predict(x_test)[0]
        Yt_hat = []
        for _ in range(nums):
            Yt_hat.append(predict_stochastic([latents, 1])) 
        return np.asarray(Yt_hat)

## Train and evaluate Dropout VAE

In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

vae = DropoutVAE(intermediate_dim=intermediate_dim,
                 dropout=0.2, latent_dim=latent_dim,
                 summary=True)
vae.fit(x_train, x_test, epochs=150)

W0615 16:12:59.132649 140515461060480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0615 16:12:59.145390 140515461060480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0615 16:12:59.147334 140515461060480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0615 16:12:59.219371 140515461060480 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0615 16:12:59.229673 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_input (InputLayer)   (None, 32)                0         
_________________________________________________________________
encoder (Model)              [(None, 16), (None, 16)]  156448    
_________________________________________________________________
decoder (Model)              (None, 32)                152352    
Total params: 308,800
Trainable params: 308,800
Non-trainable params: 0
_________________________________________________________________
None
Train on 142 samples, validate on 142 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 2

## Generate data with Dropout VAE

In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)

x_test = np.reshape(x_test, (-1, original_dim))
print(x_test.shape)
print(x_test[0].reshape(-1, original_dim).shape)

(142, 32)
(1, 32)


In [None]:
set_random_seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
random.seed(GLOBAL_SEED)
total_nums = 2
results = []

x_test_encoded = vae.mean_predict(x_test, nums=total_nums)
print(x_test_encoded.shape)

results = x_test_encoded
results = results.reshape(total_nums*results.shape[2], original_dim)
results = scaler.inverse_transform(results)
print(results.shape)

(2, 1, 142, 32)
(284, 32)


## Handling Generated data

In [None]:
d = {}
names = list(df)
for i, name in enumerate(names):
    d[name] = results[:, i]
df = pd.DataFrame(data=d)

## Re-categoricalize data from Generated data

In [None]:
names = list(df)
c_dict = {}
for n in names:
    if '_' in n:
        index = n.index('_')
        c_dict[n[:index]] = [c for c in names if n[:index+1] in c]
values = []
for key, items in c_dict.items():
    dummies = df[items]
    d_names = list(dummies)
    c_dict = {}
    for n in d_names:
        c_dict[n] = n[n.index('_')+1:]
    dummies.rename(columns=c_dict, 
                   inplace=True)
    df[key] = dummies.idxmax(axis=1)
    df.drop(items, axis=1, inplace=True)
print(df.head())

          X1       X10       X11       X12  ...        X7        X8        X9  X0
0  16.453360  0.057111  0.389558  1.073235  ...  0.052454  0.033973  0.168655   B
1  12.289575  0.061267  0.193680  0.582741  ...  0.056750  0.029594  0.178056   B
2  10.439020  0.066813  0.346121  1.787749  ...  0.031502  0.016874  0.165296   B
3  12.747104  0.059958  0.361127  1.825076  ...  0.058108  0.028500  0.150278   B
4  20.998669  0.056213  0.634688  0.740990  ...  0.172398  0.103170  0.168748   M

[5 rows x 31 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [None]:
df = df.reindex(sorted(df.columns), axis=1)
df.to_csv(path + '_dropout.csv', index=False)

# Encoding categorical data

In [None]:
df = pd.read_csv(path + '_For_Test.csv',
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
df_mc = pd.read_csv(path + '_dropout.csv',
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
df_vae = pd.read_csv(path + '_vae.csv',
                 na_filter=True, 
                 verbose=False, 
                 skip_blank_lines=True, 
                 na_values=na_values,
                 keep_default_na=False)
names = list(df)

In [None]:
from sklearn.preprocessing import LabelEncoder
colnums = len(df.columns)
for i in df.columns:
    try:
        if df[i].dtype.name == 'object':
            df[i] = df[i].astype('category')
    except:
        continue
cat_columns = df.select_dtypes(['category']).columns
print(cat_columns)
for col in cat_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].values)
    df_mc[col] = le.transform(df_mc[col].values)
    df_vae[col] = le.transform(df_vae[col].values)
    

Index(['X0'], dtype='object')


In [None]:
df = df.reindex(sorted(df.columns), axis=1)
df_mc = df_mc.reindex(sorted(df_mc.columns), axis=1)
df_vae = df_vae.reindex(sorted(df_vae.columns), axis=1)
df.to_csv(path + '_For_Test_encoded.csv')
df_mc.to_csv(path + '_dropout_encoded.csv')
df_vae.to_csv(path + '_vae_encoded.csv')