In [1]:
'''
Copyright by Steven CY Chuang.
All rights are reserved and explaned by the author.
The ways to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software must be agreed by the author.


Created on July 19, 2018.

This module provides the several classes of autoencoder series.
The use could simply use these API without defining the structure by oneself.

@author: steven.cy.chuang
'''

'\nCopyright by Steven CY Chuang.\nAll rights are reserved and explaned by the author.\nThe ways to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software must be agreed by the author.\n\n\nCreated on July 19, 2018.\n\nThis module provides the several classes of autoencoder series.\nThe use could simply use these API without defining the structure by oneself.\n\n@author: steven.cy.chuang\n'

In [2]:
from time import time
from keras.layers import Input, Dense, Lambda, Conv2D, Conv2DTranspose, Activation, Flatten, Reshape
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model
from keras import backend as K
from keras import metrics

Using TensorFlow backend.


In [3]:
class VAE():
    stdEps = 1.0
    
    def __init__(self, 
                 dimInput, 
                 layerDense=[64, 2], actDense='relu',
                 ratRecon=0.998
                ):
        '''
        The basic properties and pipeline will be defined in the initialization.
        It should be noted that layerDense defines the first half(encoder) of network. 
        The decoder will be reflected structure.
        For example, [64, 16, 2] means the nodes of decoder will be [2, 16, 64]. 
        There is another parameter should noted that ratRecon=0.5 doesn't mean the effect is half.
        Because KL loss and reconstruction loss are not the same scale.
        Args:
            dimInput (int): the number of input dimension. All features are flatten as a vector.
            layerDense (list[int]): the numbers of each dense layer. Default is [64, 2].
            actDense (string): the activation function. Default is 'relu'.
            ratRecon (float): the parameter for tuning the effects between KL loss and reconstruction loss.
        '''
        
        # Initialize some setting 
        self.dimInput = dimInput # dimInput is width*height
        self.inputs = Input(shape=(dimInput,)) 
        self.dimEncode = layerDense[-1]
        self.ratRecon = ratRecon
        
        self.encoding(layerDense, actDense)
        
        self.decoding(layerDense, actDense)
        
        self.autoencoder = Model(self.inputs, self.decoder(self.encoder(self.inputs)), name='autoencoder')

        
    def encoding(self, layerDense, actDense):
        dimEncode = self.dimEncode
        x = self.inputs

        # Stack of Dense layers
        for numFilt in layerDense[:-1]:
            x = Dense(numFilt, activation=actDense)(x)
        self.zMean = Dense(self.dimEncode)(x)
        self.zSigmaLog = Dense(self.dimEncode)(x) # log for linear dense

        # Construct the latent as the output and build the encorder pipeline
        z = Lambda(self.sampling)([self.zMean, self.zSigmaLog])
        self.encoder = Model(self.inputs, z, name='encoder')

        
    def decoding(self, layerDense, actDense):
         # Build the Decoder Model
        inputLatent = Input(shape=(self.dimEncode,), name='decoder_input')
        x = inputLatent
        for numFilt in layerDense[-2::-1]:
            x = Dense(numFilt, activation=actDense)(x)
            
        # Reconstruct the pixels as the output and build the decorder pipeline
        outputs = Dense(self.dimInput, activation='sigmoid', name='decoder_output')(x)
        self.decoder = Model(inputLatent, outputs, name='decoder')
        
        
    def sampling(self, args):
        zMean, zSigmaLog = args
        epsilon = K.random_normal(shape=(K.shape(zMean)[0], self.dimEncode),
                                  mean=0., stddev=self.stdEps)
        return zMean + K.exp(zSigmaLog) * epsilon  
        
        
    def lossVAE(self, tensorInput, tensorDecode):
        zMean = self.zMean
        zSigmaLog = self.zSigmaLog
        ratRecon = self.ratRecon
        
        lossRecon =  metrics.binary_crossentropy(K.flatten(tensorInput), K.flatten(tensorDecode))
        lossKL = - 0.5 * K.sum(1 + 2 * zSigmaLog - K.square(zMean) - K.square(K.exp(zSigmaLog)), axis=-1)
        return ratRecon * lossRecon + (1 - ratRecon) * lossKL
    
    
    def lossKL(self):
        zMean = self.zMean
        zSigmaLog = self.zSigmaLog
        lossKL = - 0.5 * K.sum(1 + 2 * zSigmaLog - K.square(zMean) - K.square(K.exp(zSigmaLog)), axis=-1)
        return lossKL        
        
        
    def fit(self,
            xTrain, xValid,
            numEpochs=50, sizeBatch=32, nameOptim='adam',
            tempPathBest=None, patience=3,
           ):
        '''
        The method is for training process. 
        The users can call this method easily just putting training and validation datasets.
        The dimension of dataset is determined by [#instance, *dimInput].
        For example, dimInput is flatten as a number and the dimension of dataset is [#instance, #feature]
        If dimInput is a list to represent [width, height, channels], the dimension of dataset is [#instance, width, height, channels]
        Args:
            xTrain (numpy ndarray): the training dataset.
            xValid (numpy ndarray): the validation dataset.
            numEpochs (int): the maximal epochs for training. Default is 50.
            sizeBatch (int): the batch size. Default is 32.
            nameOptim (string): the method for optimization. Default is adam.
            tempPathBest (string): the temperory path of the best model for early-stop. Default None means without early-stop. 
            patience (int): the times of epochs to allow further trying if current loss is not better than the best. 
        '''
        self.autoencoder.compile(optimizer=nameOptim, loss=self.lossVAE)

        if tempPathBest is None:
            callbacks = None
        else:
            cbEarlyStop = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, mode='auto')
            chkpt = tempPathBest + 'Conv_AutoEncoder.{epoch:02d}-{loss:.2f}-{val_loss:.2f}.hdf5'
            cbCheckPoint = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
            callbacks = [cbEarlyStop, cbCheckPoint]
        
        
        # Train the autoencoder
        tic = time()
        history = self.autoencoder.fit(xTrain, xTrain,
                                       epochs=numEpochs,
                                       batch_size=sizeBatch, shuffle=True,
                                       callbacks=callbacks,
                                       validation_data=(xValid, xValid)
                                      )
        timeTrain = time() - tic
        
        return history, timeTrain

In [4]:
class ConvVAE(VAE):
    
    def __init__(self, 
                 dimInput, 
                 layerConv=[8, 32], sizeKernel=3, strides=2, actConv='relu', padding='same',
                 layerDense=[64, 2], actDense='relu',
                 ratRecon=0.998):
        '''
        The basic properties and pipeline will be defined in the initialization.
        The dimension of input should be a form of a picture presented by a list [width, height, channels].
        It should be noted that layerDense defines the first half(encoder) of network. 
        The decoder will be reflected structure.
        For example, [64, 16, 2] means the nodes of decoder will be [2, 16, 64].
        It is similar for layerConv but decoder is not purely symmetric for convolution layers for this version.
        There is another parameter should noted that ratRecon=0.5 doesn't mean the effect is half.
        Because KL loss and reconstruction loss are not the same scale.
        Args:
            dimInput (list[int]): the dimension of input. E.g. [32, 28, 3] means 32 by 28 RGB pixels.
            layerConv (list[int]): the numbers of each convolution layer. Default is [8, 32].
            sizeKernel (int): the size of filter kernel. Default 3 means 3 by 3.
            strides (int): the stride for convolution. Default is 2.
            actConv (string): the activation function of each convolution layer. Default is 'relu'.
            padding (string): the padding method for convolution. Default is 'same'.
            layerDense (list[int]): the numbers of each dense layer. Default is [64, 2].
            actDense (string): the activation function of each dense layer. Default is 'relu'.
            ratRecon (float): the parameter for tuning the effects between KL loss and reconstruction loss.
        '''
        
        # Initialize some setting 
        self.dimInput = dimInput # dimInput is (width, height, channels)
        self.inputs = Input(shape=(dimInput)) 
        self.dimEncode = layerDense[-1]
        self.ratRecon = ratRecon
        
        self.encoding(layerConv, sizeKernel, strides, actConv, padding,
                      layerDense, actDense,
                     )
        
        self.decoding(layerConv, sizeKernel, strides, actConv, padding,
                      layerDense, actDense,
                     )
        
        self.autoencoder = Model(self.inputs, self.decoder(self.encoder(self.inputs)), name='autoencoder')
        

        
    def encoding(self, 
                 layerConv, sizeKernel, strides, actConv, padding,
                 layerDense, actDense,
                ):
        dimEncode = self.dimEncode
        x = self.inputs
        # Stack of Conv2D layers
        for filters in layerConv:
            x = Conv2D(filters=filters,
                       kernel_size=sizeKernel,
                       strides=strides,
                       activation=actConv,
                       padding=padding)(x)

        # Shape info needed to build Decoder Model
        self.shape = K.int_shape(x)

        # Stack of Dense layers
        x = Flatten()(x)
        for numFilt in layerDense[:-1]:
            x = Dense(numFilt, activation=actDense)(x)
        self.zMean = Dense(dimEncode)(x)
        self.zSigmaLog = Dense(dimEncode)(x) # log for linear dense

        # Construct the latent as the output and build the encorder pipeline
        z = Lambda(self.sampling)([self.zMean, self.zSigmaLog])
        self.encoder = Model(self.inputs, z, name='encoder')

        
    def decoding(self,
                 layerConv, sizeKernel, strides, actConv, padding,
                 layerDense, actDense,
                ):
        
        shape = self.shape
         # Build the Decoder Model
        inputLatent = Input(shape=(self.dimEncode,), name='decoder_input')
        x = inputLatent
        for numFilt in layerDense[-2::-1]:
            x = Dense(numFilt, activation=actDense)(x)
            
        x = Dense(shape[1] * shape[2] * shape[3])(x)
        x = Reshape((shape[1], shape[2], shape[3]))(x)

        # Stack of Transposed Conv2D layers
        for numFilt in layerConv[::-1]:
            x = Conv2DTranspose(filters=numFilt,
                                kernel_size=sizeKernel,
                                strides=strides,
                                activation=actConv,
                                padding=padding)(x)

        # Build the Conv2DTranspose layer for the pixel dimension
        x = Conv2DTranspose(filters=self.dimInput[-1],
                            kernel_size=sizeKernel,
#                             strides=strides,
                            padding=padding)(x)

        # Reconstruct the pixels as the output and build the decorder pipeline
        outputs = Activation('sigmoid', name='decoder_output')(x)
        self.decoder = Model(inputLatent, outputs, name='decoder')