# Importing Libraries and Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from keras.datasets import mnist

from scipy.stats import norm

import tensorflow as tf
import keras
from keras import layers
from keras.models import Model
from keras import metrics
from keras import backend as K 

np.random.seed(237)
K.clear_session()

from tensorflow.python.framework.ops import disable_eager_execution

disable_eager_execution()

# Importing Dataset

In [None]:
df_train = pd.read_csv('../input/mnist-in-csv/mnist_train.csv')
df_test = pd.read_csv('../input/mnist-in-csv/mnist_test.csv')

df_train.head()

In [None]:
df_train.shape
df_test.head()

# Data Pre-Processing

In [None]:
# Droping the labels column 
y_train = df_train['label']
df_train = df_train.drop(['label'], axis = 1)
# print(df_train.head())

# Droping the labels column 
y_test = df_test['label']
df_test = df_test.drop(['label'], axis = 1)
print(df_test.head())
y_test.dtype

In [None]:
# Normalizing and reshaping

df_train = df_train.values.reshape(-1, 28, 28, 1)/255.0
df_test = df_test.values.reshape(-1, 28, 28, 1)/255.0

In [None]:
y_test = y_test/1.0

# Visualizing the dataset

In [None]:
plt.figure(1)
plt.subplot(221)
plt.imshow(df_train[13][:,:,0])

plt.subplot(222)
plt.imshow(df_train[690][:,:,0])

plt.subplot(223)
plt.imshow(df_train[2375][:,:,0])

plt.subplot(224)
plt.imshow(df_train[4213][:,:,0])
plt.show()

# Model
In VAEs, **instead of encoding an input as a single point, we encode it as a distribution over the latent space**. The model is then trained as follows:
- The input is encoded as distribution over the latent space
- Then a point from the latent space is sampled from that distribution
- third, the sampled point is decoded and the reconstruction error can be computed
- finally, the reconstruction error is backpropagated through the network
<br> <p>
    
    
The training loss of VAE is defined as the sum of these the reconstruction loss and the similarity loss. The reconstruction error, is the mean squared loss of the input and reconstructed output. The similarity loss is the KL divergence between the latent space distribution and standard gaussian (zero mean and unit variance). The loss function is then the sum of these two losses.


The encoder has two output layers - 
- latent distribution mean
- variance

In [None]:
img_shape = (28, 28, 1)
batch_size = 16
latent_dim = 2                            ## no. of latent dimension parameters

## Encoder Architecture

In [None]:
encoder_inputs = keras.Input(shape = img_shape)

x = layers.Conv2D(32, 3, padding='same', activation='relu')(encoder_inputs)
x = layers.Conv2D(64, 3, padding='same', activation='relu', strides=(2, 2))(x)
x = layers.Conv2D(64, 3, padding='same', activation='relu')(x)
x = layers.Conv2D(64, 3, padding='same', activation='relu')(x)
# need to know the shape of the network here for the decoder

shape_before_flattening = K.int_shape(x)
x = layers.Flatten()(x)
x = layers.Dense(16, activation="relu")(x)

# Two outputs, latent mean and (log)variance
z_mu = layers.Dense(latent_dim)(x)
z_log_sigma = layers.Dense(latent_dim)(x)

## Creating a sampling layer

In [None]:
# sampling function
def sampling(args):
    z_mu, z_log_sigma = args
    epsilon = tf.keras.backend.random_normal(shape=(tf.keras.backend.shape(z_mu)[0], latent_dim),
                              mean=0., stddev=1.)
    return z_mu + tf.keras.backend.exp(z_log_sigma) * epsilon

# sample vector from the latent distribution
z = layers.Lambda(sampling)([z_mu, z_log_sigma])

## Decoder Architecture

In [None]:
# decoder takes the latent distribution sample as input
decoder_input = layers.Input(K.int_shape(z)[1:])

# Expand to 784 total pixels
x = layers.Dense(np.prod(shape_before_flattening[1:]), activation='relu')(decoder_input)

# reshape
x = layers.Reshape(shape_before_flattening[1:])(x)

# use Conv2DTranspose to reverse the conv layers from the encoder
x = layers.Conv2DTranspose(32, 3, padding='same', activation='relu', strides=(2, 2))(x)
x = layers.Conv2D(1, 3, padding='same', activation='sigmoid')(x)
decoder_z = keras.Model(decoder_input, x)

# apply the decoder to the sample from the latent distribution
decoder = decoder_z(z)

## Defining the loss

In [None]:
# construct a custom layer to calculate the loss
class CustomVariationalLayer(keras.layers.Layer):

    def vae_loss(self, x, decoder):
        x = tf.keras.backend.flatten(x)
        decoder = tf.keras.backend.flatten(decoder)
        # Reconstruction loss
        xent_loss = keras.metrics.binary_crossentropy(x, decoder)
        # KL divergence
        kl_loss = -5e-4 * K.mean(1 + z_log_sigma - K.square(z_mu) - K.exp(z_log_sigma), axis=-1)
        return K.mean(xent_loss + kl_loss)

    # adds the custom loss to the class
    def call(self, inputs):
        x = inputs[0]
        decoder = inputs[1]
        loss = self.vae_loss(x, decoder)
        self.add_loss(loss, inputs=inputs)
        return x

# apply the custom loss to the input images and the decoded latent distribution sample
y = CustomVariationalLayer()([encoder_inputs, decoder])

In [None]:
# VAE model statement
vae = Model(encoder_inputs, y)
vae.compile(optimizer='rmsprop', loss=None, metrics=["accuracy"], experimental_run_tf_function=False)
vae.summary()

In [None]:
vae.fit(x = df_train, y = None, shuffle = True, epochs=10, batch_size=batch_size)

In [None]:
vae.save('vae.h5')

In [None]:
y_pred = vae.predict(df_test, verbose = 1)
type(y_pred)
y_pred.shape
y_pred = y_pred[0]
# series = pd.Series(y_pred) 

In [None]:
def compute_err(y_true, y_pred):
    correct_predictions = 0
    # iterate over each label and check
    for true, predicted in zip(y_true, y_pred):
        if true == predicted:
            correct_predictions += 1
    # compute the accuracy
    accuracy = correct_predictions/len(y_true)
    return accuracy
acc = compute_err(y_test, y_pred[0])
final_acc = 0.99- acc
final_acc

In [None]:
final_acc

# Different classes on the latent 2d plane

In [None]:
# Translate into the latent space
encoder = Model(encoder_inputs, z_mu)
x_valid_noTest_encoded = encoder.predict(df_train, batch_size=batch_size)
plt.figure(figsize=(10, 10))
plt.scatter(x_valid_noTest_encoded[:, 0], x_valid_noTest_encoded[:, 1], c=y_train, cmap='brg')
plt.colorbar()

plt.show()

In [None]:
# # set colormap so that 11's are gray
# custom_cmap = plt.cm.get_cmap('brg')
# custom_cmap.set_over('gray')

# x_valid_encoded = encoder.predict(X_valid, batch_size=batch_size)
# plt.figure(figsize=(10, 10))
# gray_marker = mpatches.Circle(4,radius=0.1,color='gray', label='Test')
# plt.legend(handles=[gray_marker], loc = 'best')
# plt.scatter(x_valid_encoded[:, 0], x_valid_encoded[:, 1], c=y_valid, cmap=custom_cmap)
# plt.clim(0, 9)
# plt.colorbar()

Each of these colored clusters is a type of digit. Close clusters are digits that are structurally similar (i.e. digits that share information in the latent space).

## Visualization of the latent manifold that "generates" the MNIST digits.

In [None]:
# Display a 2D manifold of the digits
n = 20  # figure with 20*20 digits
digit_size = 28
figure = np.zeros((digit_size * n, digit_size * n))
# We will sample n points within [-15, 15] standard deviations
grid_x = np.linspace(-20, 20, n)
grid_y = np.linspace(-20, 20, n)

for i, yi in enumerate(grid_x):
    for j, xi in enumerate(grid_y):
        z_sample = np.array([[xi, yi]])
        x_decoded = decoder_z.predict(z_sample)
        digit = x_decoded[0].reshape(digit_size, digit_size)
        figure[i * digit_size: (i + 1) * digit_size,
               j * digit_size: (j + 1) * digit_size] = digit

plt.figure(figsize=(20, 20))
plt.imshow(figure)
plt.show()

[Reference 1](https://blog.keras.io/building-autoencoders-in-keras.html)

[Reference 2](https://stackoverflow.com/questions/65366442/cannot-convert-a-symbolic-keras-input-output-to-a-numpy-array-typeerror-when-usi)