<a href="https://colab.research.google.com/github/praneetheddu/LDL/blob/main/tutorials/MNIST_DLL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Model training using Tensorflow and MNIST


In [1]:
# TF imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

# Python imports
import numpy as np
import logging

# only print out errors and supress warnings
tf.get_logger().setLevel(logging.ERROR) 
tf.random.set_seed(7) # Reproducable randomness

# Hyper params
EPOCHS = 20
BATCH_SIZE = 1

In [2]:
# Load mnist dataset
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Standardize the test dataset
mean = np.mean(train_images)
stddev = np.std(train_images)
train_images = (train_images - mean) / stddev
test_images = (test_images - mean) / stddev

# one hot enccoding
train_labels = to_categorical(train_labels, num_classes=10)
test_labels = to_categorical(test_labels, num_classes=10)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
# Initialize weights
initializer = keras.initializers.RandomUniform(minval = -0.1, maxval = 0.1)
'''
Create Sequential models Fully Connected Model
- 2 layers: hidden and output layer
Input Layer = 784 inputs flattened by 28 x 288 image + 1 bias input 
Hidden Layer = 25 Inputs
Output Layer = 10 Inputs 
'''
model = keras.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(25, activation='tanh',
                          kernel_initializer=initializer,
                          bias_initializer='zeros'),
        keras.layers.Dense(10, activation='sigmoid',
                          kernel_initializer=initializer,
                          bias_initializer='zeros')])

In [None]:
'''
Use Stochastic Gradient Descent Function to optimize our loss funciton
with learning rate = 0.01
'''

opt = keras.optimizers.SGD(learning_rate=0.01)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=['accuracy'])

# Train the model for 20 Epochs and shuffle the order of the inputs
# Update weights after each epoch

history = model.fit(
          train_images, train_labels, 
          validation_data=(test_images, test_labels),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          verbose=2, shuffle=True)

Epoch 1/20
60000/60000 - 84s - loss: 0.0526 - accuracy: 0.7047 - val_loss: 0.0266 - val_accuracy: 0.8893 - 84s/epoch - 1ms/step
Epoch 2/20
60000/60000 - 71s - loss: 0.0217 - accuracy: 0.8960 - val_loss: 0.0176 - val_accuracy: 0.9107 - 71s/epoch - 1ms/step
Epoch 3/20
60000/60000 - 69s - loss: 0.0166 - accuracy: 0.9112 - val_loss: 0.0152 - val_accuracy: 0.9181 - 69s/epoch - 1ms/step
Epoch 4/20
60000/60000 - 69s - loss: 0.0147 - accuracy: 0.9193 - val_loss: 0.0140 - val_accuracy: 0.9219 - 69s/epoch - 1ms/step
Epoch 5/20
60000/60000 - 71s - loss: 0.0135 - accuracy: 0.9238 - val_loss: 0.0132 - val_accuracy: 0.9253 - 71s/epoch - 1ms/step
Epoch 6/20
60000/60000 - 71s - loss: 0.0127 - accuracy: 0.9281 - val_loss: 0.0125 - val_accuracy: 0.9270 - 71s/epoch - 1ms/step
Epoch 7/20
60000/60000 - 68s - loss: 0.0121 - accuracy: 0.9313 - val_loss: 0.0121 - val_accuracy: 0.9304 - 68s/epoch - 1ms/step
Epoch 8/20
60000/60000 - 69s - loss: 0.0117 - accuracy: 0.9335 - val_loss: 0.0117 - val_accuracy: 0.9296

## Nueron Saturation

tanh and signmoid functions have saturated regions outside of their area of interests causing the gradient descent function to make no change at all for high magnitudes. To overcome this scenario, we can use nonlinear activation funciton to adjust weights to deal its respective gradient descent magnitude. 

Momentum can be tacked to the current optimizer for faster convergence and to get out of local peaks. Keep in mind that this could cause exploding gradients which can overthow the gradient descent with seemingly large values. Gradient clipping can be used to avoid this scenario.

#### Training the model using Adam optimizer which uses adaptive learning rate and momentum.

In [None]:
# Additional Hyper parameters to avoid neuron saturation
opt = keras.optimizers.Adam()
loss = 'categorical_crossentropy'
initializer = keras.initializers.RandomUniform(minval = -0.1, maxval = 0.1)
model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])

# Train the model for 20 Epochs and shuffle the order of the inputs
# Update weights after each epoch

history = model.fit(
          train_images, train_labels, 
          validation_data=(test_images, test_labels),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          verbose=2, shuffle=True)

#### Training the model using different optimizers

In [13]:
Optimizers = []
adaOpt = keras.optimizers.Adagrad(learning_rate=0.01, epsilon=None)
RMSPropOpt = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.8, epsilon=None)
AdamOpt = keras.optimizers.Adam(learning_rate=0.01, decay=0.0, epsilon=0.1)
Optimizers.append(adaOpt)
Optimizers.append(RMSPropOpt)
Optimizers.append(AdamOpt)

# Reducing epochs to test save computational power and time for testing additional hyperparams
EPOCHS = 7 
lossFunction = ['mean_squared_error', 'categorical_crossentropy']
for opt in Optimizers:
  for loss in lossFunction:
    model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])
    history = model.fit(
          train_images, train_labels, 
          validation_data=(test_images, test_labels),
          epochs=EPOCHS, batch_size=BATCH_SIZE,
          verbose=2, shuffle=True)

Epoch 1/7
60000/60000 - 81s - loss: 0.0293 - accuracy: 0.9175 - val_loss: 0.0189 - val_accuracy: 0.9233 - 81s/epoch - 1ms/step
Epoch 2/7
60000/60000 - 81s - loss: 0.0176 - accuracy: 0.9250 - val_loss: 0.0167 - val_accuracy: 0.9219 - 81s/epoch - 1ms/step
Epoch 3/7
60000/60000 - 79s - loss: 0.0160 - accuracy: 0.9279 - val_loss: 0.0157 - val_accuracy: 0.9249 - 79s/epoch - 1ms/step
Epoch 4/7
60000/60000 - 90s - loss: 0.0151 - accuracy: 0.9301 - val_loss: 0.0151 - val_accuracy: 0.9245 - 90s/epoch - 2ms/step
Epoch 5/7
60000/60000 - 81s - loss: 0.0144 - accuracy: 0.9320 - val_loss: 0.0147 - val_accuracy: 0.9265 - 81s/epoch - 1ms/step
Epoch 6/7
60000/60000 - 80s - loss: 0.0140 - accuracy: 0.9330 - val_loss: 0.0143 - val_accuracy: 0.9274 - 80s/epoch - 1ms/step
Epoch 7/7
60000/60000 - 78s - loss: 0.0136 - accuracy: 0.9337 - val_loss: 0.0141 - val_accuracy: 0.9265 - 78s/epoch - 1ms/step
Epoch 1/7
60000/60000 - 91s - loss: 0.2298 - accuracy: 0.9338 - val_loss: 0.2381 - val_accuracy: 0.9303 - 91s/e

In [18]:
# Training our model using best configuration
'''
Create Sequential models Fully Connected Model
- 2 layers: hidden and output layer
Input Layer = 784 inputs flattened by 28 x 288 image + 1 bias input 
Hidden Layer = 25 Inputs
Output Layer = 10 Inputs 
'''
EPOCHS = 20
BATCH_SIZE = 64  
model = keras.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        keras.layers.Dense(25, activation='relu',
                          kernel_initializer='he_normal',
                          bias_initializer='zeros'),
        keras.layers.Dense(10, activation='softmax',
                          kernel_initializer='glorot_uniform',
                          bias_initializer='zeros')])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(
      train_images, train_labels, 
      validation_data=(test_images, test_labels),
      epochs=EPOCHS, batch_size=BATCH_SIZE,
      verbose=2, shuffle=True)

Epoch 1/20
938/938 - 3s - loss: 0.3473 - accuracy: 0.8958 - val_loss: 0.2153 - val_accuracy: 0.9389 - 3s/epoch - 3ms/step
Epoch 2/20
938/938 - 2s - loss: 0.2007 - accuracy: 0.9424 - val_loss: 0.1827 - val_accuracy: 0.9468 - 2s/epoch - 2ms/step
Epoch 3/20
938/938 - 2s - loss: 0.1685 - accuracy: 0.9510 - val_loss: 0.1649 - val_accuracy: 0.9498 - 2s/epoch - 3ms/step
Epoch 4/20
938/938 - 2s - loss: 0.1453 - accuracy: 0.9574 - val_loss: 0.1532 - val_accuracy: 0.9546 - 2s/epoch - 2ms/step
Epoch 5/20
938/938 - 2s - loss: 0.1331 - accuracy: 0.9604 - val_loss: 0.1405 - val_accuracy: 0.9592 - 2s/epoch - 3ms/step
Epoch 6/20
938/938 - 2s - loss: 0.1215 - accuracy: 0.9640 - val_loss: 0.1444 - val_accuracy: 0.9580 - 2s/epoch - 2ms/step
Epoch 7/20
938/938 - 2s - loss: 0.1122 - accuracy: 0.9663 - val_loss: 0.1436 - val_accuracy: 0.9581 - 2s/epoch - 2ms/step
Epoch 8/20
938/938 - 2s - loss: 0.1049 - accuracy: 0.9688 - val_loss: 0.1342 - val_accuracy: 0.9622 - 2s/epoch - 2ms/step
Epoch 9/20
938/938 - 2s 

In [17]:
print(history.history)

{'loss': [0.5518086552619934, 0.4158424735069275, 0.3728443682193756, 0.35512587428092957, 0.35235071182250977, 0.3318481147289276, 0.3247310519218445], 'accuracy': [0.845883309841156, 0.8780999779701233, 0.8924166560173035, 0.8969666957855225, 0.8981166481971741, 0.9038000106811523, 0.9064333438873291], 'val_loss': [0.4441230893135071, 0.3959222137928009, 0.364071786403656, 0.3189278841018677, 0.3516539931297302, 0.3111044764518738, 0.32032835483551025], 'val_accuracy': [0.8693000078201294, 0.8860999941825867, 0.8959000110626221, 0.9096999764442444, 0.8938000202178955, 0.9128000140190125, 0.9068999886512756]}
