In [None]:
import numpy as np                   # advanced math library
import matplotlib.pyplot as plt      # MATLAB like plotting routines
import random  
import tensorflow as tf

                      # for generating random numbers
from tensorflow import keras
from keras.datasets import mnist     # MNIST dataset is included in Keras
from keras.models import Sequential  # Model type to be used
from keras.optimizers import Adam
from tensorflow.keras import layers

from keras.layers.core import Dense, Dropout, Activation # Types of layers to be used in our model
from keras.layers import LSTM
from keras.utils import np_utils                         # NumPy related tools

**Vanishing Gradient Issue** During neural network training with backpropagation, the (local) minimum of the error function is found by iteratively taking small steps in the direction of the negative error derivative with respect to networks weights (i.e. gradients). With each subsequent layer the magnitude of the gradients gets exponentially smaller (vanishes) thus making the steps also very small which results in very slow learning of the weights in the lower layers of a deep network. It is useful for RNN in terms of processing information.

**Importance of Gates** GRU and LSTM have the cell state.The gates regulate the flow of information to the cell state. These gates can learn which data in a sequence is important and which is not. By doing that, they pass information in long sequences.The issue of short term memeory during learning process is addressed using LSTM and GRU.
 

**Difference between LSTM and GRU** The key difference between GRU and LSTM is that GRU  has two gates that are reset and update while LSTM has three gates that are input, output, forget. GRU is less complex than LSTM because it has less number of gates.

LSTM 32 Units

In [None]:
# Binarize the images
def binarize(images, threshold=0.1):
  return (threshold < images).astype('float32')

Loading Training Data

In [None]:
(X_train , y_train) , (X_test , y_test) = mnist.load_data()

print("X_train shape", X_train.shape)
print("y_train shape", y_train.shape)
print("X_test shape", X_test.shape)
print("y_test shape", y_test.shape)

X_train shape (60000, 28, 28)
y_train shape (60000,)
X_test shape (10000, 28, 28)
y_test shape (10000,)


In [None]:
X_train= X_train.astype("float32")/255.0
X_test = X_test.astype("float32")/255.0

In [None]:
model = Sequential()

model.add(layers.LSTM(32,input_shape=(X_train.shape[1:]),activation='relu',return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10,activation='softmax'))



In [None]:
model.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_22 (LSTM)               (None, 32)                7808      
_________________________________________________________________
dropout_45 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_38 (Dense)             (None, 64)                2112      
_________________________________________________________________
dropout_46 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_39 (Dense)             (None, 10)                650       
Total params: 10,570
Trainable params: 10,570
Non-trainable params: 0
_________________________________________________________________


In [None]:
opt = keras.optimizers.Adam(learning_rate=0.001)

In [None]:
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=None, weighted_metrics=None, run_eagerly=None)

In [None]:
model.fit(X_train,y_train,epochs=10,batch_size=256,verbose=1)
model.evaluate(X_test,y_test,batch_size=256,verbose=2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
40/40 - 1s - loss: 0.1203 - accuracy: 0.9637


[0.1202707514166832, 0.963699996471405]

LSTM 64 Units

In [None]:
import numpy as np                   # advanced math library
import matplotlib.pyplot as plt      # MATLAB like plotting routines
import random  
import tensorflow as tf

                      # for generating random numbers
from tensorflow import keras
from keras.datasets import mnist     # MNIST dataset is included in Keras
from keras.models import Sequential  # Model type to be used
from keras.optimizers import Adam
from tensorflow.keras import layers

from keras.layers.core import Dense, Dropout, Activation # Types of layers to be used in our model
from keras.layers import LSTM
from keras.utils import np_utils                         # NumPy related tools
# Binarize the images
def binarize(images, threshold=0.1):
  return (threshold < images).astype('float32')

(X_train , y_train) , (X_test , y_test) = mnist.load_data()

X_train= X_train.astype("float32")/255.0
X_test = X_test.astype("float32")/255.0

model = Sequential()

model.add(layers.LSTM(64,input_shape=(X_train.shape[1:]),activation='relu',return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10,activation='softmax'))

model.summary()

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=None, weighted_metrics=None, run_eagerly=None)

model.fit(X_train,y_train,epochs=10,batch_size=256,verbose=1)
model.evaluate(X_test,y_test,batch_size=256,verbose=2)



Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_23 (LSTM)               (None, 64)                23808     
_________________________________________________________________
dropout_47 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_40 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_48 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_41 (Dense)             (None, 10)                650       
Total params: 28,618
Trainable params: 28,618
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
40/4

[0.06920892000198364, 0.9797999858856201]

GRU 32 Units

In [None]:
import numpy as np                   # advanced math library
import matplotlib.pyplot as plt      # MATLAB like plotting routines
import random  
import tensorflow as tf

                      # for generating random numbers
from tensorflow import keras
from keras.datasets import mnist     # MNIST dataset is included in Keras
from keras.models import Sequential  # Model type to be used
from keras.optimizers import Adam
from tensorflow.keras import layers

from keras.layers.core import Dense, Dropout, Activation # Types of layers to be used in our model
from keras.layers import LSTM
from keras.utils import np_utils                         # NumPy related tools
# Binarize the images
def binarize(images, threshold=0.1):
  return (threshold < images).astype('float32')

(X_train , y_train) , (X_test , y_test) = mnist.load_data()

X_train= X_train.astype("float32")/255.0
X_test = X_test.astype("float32")/255.0

model = Sequential()

model.add(layers.GRU(32,activation='relu',input_shape=(X_train.shape[1:]),return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10,activation='softmax'))

model.summary()

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=None, weighted_metrics=None, run_eagerly=None)

model.fit(X_train,y_train,epochs=10,batch_size=256,verbose=1)
model.evaluate(X_test,y_test,batch_size=256,verbose=2)

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_6 (GRU)                  (None, 32)                5952      
_________________________________________________________________
dropout_49 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_42 (Dense)             (None, 64)                2112      
_________________________________________________________________
dropout_50 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_43 (Dense)             (None, 10)                650       
Total params: 8,714
Trainable params: 8,714
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
40/40 

[0.1359158158302307, 0.9603000283241272]

GRU 64 Units

In [None]:
import numpy as np                   # advanced math library
import matplotlib.pyplot as plt      # MATLAB like plotting routines
import random  
import tensorflow as tf

                      # for generating random numbers
from tensorflow import keras
from keras.datasets import mnist     # MNIST dataset is included in Keras
from keras.models import Sequential  # Model type to be used
from keras.optimizers import Adam
from tensorflow.keras import layers

from keras.layers.core import Dense, Dropout, Activation # Types of layers to be used in our model
from keras.layers import LSTM
from keras.utils import np_utils                         # NumPy related tools
# Binarize the images
def binarize(images, threshold=0.1):
  return (threshold < images).astype('float32')

(X_train , y_train) , (X_test , y_test) = mnist.load_data()

X_train= X_train.astype("float32")/255.0
X_test = X_test.astype("float32")/255.0

model = Sequential()

model.add(layers.GRU(64,activation='relu',input_shape=(X_train.shape[1:]),return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10,activation='softmax'))

model.summary()

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=None, weighted_metrics=None, run_eagerly=None)

model.fit(X_train,y_train,epochs=10,batch_size=256,verbose=1)
model.evaluate(X_test,y_test,batch_size=256,verbose=2)

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_7 (GRU)                  (None, 64)                18048     
_________________________________________________________________
dropout_51 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_44 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_52 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_45 (Dense)             (None, 10)                650       
Total params: 22,858
Trainable params: 22,858
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
40/4

[0.0807291567325592, 0.9754999876022339]

Stacked LSTM

In [None]:
import numpy as np                   # advanced math library
import matplotlib.pyplot as plt      # MATLAB like plotting routines
import random  
import tensorflow as tf

                      # for generating random numbers
from tensorflow import keras
from keras.datasets import mnist     # MNIST dataset is included in Keras
from keras.models import Sequential  # Model type to be used
from keras.optimizers import Adam
from tensorflow.keras import layers

from keras.layers.core import Dense, Dropout, Activation # Types of layers to be used in our model
from keras.layers import LSTM
from keras.utils import np_utils                         # NumPy related tools
# Binarize the images
def binarize(images, threshold=0.1):
  return (threshold < images).astype('float32')

(X_train , y_train) , (X_test , y_test) = mnist.load_data()

X_train= X_train.astype("float32")/255.0
X_test = X_test.astype("float32")/255.0

model = Sequential()

model.add(layers.LSTM(32,input_shape=(X_train.shape[1:]),activation='relu',return_sequences=True)) 
model.add(Dropout(0.2))

model.add(layers.LSTM(32,activation='relu',return_sequences=True)) 
model.add(Dropout(0.2))

model.add(layers.LSTM(32,activation='relu',return_sequences=True)) 
model.add(Dropout(0.2))

model.add(layers.LSTM(32,activation='relu',return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10,activation='softmax'))

model.summary()

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=None, weighted_metrics=None, run_eagerly=None)

model.fit(X_train,y_train,epochs=10,batch_size=256,verbose=1)
model.evaluate(X_test,y_test,batch_size=256,verbose=2)

Model: "sequential_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_30 (LSTM)               (None, 28, 32)            7808      
_________________________________________________________________
dropout_64 (Dropout)         (None, 28, 32)            0         
_________________________________________________________________
lstm_31 (LSTM)               (None, 28, 32)            8320      
_________________________________________________________________
dropout_65 (Dropout)         (None, 28, 32)            0         
_________________________________________________________________
lstm_32 (LSTM)               (None, 28, 32)            8320      
_________________________________________________________________
dropout_66 (Dropout)         (None, 28, 32)            0         
_________________________________________________________________
lstm_33 (LSTM)               (None, 32)              

KeyboardInterrupt: ignored

Stacked GRU

In [None]:
import numpy as np                   # advanced math library
import matplotlib.pyplot as plt      # MATLAB like plotting routines
import random  
import tensorflow as tf

                      # for generating random numbers
from tensorflow import keras
from keras.datasets import mnist     # MNIST dataset is included in Keras
from keras.models import Sequential  # Model type to be used
from keras.optimizers import Adam
from tensorflow.keras import layers

from keras.layers.core import Dense, Dropout, Activation # Types of layers to be used in our model
from keras.layers import LSTM
from keras.utils import np_utils                         # NumPy related tools
# Binarize the images
def binarize(images, threshold=0.1):
  return (threshold < images).astype('float32')

(X_train , y_train) , (X_test , y_test) = mnist.load_data()

X_train= X_train.astype("float32")/255.0
X_test = X_test.astype("float32")/255.0

model = Sequential()

model.add(layers.GRU(32,input_shape=(X_train.shape[1:]),activation='relu',return_sequences=True)) 
model.add(Dropout(0.2))


model.add(layers.GRU(32,activation='relu',return_sequences=True)) 
model.add(Dropout(0.2))


model.add(layers.GRU(32,activation='relu',return_sequences=True)) 
model.add(Dropout(0.2))


model.add(layers.GRU(32,activation='relu',return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(10,activation='softmax'))

model.summary()

opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=None, weighted_metrics=None, run_eagerly=None)

model.fit(X_train,y_train,epochs=10,batch_size=256,verbose=1)
model.evaluate(X_test,y_test,batch_size=256,verbose=2)

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_8 (GRU)                  (None, 28, 32)            5952      
_________________________________________________________________
dropout_56 (Dropout)         (None, 28, 32)            0         
_________________________________________________________________
gru_9 (GRU)                  (None, 32)                6336      
_________________________________________________________________
dropout_57 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_48 (Dense)             (None, 64)                2112      
_________________________________________________________________
dropout_58 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_49 (Dense)             (None, 10)              

[0.09008524566888809, 0.9735999703407288]

The accuracy is fairly high, which gives me a sense that there is a scope of overfitting. 
The computational time of GRU was fairly less.