In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.datasets import mnist
import numpy as np

# To prevent CUBLAS_STATUS_ALLOC_FAILED problem in tensorflow 2, the follwing codes are necessary.
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
# Step 1: Data Preparation

(x_train, y_train), (x_test, y_test) = mnist.load_data()

num_labels = len(np.unique(y_train))

# Reshape
image_size= x_train.shape[1]

x_train = np.reshape(x_train, [-1, image_size, image_size, 1])
x_test = np.reshape(x_test, [-1, image_size, image_size, 1])

# Normalizing

x_train = x_train/255.
x_test = x_test/255.


# One-hot encoding

y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

In [3]:
# Step 2: Model construction

input_shape =(image_size,image_size, 1)
batch_size = 128
kernel_size=3
pool_size=2
filters=64
dropout=0.2

# model is a stack of CNN-ReLU-MaxPooling

model = Sequential()
# The major change here is the use of the Conv2D layers. The ReLU activation function is already an argument of Conv2D . 
# The ReLU function can be brought out as an Activation layer when the batch normalization layer is included in the model. 
# Batch normalization is used in deep CNNs so that large learning rates can be utilized without causing instability 
# during training.

# If, in the MLP model, the number of units characterizes the Dense layers, the kernel characterizes the CNN operations. 
# The kernel can be visualized as a rectangular patch or window that slides through the whole image from left to right, 
# and from top to bottom. This operation is called convolution. 
# It transforms the input image into a feature map, which is a representation of what the kernel has learned from the 
# input image. The feature map is then transformed into another feature map in the succeeding layer and so on. 
# The number of feature maps generated per Conv2D is controlled by the filters argument.

# You'll notice that the resulting feature map is smaller than the original input image, this is because the convolution 
# is only performed on valid elements. The kernel cannot go beyond the borders of the image. 
# If the dimensions of the input should be the same as the output feature maps, Conv2D accepts the option padding='same' . 
# The input is padded with zeros around its borders to keep the dimensions unchanged after the convolution.


model.add(Conv2D(filters=filters, 
                 kernel_size=kernel_size,
                 activation='relu',
                 input_shape= input_shape))

# The significance of MaxPooling2D is the reduction in feature map size, which translates to an increase in receptive field 
# size. For example, after MaxPooling2D(2) , the 2 × 2 kernel is now approximately convolving with a 4 × 4 patch. 
# The CNN has learned a new set of feature maps for a different receptive field size.

# In Conv2D and MaxPooling2D , both pool_size and kernel can be non-square. In these cases, both the row and column sizes 
# must be indicated. For example, pool_ size = (1, 2) and kernel = (3, 5)

model.add(MaxPooling2D(pool_size))

model.add(Conv2D(filters=filters, 
                 kernel_size=kernel_size,
                 activation='relu'))
model.add(MaxPooling2D(pool_size))
model.add(Conv2D(filters=filters, 
                 kernel_size=kernel_size,
                 activation='relu',
                 input_shape= input_shape))

# The output of the last MaxPooling2D operation is a stack of feature maps. The role of Flatten is to convert the stack of 
# feature maps into a vector format that is suitable for either Dropout or Dense layers, similar to the MLP model output 
# layer.

model.add(Flatten())

# dropout added as regularizer
model.add(Dropout(dropout)) 

# output layer is 10-dim one-hot vector
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 26, 26, 64)        640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 13, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 64)        36928     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 3, 3, 64)          36928     
_________________________________________________________________
flatten (Flatten)            (None, 576)               0         
_________________________________________________________________
dropout (Dropout)            (None, 576)               0

In [4]:
# Step 3: Model compile

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [5]:
# Step 4: Model fit

model.fit(x_train, y_train, epochs=10, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2678747ec70>

In [6]:
# The model evaluation output shows a maximum test accuracy of 99.3%, which can be achieved for a 3-layer network with 
# 64 feature maps per layer using the Adam optimizer with dropout=0.2 . CNNs are more parameter efficient and have a higher
# accuracy than MLPs. 
# Likewise, CNNs are also suitable for learning representations from sequential data, images, and videos.
 
_, acc = model.evaluate(x_test,
                        y_test,
                        batch_size=batch_size,
                   verbose=0)
print("\nTest accuracy: %.1f%%" % (100.0 * acc))


Test accuracy: 99.0%
