In [1]:
# loading libraries for data manipulation
import numpy as np
import pandas as pd

# loading libraries for data visualization
import matplotlib.pyplot as plt
from plotnine import *

# import tensorflow and keras packages
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')

We will load the MNIST data again.

In [13]:
# Load MNIST data from keras.datasets
(X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = keras.datasets.mnist.load_data()

X_train_mnist = X_train_mnist.reshape(-1, 28*28).astype('float32') / 255.0
X_test_mnist = X_test_mnist.reshape(-1, 28*28).astype('float32') / 255.0

# Convert y labels to one-hot encoded vectors
y_train_mnist = keras.utils.to_categorical(y_train_mnist, num_classes=10)
y_test_mnist = keras.utils.to_categorical(y_test_mnist, num_classes=10)



Let's first build a deep neural network to classify digits without any regularization

In [3]:
model = keras.Sequential([
        keras.layers.Dense(256,activation='relu',input_shape=(784,)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(256,activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10,activation='softmax') # output layer
    ])

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

model.fit(X_train_mnist,y_train_mnist,
                           epochs=5,verbose=1,batch_size=128,
                           validation_data=(X_test_mnist,y_test_mnist))

Epoch 1/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9092 - loss: 0.3077 - val_accuracy: 0.9624 - val_loss: 0.1275
Epoch 2/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9612 - loss: 0.1268 - val_accuracy: 0.9719 - val_loss: 0.0880
Epoch 3/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9711 - loss: 0.0931 - val_accuracy: 0.9735 - val_loss: 0.0796
Epoch 4/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9769 - loss: 0.0744 - val_accuracy: 0.9749 - val_loss: 0.0767
Epoch 5/5
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9804 - loss: 0.0608 - val_accuracy: 0.9805 - val_loss: 0.0624


<keras.src.callbacks.history.History at 0x159b85be0>

This is a feed forward fully connected neural network. That is, there is a forward pass and a backward pass but all nodes in one layer are connected to all nodes in the next layer. This mechanism is not able to capture any spatial properties in the input data. 

Now let's train a Convolutional Neural Network. 

First we will reload our input data but now in the form of images. We will keep the 28x28 shape of the inputs. Note that the 1 in 28x28x1 indicates the number of channels. Here it is one to indicate grayscale images. 

In [4]:
(X_train_mnist, y_train_mnist), (X_test_mnist, y_test_mnist) = keras.datasets.mnist.load_data()

X_train_mnist = X_train_mnist.reshape(-1, 28,28,1).astype('float32') / 255.0
X_test_mnist = X_test_mnist.reshape(-1, 28,28,1).astype('float32') / 255.0

# Convert y labels to one-hot encoded vectors
y_train_mnist = keras.utils.to_categorical(y_train_mnist, num_classes=10)
y_test_mnist = keras.utils.to_categorical(y_test_mnist, num_classes=10)

There are some new layers to get familiar with. 

- Conv2D - convolution layer with filter size, number of filters, stride, and padding
- max pooling layer with pool size and stride

In [None]:
model = keras.Sequential([...])

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

model.fit(X_train_mnist,y_train_mnist,
                           epochs=5,verbose=1,batch_size=128,
                           validation_data=(X_test_mnist,y_test_mnist))

The model summary will show that there are now stacks of filters being used in the convolution layers, which are then pooled together to further reduce, in the max pooling layers. Note that there are no additional parameters to learn from the pooling layers. 

In [None]:
model.summary()

Let's modify the padding and stride values. 

- Larger stride values reduces spatial resolution: feature map shrinks faster
- Also speeds up training but the network might miss finer details

- padding = "same" retains spatial dimensions, helps detect objects touching the boundaries
- Also means you can stack more layers in the network without shrinking features maps too much

- padding = "valid" means no padding: reduces feature map dimensions
- Faster but loses edge information

In [None]:
model = keras.Sequential([...])

model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

model.fit(X_train_mnist,y_train_mnist,
                           epochs=5,verbose=1,batch_size=128,
                           validation_data=(X_test_mnist,y_test_mnist))

In [None]:
model.summary()

In [None]:

# predict on test data to identify correct and incorrect classifications
y_pred_probs = model.predict(X_test_mnist)
y_pred = np.argmax(y_pred_probs, axis=1)  # predicted classes
y_true = np.argmax(y_test_mnist, axis=1)  # actual classes

# separate correct and incorrect predictions
correct_indices = np.where(y_pred == y_true)[0]
incorrect_indices = np.where(y_pred != y_true)[0]

# display up to 9 predictions in each case
def show_images(indices, title, n=9):
    plt.figure(figsize=(10, 4))
    for i, idx in enumerate(indices[:n]):
        plt.subplot(1, n, i + 1)
        plt.imshow(X_test_mnist[idx].reshape(28, 28), cmap='gray')
        plt.title(f"True: {y_true[idx]}\nPred: {y_pred[idx]}")
    plt.suptitle(title)
    plt.tight_layout()
    plt.show()

show_images(correct_indices, "Correct!")
show_images(incorrect_indices, "Incorrect!")
