### Code

In [38]:
import math
import numpy as np
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.datasets import mnist

#### Simple Dense Class

In [2]:
# Dense Class
class NaiveDense:
    # constructor method
    def __init__(self, input_size, output_size, activation):
        # We can use actiavtion function of choice like ReLU, Softmax etc.
        self.activation = activation
        
        # Defining weight matrix shape
        w_shape = (input_size, output_size)

        # Creating weight matrix with random values
        w_initial_value = tf.random.uniform(w_shape, minval=0, maxval=1e-1)

        # Initializing tensorflow variable with randomly initialized weight values
        self.W = tf.Variable(w_initial_value)

        # defining output shape
        b_shape = (output_size,)

        # Creating output matrix with zeros
        b_initial_value = tf.zeros(b_shape)

        # Initializing tensorflow variable with output matrix values
        self.b = tf.Variable(b_initial_value)
    
    # Tensorflow specific method to apply the forward pass
    def __call__(self, inputs):
        # Feeding the dot product of input and weight (with added bias) into actiavtion function of choice
        # Returning the output value
        return self.activation(tf.matmul(inputs, self.W) + self.b)
    

    # Method to access the layer parameters (weights and bias) as property
    @property
    def weights(self):
        return [self.W, self.b]

#### Simple Sequential Class 

In [3]:
# Sequential Class
class NaiveSequential:
    # constructor method
    def __init__(self, layers):
        # a sequence of dense layers
        self.layers = layers
    
    # calls the underlying layers on the inputs, in order
    def __call__(self, inputs):
        x = inputs
        for layer in self.layers:
            # output of the first layer will be the input of the second layer
            x = layer(x)
        # Returning the output of the final layer
        return x
    
    # Method to access the layer parameters (weights and bias) as property
    @property
    def weights(self):
        # List to store the weights of all the lareys
        weights = []
        # Accessing and storing the weights of individual layers into the list
        for layer in self.layers:
            weights += layer.weights
        # Returning the complete list of weights of all the layers
        return weights

#### Mock Keras Model

In [4]:
# Specifying - Input, Output size and ReLU as activation
# We have two Dense layers
model = NaiveSequential([
    NaiveDense(input_size=28 * 28, output_size=512, activation=tf.nn.relu),
    
    # Output layer with softmax activation
    NaiveDense(input_size=512, output_size=10, activation=tf.nn.softmax)
    ])

In [16]:
# First hidden layer input shape
print(model.weights[0].shape)
# First hidden layer output shape
print(model.weights[1].shape)

# Second hidden layer input shape
print(model.weights[2].shape)
# Second hidden layer output shape
print(model.weights[3].shape)

(784, 512)
(512,)
(512, 10)
(10,)


#### Data Batch Generator

In [19]:
# Batch Generator Class
class BatchGenerator:
    # constructor method
    def __init__(self, images, labels, batch_size=128):
        # Staring index 0
        self.index = 0

        # Images
        self.images = images

        # Labels
        self.labels = labels

        # Batch size
        self.batch_size = batch_size

        # Calculating the number of batches
        self.num_batches = math.ceil(len(images) / batch_size)
    

    def next(self):
        # Selecting features/images based on current index and batch size
        images = self.images[self.index : self.index + self.batch_size]
        
        # Selecting labels based on current index and batch size
        labels = self.labels[self.index : self.index + self.batch_size]

        # Incrementing current index by batch size
        self.index += self.batch_size

        # Returning current set of features/images and labels
        return images, labels

#### One Training Step Calculating Function

**Process:** Updating the weights of the model after running it on one batch of data

**Steps:**
1. Compute the predictions of the model for the images in the batch.
2. Compute the loss value for these predictions, given the actual labels
3. Compute the gradient of the loss with regard to the model’s weights.
4. Move the weights by a small amount in the direction opposite to the gradient

##### Tensorflow `GradientTape` Object | Calculate Gradients | Update Weights

Nowadays people implement neural networks in modern frameworks that are capable of automatic differentiation, such as TensorFlow. 

Automatic differentiation is implemented with the help of computation graph.

**GradientTape in Tensorflow**

The API through which you can leverage TensorFlow’s powerful automatic differentiation capabilities is the GradientTape. 

It’s a Python scope that will “record” the tensor operations that run inside it, in the form of a computation graph (sometimes called a “tape”). 

This graph can then be used to retrieve the gradient of any output with respect to any variable or set of variables (instances of the tf.Variable class). 

A tf.Variable is a specific kind of tensor meant to hold mutable state—for instance, the weights of a neural network are always tf.Variable instances.

In [23]:
# Instantiate a scalar Variable with an initial value of 0 
x = tf.Variable(0.)

# Open a GradientTape scope
with tf.GradientTape() as tape:
    # Inside the scope, apply some tensor operations to our variable
    y = 2 * x + 3

# Use the tape to retrieve the gradient of the output y with respect to our variable x
grad_of_y_wrt_x = tape.gradient(y, x)

print("x =", x)
print("y =", y)
print("gradient =", grad_of_y_wrt_x)

x = <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>
y = tf.Tensor(3.0, shape=(), dtype=float32)
gradient = tf.Tensor(2.0, shape=(), dtype=float32)


In [24]:
# GradientTape works with tensors

x = tf.Variable(tf.random.uniform((2, 2)))

with tf.GradientTape() as tape:
    y = 2 * x + 3

grad_of_y_wrt_x = tape.gradient(y, x)

print("x =", x)
print("y =", y)
print("gradient =", grad_of_y_wrt_x)

x = <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.89283407, 0.88217425],
       [0.9997405 , 0.37762022]], dtype=float32)>
y = tf.Tensor(
[[4.7856684 4.7643485]
 [4.999481  3.7552404]], shape=(2, 2), dtype=float32)
gradient = tf.Tensor(
[[2. 2.]
 [2. 2.]], shape=(2, 2), dtype=float32)


In [25]:
# GradientTape works with list of variables

W = tf.Variable(tf.random.uniform((2, 2)))
b = tf.Variable(tf.zeros((2,)))
x = tf.random.uniform((2, 2))

with tf.GradientTape() as tape:
    y = tf.matmul(x, W) + b

grad_of_y_wrt_W_and_b = tape.gradient(y, [W, b])

print("W =", W)
print("b =", b)
print("x =", x)
print("y =", y)
print("gradient =", grad_of_y_wrt_W_and_b)

W = <tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.5926453 , 0.4134344 ],
       [0.03614008, 0.11172056]], dtype=float32)>
b = <tf.Variable 'Variable:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>
x = tf.Tensor(
[[0.49944055 0.98948157]
 [0.3720045  0.22848749]], shape=(2, 2), dtype=float32)
y = tf.Tensor(
[[0.33175105 0.31703132]
 [0.22872427 0.1793262 ]], shape=(2, 2), dtype=float32)
gradient = [<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.87144506, 0.87144506],
       [1.2179691 , 1.2179691 ]], dtype=float32)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 2.], dtype=float32)>]


##### Training Function - One Step

In [30]:
def one_training_step(model, images_batch, labels_batch):
    # Open a GradientTape scope
    with tf.GradientTape() as tape:
        # Predicting on the batch of data using the model
        predictions = model(images_batch)

        # list of loss per sample for the complete batch
        per_sample_losses = tf.keras.losses.sparse_categorical_crossentropy(labels_batch, predictions)
        
        # Calculating the average loss for that complete batch
        average_loss = tf.reduce_mean(per_sample_losses)

    # Compute the gradient of the loss with regard to the weights
    # The output gradients is a list where each entry corresponds to a weight from the model.weights list
    gradients = tape.gradient(average_loss, model.weights)
    
    # Function to update weights (defined below)
    update_weights(gradients, model.weights)

    # Returning average_loss
    return average_loss

##### `update_weights` Function

The “weight update” step (represented by the preceding update_weights function) is to move the weights by “a bit” in a direction that will reduce the loss on this batch. 

The magnitude of the move is determined by the “learning rate,” typically a small quantity.

In [26]:
learning_rate = 1e-3

def update_weights(gradients, weights):
    for g, w in zip(gradients, weights):
        # equivalent to -= operation
        w.assign_sub(g * learning_rate)

In practice we never implement a weight update step like this.

Instead, we would use an Optimizer instance from Keras.

In [29]:
# Instantiating the optimizer object
optimizer = optimizers.SGD(learning_rate=1e-3)

# Redefining the function with the optimizer object
def update_weights(gradients, weights):
    optimizer.apply_gradients(zip(gradients, weights))

#### Fit Function - Full Training Loop Using One Traing Step Function

In [31]:
def fit(model, images, labels, epochs, batch_size=128):
    # Loop will run epoch times
    for epoch_counter in range(epochs):
        # Printing epoch numbers
        print(f"Epoch {epoch_counter}")

        # Instantiating BatchGenerator object
        batch_generator = BatchGenerator(images, labels)

        # Loop will run num_batches times
        # num_batches required will be calculated based on the size of the training dataset and batch_size
        for batch_counter in range(batch_generator.num_batches):
            # Getting one batch at a time until going through all the batches (num_batches times)
            images_batch, labels_batch = batch_generator.next()
            # getting average loss for that batch and adjusting weights as well
            loss = one_training_step(model, images_batch, labels_batch)
            # Printing loss per batch
            if batch_counter % 100 == 0:
                print(f"loss at batch {batch_counter}: {loss:.2f}")

#### Test Drive of the Train Function

In [34]:
# Loading MNIST data
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

In [35]:
# Split and Scale Data
train_images = train_images.reshape((60000, 28 * 28))
train_images = train_images.astype("float32") / 255
test_images = test_images.reshape((10000, 28 * 28))
test_images = test_images.astype("float32") / 255 

In [36]:
# Fit the data in the model
fit(model, train_images, train_labels, epochs=10, batch_size=128)

Epoch 0
loss at batch 0: 4.66
loss at batch 100: 2.25
loss at batch 200: 2.25
loss at batch 300: 2.12
loss at batch 400: 2.28
Epoch 1
loss at batch 0: 1.94
loss at batch 100: 1.89
loss at batch 200: 1.88
loss at batch 300: 1.73
loss at batch 400: 1.89
Epoch 2
loss at batch 0: 1.61
loss at batch 100: 1.59
loss at batch 200: 1.54
loss at batch 300: 1.44
loss at batch 400: 1.56
Epoch 3
loss at batch 0: 1.35
loss at batch 100: 1.35
loss at batch 200: 1.27
loss at batch 300: 1.22
loss at batch 400: 1.31
Epoch 4
loss at batch 0: 1.14
loss at batch 100: 1.16
loss at batch 200: 1.06
loss at batch 300: 1.05
loss at batch 400: 1.14
Epoch 5
loss at batch 0: 0.99
loss at batch 100: 1.02
loss at batch 200: 0.92
loss at batch 300: 0.93
loss at batch 400: 1.01
Epoch 6
loss at batch 0: 0.88
loss at batch 100: 0.91
loss at batch 200: 0.81
loss at batch 300: 0.84
loss at batch 400: 0.92
Epoch 7
loss at batch 0: 0.80
loss at batch 100: 0.82
loss at batch 200: 0.73
loss at batch 300: 0.77
loss at batch 40

#### Evaluating the Model

In [57]:
# Making the prediction using the model
predictions = model(test_images)
print("Shape", predictions.shape)
print("First 2 Prediction as Tensorflow Variable/Tensor\n", predictions[0:2])

print()

# Converting Tensorflow Variable to Numpy array
predictions = predictions.numpy()
print("Shape", predictions.shape)
print("First 2 Prediction as Numpy Tensor\n", predictions[0:2])

print()

# Getting the indices of the highest probabilities from the list of predictions
# This indices will work as class labels
predicted_labels = np.argmax(predictions, axis=1)
print("Shape", predicted_labels.shape)
print("First 2 Prediction as Numpy Tensor\n", predicted_labels[0:2])

print()

# Creating a binary array comparing the predicted labels and actual labels for all the test samples
matches = predicted_labels == test_labels
print("Shape", matches.shape)
print("First 2 Prediction as Numpy Tensor\n", matches[0:2])

print()

# Calculating the mean (True/(True+False))
print(f"accuracy: {matches.mean():.2f}")

Shape (10000, 10)
First 2 Prediction as Tensorflow Variable/Tensor
 tf.Tensor(
[[1.2674748e-02 1.5277988e-03 4.2626788e-03 1.5320774e-02 1.8537574e-02
  7.2525530e-03 2.5645422e-03 8.4755456e-01 8.8109467e-03 8.1493877e-02]
 [6.8721779e-02 2.1453366e-02 3.0072197e-01 2.7287203e-01 2.3117899e-03
  7.3553540e-02 1.8272300e-01 5.7506381e-04 7.4266292e-02 2.8011312e-03]], shape=(2, 10), dtype=float32)

Shape (10000, 10)
First 2 Prediction as Numpy Tensor
 [[1.2674748e-02 1.5277988e-03 4.2626788e-03 1.5320774e-02 1.8537574e-02
  7.2525530e-03 2.5645422e-03 8.4755456e-01 8.8109467e-03 8.1493877e-02]
 [6.8721779e-02 2.1453366e-02 3.0072197e-01 2.7287203e-01 2.3117899e-03
  7.3553540e-02 1.8272300e-01 5.7506381e-04 7.4266292e-02 2.8011312e-03]]

Shape (10000,)
First 2 Prediction as Numpy Tensor
 [7 2]

Shape (10000,)
First 2 Prediction as Numpy Tensor
 [ True  True]

accuracy: 0.82
