In [1]:
import math
import numpy as np
from keras.datasets import mnist
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split


def initialize_parameters(layer_dims):
    # Initializing the variables
    np.random.seed(1)
    parameters = {}
    # Create each layer based on the dimentions received
    for layer in range(1, len(layer_dims)):
        n = layer_dims[layer - 1]
        parameters["W" + str(layer)] = np.random.randn(
            layer_dims[layer], layer_dims[layer - 1]
        ) * np.sqrt(2 / n)
        parameters["b" + str(layer)] = np.zeros((layer_dims[layer], 1))
    return parameters


def linear_forward(A, W, b):
    # Calculate Z
    Z = np.dot(W, A) + b
    linear_cache = {"A": A, "W": W, "b": b}
    return Z, linear_cache


def softmax(Z):
    # Apply the softmax activation function
    exp = np.exp(Z - np.max(Z))
    A = exp / np.sum(exp, axis=0)
    return A, Z


def relu(Z):
    # Apply the relu activation function
    A = np.maximum(0, Z)
    activation_cache = Z
    return A, activation_cache


def linear_activation_forward(A_prev, W, B, activation):
    # Initializing the variables
    activation_function = {"softmax": softmax, "relu": relu}
    Z, linear_cache = linear_forward(A_prev, W, B)
    # Apply the activation function
    A, activation_cache = activation_function[activation](Z)
    layer_cache = {"linear_cache": linear_cache, "activation_cache": activation_cache}
    return A, layer_cache


def apply_batchnorm(A):
    epsilon = 0.00000001
    norm = (A - A.mean()) / np.sqrt(A.var() + epsilon)
    return norm


def L_model_forward(X, parameters, use_batchnorm=True):
    # Initializng the variables
    caches = []
    A = X
    layers = len(parameters) // 2
    # Activate all layers except of the last with relu
    for layer in range(1, layers):
        A, layer_cache = linear_activation_forward(
            A, parameters["W" + str(layer)], parameters["b" + str(layer)], "relu"
        )
        caches.append(layer_cache)
        A = A if not use_batchnorm else apply_batchnorm(A)
    # Activate the last layer with softmax
    AL, layer_cache = linear_activation_forward(
        A,
        parameters["W" + str(layers)],
        parameters["b" + str(layers)],
        "softmax",
    )
    caches.append(layer_cache)
    return AL, caches



def compute_cost(AL, Y):
    batch_size = Y.shape[1]
    # Calculating cost
    cost = -1.0 / batch_size * np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL))
    return cost


def Linear_backward(dZ, cache):
    # Initializng the variables
    A_prev = cache["A"]
    W = cache["W"]
    batch_size = A_prev.shape[1]
    # Calculating previous dA, dW, db
    dA_prev = np.dot(W.T, dZ)
    dW = 1.0 / batch_size * np.dot(dZ, A_prev.T)
    db = 1.0 / batch_size * np.sum(dZ, axis=1, keepdims=True)
    return dA_prev, dW, db


def relu_backward(dA, activation_cache):
    # Calculates the relu derivative
    dZ = np.array(dA)
    dZ[activation_cache <= 0] = 0
    return dZ


def softmax_backward(dA, activation_cache):
    # Calculates the softmax derivative
    sm, cache = softmax(activation_cache)
    dZ = dA * (sm * (1 - sm))
    return dZ


def linear_activation_backward(dA, cache, activation):
    # Initializing the variables
    linear_cache = cache["linear_cache"]
    activation_cache = cache["activation_cache"]
    activation_backward_functions = {"relu": relu_backward, "softmax": softmax_backward}
    # Apply the activation function
    dZ = activation_backward_functions[activation](dA, activation_cache)
    return Linear_backward(dZ, linear_cache)


def L_model_backward(AL, Y, caches):
    # Initializng the variables
    grads = {}
    layers = len(caches)
    # To make sure Y has the same shape as AL
    Y = Y.reshape(AL.shape)
    # Calculates the post activation gradient
    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    # For the last layer, apply softmax
    cache = caches[layers - 1]
    (
        grads["dA" + str(layers - 1)],
        grads["dW" + str(layers)],
        grads["db" + str(layers)],
    ) = linear_activation_backward(dAL, cache, "softmax")
    # For all other layers, apply relu
    for layer in reversed(range(1, layers)):
        cache = caches[layer - 1]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(
            grads["dA" + str(layer)], cache, "relu"
        )
        grads["dA" + str(layer - 1)] = dA_prev_temp
        grads["dW" + str(layer)] = dW_temp
        grads["db" + str(layer)] = db_temp
    return grads


def Update_parameters(parameters, grads, learning_rate):
    # Updates the weights and biases of each layer, based on the learning rate
    for layer in range(1, (len(parameters) // 2) + 1):
        parameters["W" + str(layer)] = (
            parameters["W" + str(layer)] - learning_rate * grads["dW" + str(layer)]
        )
        parameters["b" + str(layer)] = (
            parameters["b" + str(layer)] - learning_rate * grads["db" + str(layer)]
        )
    return parameters


def L_layer_model(X, Y, layers_dims, learning_rate, num_iterations, batch_size):
    # Initializing variables
    costs = []
    parameters = initialize_parameters(layers_dims)
    x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2)
    stop_count = 0
    step = 0
    val_acc = 0
    y_temp = to_categorical(y_train)
    # Run until iteration limit reached
    for i in range(1, num_iterations + 1):
        # Run on all batches
        for j in range(math.ceil(X.shape[1] / batch_size)):
            step +=1
            start = j * batch_size
            end = (j + 1) * batch_size
            x_batch = x_train.T[:, start:end]
            y_batch = y_temp.T[:, start:end]
            # Forward propagation
            AL, caches = L_model_forward(x_batch, parameters)
            # Cost computation
            cost = compute_cost(AL, y_batch)
            # Backward propagation
            grads = L_model_backward(AL, y_batch, caches)
            # Parameters updated
            parameters = Update_parameters(parameters, grads, learning_rate)
            # Check validation set accuracy
            temp = Predict(x_val, y_val, parameters)
            # Save cost each 100 training steps
            if step % 100 == 0:
                costs.append({"Step " +str(step):cost})
            # Count steps with small or no improvement
            if temp <= val_acc + 0.0001:
                stop_count += 1
            else:
                val_acc = temp
                stop_count = 0
            # If stop criterion reached, break the loop
            if stop_count >= 100:
                break
        # If stop criterion reached, break the loop
        if stop_count >= 100:
            print("Stop criterion reached after "+ str(i)+" epochs")
            break
    # Print the model accuracy on the train and validation sets
    print("Train Accuracy = ", str(Predict(x_train, y_train, parameters)))
    print("Validation Accuracy = ", str(Predict(x_val, y_val, parameters)))
    return parameters, costs


def Predict(X, y, parameters):
    # Gets the probabilities for each sample
    probs, caches = L_model_forward(X.T, parameters)
    probs = probs.T
    # Gets the predicted categorization of each sample
    pr = np.argmax(probs, axis=1)
    # Calculates the accuracy based on correct predictions
    acc = np.mean(pr == y)
    return acc

# Loads the data and flattens the input
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.reshape(x_train.shape[0], -1)
x_train = x_train / 255
x_test = x_test.reshape(x_test.shape[0], -1)
x_test = x_test / 255
# Initializing hyperparameters
dims = [x_train.shape[1], 20, 7, 5, 10]
learn_rate = 0.009
iters = 750
batch_size = 28
params, costs = L_layer_model(x_train, y_train, dims, learn_rate, iters, batch_size)
# Print the model accuracy on the test set
print("Test Accuracy = ", str(Predict(x_test, y_test, params)))
# Print the costs of the 100th training step iterations
print("Costs: ",costs)

Stop criterion reached after 271 epochs
Train Accuracy =  0.785875
Validation Accuracy =  0.7808333333333334
Test Accuracy =  0.7939
Costs:  [{'Step 100': 3.061221368422179}, {'Step 200': 2.5833868158406403}, {'Step 300': 2.574453038951019}, {'Step 400': 2.5372178368863034}, {'Step 500': 2.4319804882206175}, {'Step 600': 2.2315156512018404}, {'Step 700': 2.25172659536874}, {'Step 800': 2.305951450636306}, {'Step 900': 2.1157310170394745}, {'Step 1000': 2.082152374094138}, {'Step 1100': 2.0094809763924317}, {'Step 1200': 1.9780949053525554}, {'Step 1300': 1.834433266333368}, {'Step 1400': 1.9147221448393283}, {'Step 1500': 2.0041870257895185}, {'Step 1600': 1.9498583118342048}, {'Step 1700': 1.89568394274596}, {'Step 1800': 1.7932269031413106}, {'Step 1900': 1.6998971974108454}, {'Step 2000': 1.5979601861592418}, {'Step 2100': 1.6707318966410636}, {'Step 2200': 1.8077101244714513}, {'Step 2300': 1.7689401412778156}, {'Step 2400': 1.747732123860466}, {'Step 2500': 1.6420903961380062}, {'