In [1]:
import numpy as np
from keras.datasets import mnist

In [2]:
NORMALISATION = 255
NUM_DIGITS = 10
PIXELS = 28 * 28
TRAIN_SIZE = 1000
VALIDATION_SIZE = int(TRAIN_SIZE * 20 / 100)

ALPHA = 0.001
ITERATIONS = 40
HIDDEN_SIZE = 100
BATCH_SIZE = 100

In [3]:
# Create training, validation, and test set
(x_train, y_train), (x_test, y_test) = mnist.load_data()
train_images = x_train[:TRAIN_SIZE].reshape(TRAIN_SIZE, PIXELS) / NORMALISATION    # Flatten original 3D tensor to 2D, many neural nets prefer this as input
train_labels = np.eye(NUM_DIGITS)[y_train[:TRAIN_SIZE]]

validation_images = x_train[TRAIN_SIZE:TRAIN_SIZE+VALIDATION_SIZE].reshape(VALIDATION_SIZE, PIXELS) / NORMALISATION
validation_labels = np.eye(NUM_DIGITS)[y_train[TRAIN_SIZE:TRAIN_SIZE+VALIDATION_SIZE]]

test_images = x_test.reshape(len(x_test), PIXELS) / NORMALISATION                       # NORMALISATION HELPS TO PREVENT OVERFLOW
test_labels = np.eye(NUM_DIGITS)[y_test]

In [4]:
# ReLU functions
def relu(arr: np.ndarray) -> np.ndarray:
    return np.maximum(0, arr)

def relu2deriv(arr: np.ndarray) -> np.ndarray:
    return (arr >= 0).astype(int)

In [5]:
# Initialise weights
def initialise_weights():
    np.random.seed(1)                                                                   # Set to make results reproducable 
    weights_0_1 = np.random.uniform(-0.1, 0.1, (PIXELS, HIDDEN_SIZE))                   # Weights have to be of shape (input, output)
    weights_1_2 = np.random.uniform(-0.1, 0.1, (HIDDEN_SIZE, NUM_DIGITS))
    return weights_0_1, weights_1_2

In [6]:
# Train and test functions
def train():
    weights_0_1, weights_1_2 = initialise_weights()

    for iteration in range(ITERATIONS): 
        error = correct = 0

        # Training loop
        for i in range(int(len(train_images) / BATCH_SIZE)):
            batch_start, batch_end = (i * BATCH_SIZE), ((i + 1) * BATCH_SIZE)

            layer_0 = train_images[batch_start:batch_end]               # Even if only one row need to slice, not index, so shape is (1, pixels)
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            mask = np.random.randint(2, size=layer_1.shape) * 2        # Allegedly, mask better if applied to only first layer 
            layer_1 *= mask
            layer_2 = np.dot(layer_1, weights_1_2)
            
            error += np.sum((train_labels[batch_start:batch_end] - layer_2) ** 2)
            
            for k in range(BATCH_SIZE):
                correct += int(np.argmax(layer_2[k:k+1]) == np.argmax(train_labels[batch_start+k:batch_start+k+1])) 

                layer_2_delta = (train_labels[batch_start:batch_end] - layer_2) / BATCH_SIZE        # Averaging delta                     
                layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)              # Unclear why this is used
                layer_1_delta *= mask    
                
                weights_1_2 += ALPHA * layer_1.T.dot(layer_2_delta)                                 # Need to do calculation by hand to trace correction
                weights_0_1 += ALPHA * layer_0.T.dot(layer_1_delta)                                 

        # Validation loop
        if iteration % 20 == 0:
            validation(weights_0_1, weights_1_2)

        print(f"I:{iteration} Train-Err:{str(error/float(len(train_images)))[0:5]} Train-Acc:{correct/float(len(train_images))}")
            
    return weights_0_1, weights_1_2

def validation(weights_0_1, weights_1_2):
    val_error = val_correct = 0

    for i in range(int(len(validation_images) / BATCH_SIZE)):
        batch_start, batch_end = (i * BATCH_SIZE), ((i + 1) * BATCH_SIZE)
        val_layer_0 = validation_images[batch_start:batch_end]
        val_layer_1 = relu(np.dot(val_layer_0, weights_0_1))
        val_layer_2 = np.dot(val_layer_1, weights_1_2)

        val_error += np.sum((validation_labels[batch_start:batch_end] - val_layer_2) ** 2)

        val_correct += np.sum(np.argmax(val_layer_2, axis=1) == np.argmax(validation_labels[batch_start:batch_end], axis=1))
    
    print(f"Validation-Err:{val_error / len(validation_images)} Validation-Acc:{val_correct / len(validation_images)}")

def test(weights_0_1, weights_1_2):
    error = correct = 0

    for i in range(int(len(test_images) / BATCH_SIZE)):
        batch_start, batch_end = (i * BATCH_SIZE), ((i + 1) * BATCH_SIZE)
        inputs = test_images[batch_start:batch_end]
        layer_1 = relu(np.dot(inputs, weights_0_1))
        pred = np.dot(layer_1, weights_1_2)

        error += np.sum((test_labels[batch_start:batch_end] - pred) ** 2)
        correct += np.sum(np.argmax(pred, axis=1) == np.argmax(test_labels[batch_start:batch_end], axis=1))

    print(f"Test-Err:{str(error/float(len(test_images)))[0:5]} Test-Acc:{correct/float(len(test_images))}")

In [7]:
# Train model and apply to test data
print("Training model ...")
weights_0_1, weights_1_2 = train()
print()
print("Applying model to test data ...")
test(weights_0_1, weights_1_2)

Training model ...
Validation-Err:0.7754553743229216 Validation-Acc:0.47
I:0 Train-Err:1.218 Train-Acc:0.185
I:1 Train-Err:0.829 Train-Acc:0.413
I:2 Train-Err:0.738 Train-Acc:0.532
I:3 Train-Err:0.693 Train-Acc:0.608
I:4 Train-Err:0.673 Train-Acc:0.616
I:5 Train-Err:0.643 Train-Acc:0.65
I:6 Train-Err:0.632 Train-Acc:0.655
I:7 Train-Err:0.611 Train-Acc:0.687
I:8 Train-Err:0.600 Train-Acc:0.686
I:9 Train-Err:0.577 Train-Acc:0.71
I:10 Train-Err:0.567 Train-Acc:0.71
I:11 Train-Err:0.569 Train-Acc:0.707
I:12 Train-Err:0.552 Train-Acc:0.715
I:13 Train-Err:0.543 Train-Acc:0.729
I:14 Train-Err:0.549 Train-Acc:0.724
I:15 Train-Err:0.525 Train-Acc:0.74
I:16 Train-Err:0.523 Train-Acc:0.736
I:17 Train-Err:0.508 Train-Acc:0.759
I:18 Train-Err:0.514 Train-Acc:0.737
I:19 Train-Err:0.520 Train-Acc:0.739
Validation-Err:0.49334761530406085 Validation-Acc:0.79
I:20 Train-Err:0.503 Train-Acc:0.747
I:21 Train-Err:0.495 Train-Acc:0.765
I:22 Train-Err:0.497 Train-Acc:0.768
I:23 Train-Err:0.491 Train-Acc:0.76