In [1]:
import numpy as np
from keras.datasets import mnist

In [2]:
NORMALISATION = 255
NUM_DIGITS = 10
PIXELS = 28 * 28
TRAIN_SIZE = 1000
VALIDATION_SIZE = int(TRAIN_SIZE * 20 / 100)

ALPHA = 0.001
ITERATIONS = 100
HIDDEN_SIZE = 100
BATCH_SIZE = 100

In [3]:
# Create training, validation, and test set
(x_train, y_train), (x_test, y_test) = mnist.load_data()
train_images = x_train[:TRAIN_SIZE].reshape(TRAIN_SIZE, PIXELS) / NORMALISATION    # Flatten original 3D tensor to 2D, many neural nets prefer this as input
train_labels = np.eye(NUM_DIGITS)[y_train[:TRAIN_SIZE]]

validation_images = x_train[TRAIN_SIZE:TRAIN_SIZE+VALIDATION_SIZE].reshape(VALIDATION_SIZE, PIXELS) / NORMALISATION
validation_labels = np.eye(NUM_DIGITS)[y_train[TRAIN_SIZE:TRAIN_SIZE+VALIDATION_SIZE]]

test_images = x_test.reshape(len(x_test), PIXELS) / NORMALISATION                       # NORMALISATION HELPS TO PREVENT OVERFLOW
test_labels = np.eye(NUM_DIGITS)[y_test]

In [4]:
# ReLU functions
def relu(arr: np.ndarray) -> np.ndarray:
    return np.maximum(0, arr)

def relu2deriv(arr: np.ndarray) -> np.ndarray:
    return (arr >= 0).astype(int)

In [25]:
# Initialise weights
def initialise_weights():
    np.random.seed(1)                                                                   # Set to make results reproducable 
    weights_0_1 = np.random.uniform(-0.1, 0.1, (PIXELS, HIDDEN_SIZE))                   # Weights have to be of shape (input, output)
    weights_1_2 = np.random.uniform(-0.1, 0.1, (HIDDEN_SIZE, NUM_DIGITS))
    return weights_0_1, weights_1_2

In [32]:
# Train and test functions
def train():
    weights_0_1, weights_1_2 = initialise_weights()
    val_idx = 0

    for iteration in range(ITERATIONS): 
        error = correct = 0

        # Training loop
        for i in range(int(len(train_images) / BATCH_SIZE)):
            batch_start, batch_end = (i * BATCH_SIZE), ((i + 1) * BATCH_SIZE)

            layer_0 = train_images[batch_start:batch_end]               # Even if only one row need to slice, not index, so shape is (1, pixels)
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            mask = np.random.randint(2, size=layer_1.shape) * 2        # Allegedly, mask better if applied to only first layer 
            layer_1 *= mask
            layer_2 = np.dot(layer_1, weights_1_2)
            
            error += np.sum((train_labels[batch_start:batch_end] - layer_2) ** 2)
            
            for k in range(BATCH_SIZE):
                correct += int(np.argmax(layer_2[k:k+1]) == np.argmax(train_labels[batch_start+k:batch_start+k+1])) 

                layer_2_delta = (train_labels[batch_start:batch_end] - layer_2) / BATCH_SIZE        # Averaging delta                     
                layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)              # Unclear why this is used
                layer_1_delta *= mask    
                
                weights_1_2 += ALPHA * layer_1.T.dot(layer_2_delta)                                 # Need to do calculation by hand to trace correction
                weights_0_1 += ALPHA * layer_0.T.dot(layer_1_delta)                                 

        # Validation loop
        # if i % 100 == 0 and val_idx < VALIDATION_SIZE:
        #     val_error = val_correct = 0

        #     val_batch_start, val_batch_end = (val_idx * BATCH_SIZE), ((val_idx + 1) * BATCH_SIZE)
        #     val_layer_0 = validation_images[val_batch_start:val_batch_end]
        #     val_layer_1 = np.dot(val_layer_0, weights_0_1)
        #     val_layer_2 = np.dot(val_layer_1, weights_1_2)

        #     val_error += np.sum((validation_labels[val_batch_start: val_batch_end] - val_layer_2) ** 2)

        #     for j in range(BATCH_SIZE):
        #         val_correct += int(np.argmax(val_layer_2[j:j+1]) == np.argmax(validation_labels[val_batch_start+j:val_batch_start+j+1]))

            
        print(f"\rI:{iteration} Train-Err:{str(error/float(len(train_images)))[0:5]} Train-Acc:{correct/float(len(train_images))}", end="")
    return weights_0_1, weights_1_2

def test(weights_0_1, weights_1_2):
    error = correct = 0

    for i in range(len(test_images)):
        inputs = test_images[i:i + 1]
        layer_1 = relu(np.dot(inputs, weights_0_1))
        pred = np.dot(layer_1, weights_1_2)

        error += np.sum((test_labels[i:i + 1] - pred) ** 2)
        correct += int(np.argmax(pred) == np.argmax(test_labels[i:i + 1]))

    print(f"Test-Err:{str(error/float(len(test_images)))[0:5]} Test-Acc:{correct/float(len(test_images))}")

In [34]:
# Train model and apply to test data
print("Training model ...")
weights_0_1, weights_1_2 = train()
print()
print()
print("Applying model to test data ...")
test(weights_0_1, weights_1_2)

Training model ...
I:99 Train-Err:0.414 Train-Acc:0.825
Applying model to test data ...
Test-Err:0.433 Test-Acc:0.8001
