In [1]:
import numpy as np
from keras.datasets import mnist

In [2]:
NORMALISATION = 255
NUM_DIGITS = 10
PIXELS = 28 * 28
NUM_SAMPLES = 1000

ALPHA = 0.005
ITERATIONS = 100
HIDDEN_SIZE = 40

In [3]:
# Create training set
(x_train, y_train), (x_test, y_test) = mnist.load_data()
train_images = x_train[:NUM_SAMPLES].reshape(NUM_SAMPLES, PIXELS) / NORMALISATION    # Flatten original 3D tensor to 2D, many neural nets prefer this as input

train_labels = np.zeros((len(y_train[:NUM_SAMPLES]), NUM_DIGITS))                      

for i, label in enumerate(y_train[:NUM_SAMPLES]):
    train_labels[i][label] = 1

# Create test set
test_images = x_test.reshape(len(x_test), PIXELS) / NORMALISATION                       # Allegedly, normalisation has a variety of benefits 
                                                                                        # Without it, the model does not learn due to overflow in squared errors
test_labels = np.zeros((len(y_test), NUM_DIGITS))

for i, label in enumerate(y_test):
    test_labels[i][label] = 1

In [4]:
# ReLU functions
def relu(arr: np.ndarray) -> np.ndarray:
    return np.maximum(0, arr)

def relu2deriv(arr: np.ndarray) -> np.ndarray:
    return (arr >= 0).astype(int)

In [5]:
# Initialise weights
np.random.seed(1)                                                                   # Set to make results reproducable 
weights_0_1 = np.random.uniform(-0.1, 0.1, (PIXELS, HIDDEN_SIZE))                   # Weights have to be of shape (input, output)
weights_1_2 = np.random.uniform(-0.1, 0.1, (HIDDEN_SIZE, NUM_DIGITS))

In [7]:
# Run training loop. It currently processes one sample at a
# time. 

for iteration in range(ITERATIONS): 
    error, correct = 0, 0

    for i in range(len(train_images)):
        
        layer_0 = train_images[i: i + 1]               # Slicing like this retains shape of array needed for dot product
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        mask = np.random.randint(2, size=layer_1.shape) * 2        # Seem better if applied to only first layer 
        layer_1 *= mask
        layer_2 = np.dot(layer_1, weights_1_2)
        
        layer_2_delta = train_labels[i: i + 1] - layer_2
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)              # Unclear why this is used
        layer_1_delta *= mask

        error += np.sum(layer_2_delta ** 2)
        correct += int(np.argmax(layer_2) == np.argmax(train_labels[i: i + 1])) 
        
        weights_1_2 += ALPHA * layer_1.T.dot(layer_2_delta)                                 # Unclear why the dot product is the gradient
        weights_0_1 += ALPHA * layer_0.T.dot(layer_1_delta)                                 # Same

    print(f"\r I:{iteration} Train-Err:{str(error/float(len(train_images)))[0:5]} Train-Acc:{correct/float(len(train_images))}", end="")


 I:99 Train-Err:0.420 Train-Acc:0.817

In [8]:
# Apply model to test data
error, correct = 0, 0

for i in range(len(test_images)):
    inputs = test_images[i:i + 1]
    layer_1 = relu(np.dot(inputs, weights_0_1))
    pred = np.dot(layer_1, weights_1_2)

    error += np.sum((test_labels[i:i + 1] - pred) ** 2)
    correct += int(np.argmax(pred) == np.argmax(test_labels[i:i + 1]))

print(f"Test-Err:{str(error/float(len(test_images)))[0:5]} Test-Acc:{correct/float(len(test_images))}")

Test-Err:0.429 Test-Acc:0.7942
