In [6]:
import numpy as np
class AdamOptimizer:
    def __init__(self, weights, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.alpha = alpha
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = 0
        self.v = 0
        self.t = 0
        self.theta = weights
        
    def backward_pass(self, gradient):
        self.t = self.t + 1
        self.m = self.beta1*self.m + (1 - self.beta1)*gradient
        self.v = self.beta2*self.v + (1 - self.beta2)*(gradient**2)
        m_hat = self.m/(1 - self.beta1**self.t)
        v_hat = self.v/(1 - self.beta2**self.t)
        self.theta = self.theta - self.alpha*(m_hat/(np.sqrt(v_hat) - self.epsilon))
        return self.theta

In [7]:
class Conv3x3:
    def __init__(self, num_filters):
        self.num_filters = num_filters
        self.filters = np.random.randn(num_filters, 3, 3) / 9
        # new
        self.adam = AdamOptimizer(self.filters)
        
    def iterate_regions(self, image):
        h, w = image.shape
        for i in range(h - 2):
            for j in range(w - 2):
                im_region = image[i:(i + 3), j:(j + 3)]
                yield im_region, i, j

                
    def forward(self, input):
        self.last_input = input
        h, w = input.shape
        output = np.zeros((h - 2, w - 2, self.num_filters))
        for im_region, i, j in self.iterate_regions(input):
            output[i, j] = np.sum(im_region * self.filters, axis=(1, 2))
        return output

    
    def backprop(self, d_L_d_out, learn_rate):
        d_L_d_filters = np.zeros(self.filters.shape)
        for im_region, i, j in self.iterate_regions(self.last_input):
            for f in range(self.num_filters):
                d_L_d_filters[f] += d_L_d_out[i, j, f] * im_region
        self.filters -= learn_rate * d_L_d_filters
        
        # new
        #self.filters = self.adam.backward_pass(d_L_d_filters)
        return None

In [8]:
class MaxPool2:
    def iterate_regions(self, image):
        h, w, _ = image.shape
        new_h = h // 2
        new_w = w // 2

        for i in range(new_h):
            for j in range(new_w):
                im_region = image[(i * 2):(i * 2 + 2), (j * 2):(j * 2 + 2)]
                yield im_region, i, j

    def forward(self, input):
        self.last_input = input
        h, w, num_filters = input.shape
        output = np.zeros((h // 2, w // 2, num_filters))

        for im_region, i, j in self.iterate_regions(input):
            output[i, j] = np.amax(im_region, axis=(0, 1))
        return output

    
    def backprop(self, d_L_d_out):
        d_L_d_input = np.zeros(self.last_input.shape)

        for im_region, i, j in self.iterate_regions(self.last_input):
            h, w, f = im_region.shape
            amax = np.amax(im_region, axis=(0, 1))

            for i2 in range(h):
                for j2 in range(w):
                    for f2 in range(f):
                        if im_region[i2, j2, f2] == amax[f2]:
                            d_L_d_input[i * 2 + i2, j * 2 + j2, f2] = d_L_d_out[i, j, f2]
                            # break

        return d_L_d_input


In [9]:
class Softmax:
    def __init__(self, input_len, nodes):
        self.weights = np.random.randn(input_len, nodes) / input_len
        self.biases = np.zeros(nodes)
        # new
        #self.adam_weights = AdamOptimizer(self.weights)
        #self.adam_biases = AdamOptimizer(self.biases)
        
    def forward(self, input):
        self.last_input_shape = input.shape

        input = input.flatten()
        self.last_input = input

        input_len, nodes = self.weights.shape

        totals = np.dot(input, self.weights) + self.biases
        self.last_totals = totals

        exp = np.exp(totals)
        return exp / np.sum(exp, axis=0)

    
    def backprop(self, d_L_d_out, learn_rate):
        for i, gradient in enumerate(d_L_d_out):
            if gradient == 0:
                continue

            t_exp = np.exp(self.last_totals)
            S = np.sum(t_exp)

            d_out_d_t = -t_exp[i] * t_exp / (S ** 2)
            d_out_d_t[i] = t_exp[i] * (S - t_exp[i]) / (S ** 2)

            d_t_d_w = self.last_input
            d_t_d_b = 1
            d_t_d_inputs = self.weights

            d_L_d_t = gradient * d_out_d_t

            d_L_d_w = d_t_d_w[np.newaxis].T @ d_L_d_t[np.newaxis]
            d_L_d_b = d_L_d_t * d_t_d_b
            d_L_d_inputs = d_t_d_inputs @ d_L_d_t

            self.weights -= learn_rate * d_L_d_w
            self.biases -= learn_rate * d_L_d_b

            # new
            #self.weights = self.adam_weights.backward_pass(d_L_d_w)
            #self.biases = self.adam_biases.backward_pass(d_L_d_b)

            return d_L_d_inputs.reshape(self.last_input_shape)

In [10]:
import keras
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()


train_images = x_train[:1000]
train_labels = y_train[:1000]
test_images = x_test[:1000]
test_labels = y_test[:1000]

conv = Conv3x3(8)                  
pool = MaxPool2()                  
softmax = Softmax(13 * 13 * 8, 10) 

def forward(image, label):
    out = conv.forward((image / 255) - 0.5)
    out = pool.forward(out)
    out = softmax.forward(out)

    loss = -np.log(out[label])
    acc = 1 if np.argmax(out) == label else 0
    return out, loss, acc


def train(im, label, lr=.005):
    out, loss, acc = forward(im, label)
    gradient = np.zeros(10)
    gradient[label] = -1 / out[label]
    
    
    gradient = softmax.backprop(gradient, lr)
    gradient = pool.backprop(gradient)
    gradient = conv.backprop(gradient, lr)
    return loss, acc


for epoch in range(3):
    print('--- Epoch %d ---' % (epoch + 1))

    permutation = np.random.permutation(len(train_images))
    train_images = train_images[permutation]
    train_labels = train_labels[permutation]

    loss = 0
    num_correct = 0
    for i, (im, label) in enumerate(zip(train_images, train_labels)):
        if i % 100 == 99:
            print(
                '[Step %d] Past 100 steps: Average Loss %.3f | Accuracy: %d%%' %
                (i + 1, loss / 100, num_correct)
              )
            loss = 0
            num_correct = 0

        l, acc = train(im, label)
        loss += l
        num_correct += acc

--- Epoch 1 ---
[Step 100] Past 100 steps: Average Loss 2.211 | Accuracy: 19%
[Step 200] Past 100 steps: Average Loss 2.087 | Accuracy: 30%
[Step 300] Past 100 steps: Average Loss 1.694 | Accuracy: 54%
[Step 400] Past 100 steps: Average Loss 1.162 | Accuracy: 60%
[Step 500] Past 100 steps: Average Loss 0.872 | Accuracy: 72%
[Step 600] Past 100 steps: Average Loss 0.803 | Accuracy: 77%
[Step 700] Past 100 steps: Average Loss 0.735 | Accuracy: 77%
[Step 800] Past 100 steps: Average Loss 0.759 | Accuracy: 79%
[Step 900] Past 100 steps: Average Loss 0.771 | Accuracy: 75%
[Step 1000] Past 100 steps: Average Loss 0.550 | Accuracy: 82%
--- Epoch 2 ---
[Step 100] Past 100 steps: Average Loss 0.644 | Accuracy: 79%
[Step 200] Past 100 steps: Average Loss 0.668 | Accuracy: 80%
[Step 300] Past 100 steps: Average Loss 0.573 | Accuracy: 81%
[Step 400] Past 100 steps: Average Loss 0.479 | Accuracy: 86%
[Step 500] Past 100 steps: Average Loss 0.521 | Accuracy: 85%
[Step 600] Past 100 steps: Average Lo

In [11]:
loss = 0
num_correct = 0
prediction = []

def get_label(probs):
    label = 0
    label_prob = probs[0]
    for i in range(1, len(probs)):
        if probs[i] > label_prob:
            label = i
            label_prob = probs[i]
    return label

for im, label in zip(test_images, test_labels):
    out, l, acc = forward(im, label)
    prediction.append(get_label(out))
    loss += l
    num_correct += acc

num_tests = len(test_images)
print('mnist test loss:', loss / num_tests)
print('mnist test accuracy:', num_correct / num_tests)
error_rate = 1 - num_correct / num_tests
print('mnist error rate:', error_rate)

mnist test loss: 0.5341096210899458
mnist test accuracy: 0.826
mnist error rate: 0.17400000000000004
