In [None]:
import random
import numpy as np

Implementation of a basic neural networks with numpy to get in grips with the backpropagation algorithm. Uses sigmoid activations and mean squared error cost function.


## Equations of backprop
Expression for output error
$$\delta^L = \frac{\partial C}{\partial z^L} = \frac{\partial C}{\partial a^L} * \frac{\partial a^L}{\partial z^L}$$

Error in the intermediate layers
$$\delta^l = \frac{\partial C}{\partial z^l} = \frac{\partial C}{\partial z^{l+1}}* \frac {\partial z^{l+1}}{\partial z^l}$$
<br>
Rate of change of cost with respect to weights and biases <br>
$$\frac{\partial C}{\partial w^l} = a^{l-1} \delta^l$$

$$\frac{\partial C}{\partial b} = \delta$$

## Cost function
  $$C = \frac{1}{2n} \sum_x \|y(x)-a^L(x)\|^2$$

## Code

In [None]:
class NN():
    #neurons: list of integers defining number of neurons in each layer. 
    def __init__(self, neurons):
        #Neurons defined in a list [6, 4, 3] where the first index is the number of elements in
        #the input. 4 and 3 layers in the first and seconda hidden layers respectively.
        self.neurons = neurons
        self.num_layers = len(neurons)
        self.biases = [np.random.randn(num, 1) for num in neurons[1:]]
        #Weights are in the format where outp(i) is the i-th neuron in the hidden/outp layer 
        #and inp(j) is the j-th element in the inp/hidden layer. 
        self.weights = [np.random.randn(outp, inp) for inp,outp in zip(neurons[:-1] ,neurons[1:])]
    
    def feedforward(self, test_data):        
        activation = test_data.reshape(784, 1)
        for w, b in zip(self.weights, self.biases):
            z = np.dot(w, activation) + b
            activation = sigmoid(z)
        return activation
    
    def SGD(self, training_data, epochs, mini_batch_size, eta,
            test_data=None):
        len_trn = len(training_data)
        n_test = len(test_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, len_trn, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_minibatch(mini_batch, eta)
            if test_data:
                print ("Epoch {0}: {1} / {2}".format(
                    j, self.evaluate(test_data), n_test))
            else:
                print ("Epoch {0} complete".format(j))
    
    def update_minibatch(self, mini_batch, eta):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x,y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nb+dnb for nb, dnb in zip(nabla_w, delta_nabla_w)]
            self.weights = [w - eta/len(mini_batch) * nw
                            for w, nw in zip(self.weights, nabla_w)]

            self.biases = [b - eta/len(mini_batch) * nb
                           for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        #_____feedforward______
        activation = x.reshape(784, 1)
        #list of activations
        activations = [activation]
        #list of z values
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        #____Backward pass____
        #dC/da
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        
        #dC/dw and dC/db for last layer
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        
        #dc/dw and dC/db for every previous layers
        for l in range(2, self.num_layers):
            self.weights[-l+1].transpose()
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sigmoid_prime(zs[-l])
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
    
    def cost_derivative(self, y_est, y_real):
        #Derivative of the cost function
        return y_est-y_real
    
    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

In [None]:
def sigmoid(z):
    #The sigmoid function
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    #Derivative of the sigmoid function
    return sigmoid(z)*(1-sigmoid(z))

In [None]:
#Function to one hot encode labels
def one_hot(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

In [None]:
net = NN([784, 60, 40, 30, 20, 10])

In [None]:
#Path to MNIST in .npz format
path = '/home/hearth/.keras/datasets/mnist.npz'
data = np.load(path)
X_train, y_train, X_test, y_test = data['x_train'], data['y_train'], data['x_test'], data['y_test'] 
X_train = X_train.reshape(len(X_train), 28*28)
X_test = X_test.reshape(len(X_test), 28*28)


In [None]:
y_train = [one_hot(j) for j in y_train]
y_test = [one_hot(j) for j in y_test]

In [None]:
training_data = list(zip(X_train, y_train))
test_data = list(zip(X_test, y_test))

In [None]:
net.SGD(training_data, 50, 64, 0.005,
            test_data=test_data)

## Train with random data

In [None]:
x = np.random.randn(6400, 784)
result = [random.randint(0,9) for x in range(0,6400)]
training_data = zip(x, result)
training_data = list(training_data)