<a href="https://colab.research.google.com/github/stiepan/MLCourse/blob/master/MNIST_Neural_Network_6_P3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import random
import numpy as np
from torchvision import datasets, transforms

In [0]:
# Let's read the mnist dataset

def load_mnist(path='.'):
    train_set = datasets.MNIST(path, train=True, download=True)
    x_train = train_set.data.numpy()
    _y_train = train_set.targets.numpy()
    
    test_set = datasets.MNIST(path, train=False, download=True)
    x_test = test_set.data.numpy()
    _y_test = test_set.targets.numpy()
    
    x_train = x_train / 255.
    x_test = x_test / 255.

    y_train = np.zeros((_y_train.shape[0], 10))
    y_train[np.arange(_y_train.shape[0]), _y_train] = 1
    
    y_test = np.zeros((_y_test.shape[0], 10))
    y_test[np.arange(_y_test.shape[0]), _y_test] = 1

    return (x_train, y_train), (x_test, y_test)

(x_train, y_train), (x_test, y_test) = load_mnist()

In this exercise your task is to fill in the gaps in this code by implementing the backpropagation algorithm
Once this is done, you can run the network on the MNIST example and see how it performs. Feel free to play with the parameters.

If you found this task too easy, try to implement a "fully vectorized" version, i.e. one using matrix operations instead of going over examples one by one.

In [3]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    # Derivative of the sigmoid
    return sigmoid(z)*(1-sigmoid(z))

class Network(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])]
    def feedforward(self, a):
        # Run the network on a single case
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def update_mini_batch(self, x_mini_batch, y_mini_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch.
        # eta is the learning rate
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in zip(x_mini_batch, y_mini_batch):
            delta_nabla_b, delta_nabla_w = self.backprop(x.reshape(784,1), y.reshape(10,1))
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(x_mini_batch))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(x_mini_batch))*nb 
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        # For a single input (x,y) return a tuple of lists.
        # First contains gradients over biases, second over weights.
        
        # First initialize the list of gradient arrays
        delta_nabla_b = [np.zeros_like(p) for p in self.biases]
        delta_nabla_w = [np.zeros_like(p) for p in self.weights]

        fs = [None]
        gs = [x]
        for b, w in zip(self.biases, self.weights):
          fs.append(w @ gs[-1] + b)
          gs.append(sigmoid(fs[-1]))
        
        next_layer_der = self.cost_derivative(gs[-1], y)
        for i in reversed(range(0, len(self.weights))):
          delta_nabla_b[i] = next_layer_der * sigmoid_prime(fs[i + 1])
          delta_nabla_w[i] = np.outer(delta_nabla_b[i], gs[i])
          next_layer_der = self.weights[i].T @ delta_nabla_b[i]
        
        return delta_nabla_b, delta_nabla_w

    def evaluate(self, x_test_data, y_test_data):
        # Count the number of correct answers for test_data
        test_results = [(np.argmax(self.feedforward(x_test_data[i].reshape(784,1))), np.argmax(y_test_data[i]))
                        for i in range(len(x_test_data))]
        # return accuracy
        return np.mean([int(x == y) for (x, y) in test_results])
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y) 
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        x_train, y_train = training_data
        if test_data:
            x_test, y_test = test_data
        for j in range(epochs):
            for i in range(x_train.shape[0] // mini_batch_size):
                x_mini_batch = x_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                y_mini_batch = y_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                self.update_mini_batch(x_mini_batch, y_mini_batch, eta)
            if test_data:
                print("Epoch: {0}, Accuracy: {1}".format(j, self.evaluate(x_test, y_test)))
            else:
                print("Epoch: {0}".format(j))


network = Network([784,30,10])
network.SGD((x_train, y_train), epochs=50, mini_batch_size=100, eta=3., test_data=(x_test, y_test))



Epoch: 0, Accuracy: 0.796
Epoch: 1, Accuracy: 0.8607
Epoch: 2, Accuracy: 0.8855
Epoch: 3, Accuracy: 0.8986
Epoch: 4, Accuracy: 0.9045
Epoch: 5, Accuracy: 0.9093
Epoch: 6, Accuracy: 0.9132
Epoch: 7, Accuracy: 0.9165
Epoch: 8, Accuracy: 0.9192
Epoch: 9, Accuracy: 0.921
Epoch: 10, Accuracy: 0.923
Epoch: 11, Accuracy: 0.9244
Epoch: 12, Accuracy: 0.9259
Epoch: 13, Accuracy: 0.9278
Epoch: 14, Accuracy: 0.9286
Epoch: 15, Accuracy: 0.9298
Epoch: 16, Accuracy: 0.9308
Epoch: 17, Accuracy: 0.9321
Epoch: 18, Accuracy: 0.9332
Epoch: 19, Accuracy: 0.9337
Epoch: 20, Accuracy: 0.9349
Epoch: 21, Accuracy: 0.935
Epoch: 22, Accuracy: 0.9356
Epoch: 23, Accuracy: 0.9361
Epoch: 24, Accuracy: 0.9362
Epoch: 25, Accuracy: 0.9369
Epoch: 26, Accuracy: 0.937
Epoch: 27, Accuracy: 0.938
Epoch: 28, Accuracy: 0.9386
Epoch: 29, Accuracy: 0.9389
Epoch: 30, Accuracy: 0.9392
Epoch: 31, Accuracy: 0.9394
Epoch: 32, Accuracy: 0.9397
Epoch: 33, Accuracy: 0.9397
Epoch: 34, Accuracy: 0.9405
Epoch: 35, Accuracy: 0.94
Epoch: 36,

In [14]:
class NetworkM(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])]
    def feedforward(self, X):
        # Run the network on a single case
        a = X.T
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def update_mini_batch(self, x_mini_batch, y_mini_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch.
        # eta is the learning rate
        nabla_b, nabla_w = self.backprop(x_mini_batch, y_mini_batch)
        self.weights = [w-(eta/len(x_mini_batch))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(x_mini_batch))*nb 
                       for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, X, ys):
        nabla_b = [None] * len(self.biases)
        nabla_w = [None]* len(self.weights)

        fs = [None]
        gs = [X.T]
        for b, w in zip(self.biases, self.weights):
          fs.append(w @ gs[-1] + b)
          gs.append(sigmoid(fs[-1]))
        
        next_layer_der = self.cost_derivative(gs[-1], ys)
        for i in reversed(range(0, len(self.weights))):
          node_der = next_layer_der * sigmoid_prime(fs[i + 1])
          nabla_b[i] = node_der @ np.ones(X.shape[0])
          nabla_w[i] = node_der @ gs[i].T
          next_layer_der = self.weights[i].T @ node_der    
        nabla_b = [b.reshape(b.shape[0], 1) for b in nabla_b]
    
        return nabla_b, nabla_w

    def evaluate(self, x_test_data, y_test_data):
        # Count the number of correct answers for test_data
        ps = np.argmax(self.feedforward(x_test_data), axis=0)
        ys = np.argmax(y_test_data, axis=1)
        return np.mean(ps == ys)
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y.T) 
    
    def x_feed_shape(self, X):
      samples = X.shape[0]
      sample_len = X.shape[1:]
      return X.reshape([samples, np.prod(sample_len)])

    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        x_train, y_train = training_data
        x_train = self.x_feed_shape(x_train)
        if test_data:
            x_test, y_test = test_data
            x_test = self.x_feed_shape(x_test)
        for j in range(epochs):
            for i in range(x_train.shape[0] // mini_batch_size):
                x_mini_batch = x_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                y_mini_batch = y_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                self.update_mini_batch(x_mini_batch, y_mini_batch, eta)
            if test_data:
                print("Epoch: {0}, Accuracy: {1}".format(j, self.evaluate(x_test, y_test)))
            else:
                print("Epoch: {0}".format(j))


network_m = NetworkM([784,30,10])
network_m.SGD((x_train, y_train), epochs=100, mini_batch_size=10, eta=10., test_data=(x_test, y_test))



Epoch: 0, Accuracy: 0.915
Epoch: 1, Accuracy: 0.9222
Epoch: 2, Accuracy: 0.9253
Epoch: 3, Accuracy: 0.9363
Epoch: 4, Accuracy: 0.9313
Epoch: 5, Accuracy: 0.9438
Epoch: 6, Accuracy: 0.942
Epoch: 7, Accuracy: 0.9411
Epoch: 8, Accuracy: 0.9438
Epoch: 9, Accuracy: 0.9426
Epoch: 10, Accuracy: 0.9413
Epoch: 11, Accuracy: 0.9451
Epoch: 12, Accuracy: 0.9417
Epoch: 13, Accuracy: 0.9418
Epoch: 14, Accuracy: 0.945
Epoch: 15, Accuracy: 0.9477
Epoch: 16, Accuracy: 0.9492
Epoch: 17, Accuracy: 0.9492
Epoch: 18, Accuracy: 0.9493
Epoch: 19, Accuracy: 0.9528
Epoch: 20, Accuracy: 0.9523
Epoch: 21, Accuracy: 0.9513
Epoch: 22, Accuracy: 0.9528
Epoch: 23, Accuracy: 0.949
Epoch: 24, Accuracy: 0.9519
Epoch: 25, Accuracy: 0.9516
Epoch: 26, Accuracy: 0.9519
Epoch: 27, Accuracy: 0.9478
Epoch: 28, Accuracy: 0.9495
Epoch: 29, Accuracy: 0.9488
Epoch: 30, Accuracy: 0.9504
Epoch: 31, Accuracy: 0.9507
Epoch: 32, Accuracy: 0.9533
Epoch: 33, Accuracy: 0.9516
Epoch: 34, Accuracy: 0.952
Epoch: 35, Accuracy: 0.9506
Epoch: 