In [1]:
import random
import numpy as np
from torchvision import datasets, transforms

In [2]:
# !wget www.di.ens.fr/~lelarge/MNIST.tar.gz 
# !tar -zxvf MNIST.tar.gz 

In [3]:
# Let's read the mnist dataset

def load_mnist(path='.'):
    train_set = datasets.MNIST(path, train=True, download=True)
    x_train = train_set.data.numpy()
    _y_train = train_set.targets.numpy()
    
    test_set = datasets.MNIST(path, train=False, download=True)
    x_test = test_set.data.numpy()
    _y_test = test_set.targets.numpy()
    
    x_train = x_train / 255.
    x_test = x_test / 255.

    y_train = np.zeros((_y_train.shape[0], 10))
    y_train[np.arange(_y_train.shape[0]), _y_train] = 1
    
    y_test = np.zeros((_y_test.shape[0], 10))
    y_test[np.arange(_y_test.shape[0]), _y_test] = 1

    return (x_train, y_train), (x_test, y_test)

(x_train, y_train), (x_test, y_test) = load_mnist()

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=9912422.0), HTML(value='')))


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=28881.0), HTML(value='')))


Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=1648877.0), HTML(value='')))


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=0.0, max=4542.0), HTML(value='')))


Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw

Processing...
Done!


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In this exercise your task is to fill in the gaps in this code by implementing the backpropagation algorithm
Once this is done, you can run the network on the MNIST example and see how it performs. Feel free to play with the parameters.

If you found this task too easy, try to implement a "fully vectorized" version, i.e. one using matrix operations instead of going over examples one by one.

In [12]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    # Derivative of the sigmoid
    return sigmoid(z)*(1-sigmoid(z))

class Network(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        # Run the network on a single case
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a
    
    def update_mini_batch(self, x_mini_batch, y_mini_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch.
        # eta is the learning rate
        xs = []
        ys = []
        for x, y in zip(x_mini_batch, y_mini_batch):
            xs.append(x.reshape(784))
            ys.append(y.reshape(10))
        xs = np.array(xs).T
        ys = np.array(ys).T
        nabla_b, nabla_w = self.backprop(xs, ys)

      
        self.weights = [w - (eta/len(x_mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(x_mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
        
    def backprop(self, x, y):
        # For a single input (x,y) return a tuple of lists.
        # First contains gradients over biases, second over weights.
        
        # First initialize the list of gradient arrays
        delta_nabla_b = []
        delta_nabla_w = []
        
        # Then go forward remembering all values before and after activations
        # in two other array lists
        fs = []
        gs = [x]
        for b, w in zip(self.biases, self.weights):
          fs.append(np.dot(w, gs[-1]) + b)
          gs.append(sigmoid(fs[-1]))
        
        # Now go backward from the final cost applying backpropagation
        dg = gs[-1] - y
        dfs = []
        for w, g in reversed(list(zip(self.weights, gs[1:]))):
          dfs.append(dg * g * (1-g))
          dg = np.dot(w.T, dfs[-1])

        for df, g in zip(reversed(dfs), gs[:-1]):
          delta_nabla_w.append(np.dot(df, g.T))
          delta_nabla_b.append(np.sum(df,axis=1).reshape(df.shape[0],1))

        return delta_nabla_b, delta_nabla_w

    def evaluate(self, x_test_data, y_test_data):
        # Count the number of correct answers for test_data
        test_results = [(np.argmax(self.feedforward(x_test_data[i].reshape(784,1))), np.argmax(y_test_data[i]))
                        for i in range(len(x_test_data))]
        # return accuracy
        return np.mean([int(x == y) for (x, y) in test_results])
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y) 
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        x_train, y_train = training_data
        if test_data:
            x_test, y_test = test_data
        for j in range(epochs):
            for i in range(x_train.shape[0] // mini_batch_size):
                x_mini_batch = x_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                y_mini_batch = y_train[i*mini_batch_size:(i*mini_batch_size + mini_batch_size)] 
                self.update_mini_batch(x_mini_batch, y_mini_batch, eta)
            if test_data:
                print("Epoch: {0}, Accuracy: {1:.2f}%".format(j, 100.*self.evaluate(x_test, y_test)))
            else:
                print("Epoch: {0}".format(j))


network = Network([784,30,10])
network.SGD((x_train, y_train), epochs=50, mini_batch_size=100, eta=3., test_data=(x_test, y_test))



Epoch: 0, Accuracy: 81.44%
Epoch: 1, Accuracy: 86.06%
Epoch: 2, Accuracy: 88.12%
Epoch: 3, Accuracy: 89.38%
Epoch: 4, Accuracy: 90.05%
Epoch: 5, Accuracy: 90.52%
Epoch: 6, Accuracy: 90.91%
Epoch: 7, Accuracy: 91.23%
Epoch: 8, Accuracy: 91.47%
Epoch: 9, Accuracy: 91.79%
Epoch: 10, Accuracy: 91.91%
Epoch: 11, Accuracy: 92.06%
Epoch: 12, Accuracy: 92.24%
Epoch: 13, Accuracy: 92.38%
Epoch: 14, Accuracy: 92.46%
Epoch: 15, Accuracy: 92.59%
Epoch: 16, Accuracy: 92.65%
Epoch: 17, Accuracy: 92.76%
Epoch: 18, Accuracy: 92.86%
Epoch: 19, Accuracy: 92.91%
Epoch: 20, Accuracy: 92.95%
Epoch: 21, Accuracy: 93.02%
Epoch: 22, Accuracy: 93.11%
Epoch: 23, Accuracy: 93.18%
Epoch: 24, Accuracy: 93.20%
Epoch: 25, Accuracy: 93.26%
Epoch: 26, Accuracy: 93.31%
Epoch: 27, Accuracy: 93.39%
Epoch: 28, Accuracy: 93.45%
Epoch: 29, Accuracy: 93.49%
Epoch: 30, Accuracy: 93.52%
Epoch: 31, Accuracy: 93.59%
Epoch: 32, Accuracy: 93.59%
Epoch: 33, Accuracy: 93.59%
Epoch: 34, Accuracy: 93.59%
Epoch: 35, Accuracy: 93.63%
Ep