In [1]:
from network import Network

In [2]:
import mnist_loader

In [3]:
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [110]:
test_data[0][0].shape

(784, 1)

In [111]:
test_data[0][1]

7

In [4]:
net = Network([784, 30, 10]) #where does 784 come from??

In [None]:
net.SGD(training_data=training_data, epochs=30, mini_batch_size=10, eta=3.0, test_data=test_data)

# --------------------

In [4]:
import random
import numpy as np
from copy import copy, deepcopy

# needed to make sure we always get the same set of randomized weights when we begin
np.random.seed(2015)

In [5]:
def sigmoid(z):
    return 1./(1. + np.exp(-z))
def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

In [6]:
def cost(x, y):
    # use squared error
    return 0.5 * (np.linalg.norm(x-y) ** 2)

In [7]:
sizes = [784, 30, 10]

x = training_data[0][0]
y = training_data[0][1]

num_layers = len(sizes)
biases = [np.random.randn(s, 1) for s in sizes[1:]]
weights = [np.random.randn(s1, s2) for s1, s2 in zip(sizes[:-1], sizes[1:])]

In [115]:
old_weights = deepcopy(weights)
old_biases = deepcopy(biases)

In [296]:
weights = deepcopy(old_weights)
biases = deepcopy(old_biases)

In [297]:
# feedforward
# initialize
activations = [x]
z_inputs = [x]
z = np.dot(weights[0].T, x)
for w, b in zip(weights, biases):
    z = np.dot(w.T, activations[-1]) + b
    z_inputs.append(z)
    a = sigmoid(z)
    activations.append(a)

In [305]:
# backpropagate
# for layer L
# assume we have more than one layer...
eta = 0.3
nabla_b = [np.zeros((s, 1)) for s in sizes[1:]]
nabla_w = [np.zeros((s1, s2)) for s1, s2 in zip(sizes[:-1], sizes[1:])]

#initialize (for layer L = 2)
# note that the length of activations and z_inputs are different from nabla_b and nabla_w
# because activations and z_inputs includes the input
delta = (activations[-1] - y) * sigmoid_prime(z_inputs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(activations[-2], delta.T)
biases[-1] = biases[-1] - eta * nabla_b[-1]
weights[-1] = weights[-1] - eta * nabla_w[-1]

# for all previous layers
for l in range(num_layers-1, 1, -1):
    delta = np.dot(weights[l-1], delta) * sigmoid_prime(z_inputs[l-1])
    nabla_b[l-2] = delta
    nabla_w[l-2] = np.dot(activations[l-2], delta.T)
    biases[l-2] = biases[l-2] - eta * nabla_b[l-2]
    weights[l-2] = weights[l-2] - eta * nabla_w[l-2]

In [231]:
nabla_b_old = deepcopy(nabla_b)
nabla_w_old = deepcopy(nabla_w)

We can determine whether or not the gradient is being calculated correctly by comparing our nabla_b and nabla_w matrices to another calculation of that gradient. To do so, we calculate: dC/dw_1_ij = (cost(weights[1][i][j] + epsilon) - cost(weights[1][i][j])) / epsilon. Then we compare that value to nabla_w[1][i][j]. 

We can do this for all the entries in each weight matrix. I did this for a few random entries and spot-checking it that way

In [30]:
batch_size = 3
n = len(data)
# batches = [data[i*batch_size:i*batch_size+batch_size] for i in xrange(0, n, batch_size)]
batches = [data[i:i+batch_size] for i in xrange(0, n, batch_size)]

In [83]:
data = training_data
batch_size = 1
n = len(data)
batch_data = [data[i:i+batch_size] for i in xrange(0, n, batch_size)]

In [86]:
type(batch_data)

list

In [99]:
training_data[100][0][0]

array([ 0.], dtype=float32)

In [91]:
import numpy as np

class NeuralNetwork:
    def __init__(self, sizes):
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.biases = [np.random.randn(s, 1) for s in sizes[1:]]
        self.weights = [np.random.randn(s1, s2) for s1, s2 in zip(sizes[:-1], sizes[1:])]
        self.activations = []
        self.z_inputs = []
        
        np.random.seed(2015)
    
    def train(self, data, num_epochs=1, batch_size=1):
        for idx_epoch in range(num_epochs):
            n = len(data)
            batch_data = [data[i:i+batch_size] for i in xrange(0, n, batch_size)]
            for idx, batch in enumerate(batch_data):
                self.feedforward(x=batch[0][0], y=batch[0][1]) ## need to address this for the batch size
                self.backpropagate(y=batch[0][1])

    def test(self, data):
        error_rate = 0
        for test_data in data:
            x_test = test_data[0]
            y_test = test_data[1]
            self.feedforward(x_test, y_test)
            error = (y_test != self.activations[-1])
            if error:
                num_error += 1
        
        return num_error / (1.*len(data))

    def feedforward(self, x, y):
        self.activations = [x]
        self.z_inputs = [x]

        for w, b in zip(self.weights, self.biases):
            z = np.dot(w.T, self.activations[-1]) + b
            self.z_inputs.append(z)
            a = sigmoid(z)
            self.activations.append(a)
    
    def backpropagate(self, y, eta=0.3):
        # assume we have more than one layer...
        nabla_b = [np.zeros((s, 1)) for s in self.sizes[1:]]
        nabla_w = [np.zeros((s1, s2)) for s1, s2 in zip(self.sizes[:-1], self.sizes[1:])]

        # for layer L
        # note that the length of activations and z_inputs are different from nabla_b and nabla_w
        # because activations and z_inputs includes the input in the first entry
        delta = (self.activations[-1] - y) * sigmoid_prime(self.z_inputs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(self.activations[-2], delta.T)
        self.biases[-1] = self.biases[-1] - eta * nabla_b[-1]
        self.weights[-1] = self.weights[-1] - eta * nabla_w[-1]

        # for all previous layers
        for l in range(num_layers-1, 1, -1):
            delta = np.dot(self.weights[l-1], delta) * sigmoid_prime(self.z_inputs[l-1])
            nabla_b[l-2] = delta
            nabla_w[l-2] = np.dot(self.activations[l-2], delta.T)
            self.biases[l-2] = self.biases[l-2] - eta * nabla_b[l-2]
            self.weights[l-2] = self.weights[l-2] - eta * nabla_w[l-2]
                
#     def update_mini_batch(self, mini_batch, eta):
#         batch_size = len(mini_batch)
#         nabla_b = [np.zeros((s, 1)) for s in self.sizes[1:]]

In [88]:
sizes = [784, 30, 10]
nn = NeuralNetwork(sizes)

In [100]:
len(training_data)

50000

In [109]:
training_data[10000][1]

array([[ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.]])

In [93]:
nn.train(training_data[0:10])

In [94]:
nn.test(test_data)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()