### Data Set

Source - MNIST DATA SET

1. Training Data : 60,000 Samples
        - Split : 50,000 for training, 10,000 for Validation
            
2. Testing Data  : 10,000 Samples

### Importing Packages

In [5]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import random

import gzip
import pickle

### Loading MNIST Data Set

In [6]:
def load_data():
    f = gzip.open('mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding='latin1')
    f.close()
    return (training_data, validation_data, test_data)


def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = list(zip(training_inputs, training_results))
    
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = list(zip(validation_inputs, va_d[1]))
    
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = list(zip(test_inputs, te_d[1]))

    return (training_data, validation_data, test_data)


def vectorized_result(j):
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

### MAIN NETWORK CLASS

In [7]:
class Network(object):
    
    # Initializing weights and biases -
    ''' Method that initalizes Weight Matrix and Biases with Random Values. 
        E.g: If we take a Neural Network of 3 layers, such that:
             - 2 input neurons in 1st layer, 3 hidden neurons in 2nd layer, 1 output neuron in 3rd layer
             - So, 3 bias values in 2nd layer, 1 bias value in 3rd layer
             - So, 3x2 weight matrix for 2nd layer, 1x3 weight matrix for 3rd layer '''
    
    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for y in sizes[1:]]
        self.weights = [np.random.randn(x,y) for x, y in zip(sizes[1:], sizes[:2])]
        
    
    # Feed Forwarding -
    ''' Feedforward is producing an ouput given an input to a function without traversing back and forth, 
        or in a loop, producing results in a uni-directional way. '''
    
    def feedforward(self, X):
        for bias, weight in zip(self.biases, self.weights):
            activation_value = X.dot(weight) + bias 
            output = sigmoid(activation_value)
        return output
    
    
    # Gradient Descent Algorithm -
    ''' Train the neural network using mini-batch Stochastic Gradient Descent. The "training_data" 
        is a list of tuples. If "test_data" is provided then the network will be evaluated 
        against the test data after each epoch, and partial progress printed out. 
   
        -- KEYWORDS -- 
   
        GradientDescent - Method to apply GD algorithm on training data set.
        epochs          - Number of iterations known as epochs
        mini_batch_size - Size of batches for stochastic process
        LR              - Learning Rate '''

    def GradientDescent(self, training_data, epochs, mini_batch_size, LR, test_data=None):
        
        len_train = len(training_data)
        
        if test_data != None: 
            len_test = len(test_data)

        for i in range(epochs):
            ## Shuffling Training Data
            random.shuffle(training_data)

            ## Dividing Training Data into mini batches of given size and storing in mini_batches as list
            for k in range(0, len_train, mini_batch_size):
                mini_batches = training_data[k : k + mini_batch_size]   
            
            ## Applying Step-wise Gradient Descent to each Batch with learning rate(LR)
            for mini_batch in mini_batches:
                self.StochasticGradientDescent(mini_batch, LR)

#         if test_data:
#             print(Epoch {0}: {1} / {2}).format(i, self.evaluate(test_data), len_test)
#         else:
#             print(Epoch {0} complete).format(i)
    
    
    # Updating Weights and biases using Stochastic GD -
    ''' Update the network's weights and biases by applying gradient descent using backpropagation 
        to a single mini batch. '''
        
    def StochasticGradientDescent(self, mini_batch, LR):

        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

        self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]
        
        
        
    # Back Propogation - 
    ''' Return a tuple ``(nabla_b, nabla_w)`` representing the gradient for the cost function C_x. 
        ``nabla_b`` and ``nabla_w`` are layer-by-layer lists of numpy arrays, similar to 
        ``self.biases`` and ``self.weights``.'''
    
    def backprop(self, x, y):
        
        for b in self.biases:
            nabla_b = np.zeros(b.shape)
        for w in self.weights:
            nabla_w = np.zeros(w.shape)
        
        # feedforward
        activation = x
        activations = [x]  # list to store all the activations, layer by layer
        zs = []            # list to store all the z vectors, layer by layer
        
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        
        # Note that the variable l in the loop below is used a little
        # differently to the notation in Chapter 2 of the book.  Here,
        # l = 1 means the last layer of neurons, l = 2 is the
        # second-last layer, and so on.  It's a renumbering of the
        # scheme in the book, used here to take advantage of the fact
        # that Python can use negative indices in lists.
        
        for l in xrange(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)
    

    # Evaluation - 
    ''' Return the number of test inputs for which the neural network outputs the correct result. 
        Note that the neural network's output is assumed to be the index of whichever
        neuron in the final layer has the highest activation. '''

    def evaluate(self, test_data):
        
        for (x, y) in test_data:
            # Storing in a tuple:(x, y)
            test_results = (np.argmax(self.feedforward(x)), y)
            
        return sum(int(x == y) for (x, y) in test_results)                
    
        
    # Cost Function Derivative -    
    '''Return the vector of partial derivatives \partial C_x / \partial a for the output activations.'''

    def cost_derivative(self, output_activations, y):
        return (output_activations - y)

### Sigmoid Functions

In [8]:
''' The Neuron function is assumed here is a SIGMOID Function over PERCEPTRON's STEP function.
    - Sigmoid function is a continous function which is diffrentiable at every point unlike perceptron 
      which is a step function and non-diffrentiable at 0. '''

def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))


## Derivative of the Sigmoid function.
def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

### Implementing 

In [9]:
training_data, validation_data, test_data = load_data_wrapper()

In [10]:
net = Network([784, 30, 10])
net.GradientDescent(training_data, 30, 10, 3.0, test_data=test_data)

ValueError: too many values to unpack (expected 2)