**Python Version   : 2.7.11** <br>
**Ipython Version  : 4.0.1**

In [35]:
import os
import six
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [36]:
# forward and backward functions for linar, softmax and cross-entropy nodes. 

def linear_forward(x, W, b):
    return np.dot(x, W.T) + b

def linear_backward(dz, x, W, b):
    dx = np.dot(dz, W)
    dW = np.dot(dz.T, x)
    db = dz.sum(axis=0)
    return dx, dW, db

def cross_entropy_loss_forward(p, y):
    log_prob = np.log(p[np.arange(len(y)), y])
    return -log_prob.mean()

def cross_entropy_loss_backward(p, y):
    dlog_softmax = np.zeros_like(p)
    dlog_softmax[np.arange(len(y)), y] \
        -= 1.0/len(y)
    return dlog_softmax / p

def softmax_forward(z):
    zmax = z.max(axis=1,keepdims=True)
    expz = np.exp(z-zmax)
    Z = expz.sum(axis=1,keepdims=True)
    return expz / Z

def softmax_backward(dp, z):
    p = softmax_forward(z)
    pdp = p * dp
    return pdp - p * pdp.sum(axis=1, keepdims=True)

def relu_forward(hin):
    return np.maximum(hin, 0)

def relu_backward(dh, hin):
    return np.where(
        hin>=0,
        dh,
        np.zeros_like(dh))

In [37]:
def load_mnist_data(path):
    mnist_pickle = open(path, 'rb')
    mnist =  six.moves.cPickle.load(mnist_pickle)
    
    num_train = 60000
    num_test = 10000

    mnist['data'] = mnist['data'].astype(np.float32)
    mnist['data'] /= 255
    mnist['data'] = mnist['data'].reshape(mnist['data'].shape[0], 1, 28, 28)
    mnist['target'] = mnist['target'].astype(np.int32)

    input_train, input_test = np.split(mnist['data'],   [num_train])
    target_train, target_test = np.split(mnist['target'], [num_train])
    return input_train, target_train, input_test, target_test

In [38]:
class Node(object):
    def __init__(self):
        self.cache = {}
    def store_cache(self, **kwargs):
        for key, val in kwargs.items():
            self.cache[key] = np.empty_like(val)
            self.cache[key][...] = val
    def read_cache(self, key):
        return self.cache[key]

In [39]:
class ReLU(Node):
    def __init__(self):
        super(ReLU,self).__init__()
    def forward(self, x):
        self.store_cache(x=x)
        return relu_forward(x)
    def backward(self, dy):
        xcache = self.read_cache('x')
        return relu_backward(dy, xcache)

In [40]:
# Z = Wx + b
class Linear(Node):
    def __init__(self, nin, nout):
        super(Linear,self).__init__()
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin))
        self.b = np.zeros((1,nout))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self, x):
        self.store_cache(x=x)
        return linear_forward(x, self.W, self.b)
    
    def backward(self, dy):
        dx, dW, db = linear_backward(dy, self.read_cache('x'), self.W, self.b)
        self.dW += dW
        self.db += db
        return dx
    
    def zerograds(self):
        self.dW.fill(0)
        self.db.fill(0)

In [41]:
class SoftmaxCrossEntropyLoss(Node):
    def __init__(self):
        super(SoftmaxCrossEntropyLoss,self).__init__()

    def forward(self, x, y):
        self.store_cache(x=x,y=y)
        softmax = softmax_forward(x)
        self.store_cache(softmax=softmax)
        return cross_entropy_loss_forward(softmax, y)

    def backward(self):
        xcache = self.read_cache('x')
        ycache = self.read_cache('y')
        softmaxcache = self.read_cache('softmax')
        dsoftmax = cross_entropy_loss_backward(softmaxcache, ycache)
        return softmax_backward(dsoftmax, xcache)

In [42]:
def sgd(params, grads, lr=0.1):
    for param, grad in zip(params, grads):
        param -= lr * grad    

In [43]:
input_train, target_train, input_test, target_test = load_mnist_data('data/mnist.pkl')
N_train = len(input_train)
N_test = len(input_test)

ninput = 784
nhidden = 1024
noutput = 10
eta = 0.1

linearObject_0 = Linear(ninput, nhidden)
relu = ReLU()
linearObject_1 = Linear(nhidden, noutput)
loss_func = SoftmaxCrossEntropyLoss()

params = []; 
grads = []

for layer in [linearObject_0, linearObject_1]:
    params += [layer.W, layer.b]
    grads += [layer.dW, layer.db]

input_train = input_train.reshape((N_train,ninput))
batch_size = 16
for i in range(0,N_train,batch_size):
    xbatch = input_train[i:i+batch_size]; 
    ybatch = target_train[i:i+batch_size]
    
    # Forward
    hin = linearObject_0.forward(xbatch)
    h = relu.forward(hin)
    out = linearObject_1.forward(h)
    loss = loss_func.forward(out, ybatch)

    # Clear the gradient buffer
    linearObject_0.zerograds()
    linearObject_1.zerograds()

    # Backward
    dout = loss_func.backward()
    dh = linearObject_1.backward(dout)
    dhin = relu.backward(dh)
    _ = linearObject_0.backward(dhin)
    
    # Parameter update
    sgd(params, grads, lr = eta)

    # Compute accuracy
    pred = np.argmax(out, axis=1)
    acc = (pred==ybatch).mean()
    
    # Print statistics
    print '{} loss={} acc={}'.format(i, loss, acc)

0 loss=2.44147796679 acc=0.0625
16 loss=2.32885853951 acc=0.125
32 loss=2.28554443158 acc=0.1875
48 loss=1.94327649904 acc=0.5
64 loss=1.95903196827 acc=0.25
80 loss=1.83370690378 acc=0.25
96 loss=1.82254903722 acc=0.5
112 loss=1.58850261332 acc=0.5625
128 loss=1.9585568706 acc=0.5625
144 loss=1.59126458772 acc=0.625
160 loss=1.63108175358 acc=0.625
176 loss=1.67522841624 acc=0.5625
192 loss=1.39350911545 acc=0.5
208 loss=1.26772560783 acc=0.6875
224 loss=1.44667073098 acc=0.5
240 loss=1.33562879228 acc=0.625
256 loss=1.52468738816 acc=0.625
272 loss=1.16653804258 acc=0.6875
288 loss=0.900708300301 acc=0.8125
304 loss=1.23460338226 acc=0.6875
320 loss=1.08412674766 acc=0.75
336 loss=1.05524182195 acc=0.6875
352 loss=0.90561616298 acc=0.8125
368 loss=0.819596330356 acc=0.8125
384 loss=1.01037035728 acc=0.8125
400 loss=0.917922624814 acc=0.8125
416 loss=0.907832750358 acc=0.8125
432 loss=0.885694367621 acc=0.8125
448 loss=0.8149093784 acc=0.875
464 loss=0.879504590384 acc=0.8125
480 loss

In [46]:
# Compute test accuracy
input_test = input_test.reshape((N_test,ninput))
test_acc = 0
for i in range(0, len(input_test), batch_size):
    xbatch = input_test[i:i + batch_size]; 
    ybatch = target_test[i:i + batch_size]
    
    # Forward
    hin = linearObject_0.forward(xbatch)
    h = relu.forward(hin)
    out = linearObject_1.forward(h)
    
    pred = np.argmax(out, axis=1);
    test_acc += (pred==ybatch).sum()
test_acc = float(test_acc) / N_test
print "Test accuracy={}".format(test_acc)

Test accuracy=0.9607
