**Python Version   : 2.7.11** <br>
**Ipython Version  : 4.0.1**

In [14]:
import os
import six
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [15]:
# forward and backward functions for linar, softmax and cross-entropy nodes. 

def linear_forward(x, W, b):
    return np.dot(x, W.T) + b

def linear_backward(dz, x, W, b):
    dx = np.dot(dz, W)
    dW = np.dot(dz.T, x)
    db = dz.sum(axis=0)
    return dx, dW, db

def cross_entropy_loss_forward(p, y):
    log_prob = np.log(p[np.arange(len(y)), y])
    return -log_prob.mean()

def cross_entropy_loss_backward(p, y):
    dlog_softmax = np.zeros_like(p)
    dlog_softmax[np.arange(len(y)), y] \
        -= 1.0/len(y)
    return dlog_softmax / p

def softmax_forward(z):
    zmax = z.max(axis=1,keepdims=True)
    expz = np.exp(z-zmax)
    Z = expz.sum(axis=1,keepdims=True)
    return expz / Z

def softmax_backward(dp, z):
    p = softmax_forward(z)
    pdp = p * dp
    return pdp - p * pdp.sum(axis=1, keepdims=True)

def tanh_forward(x):
    return np.tanh(x)

def tanh_backward(dy, y):
    return (1.0 - y**2) * dy

In [16]:
def load_mnist_data(path):
    mnist_pickle = open(path, 'rb')
    mnist =  six.moves.cPickle.load(mnist_pickle)
    
    num_train = 60000
    num_test = 10000

    mnist['data'] = mnist['data'].astype(np.float32)
    mnist['data'] /= 255
    mnist['data'] = mnist['data'].reshape(mnist['data'].shape[0], 1, 28, 28)
    mnist['target'] = mnist['target'].astype(np.int32)

    input_train, input_test = np.split(mnist['data'],   [num_train])
    target_train, target_test = np.split(mnist['target'], [num_train])
    return input_train, target_train, input_test, target_test

In [17]:
class Node(object):
    def __init__(self):
        self.cache = {}
    def store_cache(self, **kwargs):
        for key, val in kwargs.items():
            self.cache[key] = np.empty_like(val)
            self.cache[key][...] = val
    def read_cache(self, key):
        return self.cache[key]

In [18]:
class Tanh(Node):
    def __init__(self):
        super(Tanh,self).__init__()
    def forward(self, x):
        y = tanh_forward(x)
        self.store_cache(y=y)
        return y
    def backward(self, dy):
        ycache = self.read_cache('y')
        return tanh_backward(dy, y=ycache)

In [19]:
# Z = Wx + b
class Linear(Node):
    def __init__(self, nin, nout):
        super(Linear,self).__init__()
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin))
        self.b = np.zeros((1,nout))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self, x):
        self.store_cache(x=x)
        return linear_forward(x, self.W, self.b)
    
    def backward(self, dy):
        dx, dW, db = linear_backward(dy, self.read_cache('x'), self.W, self.b)
        self.dW += dW
        self.db += db
        return dx
    
    def zerograds(self):
        self.dW.fill(0)
        self.db.fill(0)

In [20]:
class SoftmaxCrossEntropyLoss(Node):
    def __init__(self):
        super(SoftmaxCrossEntropyLoss,self).__init__()

    def forward(self, x, y):
        self.store_cache(x=x,y=y)
        softmax = softmax_forward(x)
        self.store_cache(softmax=softmax)
        return cross_entropy_loss_forward(softmax, y)

    def backward(self):
        xcache = self.read_cache('x')
        ycache = self.read_cache('y')
        softmaxcache = self.read_cache('softmax')
        dsoftmax = cross_entropy_loss_backward(softmaxcache, ycache)
        return softmax_backward(dsoftmax, xcache)

In [21]:
def sgd(params, grads, lr=0.1):
    for param, grad in zip(params, grads):
        param -= lr * grad

In [22]:
input_train, target_train, input_test, target_test = load_mnist_data('data/mnist.pkl')
N_train = len(input_train)
N_test = len(input_test)

ninput = 784
nhidden = 1024
noutput = 10
eta = 0.1

linearObject_0 = Linear(ninput, nhidden)
tanh = Tanh()
linearObject_1 = Linear(nhidden, noutput)
loss_func = SoftmaxCrossEntropyLoss()

params = []; 
grads = []

for layer in [linearObject_0, linearObject_1]:
    params += [layer.W, layer.b]
    grads += [layer.dW, layer.db]

input_train = input_train.reshape((N_train,ninput))
batch_size = 16
for i in range(0,N_train,batch_size):
    xbatch = input_train[i:i+batch_size]; 
    ybatch = target_train[i:i+batch_size]
    
    # Forward
    hin = linearObject_0.forward(xbatch)
    h = tanh.forward(hin)
    out = linearObject_1.forward(h)
    loss = loss_func.forward(out, ybatch)

    # Clear the gradient buffer
    linearObject_0.zerograds()
    linearObject_1.zerograds()

    # Backward
    dout = loss_func.backward()
    dh = linearObject_1.backward(dout)
    dhin = tanh.backward(dh)
    dx = linearObject_0.backward(dhin)
    
    # Parameter update
    sgd(params, grads, lr = eta)
    
    # Compute accuracy
    pred = np.argmax(out, axis=1)
    acc = (pred==ybatch).mean()
    
    # Print statistics
    print '{} loss={} acc={}'.format(i, loss, acc)

0 loss=2.2763555368 acc=0.125
16 loss=2.27386806915 acc=0.0625
32 loss=2.31361921232 acc=0.1875
48 loss=1.73830357904 acc=0.375
64 loss=1.68357825195 acc=0.5625
80 loss=1.43382806832 acc=0.5625
96 loss=1.41022805513 acc=0.625
112 loss=1.10570397395 acc=0.6875
128 loss=1.59004625084 acc=0.625
144 loss=1.16588914775 acc=0.6875
160 loss=1.26940649366 acc=0.6875
176 loss=1.27240149343 acc=0.6875
192 loss=0.997602545277 acc=0.6875
208 loss=0.950229475484 acc=0.75
224 loss=1.07652106588 acc=0.75
240 loss=1.09567899015 acc=0.625
256 loss=1.20474164146 acc=0.625
272 loss=0.886932083001 acc=0.6875
288 loss=0.633736556264 acc=0.875
304 loss=0.845196492506 acc=0.75
320 loss=0.790317349152 acc=0.6875
336 loss=0.855348195052 acc=0.6875
352 loss=0.699846344071 acc=0.8125
368 loss=0.576414579823 acc=0.8125
384 loss=0.731216335059 acc=0.875
400 loss=0.710466759378 acc=0.8125
416 loss=0.723464068755 acc=0.8125
432 loss=0.669289007028 acc=0.8125
448 loss=0.734324798209 acc=0.875
464 loss=0.631581197231 

In [25]:
# Compute test accuracy
input_test = input_test.reshape((N_test,ninput))
test_acc = 0
for i in range(0, len(input_test), batch_size):
    xbatch = input_test[i:i + batch_size]; 
    ybatch = target_test[i:i + batch_size]
    
    # Forward
    hin = linearObject_0.forward(xbatch)
    h = tanh.forward(hin)
    out = linearObject_1.forward(h)
    
    pred = np.argmax(out, axis=1);
    test_acc += (pred==ybatch).sum()
test_acc = float(test_acc) / N_test
print "Test accuracy={}".format(test_acc)

Test accuracy=0.9163
