## Neural Network and Convolutional Neural Network Practice

Self learning practicing code. Textbook: Deep learning O'Reilly

In [4]:
import os
import numpy as np
from mnist import MNIST

### Helper functions

Because step function is not a good function for activation cell in practical use. Sigmoid and relu functions are commonly used as replacement methods. Softmax function is used for multi-class clasification problems. 

In [32]:
# helper functions

# sigmoid function, usually used for 2 classes clasification problem
def sigmoid(x):
    return 1 / ( 1 + np.exp(-x) )

# gradient of sigmoid function
def sigmoid_grad(x):
    return ( 1.0 - sigmoid(x) ) * sigmoid(x)

# ReLu
def relu(x):
    return np.maximum(0, x)

# soft max function, usually used for multi-class clasification problem
def softmax(x):
    x = x - np.max(x)
    return np.exp(x) / np.sum( np.exp(x) )

### Loss function
For all machine learning problems, we need a loss function to help our model learning. Both mean square error function and cross entropy error function are commonly used in neural network. 

$E = - \sum t_k\log y_k$

In [6]:
# error functions

# mse
def mse(y, t):
    return 0.5 * np.sum( (y-t)**2 )

# cross-entropy error
def cross_entropy_error(y, t):
    delta = 1e-7 # prevent log function error
    return -np.sum( t * np.log(y+delta) )

# batch version
def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    
    batch_size = y.shape[0]
    return -np.sum(np.log( y[np.arange(batch_size), t])) / batch_size

### Practice 1: 1 layer network
Build a simple 1 layer network with 2 cells and 3 different classes to predict.

In [15]:
# simple net practice
class simpleNet:
    def __init__(self):
        self.W = np.random.randn(2,3)
    
    def predict(self, x):
        return np.dot(x,self.W)
    
    def loss(self, x, t):
        z = self.predict(x)
        y = softmax(z)
        loss = cross_entropy_error(y, t)
        
        return loss

In [19]:
# 1 layer 3 cells simple network
net = simpleNet()

# assume inputs x1, x2 = 0.6, 0.9
x = np.array([0.6, 0.9])
y_hat = net.predict(x)
print('Prediction: ')
print(y_hat)

# assume actual result = 0, 0, 1
t =np.array([0, 0, 1])

# cross entropy error
error = net.loss(x, t)
print('Error: ')
print(error)

Prediction: 
[ 2.01593832 -0.17561157  1.47297297]
Error: 
-1.069329044419211


### Practice 2: 2 layers network
Build a simple 2 layers network with 100 cells in each layer to predict 10 different labels.

In [30]:
# 2 layers
class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

        
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        return y
    
    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)
    
    # numerical gradient
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        
        return grads
    
    # graph gradient
    def gradient(self, x, t):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        grads = {}
        
        batch_num = x.shape[0]
        
        # forward
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax(a2)
        
        # backward
        dy = (y-t) / batch_num
        grads['W2'] = np.dot(z1.T, dy)
        grads['b2'] = np.sum(dy, axis = 0)
        
        dz1 = np.dot(dy, W2.T)
        da1 = sigmoid_grad(a1) * dz1
        grads['W1'] = np.dot(x.T, da1)
        grads['b1'] = np.sum(da1, axis = 0)
        
        return grads

Note, the matrix size is important. From input (784) to 1st layer, the computational equations can be expressed as matrix form x(784) * W1 (784,100) and bias term b1 (100,1). From later1 output to layer2 output, the equations can be expressed as (x * W1 + b1) * W2 (100,10) + b2 (10,).

In [35]:
# initiate two layers with 100 cells and output to predict 10 digits
net = TwoLayerNet(input_size = 784, hidden_size = 100, output_size= 10)
print(net.params['W1'].shape)
print(net.params['b1'].shape)
print(net.params['W2'].shape)
print(net.params['b2'].shape)

# assume 100 pics (28*28 pixel)
x = np.random.rand(100,784)
y = net.predict(x)
t = np.random.rand(100, 10)

# gradients
numerical_grads = net.numerical_gradient(x, t)
compute_graph_grads = net.gradient(x, t)
print('Numerical gradients: ')
print(numerical_grads)
print('Computational graph gradients: ')
print(compute_graph_grads)


(784, 100)
(100,)
(100, 10)
(10,)


Above practice is to know that computing numerical gradient is very slow. Using computational graph method is way faster than numerical gradient.

### Apply 2 layers model to train and test MNIST data

In [None]:
mnidata = MNIST(os.getcwd()+'/Data')
train_img, train_lab = mnidata.load_training()
test_img, test_lab = mnidata.load_testing()