# 3-layer Neural Network for Classification 
without the deep learning framework (only python)

## 0.  Import dependency package

In [1]:
import numpy as np
import gzip
from PIL import Image
from matplotlib import pyplot as plt
%matplotlib inline

## 1. Load data
The following functions are retrieved from https://stackoverflow.com/questions/40427435/extract-images-from-idx3-ubyte-file-or-gzip-via-python

In [2]:
def training_images():
    with gzip.open('data/train-images-idx3-ubyte.gz', 'r') as f:
        # first 4 bytes is a magic number
        magic_number = int.from_bytes(f.read(4), 'big')
        # second 4 bytes is the number of images
        image_count = int.from_bytes(f.read(4), 'big')
        # third 4 bytes is the row count
        row_count = int.from_bytes(f.read(4), 'big')
        # fourth 4 bytes is the column count
        column_count = int.from_bytes(f.read(4), 'big')
        # rest is the image pixel data, each pixel is stored as an unsigned byte
        # pixel values are 0 to 255
        image_data = f.read()
        images = np.frombuffer(image_data, dtype=np.uint8)\
            .reshape((image_count, row_count, column_count))
        return images


def training_labels():
    with gzip.open('data/train-labels-idx1-ubyte.gz', 'r') as f:
        # first 4 bytes is a magic number
        magic_number = int.from_bytes(f.read(4), 'big')
        # second 4 bytes is the number of labels
        label_count = int.from_bytes(f.read(4), 'big')
        # rest is the label data, each label is stored as unsigned byte
        # label values are 0 to 9
        label_data = f.read()
        labels = np.frombuffer(label_data, dtype=np.uint8)
        return labels

In [3]:
X_t = training_images()
Y_t = training_labels()

X_t = X_t.reshape(-1,784)

# normalization
X_t = X_t / 255

In [4]:
print(X_t.shape)

(60000, 784)


## 2. Generate Neural Network

### 2.0 Activation Function

In [5]:
class ReLU:
    def prop(self, X):
        return np.maximum(0, X)
    
    def derivative(self, X):
        result = np.copy(X)
        result[result >= 0] = 1
        result[result < 0] = 0
        return result

class LeakyReLU:
    def __init__(self, hyper):
        self.hyper = hyper
        
    def prop(self, X):
        return np.maximum(self.hyper*X, X)
    
    def derivative(self, X):
        result = np.copy(X)
        result[result >= 0] = 1
        result[result < 0] = self.hyper
        return result

### 2.1 Layer_Dense

In [6]:
class Layer_dense:
    def __init__(self, input_size, output_size, activation):
        self.W = np.random.normal(size=(input_size, output_size))
        self.b = np.random.normal(size=(1,output_size))
        self.L = None
        self.activation_func = activation()
    
    def prop(self, X):
        self.L = np.dot(X, self.W) + self.b
        return self.activation_func.prop(self.L)
    
    def derivative(self):
        return self.W
    
    def activation_derivative(self):
        return self.activation_func.derivative(self.L)
    
    def update(self, learning_rate, dW, db):
        self.W -= learning_rate * dW
        self.b -= learning_rate * db

### 2.2 Softmax

In [7]:
class Softmax:
    def prop(self, X):
        return np.exp(X)/np.sum(np.exp(X), axis=0)
    
    def derivative(self, Y_hat, Y):
        "I will use this function for derivative loss_func(softmax(Y_hat),Y)"
        result = np.copy(Y_hat)
        result[range(result.shape[0]),Y] -= 1
        return result 
    
    def _derivative(self, X):
        """I will not use this function, just implementation of partial derivative"""
        result = np.zeros((*X.shape,X.shape[-1]))
        S = Softmax.prop(X)
        for k, V in enumerate(X):
            for i,x1 in enumerate(*V):
                for j,x2 in enumerate(*V):
                    if i==j:
                        result[k,i,j] = S[k,i] * (1 - S[k,j])
                    else:
                        result[k,i,j] = S[k,i] * S[k,j] * -1
        return result

### 2.3 Multi-Class Cross entropy function

In [8]:
class Cross_entropy:
    def prop(self, Y_hat, Y):
        return -1 * np.sum(np.log(Y_hat[range(Y_hat.shape[0]),Y]))
    

    def derivative(self, Y_hat, Y):
        "I will not use this function, just implementation of derivative"
        result = np.zeros(Y_hat.shape)
        result[range(Y_hat.shape[0]),Y] = -1 * (1/Y_hat[range(Y_hat.shape[0]),Y])
        return result

### 2.4 Neural Network

In [9]:
class NN:
    def __init__(self, X_t, Y_t, loss, learning_rate, batch_size, epoch):
        self.X_t = X_t
        self.Y_t = Y_t
        
        self.loss_func = loss()
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epoch = epoch
        
        self.layers = []
        self.inputs = []

        
    def add(self, layer):
        self.layers.append(layer)
    
    def prop(self, X):
        result = X
        self.inputs = [X]
        for i, layer in enumerate(self.layers):
            result = layer.prop(result)
            self.inputs.append(result)
            
        return result
    
    def backprop(self, Y):
        dZ = self.layers[-1].derivative(self.inputs[-1], Y) # softmax+cost 
        for i,layer in enumerate(reversed(self.layers[:-1]),3):
            dZ = np.multiply(layer.activation_derivative(), dZ)
            dW = np.dot(self.inputs[-i].T,dZ) / self.batch_size
            db = np.sum(dZ, axis=0, keepdims = True) / self.batch_size
            dZ = np.dot(dZ,layer.derivative().T)
            layer.update(self.learning_rate, dW, db)
    
    def get_loss(self, Y):
        return self.loss_func.prop(self.inputs[-1],Y) / self.batch_size
    
    def fit(self, verbose=True):
        
        div = int(self.X_t.shape[0] / self.batch_size)
        mod = self.X_t.shape[0] % self.batch_size
        n = div if mod < 100 else div + 1
        
        for i in range(self.epoch):
            for j,X,Y in self.batch_iter(div,mod,n):
                self.prop(X)
                self.backprop(Y)
#                 if verbose:
#                     print(f"[{j}/{n}] cost: {self.get_loss(Y)}")
            if verbose:
                print(f"[{i}/{self.epoch}] cost: {self.get_loss(Y)}")

    
    def batch_iter(self,div,mod,n):
        for i in range(n):
            if i == n - 1:
                yield i, self.X_t[i*self.batch_size:(i+1)*self.batch_size+mod+1], self.Y_t[i*self.batch_size:(i+1)*self.batch_size+mod+1]
            else:
                yield i, self.X_t[i*self.batch_size:(i+1)*self.batch_size], self.Y_t[i*self.batch_size:(i+1)*self.batch_size]
    
    def predict(self, X):
        for layer in layers:
            X = layer(X)

In [10]:
# X_t, Y_t, loss, learning_rate, batch_size, epoch
nn = NN(X_t, Y_t, Cross_entropy, 0.01, 256, 1000)
nn.add(Layer_dense(784,64,ReLU))
nn.add(Layer_dense(64,10,ReLU))
nn.add(Softmax())

In [11]:
nn.fit()

  return np.exp(X)/np.sum(np.exp(X), axis=0)
  return np.exp(X)/np.sum(np.exp(X), axis=0)


[0/1000] cost: nan
[1/1000] cost: nan
[2/1000] cost: nan
[3/1000] cost: nan
[4/1000] cost: nan
[5/1000] cost: nan
[6/1000] cost: nan
[7/1000] cost: nan
[8/1000] cost: nan
[9/1000] cost: nan
[10/1000] cost: nan
[11/1000] cost: nan
[12/1000] cost: nan
[13/1000] cost: nan
[14/1000] cost: nan
[15/1000] cost: nan
[16/1000] cost: nan
[17/1000] cost: nan
[18/1000] cost: nan
[19/1000] cost: nan
[20/1000] cost: nan
[21/1000] cost: nan
[22/1000] cost: nan


KeyboardInterrupt: 

### Trivial

In [None]:
a = np.arange(10).reshape(5,-1)
b = np.array([1,2])
np.multiply(a,b)