In [1]:
import numpy as np
from tqdm import tqdm
import math

## Neural network class

In [2]:
class NeuralNetwork():
    """
    Parameters:
    loss: (class) Loss function
    validation: (tuple) Validation data (X, y)
    """
    def __init__(self, Loss_function, validation_data=None):
        self.layers = []
        self.loss_function = Loss_function()
        self.errors = {"training": [], "validation": []}
        self.val_set = None
        if validation_data:
            X, y = validation_data
            self.val_set = {"X": X, "y": y}

    def add(self, layer):
        """ Method which adds a layer to the neural network """
        # If not first layer, curent layer input = previous layer output 
        if self.layers:
            layer.set_input_shape(shape=self.layers[-1].get_output_shape())
        if hasattr(layer, 'initialize'):
            layer.initialize()
        self.layers.append(layer)

    def train_on_batch(self, X, y):
        """ Forward/backward on batch """
        #fordward prop
        y_pred = self._forward(X)
        loss = np.mean(self.loss_function.loss(y, y_pred))#(1) mean normalizes
        acc = self.loss_function.acc(y, y_pred)
        # backward prop
        loss_grad = self.loss_function.gradient(y, y_pred)
        self._backward(loss_grad=loss_grad)
        return loss, acc

    def test_on_batch(self, X, y):
        """ Test on batch """
        y_pred = self._forward(X)
        loss = np.mean(self.loss_function.loss(y, y_pred))
        acc = self.loss_function.acc(y, y_pred)
        return loss, acc

    def fit(self, X, y, n_epochs, batch_size):
        """ Train on n_epochs """
        for _ in tqdm(range(n_epochs)):
            batch_error = []
            batch_iterator =  Batcher(X, y, batch_size=batch_size)
            for X_batch, y_batch in batch_iterator:
                loss, acc = self.train_on_batch(X_batch, y_batch)
                batch_error.append(loss)
            
            # Append training and validation errors
            self.errors["training"].append(np.mean(batch_error))
            if self.val_set is not None:
                val_loss, _ = self.test_on_batch(self.val_set["X"], self.val_set["y"])
                self.errors["validation"].append(val_loss)
        return self.errors["training"], self.errors["validation"]

    def predict(self, X):
        """ Predict labels of X """
        return self._forward(X)

    def _forward(self, X):
        """ Calculate the output of the NN """
        layer_output = X
        for layer in self.layers:
            layer_output = layer.forward(layer_output)
        return layer_output

    def _backward(self, loss_grad):
        """ Propagate the gradient 'backwards' and update the weights in each layer """
        for layer in reversed(self.layers):
            loss_grad = layer.backward(loss_grad)

    def print_network(self):
        print("***** Current network *****")
        print('layer', '\t\t', 'output_shape', '\t\t', 'Input_shape')
        for layer in self.layers:
            print(layer.layer_name, '\t\t', layer.output_shape, '\t\t',layer.input_shape)
        print('Loss function ', '\t\t', self.loss_function.loss_name)
        print("***************************")

In [3]:
def batcher(X, y=None, batch_size=64):
    """ batch generator """
    n_samples = X.shape[0]
    for i in np.arange(0, n_samples, batch_size):
        start, end = i, min(i+batch_size, n_samples)
        yield X[start:end, ...], y[start:end, ...]

class Batcher:
    def __init__(self, X, y=None, batch_size=64):
        self.X = X
        self.y = y
        self.batch_size = batch_size
    def __iter__(self):
        return self.batcher(self.X, self.y, self.batch_size)
    
    def batcher(self, X, y=None, batch_size=64):
        """ batch generator """
        n_samples = X.shape[0]
        for i in np.arange(0, n_samples, batch_size):
            start, end = i, min(i+batch_size, n_samples)
            yield X[start:end, ...], y[start:end, ...]
    

In [4]:
class Layer(object):

    def set_input_shape(self, shape):
        """ Sets expected shape for valid forward prop """
        self.input_shape = shape

    def forward(self, X):
        """ Propogates the signal forward in the network """
        raise NotImplementedError()

    def backward(self, XdA):
        """ Propogates the signal forward in the network """
        raise NotImplementedError()

In [5]:
class Activation(object):
    def __call__(self, Z): # Forward
        raise NotImplementedError()
    def gradient(self, A): 
        raise NotImplementedError()

In [6]:
class Loss(object):
    def loss(self, y_true, y_pred):
        raise NotImplementedError()

    def gradient(self, y, y_pred):
        raise NotImplementedError()

    def acc(self, y, y_pred):
        raise NotImplementedError()

## Activation functions

In [7]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))
    

def relu(Z):
    return np.where(Z >= 0, Z, 0)
    
def softmax(Z):
    e_Z = np.exp(Z)
    out = e_Z / np.sum(e_Z, axis=1, keepdims=True)
    #assert Z.shape == out.shape
    return out

class Sigmoid(Activation):
    """
    Fordward prop & grad
    """
    def __call__(self, Z): # Forward
        return 1 / (1 + np.exp(-Z))
    def gradient(self, Z): 
        return self.__call__(Z) * (1 - self.__call__(Z))

class Relu(Activation):
    """
    Fordward prop & grad
    """
    def __call__(self, Z):
        return np.where(Z >= 0, Z, 0)
    def gradient(self, Z):
        return np.where(Z >= 0, 1, 0)


class Softmax(Activation):
    """
    Fordward prop & grad
    """
    def __call__(self, Z):
        e_Z = np.exp(Z)
        out = e_Z / np.sum(e_Z, axis=1, keepdims=True)
        #assert Z.shape[0] - 10**(-13) <np.sum(out) < Z.shape[0] + 10**(-13), print(Z, e_Z )
        assert Z.shape == out.shape
        return out

    def gradient(self, Z):
        p = self.__call__(Z)
        grad = - p[:, :, np.newaxis] *  p[:, np.newaxis, :]
        diag = np.arange(p.shape[-1])
        grad[:, diag, diag]  = p * (1-p)

        return grad

## Dense Layers

In [8]:
class Dense(Layer):
    """
    Parameters: 
    out_units: (int)
    initializer: (string)
    lr: (float) learning rate
    
    """
    def __init__(self, out_units, input_shape=None, initializer = 'normal', lr = 0.06):
        self.layer_name ='dense'
        self.input_shape = input_shape
        self.output_shape = (out_units,)
        self.initializer = initializer
        self.lr = lr
        self.layer_input = None
        self.W = None
        self.b = None
        self.train = True


    def get_output_shape(self):
        return self.output_shape

    def initialize(self):
        """ Initialize the weights. Unchanged
        """
        wshape = (self.input_shape[0], self.output_shape[0])
        if self.initializer == 'normal':
            lim = np.sqrt(6) / math.sqrt(wshape[0]+wshape[1])
            self.W = np.random.uniform(-lim, lim, wshape)

        if self.initializer == 'ng':
            self.W = np.random.randn(wshape[0], wshape[1])/np.sqrt(wshape[0])

        self.b = np.zeros(shape = (1, wshape[1]))
        assert self.W.shape == (self.input_shape[0], self.output_shape[0])
        assert self.b.shape == (1,self.output_shape[0])

    def forward(self, A_prev): 
        self.layer_input = A_prev
        self.Z = np.matmul(A_prev, self.W) + self.b
        assert self.Z.shape == (A_prev.shape[0], self.W.shape[1])
        return self.Z

    def backward(self, dZ):
        # Input dZ_prev = dl/dZ_prev
        # Output dL/dA = dL/dZ * dZ/dA  = dL/dZ * W^T
        W = self.W
        A_prev = self.layer_input
        norm = A_prev.shape[0]
        if self.train:
            # Gradiend update dw= dL/dw = dz/dw * dl/dz = A_prev^T dL/dz
            # Gradiend update db= dL/bb = dz/db * dl/dz = dL/dz
            dW = np.matmul(A_prev.T, dZ) / norm #(2)normalize
            db = np.sum(dZ, axis = 0, keepdims = True) #/ norm #(2)normalize
            assert dW.shape == W.shape
            assert db.shape == self.b.shape
            self.W = self.W - self.lr * dW
            self.b = self.b - self.lr * db

        # Output dL/dA = dL/dZ * dZ/dA  = dL/dZ * W^T
        return np.matmul(dZ, W.T) # return dA_prev

In [9]:
class Flatten(Layer):
    """A layer that flattens a 2D matrix
    Parameters:
    -----------
    name: string
        The name of the activation function that will be used.
    """

    def __init__(self, input_shape = None):
        self.layer_name = 'flatten'
        self.input_shape = input_shape

    def initialize(self):
        # Just to set the output shape, but not needed below
        coords_to_flatten = 1
        for i in self.input_shape:
            coords_to_flatten *=i
        self.output_shape = (coords_to_flatten,)

    def get_output_shape(self):
        return self.output_shape

    def forward(self, Z):
        batch_size= Z.shape[0]
        shape = (batch_size, self.output_shape[0])
        return Z.reshape(shape)

    def backward(self, dA):
        batch_size= dA.shape[0]
        shape = (batch_size,) + self.input_shape
        return dA.reshape(shape)

## Activation layers

In [10]:
activation_functions = {
    'sigmoid': Sigmoid,
    'relu': Relu
}

class Activation(Layer):
    """
    A layer that applies an activation operation to the input.
        Parameters:
    -----------
    layer_name: (string) Name of activation layer

    """
    def __init__(self, layer_name):
        self.layer_name = layer_name
        self.input_shape = None
        self.activation_func = activation_functions[layer_name]()

    def initialize(self):
        """ 
        Set shape
        """
        self.output_shape = self.input_shape

    def get_output_shape(self):
        return self.output_shape

    def forward(self, Z):
        self.layer_input = Z
        act = self.activation_func(Z)
        assert Z.shape == act.shape
        return act

    def backward(self, dA):
        Z = self.layer_input
        dact = self.activation_func.gradient(Z)
        #assert Z.shape == dact.shape
        assert Z.shape == dA.shape
        dZ = dact * dA
        assert(dZ.shape == (Z.shape))
        return dZ

In [11]:
class Activation_SoftMax(Layer):
    """
    A layer that applies an activation operation to the input.

    """
    def __init__(self, input_shape = None):
        self.layer_name = 'softmax'
        self.input_shape = input_shape
        self.activation_func = Softmax()

    def initialize(self):
        """ 
        Set shape
        """
        self.output_shape = self.input_shape

    def get_output_shape(self):
        return self.output_shape

    def forward(self, Z):
        self.layer_input = Z
        act = self.activation_func(Z)
        assert Z.shape == act.shape
        return act

    def backward(self, dA):
        Z = self.layer_input
        dact = self.activation_func.gradient(Z)
        #assert Z.shape == dact.shape
        assert Z.shape == dA.shape
        dZ = np.sum(np.multiply(dact, dA[:, np.newaxis,:]), axis = 2)
        assert(dZ.shape == (Z.shape))
        return dZ

In [12]:
class CrossEntropy(Loss):
    def __init__(self):
        self.loss_name = 'CrossEntropy'

    def loss(self, y, AL):
        # Avoid division by zero
        AL = np.clip(AL, 1e-15, 1 - 1e-15)
        return (- y * np.log(AL) - (1 - y) * np.log(1 - AL)) #/(y.shape[-1])#(1)normalize in main

    def acc(self, y, AL):
        y, AL = y[0], AL[0]
        return np.sum(y == AL) /len(y)

    def gradient(self, y, AL):
        # Avoid division by zero
        AL = np.clip(AL, 1e-15, 1 - 1e-15)
        #print  ((- (y / AL) + (1 - y) / (1 - AL) ).shape,'cross-function output dA')
        return (- (y / AL) + (1 - y) / (1 - AL)) #/(y.shape[-1])#(2)normalize when updating grad


class MultiClassCrossEntropy(Loss):
    def __init__(self):
        self.loss_name = 'MultiClassCrossEntropy'

    def loss(self, y, AL):
        #assert AL.shape[0] - 10**(-13) <np.abs(np.sum(AL)) < AL.shape[0] + 10**(-13), print(AL, 1)
        return -np.sum(y * np.log(AL), axis= 1, keepdims = True) #/(y.shape[-1])#(1)normalize in main
    
    def acc(self, y_true, AL):
        preds = np.array([y for y in np.argmax(AL, axis=1)]).squeeze()
        y_true = np.array([y for y in np.argmax(y_true, axis=1)]).squeeze()
        return np.mean(preds == y_true)

    def gradient(self, y, AL):
        # Avoid division by zero
        AL = np.clip(AL, 1e-15, 1 - 1e-15)

        assert(AL.shape == (-(y / AL)).shape)

        return - (y / AL) # /(y.shape[-1])#(2)normalize when updating grad

class SoftmaxCrossEntropy(Loss):
    def __init__(self):
        self.loss_name = 'SoftmaxCrossEntropy'

    def loss(self, y, Z):
        # Avoid division by zero
        Z_aux = Z.shape
        Z = Z - np.max(Z, axis = 1, keepdims=True)
        assert Z_aux ==Z.shape

        log_e_Z = Z- np.log(np.sum( np.exp(Z), axis=1, keepdims=True))
        return (-np.sum( y * log_e_Z ,axis= 1))  # (1)normalize in main
    

    def acc(self, y, Z):
        AL = softmax(Z)
        preds = np.array([y for y in np.argmax(AL, axis=1)]).squeeze()
        y = np.array([y for y in np.argmax(y, axis=1)]).squeeze()
        return np.mean(preds == y)


    def gradient(self, y, Z):
        Z -= np.max(Z, axis = 1, keepdims=True)
        p = np.exp(Z)/ np.sum( np.exp(Z), axis=1, keepdims=True)

        # Avoid division by zero
        return  (-y + p)  #(2)Normalize when updating grad

In [13]:
# Loading data

def vectorized_result(y):
    e = np.zeros((10, 1))
    e[y] = 1.0
    return e

from tensorflow import keras

def load_dataset(flatten=False, unsqueeze = False):
    (X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

    # normalize x
    X_train = X_train.astype(float) / 255.
    X_test = X_test.astype(float) / 255.

    y_train = np.array([vectorized_result(y) for y in y_train]).squeeze()
    y_test = np.array([vectorized_result(y) for y in y_test]).squeeze()

    # we reserve the last 10000 training examples for validation
    X_train, X_val = X_train[:-10000,...], X_train[-10000:,...]
    y_train, y_val = y_train[:-10000,...], y_train[-10000:,...]

    if flatten:
        X_train = X_train.reshape((X_train.shape[0], 28*28))
        X_val = X_val.reshape((X_test.shape[0], 28*28))
        X_test = X_test.reshape((X_val.shape[0], 28*28))

    if unsqueeze:
        X_train = X_train[:,np.newaxis, :,:]
        X_val = X_val[:,np.newaxis, :,:]
        X_test = X_test[:,np.newaxis, :,:]

    return X_train, y_train, X_val, y_val, X_test, y_test

train_x, train_y, val_x, val_y, test_x, test_y = load_dataset(False, True)

In [14]:
# Building network

print('MultiClassCrossEntropy')
md=NeuralNetwork(MultiClassCrossEntropy)
np.random.seed(1)

n_x = 28*28    # num_px * num_px * 3
lr = 0.05    # num_px * num_px * 3
md.add(Flatten(input_shape = (1, 28, 28)))
md.add(Dense(100, initializer = 'normal', lr = lr))
md.add(Activation('relu'))
md.add(Dense(200, initializer = 'normal', lr = lr))
md.add(Activation('relu'))
md.add(Dense(10, initializer = 'normal', lr = lr))
md.add(Activation_SoftMax())

md.print_network()

#train
hist = md.fit(train_x, train_y, n_epochs=5, batch_size=32)


def accuracy(test_x, test_y):
    preds = md.predict(test_x)
    preds = np.array([y for y in np.argmax(preds, axis=1)]).squeeze()
    test_y_ = np.array([y for y in np.argmax(test_y, axis=1)]).squeeze()
    return np.mean(preds == test_y_)

## Evalaution
print('Training accuarecy: {}'.format(accuracy(train_x , train_y)))
print('Test accuarecy: {}'.format(accuracy(test_x , test_y)))
print('Training loss: {}'.format(hist[0][-1]))

MultiClassCrossEntropy
***** Current network *****
layer 		 output_shape 		 Input_shape
flatten 		 (784,) 		 (1, 28, 28)
dense 		 (100,) 		 (784,)
relu 		 (100,) 		 (100,)
dense 		 (200,) 		 (100,)
relu 		 (200,) 		 (200,)
dense 		 (10,) 		 (200,)
softmax 		 (10,) 		 (10,)
Loss function  		 MultiClassCrossEntropy
***************************


100%|███████████████████████████████████████████████| 5/5 [00:10<00:00,  2.13s/it]


Training accuarecy: 0.96326
Test accuarecy: 0.9543
Training loss: 0.08606785005410234


In [15]:
print('SoftmaxCrossEntropy')
md=NeuralNetwork(SoftmaxCrossEntropy)
np.random.seed(1)
lr = 0.05
n_x = 784    
md.add(Flatten(input_shape = (28, 28, 1,)))
md.add(Dense(100, initializer = 'normal', lr = lr))
md.add(Activation('relu'))
md.add(Dense(200, initializer = 'normal', lr = lr))
md.add(Activation('relu'))
md.add(Dense(10, initializer = 'normal', lr = lr))

md.print_network()

#train
hist = md.fit(train_x, train_y, n_epochs=5, batch_size=32)

def accuracy(test_x, test_y):
    preds = md.predict(test_x)
    preds = np.array([y for y in np.argmax(preds, axis=1)]).squeeze()
    test_y_ = np.array([y for y in np.argmax(test_y, axis=1)]).squeeze()
    return np.mean(preds == test_y_)

## Evalaution
print('Training accuarecy: {}'.format(accuracy(train_x , train_y)))
print('Test accuarecy: {}'.format(accuracy(test_x , test_y)))
print('Training loss: {}'.format(hist[0][-1]))

SoftmaxCrossEntropy
***** Current network *****
layer 		 output_shape 		 Input_shape
flatten 		 (784,) 		 (28, 28, 1)
dense 		 (100,) 		 (784,)
relu 		 (100,) 		 (100,)
dense 		 (200,) 		 (100,)
relu 		 (200,) 		 (200,)
dense 		 (10,) 		 (200,)
Loss function  		 SoftmaxCrossEntropy
***************************


100%|███████████████████████████████████████████████| 5/5 [00:12<00:00,  2.46s/it]


Training accuarecy: 0.96326
Test accuarecy: 0.9543
Training loss: 0.08606785005410234


In [18]:
print('CrossEntropy')
md=NeuralNetwork(CrossEntropy)
np.random.seed(1)

n_x = 28*28    # num_px * num_px * 3
lr = 0.05    # num_px * num_px * 3
md.add(Flatten(input_shape = (1, 28, 28)))
md.add(Dense(100, initializer = 'normal', lr = lr))
md.add(Activation('relu'))
md.add(Dense(200, initializer = 'normal', lr = lr))
md.add(Activation('relu'))
md.add(Dense(10, initializer = 'normal', lr = lr))
md.add(Activation('sigmoid'))

md.print_network()

#train
hist = md.fit(train_x, train_y, n_epochs=5, batch_size=32)


def accuracy(test_x, test_y):
    preds = md.predict(test_x)
    preds = np.array([y for y in np.argmax(preds, axis=1)]).squeeze()
    test_y_ = np.array([y for y in np.argmax(test_y, axis=1)]).squeeze()
    return np.mean(preds == test_y_)

## Evalaution
print('Training accuarecy: {}'.format(accuracy(train_x , train_y)))
print('Test accuarecy: {}'.format(accuracy(test_x , test_y)))
print('Training loss: {}'.format(hist[0][-1]))

CrossEntropy
***** Current network *****
layer 		 output_shape 		 Input_shape
flatten 		 (784,) 		 (1, 28, 28)
dense 		 (100,) 		 (784,)
relu 		 (100,) 		 (100,)
dense 		 (200,) 		 (100,)
relu 		 (200,) 		 (200,)
dense 		 (10,) 		 (200,)
sigmoid 		 (10,) 		 (10,)
Loss function  		 CrossEntropy
***************************


100%|███████████████████████████████████████████████| 5/5 [00:11<00:00,  2.30s/it]


Training accuarecy: 0.97302
Test accuarecy: 0.9635
Training loss: 0.01668239152653071
