# COMP 551 - Mini-project 3
Group 63

### Load the data and vectorize it to be loaded into MLP architecture

In [2]:
#import libraries
import tensorflow as tf
import numpy as np

# use the unormalized data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()

num_train, num_test = x_train.shape[0], x_test.shape[0]
x_train = x_train.reshape(num_train, -1)    # vectorization
x_test = x_test.reshape(num_test, -1)

### Build the network

In [3]:
# L2 regularisation
def compute_cost_with_regularization(A3, Y, parameters, lambd):
    m = Y.shape[1]
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    W3 = parameters["W3"]
    
    cross_entropy_cost = cross_entropy_loss(A3, Y) # This gives you the cross-entropy part of the cost

    L2_regularization_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) / (2 * m)

    cost = cross_entropy_cost + L2_regularization_cost
    
    return cost

In [4]:
#Define all the activation function

def relu():
    return Relu()


def sigmoid():
    return Sigmoid()


def tanh():
    return Tanh()


def softmax(x):     # softmax
    nx = x - np.max(x, axis=1, keepdims=True)
    exp_x = np.exp(nx)
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)


def cross_entropy_loss(y_pred, y_true):     # cross entropy as the loss function
    batch = y_true.shape[0]
    return -np.sum(np.log(y_pred[:, y_true] + 1e-7)) / batch


In [5]:
class Relu:     # relu activation function

    def forward(self, x):
        self.x = x
        self.mask = x <= 0

        out = x.copy()
        out[self.mask] = 0

        return out

    def backward(self, d): #take derevative for the backward process
        d[self.mask] = 0
        return d


In [6]:
class Affine:       # model x*w + b & error backprop

    def __init__(self, w, b):
        self.w = w
        self.b = b

    def forward(self, x):
        self.x = x
        self.y = np.dot(x, self.w) + self.b
        return self.y

    def backward(self, d):
        dx = np.dot(d, self.w.T)
        self.dw = np.dot(self.x.T, d)
        self.db = np.sum(d, axis=0)
        return dx


class SoftmaxLoss:

    def __init__(self):
        pass

    def forward(self, x, y_true):
        self.y_true = y_true
        self.y_pred = softmax(x)
        self.loss = cross_entropy_loss(self.y_pred, self.y_true)
        return self.loss

    def backward(self):
        class_num = self.y_pred.shape[1]
        batch = self.y_true.shape[0]
        y_true_one_hot = np.zeros((batch, class_num))
        y_true_one_hot[np.arange(batch), self.y_true] = 1
        return (self.y_pred - y_true_one_hot) / batch


In [7]:
class Sigmoid:      # sigmoid activation function


    def forward(self, x):
        self.x = x
        self.y = 1.0/(1 + np.exp(-x))
        return self.y

    def backward(self, d): #take derevative for the backward process
        return d * (1.0 - self.y) * self.y


class Tanh:     # tanh activation function

    def forward(self, x):
        self.x = x
        return np.tanh(x)

    def backward(self, d): #take derevative for the backward process
        return d * (1-np.tanh(self.x) ** 2)


### Build the MLP architecture

In [8]:
#Define the multilayer perceptron class to fit and predit the images

class MLP:

    def __init__(self, input_layer=784, hidden_layer= [], output_layer=10, max_episode=50000,
                 active_func=relu, batch_size=200, learing_rate=0.1): # intotal 784 pixel inputs, 10 outputs refers to integer 0-9
        size = input_layer
        self.layers = []
        self.cal_layer = []

        for hidden in hidden_layer:
            w, b = np.random.randn(size, hidden), np.random.randn(1, hidden)        # w, b refer to "weight" and "bias"
            self.layers.append({'w': w, 'b': b})
            self.cal_layer.append(Affine(w, b))
            self.cal_layer.append(active_func())
            size = hidden

        w, b = np.random.randn(size, output_layer), np.random.randn(1, output_layer)
        self.cal_layer.append(Affine(w, b))
        self.layers.append({'w': w, 'b': b})
        self.active_func = active_func
        # self.de_active_func = active_func[1]
        self.max_episode = max_episode
        self.batch_size = batch_size
        self.lr = learing_rate

        self.out_layer = SoftmaxLoss()

    def fit(self, X, y):
        for ep in range(self.max_episode):      # number of gradient descent
            # random select some data
            batch = np.random.choice(X.shape[0], self.batch_size)       # sample a part of data each time for implementing mini-batch
            x, y_true = X[batch], y[batch]

            # forward               
            for cal in self.cal_layer:
                x = cal.forward(x)
            self.out_layer.forward(x, y_true)

            # backward              # backprop, calculate gradient each iteration
            d = self.out_layer.backward()
            for cal in reversed(self.cal_layer):
                d = cal.backward(d)

            # check grad
            grad = []
            for cal in self.cal_layer:
                if isinstance(cal, Affine):     # only Affine has parameters needed to be updated
                    grad.append({'w': cal.dw, 'b': cal.db})

            # update grad
            for i in range(len(self.layers)):       # update parameter at each iteration -> SGD
                self.layers[i]['w'] -= self.lr * grad[i]['w']
                self.layers[i]['b'] -= self.lr * grad[i]['b']

            

    def predict(self, X):
        x = X
        # for i in range(len(self.layers) - 1):
        #     w, b = self.layers[i]['w'], self.layers[i]['b']
        #     x = self.active_func(np.dot(x, w) + b)

        # w, b = self.layers[len(self.layers) - 1]['w'], self.layers[len(self.layers) - 1]['b']
        # x = np.dot(x, w) + b
        # y = np.argmax(x, axis=1)
        # return y

        for cal in self.cal_layer:
            x = cal.forward(x)
        # self.out_layer.forward(x, y_true)

        x = softmax(x)
        y = np.argmax(x, axis=1)
        return y


    def evaluate(self, y_true, y_predict):
        return sum(y_true == y_predict) / len(y_true)

### Parameter defining

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
parameters = {
    #"hidden_layer": [[], [128], [128, 128]],
    "hidden_layer": [[]],
    "max_episode": [10000],
    "batch_size": [100],
    "learing_rate": [0.01, 0.1, 1, 2, 5]
}

# Grid Search

from  copy import deepcopy
def generate_all():
    keys = list(parameters.keys())
    parameters_list = [{}]

    for key in keys:
        tmp = []
        for val in parameters[key]:
            for pre in parameters_list:
                pre = deepcopy(pre)
                pre[key] = val
                tmp.append(pre)
        parameters_list = tmp
    return parameters_list

best, best_p, best_acc = None, None, 0

for p in generate_all():
    mlp = MLP(**p)
    mlp.fit(x_train, y_train)
    acc = mlp.evaluate(y_test, mlp.predict(x_test))
    if acc > best_acc:
        best_acc = acc
        best_p = p
        best = mlp
    print("Acc ", acc, " with parameter: ", p)

print("Best acc: ", best_acc)
print("Best mlp parameters: ", best_p)

Acc  0.9009  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.01}
Acc  0.8744  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.1}
Acc  0.8976  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 1}
Acc  0.8304  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 2}
Acc  0.8926  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 5}
Best acc:  0.9009
Best mlp parameters:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.01}


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
parameters = {
    "hidden_layer": [[], [128], [128, 128]],
    #"hidden_layer": [[]],
    "max_episode": [10000],
    "batch_size": [100],
    "learing_rate": [0.01]
    #"learing_rate": [0.1, 1, 2, 5]
}

# Grid Search

from  copy import deepcopy
def generate_all():
    keys = list(parameters.keys())
    parameters_list = [{}]

    for key in keys:
        tmp = []
        for val in parameters[key]:
            for pre in parameters_list:
                pre = deepcopy(pre)
                pre[key] = val
                tmp.append(pre)
        parameters_list = tmp
    return parameters_list

best, best_p, best_acc = None, None, 0

for p in generate_all():
    mlp = MLP(**p)
    mlp.fit(x_train, y_train)
    acc = mlp.evaluate(y_test, mlp.predict(x_test))
    if acc > best_acc:
        best_acc = acc
        best_p = p
        best = mlp
    print("Acc ", acc, " with parameter: ", p)

print("Best acc: ", best_acc)
print("Best mlp parameters: ", best_p)

Acc  0.8914  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.01}
Acc  0.115  with parameter:  {'hidden_layer': [128], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.01}
Acc  0.1135  with parameter:  {'hidden_layer': [128, 128], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.01}
Best acc:  0.8914
Best mlp parameters:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 0.01}


In [None]:
"""2 -> Acc  0.6848  with parameter:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 2}
Acc  0.0982  with parameter:  {'hidden_layer': [128], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 2}
Acc  0.0983  with parameter:  {'hidden_layer': [128, 128], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 2}
Best acc:  0.6848
Best mlp parameters:  {'hidden_layer': [], 'max_episode': 10000, 'batch_size': 100, 'learing_rate': 2}"""

### Test the models

In [12]:
def test(mlp):

    mlp.fit(x_train, y_train)
    yp = mlp.predict(x_test)
    print("after train, on test set:", mlp.evaluate(y_test, yp))
    print("after train, on train set:", mlp.evaluate(y_train, mlp.predict(x_train)))
    print()

No hidden layer with 0.01 gives good accuracy

In [13]:
test(MLP(hidden_layer=[], max_episode=20000, learing_rate=0.01))

after train, on test set: 0.9144
after train, on train set: 0.9239666666666667



One hidden layer with ReLu function with 0.01 does not perform very well

In [14]:
test(MLP(hidden_layer=[128], max_episode=20000, learing_rate=0.01))

after train, on test set: 0.1632
after train, on train set: 0.17436666666666667



Two hidden layer with ReLu function with different learning rates also does not perform very well

In [15]:
test(MLP(hidden_layer=[128, 128], max_episode=20000, learing_rate=0.01))

after train, on test set: 0.1135
after train, on train set: 0.11236666666666667



In [16]:
test(MLP(hidden_layer=[128, 64], max_episode=20000, learing_rate=0.01))

after train, on test set: 0.1135
after train, on train set: 0.11236666666666667



In [18]:
test(MLP(hidden_layer=[128, 128], max_episode=100000, learing_rate=0.01))

after train, on test set: 0.1137
after train, on train set: 0.11245



In [20]:
test(MLP(hidden_layer=[128, 128], max_episode=100000, learing_rate=0.0001))

after train, on test set: 0.2185
after train, on train set: 0.22263333333333332



#### Lowering the learning rate considerably gives good accuracy for ReLu activation function

In [21]:
test(MLP(hidden_layer=[128, 128], max_episode=100000, learing_rate=0.000001))

after train, on test set: 0.9039
after train, on train set: 0.9131333333333334



#### 0.1 learning rate gives good accuracy for 2 hidden layer sigmoid activation function

In [19]:
test(MLP(hidden_layer=[128, 128], max_episode=100000, learing_rate=0.1, active_func = sigmoid))

  


after train, on test set: 0.9425
after train, on train set: 0.9558833333333333



#### 0.1 learning rate gives good accuracy for 2 hidden layer tanh activation function

In [22]:
test(MLP(hidden_layer=[128, 128], max_episode=100000, learing_rate=0.1, active_func = tanh))

after train, on test set: 0.8913
after train, on train set: 0.89425

