In [None]:
import numpy as np
import matplotlib.pyplot as plt

#Activation functions
def tanh(x):
    return np.tanh(x)
def d_tanh(x):
    return 1-np.square(np.tanh(x))

def sigmoid(x):
    return 1/(1 + np.exp(-x))
def d_sigmoid(x):
    return (1 - sigmoid(x)) * sigmoid(x)

def leakyReLU(x, alpha=0.01):
    return np.maximum(alpha*x,x)
def d_leakyReLU(x, alpha=0.01):
    temp = np.sign(x)
    return np.maximum(alpha*temp,temp)

def ReLU(x):
    return np.maximum(0,x)
def d_ReLU(x):
    temp = np.sign(x)
    return np.maximum(0,temp)

#Loss function "c"
def logloss(y,a):
    return -(y*np.log(a) + (1-y)*np.log(1-a))
def d_logloss(y,a):
    return (a - y)/(a*(1 - a))

class Layer:

    activationFunctions = {
        'tanh': (tanh, d_tanh),
        'sigmoid': (sigmoid, d_sigmoid),
        'ReLU': (ReLU, d_ReLU),
        'leakyReLU': (leakyReLU, d_leakyReLU)
    }

    def __init__(self, inputs, neurons, activationF):                   #Number of inputs, neurons & type of activation function
        self.w = np.random.randn(neurons, inputs) * np.sqrt(2./inputs)  #Initial weights
        self.b = np.zeros((neurons, 1))                                 #Initial biases
        self.act, self.d_act = self.activationFunctions.get(activationF)

        self.m_dw = np.zeros((neurons, inputs))                         #First and second moments, mean and uncentered variance, weights
        self.v_dw = self.m_dw
        self.m_db = np.zeros((neurons, 1))                              #First and second moments, mean and uncentered variance, biases
        self.v_db = self.m_db

    def feedforward(self, x):
        self.x = x                                                      #Input from the previous layer
        self.m = x.shape[1]                                             #Size of the batch
        self.z = self.w @ self.x + self.b @ np.ones((1, self.m))        #Inputs times weights plus biases
        self.y = self.act(self.z)                                       #Output from the current layer
        return self.y
    
    def backprop(self, dJdy, learning_rate, lambd):
        dJdz = np.multiply(dJdy, self.d_act(self.z))                    #dJdyl+1 * g'l = dJdz
        dJdw = dJdz @ self.x.T                                          #dJdz * al-1 = dJdw
        dJdb = dJdz @ np.ones((1, self.m)).T                            #dJdz * 1 = dJdb
        dJdx = self.w.T @ dJdz                                          #Information for the next layer

        reg = lambd/self.m*self.w                                       #Regularization term, only applied to the weights
        dJdw += reg

        self.w -= learning_rate * dJdw
        self.b -= learning_rate * dJdb
        return dJdx
    
    def Adam_backprop(self, dJdy, learning_rate, lambd, beta1, beta2, epsilon, epoch):
        dJdz = np.multiply(dJdy, self.d_act(self.z))                    #dJdyl+1 * g'l = dJdz
        dJdw = dJdz @ self.x.T                                          #dJdz * al-1 = dJdw
        dJdb = dJdz @ np.ones((1, self.m)).T                            #dJdz * 1 = dJdb
        dJdx = self.w.T @ dJdz                                          #Information for the next layer

        reg = lambd/self.m*self.w                                       #Regularization term, only applied to the weights
        dJdw += reg

        self.m_dw = beta1*self.m_dw + (1-beta1)*dJdw                    #Mean
        self.m_db = beta1*self.m_db + (1-beta1)*dJdb

        self.v_dw = beta2*self.v_dw + (1-beta2)*np.power(dJdw, 2)       #Variance
        self.v_db = beta2*self.v_db + (1-beta2)*np.power(dJdb, 2)

        m_corr = 1-beta1**epoch                                         #Bias corrector terms
        v_corr = 1-beta2**epoch

        self.w -= learning_rate * np.divide(self.m_dw/m_corr, np.power(self.v_dw/v_corr,0.5)+epsilon)
        self.b -= learning_rate * np.divide(self.m_db/m_corr, np.power(self.v_db/v_corr,0.5)+epsilon)
        return dJdx

class ANN:                                                              #X and Y must be in the matrix form (nºinputs or outputs,dataset size)

    def __init__(self, nodes, activation):
        self.nlayers = len(activation)
        self.layers = [None]*self.nlayers
        for i in range(self.nlayers):
            self.layers[i] = Layer(nodes[i],nodes[i+1],activation[i])
    def test(self, x):
        for layer in self.layers:
            x = layer.feedforward(x)
        return x
    
    def train(self, x_train, y_train, epochs, optimizer, lr, lambd):
        costs = []
        m = x_train.shape[1]

        if optimizer == 'GD':                                           #Gradient descent
            for epoch in range(epochs):
                y = self.test(x_train)

                dJdy = d_logloss(y_train, y)/m
                for layer in reversed(self.layers):
                    dJdy = layer.backprop(dJdy, lr, lambd)

                J = 1/m *np.sum(logloss(y_train, y))
                costs.append(J)

        elif optimizer == 'SGD':                                        #Stochastic GD
            for epoch in range(epochs):
                for i in range(y_train.shape[1]):
                    y = self.test(x_train.T[i].reshape(-1,1))

                    dJdy = d_logloss(y_train.T[i].reshape(-1,1), y)/1
                    for layer in reversed(self.layers):
                        dJdy = layer.backprop(dJdy, lr, lambd)
                y = self.test(x_train)
                J = 1/m *np.sum(logloss(y_train, y))
                costs.append(J)

        elif optimizer == 'Adam':                                       #Adam stochastic optimizer, recommended values: alfa = 0.001, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08
            beta1 = 0.9
            beta2 = 0.999
            epsilon = 1e-08
            for epoch in range(epochs):
                for i in range(y_train.shape[1]):
                    y = self.test(x_train.T[i].reshape(-1,1))

                    dJdy = d_logloss(y_train.T[i].reshape(-1,1), y)/1
                    for layer in reversed(self.layers):
                        dJdy = layer.Adam_backprop(dJdy, lr, lambd, beta1, beta2, epsilon, epoch+1)
                
                y = self.test(x_train)
                J = 1/m *np.sum(logloss(y_train, y))
                costs.append(J)
        else:
            print('No optimizer found. This library offers GD, SGD and Adam')

        return costs



In [None]:
## NN Training
x_train = np.array([[0,0,1,1], [0,1,0,1]])
y_train = np.array([[0,1,1,0]])
#4           #Number of samples

# Hyperparameters
epochs = 1000       #Times that the entire dataset is used to train the NN
lr = 0.1            #Learning rate
lambd = 0.001       #Regularization term
nodes = [2, 5, 1]
activation = ['ReLU','sigmoid']
optimizer = 'GD'

# NN and training
neuralnetwork = ANN(nodes, activation)
costs = neuralnetwork.train(x_train, y_train, epochs, optimizer, lr, lambd)

plt.plot(range(epochs),costs)
plt.xlabel('Epoch')
plt.ylabel('Cost function error')
plt.show()

In [None]:
## NN Prediction
x = x_train
y = neuralnetwork.test(x)

vect = np.rint(y)
accuracy = np.sum(vect == y_train)/vect.shape[1]*100
print('\n Accuracy in %')
print(accuracy)

In [None]:
x = np.array([[0], [0]])
len(x.T)