# Imports

In [1]:
import numpy as np

# Activation and cost functions

In [2]:
def sigmoid(x, derivative=False):
    if derivative:
        return np.multiply(x, (1 - x))
    return 1 / (1 + np.exp(-x))


def relu(x, derivative=False):
    if derivative:
        return (x > 0).astype(int)
    return np.maximum(0, x)


def linear(x, derivative=False):
    if derivative:
        return 1
    return x


def mse(target, output):
    return np.square(target - output).mean()

# Optimizers

In [11]:
class SGD:
    def __init__(self, learning_rate):
        self.lr = learning_rate

    def proceed(self, weights, biases, gradients, gradients_b):        
        for n in range(1, len(weights)):           
            weights[n] = weights[n] - self.lr * gradients[n]
            biases[n] = biases[n] - self.lr * gradients_b[n]        
        return weights, biases
    
    
class ADAM:
    def __init__(self, learning_rate, inner_layers):
        self.lr = learning_rate
        self.ilayers = inner_layers+1
        
        # ADAM Optimizer variables
        #first moments
        self.l_w_m = [0 for i in range(self.ilayers)]
        self.l_b_m = [0 for i in range(self.ilayers)]
        #second moments
        self.l_w_v = [0 for i in range(self.ilayers)]
        self.l_b_v = [0 for i in range(self.ilayers)]

        self.t = 0
        
        
    def proceed(self, weights, biases, gradients, gradients_b):
        decay_rate_1 = 0.9
        decay_rate_2 = 0.999
        epsilon = 10**(-8)

        self.t += 1  # Increment Time Step

        # Computing 1st and 2nd moment for each layer
        for n in range(1, len(weights)):           
            self.l_w_m[n] = self.l_w_m[n] * decay_rate_1 + (1 - decay_rate_1) * gradients[n]
            self.l_b_m[n] = self.l_b_m[n] * decay_rate_1 + (1 - decay_rate_1) * gradients_b[n]
            

            self.l_w_v[n] = self.l_w_v[n] * decay_rate_2 + (1 - decay_rate_2) * np.square(gradients[n])        
            self.l_b_v[n] = self.l_b_v[n] * decay_rate_2 + (1 - decay_rate_2) * np.square(gradients_b[n])        


            # Computing bias-corrected moment
            l_w_m_corrected = self.l_w_m[n] / (1 - (decay_rate_1 ** self.t))
            l_b_m_corrected = self.l_b_m[n] / (1 - (decay_rate_1 ** self.t))

            l_w_v_corrected = self.l_w_v[n] / (1 - (decay_rate_2 ** self.t))        
            l_b_v_corrected = self.l_b_v[n] / (1 - (decay_rate_2 ** self.t))



            # Update Weights
            w_update = l_w_m_corrected / (np.sqrt(l_w_v_corrected) + epsilon)       

            b_update = l_b_m_corrected / (np.sqrt(l_b_v_corrected) + epsilon)        
     

            weights[n] -= (self.lr * w_update)        

            biases[n] -= (self.lr * b_update)        
      
        return weights, biases

# Neural Net Model

In [12]:
class NeuralNet:
    def __init__(self, nodes, activations, loss, optimizer):
        self.layers = len(nodes)
        self.nodes = nodes
        self.activations = [None] + activations
        self.loss_function = loss
        self.optimizer = optimizer

        self.weights, self.biases = self.init_weights()
        

    def init_weights(self):
        weights = [None]
        biases = [None]

        for n in range(1, self.layers):
            edge = np.sqrt(1/self.nodes[n-1])

            # shape (layer, Previous Layer)
            weights.append(np.random.uniform(-edge, edge,(self.nodes[n], self.nodes[n-1])))

            # Bias shape(h,1) - only 1 Bias neuron per layer
            biases.append(np.random.uniform(-edge, edge, (self.nodes[n], 1)))
        return weights, biases
    

    def feedforward(self, input_signal):
        outputs = [input_signal]

        for i in range(1, self.layers):
            sum_product = np.dot(self.weights[i], outputs[i-1]) + self.biases[i]
            outputs.append(self.activations[i](sum_product))        
        return outputs
                           

    def predict(self, input_signal):
        prediction = self.feedforward(input_signal)[-1]
        return prediction

    
    def back_propagation(self, outputs, target):
        """The delta rule for single-layered neural networks is a gradient descent method, 
        using the derivative of the network’s weights with respect to the output error to 
        adjust the weights to better classify training examples."""
        batch_size = target.shape[1]

        weight_gradients = [None for i in range(self.layers)]
        biase_gradients = [None for i in range(self.layers)]
        deltas = [None for i in range(self.layers)]

        loss = self.loss_function(target, outputs[-1])

        # Output Layer Error and Delta
        output_error = -(2/len(target))*(target - outputs[-1])
        deltas[-1] = output_error * self.activations[-1](outputs[-1], derivative=True)

        # Hidden Layers' Errors and Deltas
        for i in range(-2, -self.layers, -1):
            error = np.dot(self.weights[i+1].T, deltas[i+1])
            deltas[i] = error * self.activations[i](outputs[i], derivative=True)
        
        # Hidden Layers' Gradients
        for i in range(-1, -self.layers, -1):
            weight_gradients[i] = np.dot(deltas[i],  outputs[i-1].T) / batch_size
            biase_gradients[i] = np.mean(deltas[i], axis=1, keepdims=True)

        return weight_gradients, biase_gradients

    
    def update_weights(self, gradients, gradients_b):
        '''Returns updated weights'''
        self.weights, self.biases = self.optimizer.proceed(self.weights, self.biases, gradients, gradients_b)

# Example

In [32]:
%%time
input_signal = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
target_signal = np.array([[0], [1], [1], [0]]).T


#Create the Model
nn1 = NeuralNet(nodes=[2, 8, 8, 1],
                activations=[relu, relu, sigmoid],
                loss=mse,
                optimizer=ADAM(0.1,3))

#Train the Model
for epochs in range(2000):
    ff = nn1.feedforward(input_signal)
    gradients = nn1.back_propagation(ff,target_signal)
    nn1.update_weights(*gradients)

#Check the Model 
print(nn1.predict(input_signal))

[[0.00313122 0.9997586  0.99952005 0.00313122]]
Wall time: 400 ms
