### Setup

In [1]:
import numpy as np 
import math
from sklearn import datasets

def relu(X):
    return np.maximum(X, 0)

def relu_derivative(X):
    return 1 * (X > 0)

def sum_square(arr):
    return np.sum(np.square(arr))

### Neural Network

Taken from [here](https://github.com/jldbc/numpy_neural_net/blob/master/three_layer_network.py) with some modifications

In [2]:
class NN:
    def __init__(self, shapes=None, epsilon=0.12):
        assert shapes, "Please provide the shape of the NN"
        
        self.num_layers = len(shapes) - 1
        self.w = []
        self.b = []
        for i in range(self.num_layers):
            shape = (shapes[i], shapes[i+1])
            
            w = np.random.uniform(-epsilon, epsilon, shape)
            b = np.zeros((1, shape[1]))
            
            self.w.append(w)
            self.b.append(b)

    def feed_forward(self, x):
        a = []
        for i in range(self.num_layers - 1):
            w = self.w[i]
            b = self.b[i]
            
            x = x.dot(w) + b
            a.append(relu(x))
        
        z = a[-1].dot(self.w[-1]) + self.b[-1]
        z = np.exp(z)
        
        out = z / np.sum(z, axis=1, keepdims=True)
        return a[0], out

    def calculate_loss(self, X, y):
        num_examples = X.shape[0]
        
        # Forward propagation to calculate our predictions
        _, out = self.feed_forward(X)
        probs  = out / np.sum(out, axis=1, keepdims=True)
        
        # Calculating the loss
        corect_logprobs = -np.log(probs[range(num_examples), y])
        loss = np.sum(corect_logprobs)
        
        # Add regulatization term to loss (optional)
        loss += self.reg_lambda/2 * sum([sum_square(w) for w in self.w])
        return 1./num_examples * loss

    def backprop(self, X, y, a1, output):
        dW, db = [], []
        
        delta = output
        delta[range(X.shape[0]), y] -= 1
        
        dW = [(a1.T).dot(delta)]     + dW
        db = [np.sum(delta, axis=0)] + db

        delta = delta.dot(self.w[1].T) * relu_derivative(a1)
        dW = [np.dot(X.T, delta)]    + dW
        db = [np.sum(delta, axis=0)] + db
        
        # Add regularization terms
        for i in range(self.num_layers):
            dW[i] += self.reg_lambda * self.w[i]
        
        #update weights and biases
        for i in range(self.num_layers):
            self.w[i] -= self.learning_rate * dW[i]
            self.b[i] -= self.learning_rate * db[i]


    def train(self, X, y, epochs=10000, reg_lambda = 0.1, learning_rate=0.1, verbose=1000):
        self.learning_rate = learning_rate
        self.reg_lambda    = reg_lambda
        
        # Batch gradient descent
        losses = []
        for i in range(epochs):
            
            #feed forward
            a1, output = self.feed_forward(X)
            
            #backpropagation
            self.backprop(X, y, a1, output)
            
            if i % verbose == 0:
                loss = self.calculate_loss(X, y)
                losses.append(loss)
                print(f"Loss after iteration {i}: {loss}")
        return losses

### Main

In [4]:
train_x, train_y = datasets.make_moons(16, noise=0.10)
shapes = [train_x.shape[1], 150, 2]
model = NN(shapes=shapes)
model.train(train_x, train_y, reg_lambda=0.001, learning_rate=0.001, epochs=50000, verbose=10000);

Loss after iteration 0: 0.6810301186100619
Loss after iteration 10000: 0.06212428849314495
Loss after iteration 20000: 0.01554714278965735
Loss after iteration 30000: 0.009528478262497558
Loss after iteration 40000: 0.007564513297852371
