# Neural Net Built From Scratch
## By James Camacho

In [43]:
# Imports
from abc import ABC, abstractmethod
import numpy as np
from tqdm.notebook import tqdm
rng = np.random.default_rng(0)

Activation functions.

In [63]:
class Activation(object):
    def __init__(self, f, df):
        self.f = f
        self.df = df
    def __call__(self, x):
        return self.f(x)
    
    def forward(self, x):
        return self.f(x)
    
    def grad(self, x):
        return self.df(x)
        
def sig(x):
    return 1 / (1 + np.exp(-x))

def dsig(x):
    return np.exp(-x) / (1 + np.exp(-x))**2

def relu(x):
    return np.maximum(0.1*x, x)

def drelu(x):
    return np.piecewise(x, [x <= 0, x > 0], [0.1, 1])

Sigmoid = Activation(sig, dsig)
ReLU = Activation(relu, drelu)

Loss function.

In [165]:
class Loss(object):
    def __init__(self, f, df):
        self.f = f
        self.df = df
    
    def __call__(self, x, y):
        return self.f(x, y)
    
    def forward(self, x, y):
        return self.f(x, y)
    
    def grad(self, x, y):
        return self.df(x, y)
    
def loss(output, y):
    return np.sum((output-y)**2) / len(y)

def dloss(output, y):
    return 2*(output - y) / len(y)

MSE = Loss(loss, dloss)

Neural nets should have a bunch of layers. We're using sigmoid activation.

In [486]:
class Layer(object):
    def __init__(self, ins, outs, activation=ReLU):
        self.w = rng.normal(1, 0.1, size=(ins, outs))
        self.b = rng.normal(0, 0.1, size=outs)
        self.a = activation
        
    def __call__(self, x):
        return self.forward(x)
    
    def forward(self, x):
        return self.a(x @ self.w + self.b)
    
    def grad(self, x):
        d = self.a.grad(x @ self.w + self.b)
        return d
    
class NeuralNet(object):
    def __init__(self, layers):
        self.layers = layers
        
    def __call__(self, x):
        return self.forward(x)
    
    def forward(self, x):
        for layer in layers:
            x = layer(x)
        
        return x
    
    def train(self, x, y, loss=MSE):
        for layer in layers:
            layer.db = layer.grad(x)
            x = layer(x)
        
        dx = loss.grad(x, y)
        for layer in reversed(layers):
            layer.db *= dx
            layer.dw = layer.w[None,] * layer.db[:,None]
            dx = dx @ layer.w.T
            
        for layer in self.layers:
            layer.db = np.sum(layer.db, axis=0)
            layer.dw = np.sum(layer.dw, axis=0)

Optimizer function.

In [487]:
class Optimizer(ABC):
    def __init__(self, nn):
        self.nn = nn
    
    @abstractmethod
    def backprop(self):
        pass
    
def penalize(nn, penalty=1e-3):
    # penalize nn for large weights
    for layer in nn.layers:
        layer.db += layer.b * penalty
        layer.dw += layer.w * penalty
    
class SGD(Optimizer):
    def __init__(self, nn, alpha=1e-3, penalty=1e-3):
        self.nn = nn
        self.alpha = alpha
        self.penalty = penalty
    
    def backprop(self):
        penalize(nn, self.penalty)
        for layer in self.nn.layers:
            layer.b -= self.alpha * layer.db
            layer.w -= self.alpha * layer.dw
    
class Adam(Optimizer):
    def __init__(self, nn, alpha=1e-3, beta=0.1, penalty=1e-3):
        """
        nn - NeuralNet to optimize.
        alpha - learning rate
        beta - exponential decay rate (for signal/noise weighted mean)
        """
        super().__init__(nn)
        self.signals_b = None
        self.noise_b = None
        self.signals_w = None
        self.noise_w = None
        self.a = alpha
        self.b = 1 - beta
        self.bk = 1
        self.penalty = penalty
        
    def reset(self):
        self.signals_b = None
        self.noise_b = None
        self.signals_w = None
        self.noise_w = None
        self.bk = 1
    
    def backprop(self):
        """
        Should call nn.train before this.
        """
        # Add penalty for big values
        penalize(nn, self.penalty)
        
        if self.signals_b is None:
            self.signals_b = {}
            self.noise_b = {}
            self.signals_w = {}
            self.noise_w = {}
            for layer in self.nn.layers:
                self.signals_b[layer] = layer.db
                self.noise_b[layer] = layer.db ** 2
                
                self.signals_w[layer] = layer.dw
                self.noise_w[layer] = layer.dw ** 2
        
        self.bk *= self.b
        for layer in self.nn.layers:
            self.signals_b[layer] = ((1-self.b) * layer.db + self.b * self.signals_b[layer] * (1-self.bk)) / (1-self.bk*self.b)
            self.noise_b[layer] = ((1-self.b) * layer.db**2 + self.b * self.noise_b[layer] * (1-self.bk)) / (1-self.bk*self.b)
            
            self.signals_w[layer] = ((1-self.b) * layer.db + self.b * self.signals_w[layer] * (1-self.bk)) / (1-self.bk*self.b)
            self.noise_w[layer] = ((1-self.b) * layer.db**2 + self.b * self.noise_w[layer] * (1-self.bk)) / (1-self.bk*self.b)
            
            db = self.a * self.signals_b[layer] / self.noise_b[layer]**0.5
            dw = self.a * self.signals_w[layer] / self.noise_w[layer]**0.5
            layer.b[np.isfinite(db)] -= db[np.isfinite(db)]
            layer.w[np.isfinite(dw)] -= dw[np.isfinite(dw)]

We're going to have it learn the AND function.

In [488]:
train_x = rng.integers(2, size=(100, 2))
train_y = np.vstack(train_x[:, 0] & train_x[:, 1])

Create our Neural Net.

In [489]:
layers = [Layer(2, 100), Layer(100, 1)]
nn = NeuralNet(layers)

Optimize. Intially start with SGD, then finish with Adam.

In [490]:
optim = SGD(nn, alpha=1e-4)
epochs = 1000
pbar = tqdm(range(epochs))
for i in pbar:
    nn.train(train_x, train_y)
    optim.backprop()
    if i % 100 == 0:
        pred = nn(train_x)
        loss = MSE(pred, train_y)
        pbar.set_description("Loss: %.3f" % loss)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [491]:
optim = Adam(nn, alpha=1e-4)
epochs = 10000
pbar = tqdm(range(epochs))
for i in pbar:
    nn.train(train_x, train_y)
    optim.backprop()
    if i % 100 == 0:
        pred = nn(train_x)
        loss = MSE(pred, train_y)
        pbar.set_description("Loss: %.3f" % loss)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [492]:
for i in [0, 1]:
    for j in [0, 1]:
        print(f"{i} AND {j} is {nn([i,j])[0]:.5f}")

0 AND 0 is -0.15923
0 AND 1 is -0.01149
1 AND 0 is -0.01174
1 AND 1 is 1.02833
