In [7]:
import numpy as np

################################################
#         Layers
################################################
class Input():
    def __init__(self, data):
        self.name = "Input"
        self.input = data.reshape(-1,1)
        self.input = np.append(self.input, 1).reshape(-1,1)
        self.a = self.input
        self.size = self.input.size

class Dense():
    def __init__(self, size, activation, intialization, name):
        self.name = name
        self.size = size
        self.activation = activation
        # Code for initialization
        # self.W = 
        
################################################
#         Initializers
################################################
class RandomNormal():
    def __init__(self, mean = 0.0, stddev = 1.0):
        self.mean = mean
        self.stddev = stddev
    
    def weights_biases(self, n_prev, n_curr):
        W = np.random.normal(loc = self.mean, scale = self.stddev, \
                             size = (n_prev, n_curr))
        b = np.random.normal(loc = self.mean, scale = self.stddev, \
                             size = (n_curr,))
        return W, b
    
class XavierUniform():
    def __init__(self):
        pass
    
    def weights_biases(self, n_prev, n_curr):
        upper_bound = np.sqrt(6.0/(n_prev + n_curr))
        lower_bound = -1*upper_bound
        W = np.random.uniform(low = lower_bound, high = upper_bound, \
                              size = (n_prev, n_curr))
        b = np.zeros((n_curr,), dtype = np.float64)
        return W, b
    
################################################
#         Activations
################################################
class Sigmoid():
    def __init__(self, c=1, b=0):
        self.c = c
        self.b = b

    def value(self, x):
        val = 1 + np.exp(-self.c*(x + self.b))
        return 1/val

    def diff(self, x):
        y = self.value(x)
        val = self.c*y*(1-y)
        return val

class Tanh():
    def __init__(self):
        pass

    def value(self, x):
        num = np.exp(x) - np.exp(-x)
        denom = np.exp(x) + np.exp(-x)
        return num/denom

    def diff(self, x):
        y = self.value(x)
        val = 1 - y**2
        return val

class Relu():
    def __init__(self):
        pass

    def value(self, x):
        val = x
        val[val<0] = 0
        return val

    def diff(self, x):
        val = np.ones(x.shape)
        val[val<=0] = 0
        return val

class Softmax():
    def __init__(self):
        pass

    def value(self, x):
        val = np.exp(x)/np.sum(np.exp(x))
        return val

    def diff(self, x):
        y = self.value(x)
        # Motivation for condensed equation:
        # https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
        val = (np.ones(y.shape) - y)
        return val

################################################
#         Optimizers
################################################
class Momentum():
    def __init__(self, eta=None, gamma=None):
        self.update = 0
        self.eta
        self.gamma = gamma

    def get_update(self, W, grad):
        self.update = self.gamma*self.update + self.eta*grad
        W = W - self.update
        return W

class Nesterov():
    def __init__(self, eta=None, gamma=None):
        self.update = 0
        self.eta = eta
        self.gamma = gamma
        
    def get_update(self, W):
        W_lookahead = W - self.gamma*self.update
        self.update = self.gamma*self.update + self.eta*gradient(W_lookahead) # Need to call gradient function
        W = W - self.update
        return W
        

class AdaGrad():
    def __init__(self, eta=1e-3, eps=1e-7):
        self.v = 0
        self.eta = eta
        self.eps = eps
    
    def get_update(self, W, grad):
        # eps value as in keras
        self.v = self.v + grad**2
        W = W - (self.eta/(self.v+self.eps)**0.5)*grad
        return W

class RMSProp():
    def __init__(self, beta=0.9, eta = 1e-3, eps = 1e-7):
        self.v = 0
        self.beta = beta
        self.eta = eta
        self.eps = eps

    def get_update(self, W, grad):
        self.v = self.beta*self.v + (1-self.beta)*(grad**2)
        W = W - (self.eta/(self.v+self.eps)**0.5)*grad
        return W

class Adam():
    def __init__(self, beta1=0.9, beta2=0.999, eta=1e-3, eps=1e-7):
        self.m = 0
        self.v = 0
        self.beta1 = beta1
        self.beta2 = beta2
        self.eta = eta
        self.eps = eps
        self.iter = 1

    def get_update(self, W, grad):
        self.m = self.beta1*self.m + (1-self.beta1)*grad
        self.v = self.beta2*self.v + (1-self.beta2)*(grad**2)
        m_cap = self.m/(1-self.beta1**self.iter)
        v_cap = self.v/(1-self.beta2**self.iter)        
        W = W - (self.eta/(v_cap+self.eps)**0.5)*m_cap
        self.iter += 1
        return W

class Nadam():
    # Reference: https://ruder.io/optimizing-gradient-descent/index.html#nadam
    def __init__(self, beta1=0.9, beta2=0.999, eta=1e-3, eps=1e-7):
        self.m = 0
        self.v = 0
        self.beta1 = beta1
        self.beta2 = beta2
        self.eta = eta
        self.eps = eps
        self.iter = 1
    
    def get_update(self, W, grad):
        self.m = self.beta1*self.m + (1-self.beta1)*grad
        self.v = self.beta2*self.v + (1-self.beta2)*(grad**2)
        m_cap = self.m/(1-self.beta1**self.iter)
        v_cap = self.v/(1-self.beta2**self.iter) 
        update = self.beta1*m_cap + ((1-self.beta1)/(1-self.beta1**self.iter))*grad
        W = W - (self.eta/(v_cap+self.eps)**0.5)*update
        self.iter += 1
        return W
        
################################################
#         Network
################################################

class NeuralNetwork():
    def __init__(self, layers, loss, batch_size):
        self.layers = layers
        self.batch_size = batch_size
        self.loss = loss

    def forward_propogation(self, epochs):
        X = self.layers[0].input
        for i in range(1, len(self.layers)):
            self.layers[i].h = self.layers[i].W @ self.layers[i-1].a + self.layers[i].b
            self.layers[i].a = self.layers[i].activation.value(self.layers[i].h)

    def backward_propogation(self):
        pass