In [3]:
import numpy as np

### Weight initializers

In [4]:
class RandomInitializer():        
    def initialize(self, shape):
        W = np.random.randn(shape[0], shape[1])
        return W
    
class ZerosInitializer():
    def initialize(self, shape):
        W = np.zeros(shape)
        return W

class HeInitializer():
    def initialize(self, shape):
        W = np.random.randn(shape[0], shape[1]) * np.sqrt(2 / shape[1])
        return W


## Activation funcs

In [5]:
class RELU():
    def activate(self, Z):
        return Z * (Z > 0)
    
    def derivative(self, Z):
        return 1 * (Z > 0)


class Sigmoid():
    def activate(self, Z):
        return 1 / (1 + np.exp(-Z))
    
    def derivative(self, Z):
        return self.activate(Z) * (1 - self.activate(Z))
    

class Linear():
    def activate(self, Z):
        return Z
    
    def derivative(self, Z):
        return (np.ones(Z.shape))
    
    

## Costs

In [6]:
class BinaryCrossEntropy():
    def compute_cost(self, y_pred, y_true):
        #shape y_pred and y_true = (1, m_examples)
        m = y_true.shape[1]
        
        #lets cut off a  tiny constant to avoid log0 problem
        epsilon = 10 ** -15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)
        
        cost =  -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        
        cost = np.sum(cost, axis=1, keepdims=True) * (1 / m)
        
        return (cost)
    
    def derivative(self, y_pred, y_true):
        
        #Do it to avoid division by 0
        epsilon = 10 ** -15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)
        
        dA = - (y_true / y_pred) + (1 - y_true) / (1 - y_pred)
        
        return (dA)

## Layers

In [59]:
class Layer():
    def __init__(self, n_units, activation, l2_reg=0, weight_initializer=HeInitializer):
        self.activation = activation
        self.n_units = n_units
        self.l2_reg =l2_reg
        
        self.activation = activation()
        
        #initialize cache
        self.Z = None
        self.A = None
        
        #initialize params
        #waiting for initializing the model
        self.initializer = weight_initializer
        self.W = None
        self.b = ZerosInitializer().initialize((n_units, 1))
        
        #We will need them for Adam and Momentum
        #moments
        self.V_dW = None
        self.V_db = ZerosInitializer().initialize((n_units, 1))
        #RMS_prop part
        self.S_dw = None 
        self.S_db = ZerosInitializer().initialize((n_units, 1))
        
        
        #grads
        self.dZ = None
        self.dA = None
        
        self.dW = None
        self.db = None
        
    def initialize(self, n_units_prev):
        shape = (self.n_units, n_units_prev)
        self.W = self.initializer().initialize(shape)
        
        #initialize params in case we use Adam/Momentum
        self.V_dW = ZerosInitializer().initialize(shape)
        self.S_dW = ZerosInitializer().initialize(shape)
        
        
    def forward_propogation(self, A_prev):
        #keep A_prev for backprop
        self.A_prev = A_prev
        
        self.Z = np.dot(self.W, A_prev) + self.b
        self.A = self.activation.activate(self.Z)
        
        return (self.A)
    
    def back_propogation(self, W_next=None, dZ_next=None, dA_final=None):
        
        batch_size = self.Z.shape[1]
        
        #Check for valid input
        if dA_final is None:
            if W_next is None or dZ_next is None:
                raise ValueError("Either both W_next and dZ_next must be provided, or dA_final must be provided.")

        
        #compute and keep gradients
        #dA_final is a specific case, where our layer is final and we compute cost derivs
        if dA_final is not None:
            self.dA = dA_final
        else:
            self.dA = np.dot(W_next.T, dZ_next)
            
        self.dZ = self.dA * self.activation.derivative(self.Z)
        
        #regularization
        l2_term = (self.l2_reg / batch_size) * self.W
        
        self.dW = np.dot(self.dZ, self.A_prev.T) * (1 / batch_size) + l2_term
        self.db = np.sum(self.dZ, axis=1, keepdims=True) * (1 / batch_size)
        

## Optimizers

In [115]:
class GradientDescent():
    def __init__(self, learning_rate):
        self.counter = 1
        self.learning_rate = learning_rate
        
    def update(self, layer):
        #update params of layer
        layer.W = layer.W - self.learning_rate * layer.dW
        layer.b = layer.b - self.learning_rate * layer.db
        
    def tick(self):
        self.counter += 1
        

class Momentum():
    def __init__(self, learning_rate, beta=0.9, bias_correction=False):
        
        self.counter = 1
        self.learning_rate = learning_rate
        self.beta = beta
        
        self.bias_correction = bias_correction
        self.epsilon = 10 ** -8
        
    def update(self, layer):
        
        #compute new velocities
        layer.V_dW = self.beta * layer.V_dW + (1 - self.beta) * layer.dW
        layer.V_db = self.beta * layer.V_db + (1 - self.beta) * layer.db
        
        if self.bias_correction:
            #correct velocities
            layer.V_dW = layer.V_dW / (1 - self.beta ** self.counter)
            layer.V_db = layer.V_db / (1 - self.beta ** self.counter)
        
        
        #update params
        layer.W = layer.W - self.learning_rate * layer.V_dW
        layer.b = layer.b - self.learning_rate * layer.V_db
        
    def tick(self):
        self.counter += 1
        

class Adam():
    def __init__(self, alpha, beta1=0.9, beta2=0.99, bias_correction=False):
        
        self.counter = 1
        
        self.alpha = alpha
        self.beta1 = beta1 #Momentum
        self.beta2 = beta2 #RMSprop
        
        self.bias_correction = bias_correction
        
        self.epsilon = 10 ** -8
        
    def update(self, layer):
        #compute new velocities
        layer.V_dW = self.beta1 * layer.V_dW + (1 - self.beta1) * layer.dW
        layer.V_db = self.beta1 * layer.V_db + (1 - self.beta1) * layer.db
        
        #compute new second moments
        layer.S_dW = self.beta2 * layer.S_dW + (1 - self.beta2) * np.square(layer.dW)
        layer.S_db = self.beta2 * layer.S_db + (1 - self.beta2) * np.square(layer.db)
        
        
        if self.bias_correction:
            #correct velocities
            layer.V_dW = layer.V_dW / (1 - self.beta1 ** self.counter)
            layer.V_db = layer.V_db / (1 - self.beta1 ** self.counter)
            
            #correct 2nd moments
            layer.S_dW = layer.S_dW / (1 - self.beta2 ** self.counter)
            layer.S_db = layer.S_db / (1 - self.beta2 ** self.counter)
            
        #UPdate parameters
        
        layer.W = layer.W - self.alpha * (layer.V_dW / (np.sqrt(layer.S_dW) + self.epsilon))
        layer.b = layer.b - self.alpha * (layer.V_db / (np.sqrt(layer.S_db) + self.epsilon))
        
    def tick(self):
        self.counter += 1 
        
    
        
        
        

In [116]:
l1 = Layer(3, activation=Sigmoid)


X = np.array([[3, 4],
              [0, 3],
               [2, 4],
               [0 , 0],
                [3, 0]]).T

y = np.array([[1, 0, 1, 0, 1]])


In [131]:
layer1 = Layer(n_units=3, activation = RELU)
layer2 = Layer(1, Sigmoid)
costf = BinaryCrossEntropy()

layer1.initialize(2)
layer2.initialize(3)



alpha = 0.01
epochs = 100

optimizer = Adam(alpha)

for i in range(epochs+1):
    a1 = layer1.forward_propogation(X)
    a2 = layer2.forward_propogation(a1)

    cost = costf.compute_cost(a2, y)
    if (i % (epochs//10) == 0):
        print("i = {}, cost = {}".format(i, cost))
    
    
    dA_final = costf.derivative(a2, y)

    #back_prop
    layer2.back_propogation(dA_final=dA_final)
    layer1.back_propogation(layer2.W, layer2.dZ)
    
    #update params
    optimizer.update(layer1)
    optimizer.update(layer2)
    optimizer.tick()
    

i = 0, cost = [[0.68381745]]
i = 10, cost = [[0.56138149]]
i = 20, cost = [[0.47808063]]
i = 30, cost = [[0.41353049]]
i = 40, cost = [[0.34077586]]
i = 50, cost = [[0.26387362]]
i = 60, cost = [[0.19297757]]
i = 70, cost = [[0.13953807]]
i = 80, cost = [[0.11494505]]
i = 90, cost = [[0.09826149]]
i = 100, cost = [[0.08696589]]


In [105]:
a1 = layer1.forward_propogation(X)
a2 = layer2.forward_propogation(a1)

a2

array([[3.39771970e-103, 1.62133231e-061, 1.14811064e-107,
        1.00000000e+000, 1.00000000e+000]])

In [66]:
layer1.W

array([[ 1.66965741,  0.0046513 ],
       [-0.03837817, -0.28983827],
       [-1.48759324,  0.12612852]])