In [9]:
import numpy as np

### Weight initializers

In [13]:
class RandomInitializer():        
    def initialize(self, shape):
        W = np.random.randn(shape[0], shape[1])
        return W
    
class ZerosInitializer():
    def initialize(self, shape):
        W = np.zeros(shape)
        return W

class HeInitializer():
    def initialize(self, shape):
        W = np.random.randn(shape[0], shape[1]) * np.sqrt(2 / shape[1])
        return W


## Activation funcs

In [43]:
class RELU():
    def activate(self, Z):
        return Z * (Z > 0)
    
    def derivative(self, Z):
        return 1 * (Z > 0)


class Sigmoid():
    def activate(self, Z):
        return 1 / (1 + np.exp(-Z))
    
    def derivative(self, Z):
        return self.activate(Z) * (1 - self.activate(Z))
    

class Linear():
    def activate(self, Z):
        return Z
    
    def derivative(self, Z):
        return (np.ones(Z.shape))
    
    

array([[0.25      , 0.19661193, 0.00664806, 0.04124902]])

## Costs

In [72]:
class BinaryCrossEntropy():
    def compute_cost(self, y_pred, y_true):
        #shape y_pred and y_true = (1, m_examples)
        m = y_true.shape[1]
        
        #lets cut off a  tiny constant to avoid log0 problem
        epsilon = 10 ** -15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)
        
        cost =  -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        
        cost = np.sum(cost, axis=1, keepdims=True) * (1 / m)
        
        return (cost)
    
    def derivative(self, y_pred, y_true):
        
        #Do it to avoid division by 0
        epsilon = 10 ** -15
        y_pred = np.clip(y_pred, epsilon, 1-epsilon)
        
        dA = - (y_true / y_pred) + (1 - y_true) / (1 - y_pred)
        
        return (dA)

## Layers

In [60]:
class Layer():
    def __init__(self, n_units, activation, l2_reg=0, weight_initializer=HeInitializer):
        self.activation = activation
        self.n_units = n_units
        self.l2_reg =l2_reg
        
        self.activation = activation()
        
        #initialize cache
        self.Z = None
        self.A = None
        
        #initialize params
        #waiting for initializing the model
        self.initializer = weight_initializer
        self.W = None
        self.b = ZerosInitializer().initialize((n_units, 1))
        
        #grads
        self.dZ = None
        self.dA = None
        
        self.dW = None
        self.db = None
        
    def initialize(self, n_units_prev):
        shape = (self.n_units, n_units_prev)
        self.W = self.initializer().initialize(shape)
        
        
    def forward_propogation(self, A_prev):
        #keep A_prev for backprop
        self.A_prev = A_prev
        
        self.Z = np.dot(self.W, A_prev) + self.b
        self.A = self.activation.activate(self.Z)
        
        return (self.A)
    
    def back_propogation(self, W_next, dZ_next):
        
        batch_size = self.Z.shape[1]
        
        #compute and keep gradients
        self.dA = np.dot(W_next.T, dZ_next)
        self.dZ = self.dA * self.activation.derivative(self.Z)
        
        #regularization
        l2_term = (self.l2_reg / batch_size) * self.W
        
        self.dW = np.dot(self.dZ, self.A_prev.T) * (1 / batch_size) + l2_term
        self.db = np.sum(self.dZ, axis=1, keepdims=True) * (1 / batch_size)
        

In [61]:
l1 = Layer(3, activation=Sigmoid)


X = np.array([[3, 4],
              [0, 3],
               [2, 4],
               [0 , 0],
                [3, 0]]).T

y = np.array([[1, 0, 1, 0, 1]])


In [78]:
layer1 = Layer(n_units=3, activation = RELU)
layer2 = Layer(1, Sigmoid)
costf = BinaryCrossEntropy()

layer1.initialize(2)
layer2.initialize(3)


a1 = layer1.forward_propogation(X)
a2 = layer2.forward_propogation(a1)

cost = costf.compute_cost(a2, y)
dA = costf.derivative(a2, y)

In [79]:
dA

array([[-1.04124473e+00,  1.74107941e+03, -1.00433171e+00,
         2.00000000e+00, -6.67283823e+00]])