## Session 36 
Aug 20, 2021

In [5]:
import numpy as np

In [6]:
class BinaryCrossEntropy:

    def __call__(self, ytrue, ypred):
        return -np.sum( ytrue*np.log(ypred + 1e-10) + (1-ytrue)*np.log(1-ypred + 1e-10) )

    def grad_input(self, y, ypred):
        grad = np.zeros(ypred.shape)
        ix0 = (y==0).reshape(-1,)
        ix1 = (y==1).reshape(-1,)
        grad[ix0,:] = 1/(1-ypred[ix0,:])
        grad[ix1,:] = -1/ypred[ix1,:]
        return grad

In [7]:
class Sigmoid:

    def __call__(self, X):
        return self.eval(X)

    def eval(self, X):
        return 1/((np.e**-X) + 1)

    def grad_input(self, X):
        return np.identity(X.shape[1]) * self.eval(X)*(1-self.eval(X))

In [31]:
s = Sigmoid()
I = np.identity(2)
b = np.array([2, 3])
print(b)
print(I)
print(I*b)
s.grad_input(np.array([[2, 3]])).shape

[2 3]
[[1. 0.]
 [0. 1.]]
[[2. 0.]
 [0. 3.]]


(2, 2)

In [42]:
class Dot:

    def __init__(self, input_size, units):
        self.W = np.random.randn(input_size, units)
        self.b = np.random.randn(1, units)

    def __call__(self, X):
        return self.eval(X)

    def eval(self, X):
        return X.dot(self.W) + self.b

    def grad_input(self, X):
        return self.W.T

    def grad_w(self, X):
        I = np.identity(self.b.shape[1])
        g = np.stack([I]*self.W.shape[0], axis=0)
        for i in range(g.shape[0]):
            g[i] *= X[0][i]
        return g

    def grad_b(self):
        return np.identity(self.b.shape[1])
    
    def get_parameter_shape(self):
        return self.W.shape, self.b.shape


In [43]:
class Dense:

    def __init__(self, input_size, activation, units):
        """
        input_size: no. of neurons in previous layer
        activation: some activation funtion
        units: no. of neurons in current layer 
        """
        self.activation = activation
        self.units = units
        self.dot = Dot(input_size, units)

    def eval(self, X):
        return self.activation( self.dot(X))

    def grad_input(self, X):
        g1 = self.activation.grad_input( self.dot(X) )
        g2 = self.dot.grad_input(X)
#         print(g1.shape, g2.shape, g1.dot(g2).shape)
        return g1.dot(g2)

    def grad_parameters(self, X):
        da_dI = self.activation.grad_input(self.dot(X))
        dI_dw = self.dot.grad_w(X)
        da_dw = da_dI.dot(dI_dw)
        
        dI_db = self.dot.grad_b()
        da_db = da_dI.dot(dI_db)
        return np.transpose(da_dw, [1,0,2]), da_db
        

    def update(self, grad, optimizer):
        """ grad: (dL_dwi, dL_dbi)"""
        self.dot.W = optimizer.minimize(self.dot.W, grad[0])
        self.dot.b = optimizer.minimize(self.dot.b, grad[1])
        
    def get_parameter_shape(self):
        return self.dot.get_parameter_shape()
    
    def get_total_parameters(self):
        w_shape, b_shape = self.dot.get_parameter_shape()
        return np.prod(w_shape) + np.prod(b_shape)

In [10]:
class GradientDescentOptimizer:

    def __init__(self, lr):
        self.lr = lr

    def minimize(self, w, grad):
        assert w.shape == grad.shape, f"Shape mismatch w shape {w.shape} != grad shape {grad.shape}"
        w = w-self.lr*grad
        return w

In [44]:
class Sequential:

    def __init__(self, loss):
        self.layers = []
        self.loss = loss

    def add(self, layer):
        self.layers.append(layer)

    def forward_propagation(self, X):
        output = X
        outputs = []
        grads = []
        for layer in self.layers:
            g = {}
            g['input'] = layer.grad_input(output)
            g['w'], g['b'] = layer.grad_parameters(output)
            grads.append(g)
            output = layer.eval(output)
            outputs.append(output)
        return outputs, grads

    def back_propagate(self, grads, outputs, y):
        grad_loss = self.loss.grad_input(y, outputs[-1]) # dL/dlast_layer_output
        for layer, grad in list(zip(self.layers, grads))[::-1]:
#             print(grad['input'].shape, grad['w'].shape, grad['b'].shape)
            dL_dwi, dL_dbi = grad_loss.dot(grad['w']), grad_loss.dot(grad['b'])
            layer.update((dL_dwi[0], dL_dbi), self.optimizer)
            grad_loss = grad_loss.dot(grad['input']) # update grad loss for prev layer
            

    def fit(self, X, y, epochs, optimizer, learning_rate, verbose=1):
        self.optimizer = optimizer(learning_rate)
        for i in range(epochs):
            outputs, grads = self.forward_propagation(X)
            self.back_propagate(grads, outputs, y)
            if verbose==1:
                print(f"\rEpoch: {i+1} Loss: {self.loss(y, outputs[-1])}", end="")
        if verbose==0:
            print(f"Epoch: {i} Loss: {self.loss(y, outputs[-1])}")

    def eval(self, X):
        return self.forward_propagation(X)[0][-1]
    
    def summary(self):
        from tabulate import tabulate
        headers = ["#", "Layer Type", "W.shape", "b.shape", "Total parameters"]
        table = []
        total_p = 0
        for i, layer in enumerate(self.layers):
            w_shape, b_shape = layer.get_parameter_shape()
            p = layer.get_total_parameters() # total parameters of a layer
            table.append([i+1, layer.__class__.__name__, w_shape, b_shape, p])
            total_p += p
        print(tabulate(table, headers, tablefmt="pretty"))
        print("Total no. of model parameters", total_p)
            

In [45]:
model = Sequential(BinaryCrossEntropy())
model.add(Dense(input_size = 2, activation=Sigmoid(), units=3))
model.add(Dense(input_size = 3, activation=Sigmoid(), units=2))
model.add(Dense(input_size = 2, activation=Sigmoid(), units=1))

In [46]:
model.summary()

+---+------------+---------+---------+------------------+
| # | Layer Type | W.shape | b.shape | Total parameters |
+---+------------+---------+---------+------------------+
| 1 |   Dense    | (2, 3)  | (1, 3)  |        9         |
| 2 |   Dense    | (3, 2)  | (1, 2)  |        8         |
| 3 |   Dense    | (2, 1)  | (1, 1)  |        3         |
+---+------------+---------+---------+------------------+
Total no. of model parameters 20


In [47]:
n_samples = 1
n_features = 2
X = np.random.randn(n_samples, n_features)
y = np.random.choice(a=[0,1], size=(1,n_samples))
print("X:", X.shape, "Y:", y.shape)
ypred = model.eval(X)

# before fitting/ training
print("Loss:", model.loss(y, ypred))
print("Ypred:", ypred.shape, ypred)

X: (1, 2) Y: (1, 1)
Loss: 0.24175454244267242
Ypred: (1, 1) [[0.2147511]]


In [48]:
model.fit(X, y, epochs=1500, optimizer=GradientDescentOptimizer, learning_rate=0.008, verbose=1)

Epoch: 1500 Loss: 0.048172993179788345

In [49]:
ypred = model.eval(X) # after training
print(ypred)

[[0.04700436]]


In [50]:
model.loss(y, ypred)

0.04814494977918185