In [1]:
from copy import deepcopy
import numpy as np 
from sklearn.datasets import make_classification

from templates import AutoDiffFunction, Layer, Loss, Optimizer

Defining the activation functions

In [2]:
class Sigmoid(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        self.saved_for_backward = 1/(1 + np.exp(-x))
        return self.saved_for_backward

    def compute_grad(self, x):
        y = self.saved_for_backward

        return {"x": y*(1-y)}

    def backward(self, dy):
        return dy * self.grad["x"]      


class RelU(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        self.saved_for_backward = np.where(x>0.0, 1.0, 0.0)

        return x * self.saved_for_backward

    def compute_grad(self, x):
        return {"x": self.saved_for_backward}

    def backward(self, dy):
        return dy * self.grad["x"]
     
class Softmax(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        v = np.exp(x)
        self.saved_for_backward = v

        return v / np.sum(v, axis=1, keepdims=True)

    def compute_grad(self, x):
        pass

    def backward(self, dy):
        return dy * self.grad["x"]


Defining the layers

In [3]:
class FC(Layer):
    def __init__(self, in_dim, out_dim) -> None:
        super().__init__()
        self.initialize_weights(in_dim, out_dim)

    def initialize_weights(self, in_dim, out_dim):
        
        scaling_factor = 1/np.sqrt(in_dim)
        self.weights["w"] = np.random.randn(in_dim, out_dim) / scaling_factor
        self.weights["b"] = np.random.randn(1, out_dim) / scaling_factor

    def compute_grad(self, x):
        
        gradients = {}

        # y = x * w + b        
        # we compute gradients wrt w and x 
        # gradient wrt b is not required explicitly since we know that it's value is 1
        gradients["w"] = self.saved_for_backward["x"].T
        gradients["x"] = self.weights["w"].T

        return gradients


    def forward(self, x):
        
        output = x @ self.weights["w"] + self.weights["b"]
        self.saved_for_backward["x"] = x
        
        return output

    def backward(self, dy):
        #print(dy.shape)
        #print(self.grad["x"].shape)
        #print(self.grad["w"].shape)
        
        dx = dy @ self.grad["x"]
        
        # calculating gradients wrt weights
        dw = self.grad["w"] @ dy
        db = np.sum(dy, axis=0, keepdims=True)

        self.absolute_gradients = {"w": dw, "b": db}

        return dx

    def update_weights(self):
        self.optimizer.step(self)

Defining the loss function

For this particular problem, we require CrossEntropy Loss for classification

In [4]:
x = np.array([[0.94, 0.04, 0.02],[0.3, 0.64, 0.06]])

y = np.array([[1,0,0],[0,1,0]])

z = -y * np.log(x)
print(z)
z = np.sum(z, axis=1)
print(z)
print(np.mean(z))

[[ 0.0618754 -0.        -0.       ]
 [-0.         0.4462871 -0.       ]]
[0.0618754 0.4462871]
0.2540812531732535


In [5]:
class CrossEntropyLossFromLogits(Loss):

    @staticmethod
    def softmax(x):
        v = np.exp(x)

        return v / np.sum(v, axis=1, keepdims=True)

    @staticmethod
    def encode(y): 
        d = len(np.unique(y))
        encoded_y = np.zeros(shape=(len(y), d))

        for i in range(len(y)):
            encoded_y[i,y[i]] = 1

        return encoded_y

    def forward(self, y_pred, y_true):
         
        probabilities = self.softmax(y_pred)
        y_true_encoded = self.encode(y_true)

        loss_value = np.mean(np.sum(- y_true_encoded * np.log(probabilities), axis=1))

        self.saved_for_backward["probabilities"] = probabilities
        self.saved_for_backward["y_true"] = y_true_encoded

        return loss_value

    def compute_grad(self, y_pred, y_true):

        return {"x": self.saved_for_backward["probabilities"] - self.saved_for_backward["y_true"]}        

Creating an optimizer for the loss

In [6]:
class SGD(Optimizer):
    def __init__(self, lr, batch_size=32, lamda=0):
        self.lr = lr
        self.lamda = lamda

    def step(self, layer):

        ## adding 2*lambda*w to gradient if regularisation is there
        for grad_name, _ in layer.absolute_gradients.items():
            layer.absolute_gradients[grad_name] = layer.absolute_gradients[grad_name] + \
                                                        2 * self.lamda * layer.weights[grad_name]


        for weight_name, _ in layer.weights.items():
            layer.weights[weight_name] = layer.weights[weight_name] - self.lr * layer.absolute_gradients[weight_name]

class Nadam(Optimizer):
    def __init__(self, lr, beta_1, beta_2, epsilon):
        self.lr = lr
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.t = 0

    def step(self, layer):
        
        # we have 2 parameters to remember m(t) and v(t) for all weights in the layer
        if self.remember == {}:
            for weight_name, weight in layer.weights.items():
                self.remember[weight_name] = {}
                self.remember[weight_name]["v"] = np.zeros_like(weight)
                self.remember[weight_name]["m"] = np.zeros_like(weight)

        for weight_name, weight in layer.weights.items():
            
            self.remember[weight_name]["m"] = self.beta_1 * self.remember[weight_name]["m"] + \
                                                (1 -self.beta_1) * layer.absolute_gradients[weight_name]

            self.remember[weight_name]["v"] = self.beta_2 * self.remember[weight_name]["v"] + \
                                                (1 - self.beta_2) * layer.absolute_gradients[weight_name]**2

            # bias correction step 
            m_hat = self.remember[weight_name]["m"]/(1 - self.beta_1 ** self.t)
            v_hat = self.remember[weight_name]["v"]/(1 - self.beta_2 ** self.t)

            d = self.lr / (np.sqrt(v_hat) + self.epsilon) * (self.beta_1*m_hat + (1-self.beta_1)/
                                                (1-self.beta_1 ** self.t) * layer.absolute_gradients[weight_name]) 

            layer.weights[weight_name] = layer.weights[weight_name] - d

        self.t += 1

Creating the structure for an actual neural network

In [7]:
class NeuralNet():
    def __init__(self, layers) -> None:
        self.layers = layers

    def __call__(self, *args, **kwds):
        return self.forward(*args, **kwds)

    def compile(self, loss, optimizer):
        self.loss = loss

        for layer in self.layers:
            if isinstance(layer, Layer):
                layer.optimizer = deepcopy(optimizer)

    def calculate_loss(self, y_pred, y_true):
        return self.loss(y_pred, y_true)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)

        return x

    def backward(self):

        gradient = self.loss.backward()
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)

        return gradient

    def update_weights(self):

        for layer in reversed(self.layers):
            if isinstance(layer, Layer):
                layer.update_weights()

Create a custom classification dataset to test out the function <br> dsdsd

In [8]:
## creating a dummy dataset to test out stuff ##

X, y = make_classification(n_samples=32*6, n_features=20, n_informative=15, n_classes=3)

def create_batches(X, y, batch_size=32):
    batches = []

    for i in range(len(y) // batch_size):
        start_idx = batch_size * i
        end_idx = batch_size * (i + 1)

        batches.append([X[start_idx: end_idx], y[start_idx: end_idx]])

    return batches

In [9]:
## Utility functions ##
def probs_to_labels(y): 
    return np.argmax(y, axis=1)


def encoded_to_labels(y):
    return np.where(y==1)[1]

def accuracy_score(y_pred, y_true):

    pred_labels = probs_to_labels(y_pred)

    return np.sum(pred_labels == y_true) / len(y_true)

batches = create_batches(X, y, batch_size=32)
len(batches)

6

In [10]:
def fit_model(model, batches, loss, optimizer, epochs=10):

    training_stats = []
    num_batches = len(batches)
    
    model.compile(loss=loss, optimizer=optimizer)

    for epoch in range(1, epochs+1):

        total_loss = 0
        total_accuracy = 0

        for X, y in batches:

            preds = model(X)
            total_loss += model.loss(preds, y)
            total_accuracy += accuracy_score(preds, y)

            _ = model.backward()
            model.update_weights()

        loss_per_epoch = total_loss / num_batches
        accuracy = total_accuracy / num_batches

        print(f"Epoch: {epoch} Train Loss: {loss_per_epoch} Train Accuracy: {accuracy}")

        training_stats.append({"Epoch" : epoch, 
                                "Train Loss": loss_per_epoch,
                                "Train Accuracy": accuracy})

    
    return training_stats

In [11]:
# Initializing the model and setting up loss and optimizer
model = NeuralNet([FC(20, 32), RelU(), FC(32, 3)])
optimizer = SGD(lr = 0.001, lamda=1e-3)
loss = CrossEntropyLossFromLogits()

training_stats = fit_model(model, batches, loss, optimizer, epochs=50)

Epoch: 1 Train Loss: 34.134649303259756 Train Accuracy: 0.28125
Epoch: 2 Train Loss: 14.080383490859802 Train Accuracy: 0.3958333333333333
Epoch: 3 Train Loss: 9.013354405502621 Train Accuracy: 0.4583333333333333
Epoch: 4 Train Loss: 5.896979104371906 Train Accuracy: 0.5625
Epoch: 5 Train Loss: 4.565701865042431 Train Accuracy: 0.6302083333333334
Epoch: 6 Train Loss: 3.797673161103596 Train Accuracy: 0.6875
Epoch: 7 Train Loss: 3.3453340828833498 Train Accuracy: 0.7395833333333334
Epoch: 8 Train Loss: 2.9882099247704694 Train Accuracy: 0.6979166666666666
Epoch: 9 Train Loss: 2.608024725526159 Train Accuracy: 0.7239583333333334
Epoch: 10 Train Loss: 2.4205949508906737 Train Accuracy: 0.7447916666666666
Epoch: 11 Train Loss: 2.16154099837062 Train Accuracy: 0.765625
Epoch: 12 Train Loss: 1.9834257504833064 Train Accuracy: 0.78125
Epoch: 13 Train Loss: 1.8410954369710264 Train Accuracy: 0.7604166666666666
Epoch: 14 Train Loss: 1.7361605808951757 Train Accuracy: 0.7552083333333334
Epoch: 1

Load MNIST Dataset