In [11]:
from copy import deepcopy
import numpy as np 
from sklearn.datasets import make_classification

from templates import AutoDiffFunction, Layer, Loss, Optimizer

# Activation Functions

In [12]:
class Sigmoid(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        self.saved_for_backward = 1/(1 + np.exp(-x))
        return self.saved_for_backward

    def compute_grad(self, x):
        y = self.saved_for_backward

        return {"x": y*(1-y)}

    def backward(self, dy):
        return dy * self.grad["x"]      


class RelU(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        self.saved_for_backward = np.where(x>0.0, 1.0, 0.0)

        return x * self.saved_for_backward

    def compute_grad(self, x):
        return {"x": self.saved_for_backward}

    def backward(self, dy):
        return dy * self.grad["x"]
    
class Tanh(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        self.saved_for_backward = (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

        return self.saved_for_backward

    def compute_grad(self, x):
        y = self.saved_for_backward

        return {"x": 1 - y**2}

    def backward(self, dy):
        return dy * self.grad["x"]
    
class Softmax(AutoDiffFunction):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        v = np.exp(x)
        self.saved_for_backward = v

        return v / np.sum(v, axis=1, keepdims=True)

    def compute_grad(self, x):
        pass

    def backward(self, dy):
        return dy * self.grad["x"]

# Layers

In [13]:
class FC(Layer):
    def __init__(self, in_dim, out_dim) -> None:
        super().__init__()
        self.initialize_weights(in_dim, out_dim)

    def initialize_weights(self, in_dim, out_dim):
        
        self.weights["w"] = np.random.randn(in_dim, out_dim)
        self.weights["b"] = np.random.randn(1, out_dim)

    def compute_grad(self, x):
        
        gradients = {}

        # y = x * w + b        
        # we compute gradients wrt w and x 
        # gradient wrt b is not required explicitly since we know that it's value is 1
        gradients["w"] = self.saved_for_backward["x"].T
        gradients["x"] = self.weights["w"].T

        return gradients


    def forward(self, x):
        
        output = x @ self.weights["w"] + self.weights["b"]
        self.saved_for_backward["x"] = x
        
        return output

    def backward(self, dy):
        #print(dy.shape)
        #print(self.grad["x"].shape)
        #print(self.grad["w"].shape)
        
        dx = dy @ self.grad["x"]
        
        # calculating gradients wrt weights
        dw = self.grad["w"] @ dy
        db = np.sum(dy, axis=0, keepdims=True)

        self.absolute_gradients = {"w": dw, "b": db}

        return dx

    def update_weights(self):
        self.optimizer.step(self)

# Loss Functions

In [14]:
class CrossEntropyLossFromLogits(Loss):

    @staticmethod
    def softmax(x):
        v = np.exp(x)

        return v / np.sum(v, axis=1, keepdims=True)

    def encode(self, y): 
        encoded_y = np.zeros(shape=(len(y), self.num_classes))

        for i in range(len(y)):
            encoded_y[i,y[i]] = 1

        return encoded_y

    def forward(self, y_pred, y_true):
         
        probabilities = self.softmax(y_pred)
        y_true_encoded = self.encode(y_true)

        loss_value = np.mean(np.sum(- y_true_encoded * np.log(probabilities), axis=1))

        self.saved_for_backward["probabilities"] = probabilities
        self.saved_for_backward["y_true"] = y_true_encoded

        return loss_value

    def compute_grad(self, y_pred, y_true):

        return {"x": self.saved_for_backward["probabilities"] - self.saved_for_backward["y_true"]}
    
class MSELoss(Loss):

    @staticmethod
    def softmax(x):
        v = np.exp(x)

        return v / np.sum(v, axis=1, keepdims=True)

    def encode(self, y): 
        encoded_y = np.zeros(shape=(len(y), self.num_classes))

        for i in range(len(y)):
            encoded_y[i,y[i]] = 1

        return encoded_y
    
    @staticmethod
    def indicator(i, j):
        ind = {True: 1, False: 0}
        return ind[i==j]

    def forward(self, y_pred, y_true):
         
        probabilities = self.softmax(y_pred)
        y_true_encoded = self.encode(y_true)

        loss_value = np.mean(np.sum((probabilities - y_true_encoded)**2, axis=1))

        self.saved_for_backward["probabilities"] = probabilities
        self.saved_for_backward["y_true"] = y_true_encoded

        return loss_value

    def compute_grad(self, y_pred, y_true):
        grad = np.zeros(len(y_true), self.num_classes)
        
        for point_counter in range(len(y_true)):
            res = 0
            for i in range(self.num_classes):
                for j in range(self.num_classes):
                    
                    res += y_pred[point_counter, j] * (y_pred[point_counter, j] - y_true[point_counter, j]) * (indicator(i,j) - y_pred[point_counter, i])
            grad[point_counter, i] = res
        
        return {"x": grad}

In [15]:
arr1 = np.array([[1,2,3],[4,5,7]])
v = np.exp(arr1)
print(v / np.sum(v, axis=1, keepdims=True))

[[0.09003057 0.24472847 0.66524096]
 [0.04201007 0.1141952  0.84379473]]


# Optimizer

In [34]:
class SGD(Optimizer):
    def __init__(self, lr):
        self.lr = lr
    
    def step(self, layer):
        for weight_name, _ in layer.weights.items():
            layer.weights[weight_name] = layer.weights[weight_name] - self.lr * layer.absolute_gradients[weight_name]
            
class Momentum(Optimizer):
    def __init__(self, lr, gamma):
        self.lr = lr
        self.gamma = gamma
        
    def step(self, layer):
        
        #Initialise update history
        if self.remember == {}:
            self.remember[weight_name] = {}
            self.remember[weight_name]["v"] = np.zeros_like(weight)
        
        #Momentum update rule
        for weight_name, weight in layers.weights.items():
            self.remember[weight_name]["v"] = self.gamma * self.remember[weight_name]["v"] + \
                                                self.lr * layer.absolute_gradients[weight_name]
            layer.weights[weight_name] = layer.weights[weight_name] - self.remember[weight_name]["v"]

"""
class NAG(Optimizer):
    def __init__(self, lr, gamma):
        self.lr = lr
        self.gamma = gamma
        
    def step(self, layer):
        
        #Initialise update history
        if self.remember == {}:
            self.remember[weight_name] = {}
            self.remember[weight_name]["v"] = np.zeros_like(weight)
            self.remember[weight_name]["look_ahead"] = np.zeros_like(weight)
        
        #NAG update rule
        for weight_name, weight in layers.weights.items():
            self.remember[weight_name]["look_ahead"] = layer.weights[weight_name] - self.gamma * self.remember[weight_name]["v"]
            #self.remember[weight_name]["v"] = 
            layer.weights[weight_name] = layer.weights[weight_name] - self.remember[weight_name]["v"]
            
        pass
"""

class RMSprop(Optimizer):
    def __init__(self, lr, beta, epsilon):
        self.lr = lr
        self.beta = beta
        self.epsilon = epsilon
        
    def step(self, layer):
        
        #Initialise update history
        if self.remember == {}:
            self.remember[weight_name] = {}
            self.remember[weight_name]["v"] = np.zeros_like(weight)
        
        #RMSprop update rule
        for weight_name, weight in layers.weights.items():
            self.remember[weight_name]["v"] = self.beta * self.remember[weight_name]["v"] + \
                                                (1 - self.beta) * (layer.absolute_gradients[weight_name] ** 2)
            layer.weights[weight_name] = layer.weights[weight_name] - (self.lr / (np.sqrt(self.remember[weight_name]["v"] + \
                                                self.epsilon))) * layer.weights[weight_name]

class Adam(Optimizer):
    def __init__(self, lr, beta_1, beta_2, epsilon):
        self.lr = lr
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.t = 0
        
    def step(self, layer):
        
        #Initialise update history
        if self.remember == {}:
            self.remember[weight_name] = {}
            self.remember[weight_name]["m"] = np.zeros_like(weight)
            self.remember[weight_name]["v"] = np.zeros_like(weight)
        
        #RMSprop update rule
        for weight_name, weight in layers.weights.items():
            
            #Update m_t and v_t
            self.remember[weight_name]["m"] = self.beta_1 * self.remember[weight_name]["m"] + \
                                                (1 -self.beta_1) * layer.absolute_gradients[weight_name]
            
            self.remember[weight_name]["v"] = self.beta_2 * self.remember[weight_name]["v"] + \
                                                (1 - self.beta_2) * (layer.absolute_gradients[weight_name]**2)
            
            #Bias correction
            m_hat = self.remember[weight_name]["m"]/(1 - self.beta_1 ** self.t)
            v_hat = self.remember[weight_name]["v"]/(1 - self.beta_2 ** self.t)
            
            #Update parameters
            layer.weights[weight_name] = layer.weights[weight_name] - (self.lr / (np.sqrt(v_hat + self.epsilon))) * m_hat
            
        self.t += 1
            
class Nadam(Optimizer):
    def __init__(self, lr, beta_1, beta_2, epsilon):
        self.lr = lr
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.epsilon = epsilon
        self.t = 0

    def step(self, layer):
        
        # we have 2 parameters to remember m(t) and v(t) for all weights in the layer
        if self.remember == {}:
            for weight_name, weight in layer.weights.items():
                self.remember[weight_name] = {}
                self.remember[weight_name]["v"] = np.zeros_like(weight)
                self.remember[weight_name]["m"] = np.zeros_like(weight)

        for weight_name, weight in layer.weights.items():
            
            self.remember[weight_name]["m"] = self.beta_1 * self.remember[weight_name]["m"] + \
                                                (1 -self.beta_1) * layer.absolute_gradients[weight_name]

            self.remember[weight_name]["v"] = self.beta_2 * self.remember[weight_name]["v"] + \
                                                (1 - self.beta_2) * layer.absolute_gradients[weight_name]**2

            # bias correction step 
            m_hat = self.remember[weight_name]["m"]/(1 - self.beta_1 ** self.t)
            v_hat = self.remember[weight_name]["v"]/(1 - self.beta_2 ** self.t)

            d = self.lr / (np.sqrt(v_hat) + self.epsilon) * (self.beta_1*m_hat + (1-self.beta_1)/
                                                (1-self.beta_1 ** self.t) * layer.absolute_gradients[weight_name]) 

            layer.weights[weight_name] = layer.weights[weight_name] - d

        self.t += 1

# Neural Network Structure

In [17]:
class NeuralNet():
    def __init__(self, layers) -> None:
        self.layers = layers

    def __call__(self, *args, **kwds):
        return self.forward(*args, **kwds)

    def compile(self, loss, optimizer):
        self.loss = loss

        for layer in self.layers:
            if isinstance(layer, Layer):
                layer.optimizer = deepcopy(optimizer)

    def calculate_loss(self, y_pred, y_true):
        return self.loss(y_pred, y_true)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)

        return x

    def backward(self):

        gradient = self.loss.backward()
        for layer in reversed(self.layers):
            gradient = layer.backward(gradient)

        return gradient

    def update_weights(self):

        for layer in reversed(self.layers):
            if isinstance(layer, Layer):
                layer.update_weights()

In [18]:
model = NeuralNet([FC(20, 32), RelU(), FC(32, 3)])
optimizer = SGD(lr = 0.001)

def fit_model(model, batches, optimizer, epochs=10):

    training_stats = []
    num_batches = len(batches)
 
    loss = CrossEntropyLossFromLogits()
    model.compile(loss=loss, optimizer=optimizer)

    for epoch in range(1, epochs+1):

        total_loss = 0
        total_accuracy = 0

        for X, y in batches:

            preds = model(X)
            total_loss += model.loss(preds, y)
            total_accuracy += accuracy_score(preds, y)

            _ = model.backward()
            model.update_weights()

        loss_per_epoch = total_loss / num_batches
        accuracy = total_accuracy / num_batches

        print(f"Epoch: {epoch} Train Loss: {loss_per_epoch} Train Accuracy: {accuracy}")

        training_stats.append({"Epoch" : epoch, 
                                "Train Loss": loss_per_epoch,
                                "Train Accuracy": accuracy})

    return training_stats