## Building the whole pipeline in bits !! ##

In [None]:
import numpy as np

In [None]:
# pip install nnfs

### Layer Construction ###

In [1]:
class Dense:
    def __init__(self,inputs,neurons):
        self.weights = 0.01* np.random.randn(inputs,neurons)
        self.bias = np.zeros((1,neurons))
    def forward_prop(self,inputs):
        self.inputs = inputs
        self.output = np.dot(inputs,self.weights)+ self.bias

    def backward_prop(self,dvalues):
        self.dweights = np.dot(self.inputs.T,dvalues)
        self.dbiases = np.sum(dvalues,axis=0,keepdims=True)
        self.dinputs = np.dot(dvalues,self.weights.T)

### Activation Function ###

In [None]:
class activateReLu:
    def forward_pass(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
    def backward_pass(self,dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

In [None]:
class activateSoftmax:
    def forward_pass(self,inputs):
        expVals = np.exp(inputs - np.max(inputs,axis=1,keepdims=True))
        probs = expVals / np.sum(expVals,axis=1,keepdims=True)
        self.output = probs

### Loss Function ###

In [None]:
class Loss:
    def lossCal(self,output,y):
        sample_losses = self.forward_pass(output,y)
        data_loss = np.mean(sample_losses)

        return data_loss

In [None]:
class CategoricalCrossEntropyFunc(Loss) :
    def forward_pass(self,y_pred,y_true):
        samples = len(y_pred)
        y_pred_clip = np.clip(y_pred,1e-7,1-1e-7)

        if len(y_true.shape) ==1:
            correct_confidence = y_pred_clip[range(samples),y_true]

        elif len(y_true.shape) == 2:
            correct_confidence = np.sum(y_true*y_pred_clip,axis=1)

        neg_likelihoods = -np.log(correct_confidence)
        return neg_likelihoods


    def backward_prop(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])

        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        self.dinputs = -y_true/dvalues
        self.dinputs = self.dinputs/samples

In [None]:
class Softmax_Categorical_Loss():
    def __init__(self):
        self.activation = activateSoftmax()
        self.loss = CategoricalCrossEntropyFunc()

    def forward_prop(self, inputs, y_true):
        self.activation.forward_pass(inputs)
        self.output = self.activation.output

        return self.loss.lossCal(self.output,y_true)

    def backward_prop(self,dvalues,y_true):
        samples = len(dvalues)
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

### Final Testing on Data !!! ###

In [None]:
import nnfs
from nnfs.datasets import spiral_data
import matplotlib.pyplot as plt
nnfs.init()

X,y = spiral_data(samples=100, classes=3)
plt.scatter(X[:,0], X[:,1], c=y, cmap='brg')
plt.show()

In [None]:
"""X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,3)
activation_layer_1 = activateReLu()

layer2 = Dense(3,3)
loss_activation = Softmax_Categorical_Loss()

layer1.forward_prop(X)
activation_layer_1.forward_pass(layer1.output)

layer2.forward_prop(activation_layer_1.output)

loss = loss_activation.forward_prop(layer2.output,y)

print("### Results before forward propagation ###")
print(loss_activation.output[:5])

print("Loss : ", loss)

preds = np.argmax(loss_activation.output, axis=1)
# print(preds)
if len(y.shape) == 2:
    y = np.argmax(y,axis=1)
# print(y)

acc = np.mean(preds == y)
print("Accuracy : ", acc)

loss_back = loss_activation.backward_prop(loss_activation.output,y)
layer2.backward_prop(loss_activation.dinputs)
activation_layer_1.backward_pass(layer2.dinputs)
layer1.backward_prop(activation_layer_1.dinputs)
print("### Results before backward propagation ###")
# Print gradients
print(layer1.dweights)
print(layer1.dbiases)
print(layer2.dweights)
print(layer2.dbiases)
"""

In [None]:
# print(preds)

### Building the Optimizer ###

In [None]:
class Optimizer_GD:
    def __init__(self,learning_rate=0.09):
        self.learning_rate = learning_rate

    def update_params(self,layer):
        layer.weights += -self.learning_rate*layer.dweights
        layer.bias += -self.learning_rate*layer.dbiases

In [None]:
X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,64)
activation_layer_1 = activateReLu()

layer2 = Dense(64,3)
loss_activation = Softmax_Categorical_Loss()
optim = Optimizer_GD()

epochs = []
accuracies = []
for epoch in range (10001):
    layer1.forward_prop(X)
    activation_layer_1.forward_pass(layer1.output)
    
    layer2.forward_prop(activation_layer_1.output)
    
    loss = loss_activation.forward_prop(layer2.output,y)
    
    # print("### Results before forward propagation ###")
    # print(loss_activation.output[:5])
    
    
    
    preds = np.argmax(loss_activation.output, axis=1)
    # print(preds)
    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    # print(y)
    
    acc = np.mean(preds == y)
    # print("Accuracy : ", acc)
    if not epoch %100:
        print(f'epoch: {epoch}, ' +
              f'acc: {acc:.3f}, ' +
              f'loss: {loss:.3f}')
        epochs.append(epoch)
        accuracies.append(acc)
    loss_activation.backward_prop(loss_activation.output,y)
    layer2.backward_prop(loss_activation.dinputs)
    activation_layer_1.backward_pass(layer2.dinputs)
    layer1.backward_prop(activation_layer_1.dinputs)

    optim.update_params(layer1)
    optim.update_params(layer2)
# print("### Results before backward propagation ###")
# Print gradients
# print(layer1.dweights)
# print(layer1.dbiases)
# print(layer2.dweights)
# print(layer2.dbiases)

plt.figure(figsize=(8, 5))
plt.scatter(epochs, accuracies, color='blue', label='Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Epochs vs Accuracy')
plt.legend()
plt.show()

### Learning Rate Decay ###

In [None]:
class Optimizer_GD:
    def __init__(self,learning_rate=0.0, decay= 0.0):
        self.learning_rate = learning_rate
        self.decay = decay
        self.curr_learning_rate = learning_rate
        self.iterations = 0 

    def pre_update_params(self):
        if self.decay:
            self.curr_learning_rate = self.learning_rate * (1.0/(1+self.decay*self.iterations))
    def update_params(self,layer):
        layer.weights += -self.curr_learning_rate*layer.dweights
        layer.bias += -self.curr_learning_rate*layer.dbiases

    def post_update_params(self):
        self.iterations +=1

In [None]:
X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,64)
activation_layer_1 = activateReLu()

layer2 = Dense(64,3)
loss_activation = Softmax_Categorical_Loss()
optim = Optimizer_GD(learning_rate=2.0,decay=1e-3)

lrs = []
accuracies = []
for epoch in range (20001):
    layer1.forward_prop(X)
    activation_layer_1.forward_pass(layer1.output)
    
    layer2.forward_prop(activation_layer_1.output)
    
    loss = loss_activation.forward_prop(layer2.output,y)
    
    # print("### Results before forward propagation ###")
    # print(loss_activation.output[:5])
    
    
    
    preds = np.argmax(loss_activation.output, axis=1)
    # print(preds)
    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    # print(y)
    
    acc = np.mean(preds == y)
    # print("Accuracy : ", acc)
    if not epoch %100:
        print(f'epoch: {epoch}, ' +
              f'acc: {acc:.3f}, ' +
              f'loss: {loss:.3f},' +
             f'lr: {optim.curr_learning_rate}')
        lrs.append(optim.curr_learning_rate)
        accuracies.append(acc)
    loss_activation.backward_prop(loss_activation.output,y)
    layer2.backward_prop(loss_activation.dinputs)
    activation_layer_1.backward_pass(layer2.dinputs)
    layer1.backward_prop(activation_layer_1.dinputs)

    optim.pre_update_params()
    optim.update_params(layer1)
    optim.update_params(layer2)
    optim.post_update_params()
# print("### Results before backward propagation ###")
# Print gradients
# print(layer1.dweights)
# print(layer1.dbiases)
# print(layer2.dweights)
# print(layer2.dbiases)

plt.figure(figsize=(8, 5))
plt.scatter(lrs, accuracies, color='blue', label='Accuracy')
plt.xlabel('LRs')
plt.ylabel('Accuracy')
plt.title('LRs vs Accuracy')
plt.legend()
plt.show()

### Momentum ###

In [None]:
class Optimizer_GD:
    def __init__(self,learning_rate=0.0, decay= 0.0, momen=0.0):
        self.learning_rate = learning_rate
        self.decay = decay
        self.curr_learning_rate = learning_rate
        self.iterations = 0 
        self.momentum = momen

    def pre_update_params(self):
        if self.decay:
            self.curr_learning_rate = self.learning_rate * (1.0/(1+self.decay*self.iterations))
    def update_params(self,layer):
        if self.momentum:
            if not hasattr(layer,'weights_momentums'):
                layer.weights_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.bias)

            weights_updates = self.momentum * layer.weights_momentums - self.curr_learning_rate*layer.dweights
            layer.weights_momentums = weights_updates

            bias_updates = self.momentum * layer.bias_momentums - self.curr_learning_rate*layer.dbiases
            layer.bias_momentums = bias_updates
        else:
            layer.weights += -self.curr_learning_rate*layer.dweights
            layer.bias += -self.curr_learning_rate*layer.dbiases

        layer.weights += weights_updates
        layer.bias += bias_updates

    def post_update_params(self):
        self.iterations +=1

In [None]:
X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,64)
activation_layer_1 = activateReLu()

layer2 = Dense(64,3)
loss_activation = Softmax_Categorical_Loss()
optim = Optimizer_GD(learning_rate=2.0,decay=1e-3,momen=0.09)

lrs = []
accuracies = []
for epoch in range (10001):
    layer1.forward_prop(X)
    activation_layer_1.forward_pass(layer1.output)
    
    layer2.forward_prop(activation_layer_1.output)
    
    loss = loss_activation.forward_prop(layer2.output,y)
    
    # print("### Results before forward propagation ###")
    # print(loss_activation.output[:5])
    
    
    
    preds = np.argmax(loss_activation.output, axis=1)
    # print(preds)
    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    # print(y)
    
    acc = np.mean(preds == y)
    # print("Accuracy : ", acc)
    if not epoch %100:
        print(f'epoch: {epoch}, ' +
              f'acc: {acc:.3f}, ' +
              f'loss: {loss:.3f},' +
             f'lr: {optim.curr_learning_rate}')
        lrs.append(optim.curr_learning_rate)
        accuracies.append(acc)
    loss_activation.backward_prop(loss_activation.output,y)
    layer2.backward_prop(loss_activation.dinputs)
    activation_layer_1.backward_pass(layer2.dinputs)
    layer1.backward_prop(activation_layer_1.dinputs)

    optim.pre_update_params()
    optim.update_params(layer1)
    optim.update_params(layer2)
    optim.post_update_params()
# print("### Results before backward propagation ###")
# Print gradients
# print(layer1.dweights)
# print(layer1.dbiases)
# print(layer2.dweights)
# print(layer2.dbiases)

plt.figure(figsize=(8, 5))
plt.scatter(lrs, accuracies, color='blue', label='Accuracy')
plt.xlabel('LRs')
plt.ylabel('Accuracy')
plt.title('LRs vs Accuracy')
plt.legend()
plt.show()

### Adagrad Optimizer ###

In [None]:
class Optimizer_Adagrad:
    def __init__(self,learning_rate=0.0, decay=0., epsilion=0.09):
        self.learning_rate = learning_rate
        self.curr_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilion = epsilion

    def pre_update_params(self):
        if self.decay:
            self.curr_learning_rate = self.learning_rate * (1.0/1.0+(self.decay*self.iterations))

    def update_params(self,layer):
        if not hasattr(layer,'weights_cache'):
            layer.weights_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.bias)

        layer.weights_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2

        layer.weights += -self.curr_learning_rate*layer.dweights/(np.sqrt(layer.weights_cache+self.epsilion))
        layer.bias += -self.curr_learning_rate*layer.dbiases/(np.sqrt(layer.bias_cache+self.epsilion))

    def post_update_params(self):
        self.iterations +=1

In [None]:
X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,64)
activation_layer_1 = activateReLu()

layer2 = Dense(64,3)
loss_activation = Softmax_Categorical_Loss()
optim = Optimizer_Adagrad(learning_rate=1.0,decay=1e-3)

lrs = []
accuracies = []
for epoch in range (10001):
    layer1.forward_prop(X)
    activation_layer_1.forward_pass(layer1.output)
    
    layer2.forward_prop(activation_layer_1.output)
    
    loss = loss_activation.forward_prop(layer2.output,y)
    
    # print("### Results before forward propagation ###")
    # print(loss_activation.output[:5])
    
    
    
    preds = np.argmax(loss_activation.output, axis=1)
    # print(preds)
    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    # print(y)
    
    acc = np.mean(preds == y)
    # print("Accuracy : ", acc)
    if not epoch %100:
        print(f'epoch: {epoch}, ' +
              f'acc: {acc:.3f}, ' +
              f'loss: {loss:.3f},' +
             f'lr: {optim.curr_learning_rate}')
        lrs.append(optim.curr_learning_rate)
        accuracies.append(acc)
    loss_activation.backward_prop(loss_activation.output,y)
    layer2.backward_prop(loss_activation.dinputs)
    activation_layer_1.backward_pass(layer2.dinputs)
    layer1.backward_prop(activation_layer_1.dinputs)

    optim.pre_update_params()
    optim.update_params(layer1)
    optim.update_params(layer2)
    optim.post_update_params()
# print("### Results before backward propagation ###")
# Print gradients
# print(layer1.dweights)
# print(layer1.dbiases)
# print(layer2.dweights)
# print(layer2.dbiases)

plt.figure(figsize=(8, 5))
plt.scatter(lrs, accuracies, color='blue', label='Accuracy')
plt.xlabel('LRs')
plt.ylabel('Accuracy')
plt.title('LRs vs Accuracy')
plt.legend()
plt.show()

### RMSProp Optimizer ###

In [None]:
class Optimizer_RMSProp:
    def __init__(self,learning_rate=0.0, decay=0., epsilion=0.09, rho= 0.5):
        self.learning_rate = learning_rate
        self.curr_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.rho = rho
        self.epsilion = epsilion

    def pre_update_params(self):
        if self.decay:
            self.curr_learning_rate = self.learning_rate * (1.0/1.0+(self.decay*self.iterations))

    def update_params(self,layer):
        if not hasattr(layer,'weights_cache'):
            layer.weights_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.bias)

        layer.weights_cache = self.rho*layer.weights_cache + (1-self.rho)*layer.dweights**2
        layer.bias_cache = self.rho*layer.bias_cache + (1-self.rho)*layer.dbiases**2

        layer.weights += -self.curr_learning_rate*layer.dweights/(np.sqrt(layer.weights_cache+self.epsilion))
        layer.bias += -self.curr_learning_rate*layer.dbiases/(np.sqrt(layer.bias_cache+self.epsilion))

    def post_update_params(self):
        self.iterations +=1

In [None]:
X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,64)
activation_layer_1 = activateReLu()

layer2 = Dense(64,3)
loss_activation = Softmax_Categorical_Loss()
optim = Optimizer_RMSProp(learning_rate=1.0,decay=1e-3)

lrs = []
accuracies = []
for epoch in range (10001):
    layer1.forward_prop(X)
    activation_layer_1.forward_pass(layer1.output)
    
    layer2.forward_prop(activation_layer_1.output)
    
    loss = loss_activation.forward_prop(layer2.output,y)
    
    # print("### Results before forward propagation ###")
    # print(loss_activation.output[:5])
    
    
    
    preds = np.argmax(loss_activation.output, axis=1)
    # print(preds)
    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    # print(y)
    
    acc = np.mean(preds == y)
    # print("Accuracy : ", acc)
    if not epoch %100:
        print(f'epoch: {epoch}, ' +
              f'acc: {acc:.3f}, ' +
              f'loss: {loss:.3f},' +
             f'lr: {optim.curr_learning_rate}')
        lrs.append(optim.curr_learning_rate)
        accuracies.append(acc)
    loss_activation.backward_prop(loss_activation.output,y)
    layer2.backward_prop(loss_activation.dinputs)
    activation_layer_1.backward_pass(layer2.dinputs)
    layer1.backward_prop(activation_layer_1.dinputs)

    optim.pre_update_params()
    optim.update_params(layer1)
    optim.update_params(layer2)
    optim.post_update_params()
# print("### Results before backward propagation ###")
# Print gradients
# print(layer1.dweights)
# print(layer1.dbiases)
# print(layer2.dweights)
# print(layer2.dbiases)

plt.figure(figsize=(8, 5))
plt.scatter(lrs, accuracies, color='blue', label='Accuracy')
plt.xlabel('LRs')
plt.ylabel('Accuracy')
plt.title('LRs vs Accuracy')
plt.legend()
plt.show()

### ADAM Optimizer ###

In [None]:
class Optimizer_Adam:
    def __init__(self,learning_rate=0., decay=0.,momen=0.,epsilion=0.09,beta_1=0.09, beta_2=0.009):
        self.learning_rate = learning_rate
        self.curr_learning_rate = learning_rate
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.decay = decay
        self.momen = momen
        self.epsilion = epsilion
        self.iterations = 0

    def pre_update_params(self):
        if self.decay:
            self.curr_learning_rate = self.learning_rate * (1.0/(1.0+self.decay*self.iterations))

    def update_params(self,layer):
        if not hasattr(layer,'weights_cache'):
            layer.weights_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.bias)
            layer.weights_momentums = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.bias)

        layer.weights_momentums = self.beta_1*layer.weights_momentums + (1-self.beta_1)*layer.dweights
        layer.bias_momentums = self.beta_1*layer.bias_momentums + (1-self.beta_1)*layer.dbiases

        layer.weights_cache = self.beta_2*layer.weights_cache + (1-self.beta_2)*layer.dweights**2
        layer.bias_cache = self.beta_2*layer.bias_cache + (1-self.beta_2)*layer.dbiases**2

        weight_momens = layer.weights_momentums/(1-self.beta_1**(self.iterations+1))
        bias_momens = layer.bias_momentums/(1-self.beta_1**(self.iterations+1))


        weight_cac = layer.weights_cache/(1-self.beta_2**(self.iterations+1))
        bias_cac = layer.bias_cache/(1-self.beta_2**(self.iterations+1))

        layer.weights += -self.curr_learning_rate* (weight_momens/(np.sqrt(weight_cac)+ self.epsilion))
        layer.bias += -self.curr_learning_rate* (bias_momens/(np.sqrt(bias_cac)+ self.epsilion))

    def post_update_params(self):
        self.iterations += 1

In [None]:
X,y = spiral_data(samples=100, classes=3)
layer1 = Dense(2,64)
activation_layer_1 = activateReLu()

layer2 = Dense(64,3)
loss_activation = Softmax_Categorical_Loss()
optim = Optimizer_Adam(learning_rate=1.0,decay=1e-3)

lrs = []
accuracies = []
for epoch in range (10001):
    layer1.forward_prop(X)
    activation_layer_1.forward_pass(layer1.output)
    
    layer2.forward_prop(activation_layer_1.output)
    
    loss = loss_activation.forward_prop(layer2.output,y)
    
    # print("### Results before forward propagation ###")
    # print(loss_activation.output[:5])
    
    
    
    preds = np.argmax(loss_activation.output, axis=1)
    # print(preds)
    if len(y.shape) == 2:
        y = np.argmax(y,axis=1)
    # print(y)
    
    acc = np.mean(preds == y)
    # print("Accuracy : ", acc)
    if not epoch %100:
        print(f'epoch: {epoch}, ' +
              f'acc: {acc:.3f}, ' +
              f'loss: {loss:.3f},' +
             f'lr: {optim.curr_learning_rate}')
        lrs.append(optim.curr_learning_rate)
        accuracies.append(acc)
    loss_activation.backward_prop(loss_activation.output,y)
    layer2.backward_prop(loss_activation.dinputs)
    activation_layer_1.backward_pass(layer2.dinputs)
    layer1.backward_prop(activation_layer_1.dinputs)

    optim.pre_update_params()
    optim.update_params(layer1)
    optim.update_params(layer2)
    optim.post_update_params()
# print("### Results before backward propagation ###")
# Print gradients
# print(layer1.dweights)
# print(layer1.dbiases)
# print(layer2.dweights)
# print(layer2.dbiases)

plt.figure(figsize=(8, 5))
plt.scatter(lrs, accuracies, color='blue', label='Accuracy')
plt.xlabel('LRs')
plt.ylabel('Accuracy')
plt.title('LRs vs Accuracy')
plt.legend()
plt.show()

### Testing Data ###

In [None]:
X_test, y_test = spiral_data(classes=3, samples=100)
layer1.forward_prop(X_test)
activation_layer_1.forward_pass(layer1.output)
layer2.forward_prop(activation_layer_1.output)
loss = loss_activation.forward_prop(layer2.output, y_test)

preds = np.argmax(loss_activation.output, axis=1)
if (y_test.shape == 2):
    y_test = np.argmax(y_test,axis=1)

accuracy = np.mean(preds == y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')