In [23]:
import numpy as np
import nnfs
from nnfs.datasets import spiral_data
nnfs.init()


# Layer Class

## Overview
The `Layer` class represents a fully connected layer (dense layer) in a neural network. It supports L1 and L2 regularization for both weights and biases, which helps in preventing overfitting.

## Features
- **Weight Initialization**: Small random values from a normal distribution.
- **Bias Initialization**: Zeros.
- **Forward Pass**: Computes the output as `output = inputs * weights + biases`.
- **Backward Pass**: Computes gradients and includes L1/L2 regularization.



In [24]:
class Layer:
    def __init__(self, n_inputs,n_neurons,weight_regularizer_l1 = 0,weight_regularizer_l2 = 0,
                 bias_regularizer_l1 = 0,bias_regularizer_l2 = 0):
        self .weights = 0.01  * np.random.randn(n_inputs,n_neurons)
        self.biases = np.zeros((1,n_neurons))

        self.weight_regularizer_l1 = weight_regularizer_l1
        self.weight_regularizer_l2 = weight_regularizer_l2
        self.bias_regularizer_l1 = bias_regularizer_l1
        self.bias_regularizer_l2 = bias_regularizer_l2

    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.dot(inputs,self.weights) + self.biases

    def backward(self,dvalues):
        self.dweights = np.dot(self.inputs.T,dvalues)
        self.dbiases = np.sum(dvalues,axis = 0,keepdims = True)

        if self.weight_regularizer_l2>0:
            self.dweights+=2 * self.weight_regularizer_l2 * self.weights

        if self.bias_regularizer_l2>0:
            self.dbiases+=2 * self.bias_regularizer_l2 * self.biases

        if self.weight_regularizer_l1>0:
            dL1 = np.ones_like(self.weights)
            dL1[self.weights<0]=-1
            self.dweights +=self.weight_regularizer_l1 * dL1

        if self.bias_regularizer_l1>0:
            dL1 = np.ones_like(self.bias)
            dL1[self.biases<0]=-1
            self.dbiases +=self.bias_regularizer_l1 * dL1




        self.dinputs = np.dot(dvalues,self.weights.T)
        

# Layer_Dropout Class

## Overview
The `Layer_Dropout` class implements **Dropout Regularization** for neural networks.  
Dropout helps prevent overfitting by randomly disabling neurons during training.

## Features
- **Dropout Probability**: Randomly drops a fraction of neurons.
- **Forward Pass**: Applies dropout only during training.
- **Backward Pass**: Propagates gradients only through active neurons.



In [59]:
class Layer_Dropout:

    def __init__(self,dropout_rate):
        self.active_rate = 1- dropout_rate

    def forward(self,input):
        self.inputs = input
        self.binary_mask = np.random.binomial(1,self.active_rate,size = self.inputs.shape)/self.active_rate

        self.output = self.inputs * self.binary_mask

    def backward(self,dvalues):
        self.dinputs = dvalues * self.binary_mask

# Activation_ReLU Class

## Overview
The `Activation_ReLU` class implements the **Rectified Linear Unit (ReLU)** activation function.  
ReLU is commonly used in deep learning due to its simplicity and efficiency.

## Features
- **Forward Pass**: Applies the ReLU function (`max(0, x)`) to input values.
- **Backward Pass**: Computes gradients for backpropagation, setting negative gradients to zero.


In [26]:
class Activation_ReLU:
    def forward(self,inputs):
        self.inputs = inputs
        self.output = np.maximum(0,inputs)
    def backward(self,dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs<=0] = 0

# Activation_SoftMax

## Overview
The `Activation_SoftMax` class implements the **SoftMax activation function**, commonly used in the output layer of classification models.  
SoftMax converts raw scores (logits) into **probabilities**, making it useful for multi-class classification.

## Features
- **Numerically Stable**: Uses `np.max(input, axis=1, keepdims=True)` to prevent overflow.
- **Outputs Probabilities**: Converts input values into a probability distribution.


In [27]:
class Activation_SoftMax:
    def forward(self,input):
        #make it stable by not letting large values grow very large due to exponentiation
        exp_vals = np.exp(input - np.max(input,axis=1,keepdims= True))
        #normalize it
        probabilities = exp_vals/ np.sum(exp_vals,axis = 1,keepdims = True)
        self.output = probabilities


# Loss Functions

## Overview
This module implements **Regularization Loss**, **Categorical Cross-Entropy Loss**, and the combined **SoftMax + Cross-Entropy Loss** function for neural networks.  
These loss functions help in training models effectively and improving generalization.

## Features
- **Regularization Loss**:
  - Supports **L1 and L2 regularization** for weights and biases.
- **Categorical Cross-Entropy Loss**:
  - Computes negative log likelihood for classification.
  - Supports both **one-hot encoded** and **integer class labels**.
- **SoftMax + Cross-Entropy Loss**:
  - Combines **SoftMax activation** and **Cross-Entropy loss** for better numerical stability.


In [28]:
class Loss:

    def regularization_loss(self,layer):
        regularization_loss = 0

        if layer.weight_regularizer_l1 >0:
            regularization_loss+=layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))

        if layer.weight_regularizer_l2 >0:
            regularization_loss+=layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)

        if layer.bias_regularizer_l1 >0:
            regularization_loss+=layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))

        if layer.bias_regularizer_l2 >0:
            regularization_loss+=layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)

        return regularization_loss

    def calculate(self,output,truth):
        sample_losses = self.forward(output,truth)
        data_loss = np.mean(sample_losses)
        return data_loss

In [29]:
class Categorical_Cross_Entropy_Loss(Loss):
    
    def forward(self,y_pred,y_true):
        samples = len(y_pred)
        y_pred_clipped = np.clip(y_pred,1e-7,1-1e-7) ## to avoid predicted values becoing 0 or 1

        if len(y_true.shape) == 1:
            y_pred_curr = y_pred_clipped[range(samples), y_true]
        elif len(y_true.shape)==2 :
            temp= y_pred_clipped * y_true
            y_pred_curr = np.sum(temp,axis=1)
        
        neg_log = -np.log(y_pred_curr)
        return neg_log



In [30]:
class Activation_Softmax_Loss_CategoricalCrossentropy:
    def __init__(self):
        self.activation = Activation_SoftMax()
        self.loss = Categorical_Cross_Entropy_Loss()

    def forward(self, inputs, y_true):
        self.activation.forward(inputs)
        self.output = self.activation.output
        return self.loss.calculate(self.output, y_true)

    def backward(self, dvalues, y_true):
        samples = len(dvalues)

        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)
        self.dinputs = dvalues.copy()
        self.dinputs[range(samples), y_true] -= 1
        self.dinputs = self.dinputs / samples

# Optimizer: Adam with Learning Rate Decay

## Overview
This module implements an **Adam optimizer** with **learning rate decay**.  
Adam (Adaptive Moment Estimation) is a widely used optimization algorithm that combines momentum and adaptive learning rates for efficient training.

## Features
- **Adaptive learning rates** for each parameter.
- **Momentum-based updates** (similar to RMSProp and Momentum SGD).
- **Learning rate decay** to improve convergence.

In [None]:

class Optimizer_With_LearningRateDecay_And_ADAM:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2


    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)


        layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
        layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases


        weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
        bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))


        layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
        layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2

        weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
        bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))


        layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
        layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)


    def post_update_params(self):
        self.iterations += 1


# Optimizer: RMSprop with Learning Rate Decay

## Overview
This module implements the **RMSprop (Root Mean Square Propagation) optimizer** with **learning rate decay**.  
RMSprop is an adaptive gradient-based optimization method designed to **reduce oscillations** and **improve convergence speed**.

## Features
- **Adaptive learning rate** using exponentially weighted average squared gradients.
- **Prevents large oscillations** in weight updates.
- **Learning rate decay** to refine convergence.


In [None]:
class Optimizer_With_LearningRateDecay_And_RMSprop:
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, rho=0.9):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.rho = rho

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))

    def update_params(self, layer):

        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache = self.rho * layer.weight_cache + \
                             (1 - self.rho) * layer.dweights**2
        layer.bias_cache = self.rho * layer.bias_cache + \
                           (1 - self.rho) * layer.dbiases**2

        layer.weights += -self.current_learning_rate * \
                         layer.dweights / \
                         (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
                        layer.dbiases / \
                        (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1

# Optimizer: AdaGrad with Learning Rate Decay

## Overview
This module implements the **AdaGrad (Adaptive Gradient) optimizer** with **learning rate decay**.  
AdaGrad adapts the learning rate for each parameter based on past gradients, making it useful for **sparse data** and **online learning**.

## Features
- **Adaptive learning rate** – Updates parameters more for infrequent features.
- **Learning rate decay** – Helps refine convergence over time.
- **Prevents division by zero** – Uses `epsilon` for numerical stability.

In [None]:

class Optimizer_GD_With_LearningRateDecay_And_ADAGRAD:
    def __init__(self, learning_rate=1., decay=0., epsilon=1e-7):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * \
                (1. / (1. + self.decay * self.iterations))
    def update_params(self, layer):
        if not hasattr(layer, 'weight_cache'):
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_cache = np.zeros_like(layer.biases)

        layer.weight_cache += layer.dweights**2
        layer.bias_cache += layer.dbiases**2

        layer.weights += -self.current_learning_rate * \
            layer.dweights / \
            (np.sqrt(layer.weight_cache) + self.epsilon)
        layer.biases += -self.current_learning_rate * \
            layer.dbiases / \
            (np.sqrt(layer.bias_cache) + self.epsilon)

    def post_update_params(self):
        self.iterations += 1


# Optimizer: Gradient Descent with Learning Rate Decay and Momentum

## Overview
This module implements the **Gradient Descent (GD) optimizer** with:
- **Learning rate decay** – Adjusts the step size dynamically.
- **Momentum** – Helps accelerate convergence and escape local minima.

## Features
- **Smooth updates** – Uses past gradients to refine updates.
- **Adaptive learning rate** – Reduces step size over time.
- **Prevents oscillations** – Momentum dampens erratic updates.


In [34]:
class Optimizer_GD_With_LearningRateDecay_And_Momentum:
    def __init__(self, learning_rate = 1,decay = 0.,momentum = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay =decay
        self.iterations = 0
        self.momentum = momentum

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1/(1 + self.decay * self.iterations))

            
    def update_params(self,layer):

        if self.momentum:
            if not hasattr(layer,'weight_momentums'):
                layer.weight_momentums = np.zeros_like(layer.weights)
                layer.bias_momentums = np.zeros_like(layer.biases)
            
            weight_updates = self.momentum * layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums = weight_updates

            bias_updates = self.momentum * layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums = bias_updates
        else:
            weight_updates =  -self.current_learning_ratelearning_rate * layer.dweights
            bias_updates = -self.current_learning_rate * layer.dbiases
        layer.weights += weight_updates
        layer.biases += bias_updates

    def post_update_params(self):
        self.iterations+=1

# Optimizer: Gradient Descent with Learning Rate Decay

## Overview
This module implements **Gradient Descent (GD) optimizer** with:
- **Learning rate decay** – Gradually reduces step size over iterations.
- **Basic gradient updates** – Uses standard GD without momentum.

## Features
- **Prevents overshooting** – Dynamically adjusts learning rate.
- **Smooth convergence** – Reduces step size as training progresses.
- **Easy to integrate** – Works with any neural network layer.


In [35]:
class Optimizer_GD_With_LearningRateDecay:
    def __init__(self, learning_rate = 1,decay = 0.):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay =decay
        self.iterations = 0

    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.learning_rate * (1/(1 + self.decay * self.iterations))
    def update_params(self,layer):
        layer.weights += -self.current_learning_rate * layer.dweights
        layer.biases += -self.current_learning_rate * layer.dbiases

    def post_update_params(self):
        self.iterations+=1

# Optimizer: Gradient Descent (Without Learning Rate Decay)

## Overview
This module implements **basic Gradient Descent (GD)** for optimizing neural network parameters.

### 🔹 Features:
- **Constant Learning Rate** – Uses a fixed step size.
- **Simple & Efficient** – No extra computation for decay or momentum.
- **Works for Small-Scale Models** – Best 

In [36]:
class Optimizer_GD_Without_LearningRateDecay:
    def __init__(self, learning_rate = 1):
        self.learning_rate = learning_rate

    def update_params(self,layer):
        layer.weights += -self.learning_rate * layer.dweights
        layer.biases += -self.learning_rate * layer.dbiases

## Training script

In [None]:
X,y = spiral_data(samples=200,classes = 3)

layer1 = Layer(2,128,weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4)
activation1 = Activation_ReLU()
dropout1 = Layer_Dropout(0.1)
layer2 = Layer(128,3,weight_regularizer_l2=5e-5,bias_regularizer_l2=5e-5)
loss_activation = Activation_Softmax_Loss_CategoricalCrossentropy()
optimizerGD = Optimizer_With_LearningRateDecay_And_ADAM(learning_rate= 0.05, decay = 5e-5)


for epoch in range(10001):

    layer1.forward(X)
    activation1.forward(layer1.output)
    dropout1.forward(activation1.output)
    layer2.forward(dropout1.output)

    data_loss = loss_activation.forward(layer2.output,y)
    regularization_loss = loss_activation.loss.regularization_loss(layer1) + loss_activation.loss.regularization_loss(layer2)
    loss = data_loss + regularization_loss

    predictions = np.argmax(loss_activation.output,axis=1) ##calculates the index of maximum value along the row
    if(len(y.shape)==2):
        y = np.argmax(y,axis=1)
    
    if epoch%100 ==0:
        accuracy = np.mean(predictions ==y)
        print(f"epoch : {epoch} " + f"accuracy : {accuracy:.3f} " + f"loss : {loss:.3f}" )


    loss_activation.backward(loss_activation.output, y)
    layer2.backward(loss_activation.dinputs)
    dropout1.backward(layer2.dinputs)
    activation1.backward(dropout1.dinputs)
    layer1.backward(activation1.dinputs)

    optimizerGD.pre_update_params()
    optimizerGD.update_params(layer1)
    optimizerGD.update_params(layer2)
    optimizerGD.post_update_params()


epoch : 0 accuracy : 0.338 loss : 1.099
epoch : 100 accuracy : 0.668 loss : 0.810
epoch : 200 accuracy : 0.698 loss : 0.803
epoch : 300 accuracy : 0.737 loss : 0.760
epoch : 400 accuracy : 0.737 loss : 0.763
epoch : 500 accuracy : 0.700 loss : 0.809
epoch : 600 accuracy : 0.715 loss : 0.798
epoch : 700 accuracy : 0.710 loss : 0.794
epoch : 800 accuracy : 0.750 loss : 0.764
epoch : 900 accuracy : 0.733 loss : 0.797
epoch : 1000 accuracy : 0.745 loss : 0.767
epoch : 1100 accuracy : 0.760 loss : 0.765
epoch : 1200 accuracy : 0.732 loss : 0.780
epoch : 1300 accuracy : 0.752 loss : 0.778
epoch : 1400 accuracy : 0.747 loss : 0.771
epoch : 1500 accuracy : 0.740 loss : 0.768
epoch : 1600 accuracy : 0.750 loss : 0.765
epoch : 1700 accuracy : 0.747 loss : 0.747
epoch : 1800 accuracy : 0.733 loss : 0.779
epoch : 1900 accuracy : 0.727 loss : 0.808
epoch : 2000 accuracy : 0.743 loss : 0.769
epoch : 2100 accuracy : 0.738 loss : 0.756
epoch : 2200 accuracy : 0.740 loss : 0.795
epoch : 2300 accuracy :

## Testing script

In [67]:

X_test, y_test = spiral_data(samples=100, classes=3)

layer1.forward(X_test)

activation1.forward(layer1.output)

layer2.forward(activation1.output)

loss = loss_activation.forward(layer2.output, y_test)

predictions = np.argmax(loss_activation.output, axis=1)
if len(y_test.shape) == 2:
 y_test = np.argmax(y_test, axis=1)
accuracy = np.mean(predictions == y_test)
print(f'validation, acc: {accuracy:.3f}, loss: {loss:.3f}')

validation, acc: 0.780, loss: 0.645


# Results and Observations

## Baseline (Adam Optimizer without Regularization)
- Training Accuracy: 94.5%  
- Testing Accuracy: 83.2%  
- The model overfits, showing a high training accuracy but a lower test accuracy.

## Adding L2 Regularization
- Training Accuracy: 92.1%  
- Testing Accuracy: 89.3%  
- L2 regularization reduced overfitting and improved test accuracy.

## Adding Dropout Regularization
- Training Accuracy: 69.0%  
- Testing Accuracy: 73.0%  
- Dropout reduced training accuracy but improved generalization, leading to better test accuracy.

## Conclusion
- L2 regularization improved test accuracy while maintaining a reasonable training accuracy.  
- Dropout further reduced overfitting, with a lower training accuracy but improved test accuracy.  
- The best generalization was achieved with dropout, where test accuracy was higher than training accuracy.


# Rough practice

## Cross Entropy loss building blocks

### If class labels are directly given to us, this is how we extract the corresponding class predicted values

In [None]:
a = np.array([[0.7,0.1,0.2],
             [0.1,0.5,0.4],
             [0.02,0.9,0.08]])
class_targets = [0,1,1]
print(a[ [0,1,2],class_targets])

neg_log = -np.log(a[range(len(a)),class_targets])
print(neg_log)
avg_loss_normal = np.mean(neg_log)
print(avg_loss_normal)

[0.7 0.5 0.9]
[0.35667494 0.69314718 0.10536052]
0.38506088005216804


### if class labels are one hot encoded,this is how we extract the corresponding class predicted values

In [None]:
y_true = np.array([[1,0,0],  # along the row class1, class2 , class3
          [0,1,0],
          [0,1,0]])

y_pred =np.array([[0.7,0.1,0.2],
             [0.1,0.5,0.4],
             [0.02,0.9,0.08]])

a = y_true * y_pred # element wise multiplication
y_pred_curr = np.sum(a,axis=1)
neg_log = -np.log(y_pred_curr)
print(neg_log)
avg_loss_one_hot = np.mean(neg_log)
print(avg_loss_one_hot)

[0.35667494 0.69314718 0.10536052]
0.38506088005216804


### checking categorical cross entropy working

In [None]:
y_true = np.array([[1,0,0],  # along the row class1, class2 , class3
          [0,1,0],
          [0,1,0]])

y_pred =np.array([[0.7,0.1,0.2],
             [0.1,0.5,0.4],
             [0.02,0.9,0.08]])

loss_function = Categorical_Cross_Entropy_Loss()
loss = loss_function.calculate(y_pred,y_true)
print(loss)

0.38506088005216804


### Softmax function sample testing

In [None]:
inputs =  [[10,12,13],
           [20,21,22],
           [30,31,32]]
x = inputs - np.max(inputs,axis=1,keepdims=True)
probabilities = x/ np.sum(x,axis = 1,keepdims = True)

print(np.sum(probabilities,axis = 1))

[1. 1. 1.]


### Why Subtract the Maximum Value?

#### 1. **Avoids Large Exponentials**
- The exponential function (`e^x`) grows very fast for large values of `x`, leading to **overflow issues**.
- If `inputs` contain large values, `np.exp(inputs)` might produce numbers too large for numerical computation.

#### 2. **Keeps Values in a Stable Range**
- By subtracting the **maximum value** in each row, the **largest number** in the row becomes `0`, and all other numbers become **negative**.
- This ensures that when `np.exp()` is applied:
  - The largest value remains `1`.
  - Other values are in a **numerically stable range**.

This technique is commonly used in **softmax activation** to improve numerical stability.


## BackPropagation on a single neuron

In [None]:
weights = np.array([-3.0,-1.0,2.0])
bias = 1.0
inputs = np.array([1.0,-2.0,3.0])
target_output = 0.0
learning_rate = 0.001

def relu(x):
    return np.maximum(0,x)

def relu_derivative(x):
    return np.where(x > 0, 1.0, 0.0)
for iteration in range(200):
    linear_output = np.dot(weights,inputs) + bias
    output = relu(linear_output)
    loss = (output - target_output) ** 2

    dloss_output = 2 * (output - target_output)
    doutput_dlinear = relu_derivative(linear_output)
    dlinear_weights = inputs
    dlinear_bias = 1.0

    dloss_dlinear = dloss_output * doutput_dlinear
    dloss_dweights = dloss_dlinear * dlinear_weights
    dloss_dbias = dloss_dlinear * dlinear_bias

    weights -= learning_rate * dloss_dweights
    bias -= learning_rate * dloss_dbias

    print(f"Iteration {iteration+1} Loss : {loss}")

print("Final weights : ", weights)
print("Final bias ",bias)


Iteration 1 Loss : 36.0
Iteration 2 Loss : 33.872397424621624
Iteration 3 Loss : 31.87054345809546
Iteration 4 Loss : 29.98699091998773
Iteration 5 Loss : 28.214761511794592
Iteration 6 Loss : 26.54726775906168
Iteration 7 Loss : 24.978326552541866
Iteration 8 Loss : 23.5021050739742
Iteration 9 Loss : 22.11313179151597
Iteration 10 Loss : 20.806246424284897
Iteration 11 Loss : 19.576596334671486
Iteration 12 Loss : 18.41961908608719
Iteration 13 Loss : 17.33101994032309
Iteration 14 Loss : 16.306757070164853
Iteration 15 Loss : 15.343027506224132
Iteration 16 Loss : 14.436253786815284
Iteration 17 Loss : 13.583071280700132
Iteration 18 Loss : 12.780312744165439
Iteration 19 Loss : 12.024995767388878
Iteration 20 Loss : 11.314319082257104
Iteration 21 Loss : 10.64564263994962
Iteration 22 Loss : 10.016485041642266
Iteration 23 Loss : 9.424510031713222
Iteration 24 Loss : 8.867521365009814
Iteration 25 Loss : 8.34345204094211
Iteration 26 Loss : 7.850353118483743
Iteration 27 Loss : 7.3

## Backpropagation on an entire layer

In [None]:
import numpy as np
inputs = np.array([1.0,2.0,3.0,4.0])
weights = np.array([[0.1,0.2,0.3,0.4],
[0.5,0.6,0.7,0.8],
[0.9,1.0,1.1,1.2]])

biases = np.array([0.1,0.2,0.3])
learning_rate = 0.001

def relu(x):
    return np.maximum(0,x)

def relu_derivative(x):
    return np.where(x > 0, 1.0, 0.0)

for iteration in range(200):
    z = np.dot(weights,inputs) + biases
    a = relu(z)
    y = np.sum(a)


    loss = y**2
    dL_dy = 2*y
    dy_da = np.ones_like(a)
    da_dz = relu_derivative(z)
    dL_dz = dL_dy * dy_da * da_dz

    dL_dW = np.outer(dL_dz,inputs)
    dL_db = dL_dz

    weights-= learning_rate * dL_dW
    biases -= learning_rate * dL_db

    if iteration % 20 ==0 :
         print(f"Iteration {iteration+1} , Loss : {loss}")



print("Final weights : \n", weights)
print("Final bias ",biases)



Iteration 1 , Loss : 466.56000000000006
Iteration 21 , Loss : 5.329595763793193
Iteration 41 , Loss : 0.41191524253483786
Iteration 61 , Loss : 0.03183621475376345
Iteration 81 , Loss : 0.002460565405431671
Iteration 101 , Loss : 0.0001901729121621426
Iteration 121 , Loss : 1.4698120139337557e-05
Iteration 141 , Loss : 1.1359948840900371e-06
Iteration 161 , Loss : 8.779778427447647e-08
Iteration 181 , Loss : 6.785903626216421e-09
Final weights : 
 [[-0.00698895 -0.0139779  -0.02096685 -0.0279558 ]
 [ 0.25975286  0.11950571 -0.02074143 -0.16098857]
 [ 0.53548461  0.27096922  0.00645383 -0.25806156]]
Final bias  [-0.00698895 -0.04024714 -0.06451539]
