In [64]:
import ipywidgets as widgets
import torch
import torch.autograd as autograd
import matplotlib.pyplot as plt
import numpy as np
from ipywidgets import fixed  # Importing fixed
import torch.nn.functional as F

In [22]:
def plot_multiple_functions(func_list,x_range_start=-10, x_range_end=10,  num_points=100, log_scale=False, **kwargs):
    x = np.linspace(x_range_start, x_range_end, num_points)
    for func in func_list:
        y = func(x)
        plt.plot(x, y, label=f'{func.__name__}', **kwargs)
    
    if log_scale:
        plt.yscale('log')
    
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title('Plot of Multiple Functions')
    plt.grid(True)
    plt.legend()
    plt.show()


In [25]:
## Lets start with a simple scalar example
def sigmoid(x):
    return 1 / (1 + torch.exp(-x))

In [26]:
# Widget to adjust x_range_start
x_range_start_slider = widgets.FloatSlider(min=-20, max=0, step=0.5, value=-10, description='x_range_start')

# Interactive function
widgets.interact(plot_multiple_functions, 
                 func_list=fixed([sigmoid]), 
                 x_range_start=x_range_start_slider, 
                 x_range_end=fixed(10), 
                 num_points=fixed(100), 
                 log_scale=fixed(False))


interactive(children=(FloatSlider(value=-10.0, description='x_range_start', max=0.0, min=-20.0, step=0.5), Out…

<function __main__.plot_multiple_functions(func_list, x_range_start=-10, x_range_end=10, num_points=100, log_scale=False, **kwargs)>

In [80]:
my_test_linear_transform = torch.nn.Linear(5,3)
for name, param in my_test_linear_transform.named_parameters():
    print(name, param)

weight Parameter containing:
tensor([[ 0.4286,  0.3634,  0.3349, -0.0226,  0.3046],
        [-0.1771, -0.4120, -0.4424,  0.0497, -0.0045],
        [-0.1930, -0.1363, -0.0225,  0.2749, -0.0244]], requires_grad=True)
bias Parameter containing:
tensor([ 0.2026, -0.1183,  0.1449], requires_grad=True)


In [95]:
## lets create a sample input dataset. Input is 10 samples with 5 features. Output is 10 samples with 3 classes
x = torch.randn(10,5)
y_target = torch.randint(0,3,(10,))


x.requires_grad = False
weight_matrix = my_test_linear_transform.weight.detach().clone()
weight_matrix.requires_grad = False
bias = my_test_linear_transform.bias.detach().clone()
bias.requires_grad = False

## lets create a simple linear and compute loss
with torch.no_grad():
    y_a_manual = my_test_linear_transform(x)
    y_a_manual_mm = x @ weight_matrix.t() + bias
    y_a_softmax_manual = F.softmax(y_a_manual, dim=1)
    loss_manual_vec = F.cross_entropy(y_a_manual, y_target, reduction='none')
    loss_manual = loss_manual_vec.mean()



In [96]:
# # Example tensors
# matrix_tensor = torch.randn(10, 5)  # A (10, 5) tensor
# index_tensor = torch.randint(0, 5, (10,))  # An index tensor of shape (10) with values between 0 and 4

# # Extract values
# extracted_values = matrix_tensor[torch.arange(matrix_tensor.size(0)), index_tensor]


To derive the gradient of the softmax activation function, let's start with the definition of the softmax function. The softmax function is often used in multi-class classification tasks, and it is defined as follows:

For a vector $\mathbf{z} = [z_1, z_2, \dots, z_n]$, the softmax function produces an output vector $\mathbf{y} = [y_1, y_2, \dots, y_n]$, where each component $y_i$ is given by:

$$
y_i = \frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}}
$$

Here, $\mathbf{y}$ represents the probabilities for each class, and each $y_i$ is constrained to lie between 0 and 1, with the sum of all $y_i$ equal to 1.

### Deriving the Gradient

The goal is to find the gradient of the softmax function with respect to its inputs $\mathbf{z}$. Specifically, we want to compute $\frac{\partial y_i}{\partial z_k}$ for each pair of $i$ and $k$.

**Case 1: When $i = k$**

Let's start with the derivative of $y_i$ with respect to $z_i$:

$$
\frac{\partial y_i}{\partial z_i} = \frac{\partial}{\partial z_i} \left(\frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}}\right)
$$

Using the quotient rule:

$$
\frac{\partial y_i}{\partial z_i} = \frac{ \left(\sum_{j=1}^{n} e^{z_j}\right) \cdot \frac{\partial e^{z_i}}{\partial z_i} - e^{z_i} \cdot \frac{\partial \sum_{j=1}^{n} e^{z_j}}{\partial z_i}}{\left(\sum_{j=1}^{n} e^{z_j}\right)^2}
$$

Since $\frac{\partial e^{z_i}}{\partial z_i} = e^{z_i}$ and $\frac{\partial \sum_{j=1}^{n} e^{z_j}}{\partial z_i} = e^{z_i}$, we have:

$$
\frac{\partial y_i}{\partial z_i} = \frac{e^{z_i} \sum_{j=1}^{n} e^{z_j} - e^{z_i} \cdot e^{z_i}}{\left(\sum_{j=1}^{n} e^{z_j}\right)^2}
$$

$$
\frac{\partial y_i}{\partial z_i} = \frac{e^{z_i} \left(\sum_{j=1}^{n} e^{z_j} - e^{z_i}\right)}{\left(\sum_{j=1}^{n} e^{z_j}\right)^2}
$$

$$
\frac{\partial y_i}{\partial z_i} = \frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}} \cdot \left(1 - \frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}}\right)
$$

$$
\frac{\partial y_i}{\partial z_i} = y_i \cdot (1 - y_i)
$$

**Case 2: When $i \neq k$**

Now, consider the derivative of $y_i$ with respect to $z_k$ where $i \neq k$:

$$
\frac{\partial y_i}{\partial z_k} = \frac{\partial}{\partial z_k} \left(\frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}}\right)
$$

Again, using the quotient rule:

$$
\frac{\partial y_i}{\partial z_k} = \frac{0 \cdot \sum_{j=1}^{n} e^{z_j} - e^{z_i} \cdot \frac{\partial \sum_{j=1}^{n} e^{z_j}}{\partial z_k}}{\left(\sum_{j=1}^{n} e^{z_j}\right)^2}
$$

Since $\frac{\partial \sum_{j=1}^{n} e^{z_j}}{\partial z_k} = e^{z_k}$, we have:

$$
\frac{\partial y_i}{\partial z_k} = - \frac{e^{z_i} \cdot e^{z_k}}{\left(\sum_{j=1}^{n} e^{z_j}\right)^2}
$$

$$
\frac{\partial y_i}{\partial z_k} = - \frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}} \cdot \frac{e^{z_k}}{\sum_{j=1}^{n} e^{z_j}}
$$

$$
\frac{\partial y_i}{\partial z_k} = - y_i \cdot y_k
$$

### Putting It All Together

The gradient of the softmax function with respect to its inputs can be expressed as:

$$
\frac{\partial y_i}{\partial z_k} = 
\begin{cases}
y_i \cdot (1 - y_i) & \text{if } i = k \\
- y_i \cdot y_k & \text{if } i \neq k
\end{cases}
$$

This result can also be written more compactly using the Kronecker delta notation $\delta_{ik}$, which is 1 if $i = k$ and 0 otherwise:

$$
\frac{\partial y_i}{\partial z_k} = y_i \cdot (\delta_{ik} - y_k)
$$

This is the gradient of the softmax function with respect to its inputs, and it's crucial for calculating gradients in neural networks during backpropagation when using softmax as the output activation function.




# Gradient Flow in Classification Problems

## Introduction

In a typical classification problem, we have:

- **N**: The number of data points in the dataset.
- **C**: The number of classes or categories the model can predict.

For each data point \( $ x_i $ \), the model outputs a probability distribution over the **C** classes using a softmax layer. The training process involves minimizing a loss function, often the cross-entropy loss, and updating the model's parameters using gradient descent.

## Cross-Entropy Loss Function

Given the true label \( $ y_i $ \) (one-hot encoded) and the predicted probability \( $ \hat{y}_i $ \) from the softmax layer, the cross-entropy loss for a single data point is:

$$
L_i = -\sum_{j=1}^{C} y_{ij} \log(\hat{y}_{ij})
$$

The total loss over all **N** data points is:

$$
L = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \log(\hat{y}_{ij})
$$

## Softmax Output

The softmax function converts the logits (raw output of the network) \( $ z_i $ \) into probabilities \( $ \hat{y}_i $ \) for each class:

$$
\hat{y}_{ij} = \frac{e^{z_{ij}}}{\sum_{k=1}^{C} e^{z_{ik}}}
$$

Where:
- \( $ z_{ij }$ \) is the logit for the \( $ j $ \)-th class of the \( i \)-th data point.
- \( $\hat{y}_{ij}$ \) is the predicted probability of the \( i \)-th data point belonging to class \( j \).

## Gradient Flow

### Gradient with Respect to the Softmax Output

The gradient of the loss function with respect to the softmax output \( $\hat{y}_{ij}$ \) is:

$$
\frac{\partial L}{\partial \hat{y}_{ij}} = -\frac{y_{ij}}{\hat{y}_{ij}}
$$

### Gradient with Respect to the Logits

To compute the gradient with respect to the logits \( $ z_{ij} $ \), we apply the chain rule. The key steps are:

1. **Softmax Gradient**:

   - For \( i = j \):

     $$
     \frac{\partial \hat{y}_{ij}}{\partial z_{ij}} = \hat{y}_{ij}(1 - \hat{y}_{ij})
     $$

   - For \( $i \neq j $\):

     $$
     \frac{\partial \hat{y}_{ik}}{\partial z_{ij}} = -\hat{y}_{ij}\hat{y}_{ik}
     $$

2. **Chain Rule Application**:

   The gradient of the loss with respect to the logits is:

   $$
   \frac{\partial L}{\partial z_{ij}} = \hat{y}_{ij} - y_{ij}
   $$

   This result shows that the gradient is the difference between the predicted probability \(  $ \hat{y}_{ij} $ \) and the true label \( $ y_{ij} $ \).

### Gradient with Respect to Weights and Biases

Assume the logits \( $ z_{ij} $ \) are computed as a linear combination of input features \( $ x_i $ \) with weights \( $ w_j $ \) and bias \( $ b_j $ \):

$$
z_{ij} = w_j^T x_i + b_j
$$

The gradients with respect to the weights and biases are:

- **Weights**:

  $$
  \frac{\partial L}{\partial w_j} = \sum_{i=1}^{N} \frac{\partial L}{\partial z_{ij}} \cdot \frac{\partial z_{ij}}{\partial w_j} = \sum_{i=1}^{N} (\hat{y}_{ij} - y_{ij}) x_i
  $$

- **Biases**:

  $$
  \frac{\partial L}{\partial b_j} = \sum_{i=1}^{N} (\hat{y}_{ij} - y_{ij})
  $$

These gradients are then used to update the model's parameters during the training process using gradient descent or a variant like stochastic gradient descent (SGD).

## Summary

- The gradient of the loss with respect to the logits \( $ z_{ij} $ \) is simply the difference between the predicted probability and the true label.
- The gradients with respect to the weights and biases are derived using the chain rule and are used in the optimization process to minimize the loss function and improve the model's performance.



In [122]:
x = torch.rand(10)
def sigmoid_f(x: torch.Tensor) -> torch.Tensor:
    return 1 / (1 + torch.exp(-x))
def softmax_f(x: torch.Tensor) -> torch.Tensor:
    exp_x = torch.exp(x)
    return exp_x / exp_x.sum()

In [121]:
print(sigmoid_f(x))

tensor([0.5274, 0.6246, 0.5872, 0.5913, 0.5987, 0.5771, 0.5442, 0.5708, 0.5868,
        0.5979])


In [123]:
print(sigmoid_f(x) * (1- sigmoid_f(x)))

tensor([0.2293, 0.2044, 0.2483, 0.2175, 0.2471, 0.2340, 0.2267, 0.2224, 0.2499,
        0.2470])


In [125]:
print(softmax_f(x) * (1- softmax_f(x)))

tensor([0.0967, 0.1271, 0.0657, 0.1113, 0.0689, 0.0904, 0.1000, 0.1053, 0.0581,
        0.0690])


In [127]:
torch.outer(softmax_f(x), softmax_f(x))

tensor([[0.0118, 0.0162, 0.0077, 0.0138, 0.0081, 0.0109, 0.0122, 0.0130, 0.0067,
         0.0081],
        [0.0162, 0.0223, 0.0106, 0.0191, 0.0111, 0.0150, 0.0168, 0.0179, 0.0093,
         0.0111],
        [0.0077, 0.0106, 0.0050, 0.0090, 0.0053, 0.0071, 0.0080, 0.0085, 0.0044,
         0.0053],
        [0.0138, 0.0191, 0.0090, 0.0163, 0.0095, 0.0128, 0.0144, 0.0153, 0.0079,
         0.0095],
        [0.0081, 0.0111, 0.0053, 0.0095, 0.0055, 0.0075, 0.0084, 0.0089, 0.0046,
         0.0056],
        [0.0109, 0.0150, 0.0071, 0.0128, 0.0075, 0.0101, 0.0113, 0.0120, 0.0062,
         0.0075],
        [0.0122, 0.0168, 0.0080, 0.0144, 0.0084, 0.0113, 0.0127, 0.0135, 0.0070,
         0.0084],
        [0.0130, 0.0179, 0.0085, 0.0153, 0.0089, 0.0120, 0.0135, 0.0143, 0.0074,
         0.0089],
        [0.0067, 0.0093, 0.0044, 0.0079, 0.0046, 0.0062, 0.0070, 0.0074, 0.0038,
         0.0046],
        [0.0081, 0.0111, 0.0053, 0.0095, 0.0056, 0.0075, 0.0084, 0.0089, 0.0046,
         0.0056]])

In [124]:
from torch.autograd.functional import jacobian, hessian
from torch.autograd import grad

jacobian(sigmoid_f, x), jacobian(softmax_f, x)

(tensor([[0.2293, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.2044, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.2483, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.2175, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.2471, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2340, 0.0000, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2267, 0.0000, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2224, 0.0000,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2499,
          0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         

## Now we will see how local gradient is calculated for linear transformations of neural networks.

### Formulation of the Problem

Given a weight matrix \( $W$ \) and a bias vector \( $b$ \), the output of a linear transformation is given by:

$$
z = Wx + b
$$

where:
- \( $W$ \) is a \( $ m \times n $ \) matrix.
- \( $x$ \) is a \($  n \times 1 $ \) column vector.
- \( $b$ \) is a \( $ m \times 1 $\) column vector.
- \( $z$ \) is a \( $ m \times 1 $\) column vector.

At this point we should see matrix as a transformation of space. The matrix \( $W$ \) transforms the input vector \( $x$ \) from an \( $ n $ \)-dimensional space to an \( $ m $ \)-dimensional space. The bias vector \( $b$ \) shifts the transformed vector in the \( $ m $ \)-dimensional space. The columns of the matrix \( $W$ \) are the directions in which the input vector \( $x$ \) is transformed. A matrix multiplication of a vector with the matrix \( $W$ \) is a linear combination of the columns of the matrix \( $W$ \) with the vector \( $x$ \) as the coefficients.

$$
W = \begin{bmatrix} | & | & | \\ w_1 & w_2 & w_3 \\ | & | & | \end{bmatrix} \\
W. \vec{x} = x1 * \vec{w_1} + x2 * \vec{w_2} + x3 * \vec{w_3}
$$

In terms of basis vectors, the matrix \( $W$ \) transforms the standard basis vectors \( $\vec{e_1},\vec{e_2}, \ldots, \vec{e_n} $ \) to the columns of the matrix \( $W$ \). The output of the linear transformation is a linear combination of the transformed basis vectors.

### Local Gradient

Lets derive the local gradient of linear transformation. This is a vector valued transformation. For a vector valued transformation, the local gradient is a Jacobian matrix. The Jacobian matrix is a matrix of partial derivatives of the output vector with respect to the input vector. Computing the Jacobian matrix involves computing the partial derivatives of each output element with respect to each input element.

$$
\begin{aligned}
z_i & = \sum_{j=1}^{n} W_{ij}x_j + b_i \\
\frac{\partial z_i}{\partial x_j} & = W_{ij} \\
\end{aligned}
$$

The Jacobian matrix is given by:

$$
\begin{aligned}
J & = \begin{bmatrix} \frac{\partial z_1}{\partial x_1} & \frac{\partial z_1}{\partial x_2} & \ldots & \frac{\partial z_1}{\partial x_n} \\ \frac{\partial z_2}{\partial x_1} & \frac{\partial z_2}{\partial x_2} & \ldots & \frac{\partial z_2}{\partial x_n} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{\partial z_m}{\partial x_1} & \frac{\partial z_m}{\partial x_2} & \ldots & \frac{\partial z_m}{\partial x_n} \end{bmatrix} \\
\end{aligned}
$$