# Gradinent FLow

In this notebook lets look at gradient flow through different functions, scalar fields and vector fields.

In [88]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd.functional import jacobian

In [2]:
def my_sum(x):
    return torch.sum(x)

def my_mean(x):
    return torch.mean(x)

def my_exp(x):
    return torch.exp(x)


def my_log(x):
    return torch.log(x)

### Lets look at gradient of sum function

In [3]:
x = torch.tensor([1.0, 2.0, 3.0, 4.0], requires_grad=True)
y = my_sum(x)
y.backward()
print(x.grad)

## reset the gradient
x.grad.zero_()
print(jacobian(my_sum, x))

tensor([1., 1., 1., 1.])
tensor([1., 1., 1., 1.])


### Lets look at gradient of mean function

In [4]:
y  = my_mean(x)
y.backward()
print(x.grad)

## reset the gradient
x.grad.zero_()
print(jacobian(my_mean, x))

tensor([0.2500, 0.2500, 0.2500, 0.2500])
tensor([0.2500, 0.2500, 0.2500, 0.2500])


### Lets look at gradient of exp function

In [5]:
y = my_exp(x)
y.backward(torch.ones_like(x))
print(x.grad)

## reset the gradient
x.grad.zero_()
print(jacobian(my_exp, x))

tensor([ 2.7183,  7.3891, 20.0855, 54.5982])
tensor([[ 2.7183,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  7.3891,  0.0000,  0.0000],
        [ 0.0000,  0.0000, 20.0855,  0.0000],
        [ 0.0000,  0.0000,  0.0000, 54.5982]])


In [55]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
W = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True)
b = torch.tensor([0.1, 0.2], requires_grad=True)

# Forward pass
def my_affine(x, W, b):
    return torch.matmul(W, x) + b

affine_jacobian = jacobian(my_affine, (x, W, b))

In [56]:
affine_jacobian

(tensor([[0.1000, 0.2000, 0.3000],
         [0.4000, 0.5000, 0.6000]]),
 tensor([[[1., 2., 3.],
          [0., 0., 0.]],
 
         [[0., 0., 0.],
          [1., 2., 3.]]]),
 tensor([[1., 0.],
         [0., 1.]]))

### Lets look at gradient of log function

In [6]:
y = my_log(x)
y.backward(torch.ones_like(x))
print(x.grad)

## reset the gradient
x.grad.zero_()
print(jacobian(my_log, x))

tensor([1.0000, 0.5000, 0.3333, 0.2500])
tensor([[1.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.5000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3333, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.2500]])


In [24]:
import torch

# Initialize tensors
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
W = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True)
b = torch.tensor([0.1, 0.2], requires_grad=True)

# Forward pass
z = W @ x + b
z.retain_grad()

# Compute L
L = z.sum()

# Clear previous gradients
W.grad = None
b.grad = None
x.grad = None

# Backward pass to get dL/dW (global gradient)
L.backward(retain_graph=True)

# Check W.grad (dL/dW)
print("W.grad (dL/dW):", W.grad)

# Compute local gradients
with torch.no_grad():
    # dL/dz
    grad_L_z = torch.autograd.grad(L, z, retain_graph=True)[0]
    print("dL/dz:", grad_L_z)

    # dz/dW
    grad_z_W = torch.autograd.grad(z, W, grad_outputs=torch.ones_like(z), retain_graph=True)[0]
    print("dz/dW:", grad_z_W)

# Checking z.grad
print("z.grad:", z.grad)


W.grad (dL/dW): tensor([[1., 2., 3.],
        [1., 2., 3.]])
dL/dz: tensor([1., 1.])
dz/dW: tensor([[1., 2., 3.],
        [1., 2., 3.]])
z.grad: tensor([3., 3.])


In [20]:
z.grad

tensor([3., 3.])

In [21]:
import torch

# Initialize tensors
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
W = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True)
b = torch.tensor([0.1, 0.2], requires_grad=True)

# Forward pass
z = W @ x + b

# Compute L
L = z.sum()

# Backward pass to get dL/dW (global gradient)
L.backward(retain_graph=True)

# Check W.grad (dL/dW)
print("W.grad (dL/dW):", W.grad)

# Compute local gradients
# dL/dz
grad_L_z = torch.autograd.grad(L, z, retain_graph=True)[0]
print("dL/dz:", grad_L_z)

# dz/dW
grad_z_W = torch.autograd.grad(z, W, grad_outputs=torch.ones_like(z), retain_graph=True)[0]
print("dz/dW:", grad_z_W)

# Checking z.grad
print("z.grad:", z.grad)


W.grad (dL/dW): tensor([[1., 2., 3.],
        [1., 2., 3.]])
dL/dz: tensor([1., 1.])
dz/dW: tensor([[1., 2., 3.],
        [1., 2., 3.]])
z.grad: None


  print("z.grad:", z.grad)


In [26]:
import torch

# Initialize tensors
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
W = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True)
b = torch.tensor([0.1, 0.2], requires_grad=True)

# Forward pass
z = W @ x + b

# Retain gradient for z, which is not a leaf tensor
z.retain_grad()

# Compute L
L = z.sum()

# Backward pass to get dL/dW (global gradient)
L.backward(retain_graph=True)

# Check W.grad (dL/dW)
print("W.grad (dL/dW):", W.grad)

# Check z.grad after backward pass
print("z.grad:", z.grad)

# Compute local gradients manually
# dL/dz
grad_L_z = torch.autograd.grad(L, z, retain_graph=True)[0]
print("dL/dz:", grad_L_z)

# dz/dW
grad_z_W = torch.autograd.grad(z, W, grad_outputs=torch.ones_like(z), retain_graph=True)[0]
print("dz/dW:", grad_z_W)

with torch.no_grad():
    # Compute dL/dW
    grad_L_W = grad_L_z @ grad_z_W
    print("dL/dW:", grad_L_W)


W.grad (dL/dW): tensor([[1., 2., 3.],
        [1., 2., 3.]])
z.grad: tensor([1., 1.])
dL/dz: tensor([1., 1.])
dz/dW: tensor([[1., 2., 3.],
        [1., 2., 3.]])
dL/dW: tensor([2., 4., 6.])


In [58]:
import torch

# Initialize tensors
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
W = torch.tensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], requires_grad=True)
b = torch.tensor([0.1, 0.2], requires_grad=True)

# Forward pass
z = W @ x + b

# Retain gradient for z, which is not a leaf tensor
z.retain_grad()

# Compute L
L = z.sum()

# Backward pass to get dL/dW (global gradient)
L.backward(retain_graph=True)

# Check W.grad (dL/dW)
print("W.grad (dL/dW):", W.grad)

# Check z.grad after backward pass
print("z.grad:", z.grad)

# Compute local gradients manually
# dL/dz
grad_L_z = torch.autograd.grad(L, z, retain_graph=True)[0]
print("dL/dz:", grad_L_z)

# dz/dW
grad_z_W = torch.autograd.grad(z, W, grad_outputs=torch.ones_like(z), retain_graph=True)[0]
print("dz/dW:", grad_z_W)

# Compute dL/dW correctly
# Since dz/dW is a tensor of shape (2, 3) (like W), we need to properly sum the contributions.
# dL/dW = grad_L_z (2, 1) * dz/dW (2, 3)
# We should use matrix multiplication and broadcasting

grad_L_W_manual = torch.outer(grad_L_z, x)  # This gives you the correct (2, 3) shape
print("Manual dL/dW:", grad_L_W_manual)


W.grad (dL/dW): tensor([[1., 2., 3.],
        [1., 2., 3.]])
z.grad: tensor([1., 1.])
dL/dz: tensor([1., 1.])
dz/dW: tensor([[1., 2., 3.],
        [1., 2., 3.]])
Manual dL/dW: tensor([[1., 2., 3.],
        [1., 2., 3.]], grad_fn=<MulBackward0>)


In [87]:
# Forward pass
def my_affine(x, W, b):
    return torch.matmul(W, x) + b

with torch.no_grad():
    affine_jacobian = jacobian(my_affine, (x, W, b))

jacob_dz_dx, jacbo_dz_dW, jacob_dz_db = affine_jacobian

TypeError: 'Tensor' object is not callable

In [76]:
jacbo_dz_dW

tensor([[[1., 2., 3.],
         [0., 0., 0.]],

        [[0., 0., 0.],
         [1., 2., 3.]]])

In [74]:
grad_z_W

tensor([[1., 2., 3.],
        [1., 2., 3.]])

In [83]:
grad_L_z, jacbo_dz_dW

(tensor([1., 1.]),
 tensor([[[1., 2., 3.],
          [0., 0., 0.]],
 
         [[0., 0., 0.],
          [1., 2., 3.]]]))

In [82]:
grad_L_z @ jacbo_dz_dW 

tensor([[1., 2., 3.],
        [1., 2., 3.]])

In [79]:
grad_L_z.shape

torch.Size([2])

In [80]:
jacbo_dz_dW.shape

torch.Size([2, 2, 3])

In [109]:
x = torch.tensor([10.0, 15.0, 20.0, 30.0, 40.0], requires_grad=True)
W = torch.tensor([[0.1, 0.2, 0.3, 0.4, 0.5], [0.4, 0.5, 0.6, 0.8, 1.0],  [0.4, 0.5, 0.6, 0.8, 1.0]], requires_grad=True)
b = torch.tensor([0.1, 0.2, 0.7], requires_grad=True)
y_out = torch.tensor([1.0, 5.0, 10.0], requires_grad=False)

def my_affine(x, W, b):
    return W @ x + b

def my_affine_sum(x, W, b):
    y_out = torch.tensor([1.0, 5.0, 10.0], requires_grad=False)
    affine_out = my_affine(x, W, b)
    print(affine_out)
    return F.mse_loss(affine_out, y_out)

affine_jacobian = jacobian(my_affine, (x, W, b))
affine_sum_jacobian = jacobian(my_affine_sum, (x, W, b))

## now we will compute via backward pass

tensor([42.1000, 87.7000, 88.2000], grad_fn=<AddBackward0>)


In [110]:
affine_sum_jacobian

(tensor([ 45.6467,  59.1133,  72.5800,  96.7733, 120.9667]),
 tensor([[ 274.0000,  411.0000,  548.0000,  822.0000, 1096.0000],
         [ 551.3333,  827.0000, 1102.6666, 1654.0000, 2205.3333],
         [ 521.3333,  782.0000, 1042.6666, 1564.0000, 2085.3333]]),
 tensor([27.4000, 55.1333, 52.1333]))

In [121]:
input_1 = torch.tensor([42.1000, 87.7000, 88.2000], requires_grad=False)
input_2 = torch.tensor([1.0, 5.0, 10.0], requires_grad=False)

print(2*(input_2 - input_1)/3)

jacobian(F.mse_loss,(input_1, input_2))

tensor([-27.4000, -55.1333, -52.1333])


(tensor([27.4000, 55.1333, 52.1333]), tensor([-27.4000, -55.1333, -52.1333]))

In [99]:
## now lets compute the jacobian via backward pass
z = my_affine(x, W, b)
z.retain_grad()

L = z.sum()
L.backward(retain_graph=True)

## check the global gradient
print("W.grad" ,W.grad)
# Check z.grad after backward pass
print("z.grad:", z.grad)


## 
grad_L_z = torch.autograd.grad(L, z, retain_graph=True)[0]
print("dL/dz:", grad_L_z)


W.grad tensor([[10., 15., 20.],
        [10., 15., 20.]])
z.grad: tensor([1., 1.])
dL/dz: tensor([1., 1.])


In [100]:
affine_sum_jacobian

(tensor([0.5000, 0.7000, 0.9000]),
 tensor([[10., 15., 20.],
         [10., 15., 20.]]),
 tensor([1., 1.]))

In [101]:
affine_jacobian

(tensor([[0.1000, 0.2000, 0.3000],
         [0.4000, 0.5000, 0.6000]]),
 tensor([[[10., 15., 20.],
          [ 0.,  0.,  0.]],
 
         [[ 0.,  0.,  0.],
          [10., 15., 20.]]]),
 tensor([[1., 0.],
         [0., 1.]]))

In [103]:
y_out = torch.tensor([1.0, 5.0])
y_out_1 = torch.tensor([2.0, 6.0])



tensor(1.)

In [124]:
# # Define functions
# def z(W, x, b):
#     return torch.matmul(W, x) + b

# def a(z):
#     return F.softmax(z, dim=1)

# def L(a, truth):
#     return F.mse_loss(a, truth)

# # Create input tensors
# W = torch.randn(2, 3, requires_grad=True)
# x = torch.randn(3, 1, requires_grad=True)
# b = torch.randn(2, 1, requires_grad=True)
# truth = torch.randn(2, 1)



In [139]:
# Define inputs
x = torch.tensor([1.0, 2.0], requires_grad=True)
W = torch.tensor([[0.5, -0.5], [0.3, 0.8]], requires_grad=True)
b = torch.tensor([0.1, 0.2], requires_grad=True)

# Ground truth
truth = torch.tensor([0.0, 1.0])

# Forward pass
z = W @ x + b
a = F.softmax(z, dim=0)
L = F.mse_loss(a, truth)

# Perform backward pass to get gradients from PyTorch
L.backward()

# Save the gradients before zeroing them out
dL_dW_pytorch = W.grad.clone()
dL_db_pytorch = b.grad.clone()
dL_dx_pytorch = x.grad.clone()

# Reset gradients to manually compute
W.grad.zero_()
b.grad.zero_()
x.grad.zero_()

print("PyTorch gradients:")
print("dL/dW (PyTorch):", dL_dW_pytorch)
print("dL/db (PyTorch):", dL_db_pytorch)
print("dL/dx (PyTorch):", dL_dx_pytorch)

PyTorch gradients:
dL/dW (PyTorch): tensor([[ 0.0106,  0.0213],
        [-0.0106, -0.0213]])
dL/db (PyTorch): tensor([ 0.0106, -0.0106])
dL/dx (PyTorch): tensor([ 0.0021, -0.0138])


In [140]:
dL_da = 2 * (a - truth) / a.size(0)

# Compute the gradient of a with respect to z (Jacobian)
a_diag = torch.diag(a)
J = a_diag - torch.outer(a, a)

# Compute the gradient of z with respect to W, x, and b
dz_dW = x.unsqueeze(0)  # W is applied to x: z = W @ x, so dz/dW = x^T
dz_db = torch.eye(z.size(0))  # dz/db = I
dz_dx = W.T  # W is applied to x: z = W @ x, so dz/dx = W^T

# Chain rule: compute dL/dz
dL_dz = dL_da @ J

# Chain rule: compute dL/dW, dL/db, dL/dx
dL_dW_manual = dL_dz.unsqueeze(1) @ dz_dW.unsqueeze(0)
dL_db_manual = dL_dz @ dz_db
dL_dx_manual = dL_dz @ dz_dx

print("\nManual gradients:")
print("dL/dW (Manual):", dL_dW_manual)
print("dL/db (Manual):", dL_db_manual)
print("dL/dx (Manual):", dL_dx_manual)


Manual gradients:
dL/dW (Manual): tensor([[[ 0.0106,  0.0213],
         [-0.0106, -0.0213]]], grad_fn=<CloneBackward0>)
dL/db (Manual): tensor([ 0.0106, -0.0106], grad_fn=<SqueezeBackward4>)
dL/dx (Manual): tensor([ 0.0106, -0.0053], grad_fn=<SqueezeBackward4>)


In [141]:
print("\nComparison:")
print("dL/dW difference:", torch.allclose(dL_dW_manual, dL_dW_pytorch))
print("dL/db difference:", torch.allclose(dL_db_manual, dL_db_pytorch))
print("dL/dx difference:", torch.allclose(dL_dx_manual, dL_dx_pytorch))


Comparison:
dL/dW difference: True
dL/db difference: True
dL/dx difference: False


In [142]:
# Recompute dL/dz using the correct softmax gradient and MSE loss derivative
dL_dz = dL_da @ J

# Compute dL/dx manually
dL_dx_manual = dL_dz @ dz_dx

print("Recomputed dL/dx (Manual):", dL_dx_manual)
print("dL/dx (PyTorch):", dL_dx_pytorch)

Recomputed dL/dx (Manual): tensor([ 0.0106, -0.0053], grad_fn=<SqueezeBackward4>)
dL/dx (PyTorch): tensor([ 0.0021, -0.0138])


In [143]:
# Step 1: Compute dL/da
dL_da = 2 * (a - truth) / a.size(0)

# Step 2: Compute Jacobian da/dz
a_diag = torch.diag(a)
J = a_diag - torch.outer(a, a)

# Step 3: Compute dL/dz
dL_dz = dL_da @ J

# Step 4: Compute dz/dx
dz_dx = W.T

# Step 5: Manually compute dL/dx
dL_dx_manual = dL_dz @ dz_dx

print("Recomputed dL/dx (Manual):", dL_dx_manual)
print("dL/dx (PyTorch):", dL_dx_pytorch)


Recomputed dL/dx (Manual): tensor([ 0.0106, -0.0053], grad_fn=<SqueezeBackward4>)
dL/dx (PyTorch): tensor([ 0.0021, -0.0138])


In [144]:
import torch
import torch.nn.functional as F

# Dimensions
n = 5  # Input dimension
m = 3  # Output dimension

# Set random seed for reproducibility
torch.manual_seed(0)

# Initialize variables
x = torch.randn(n, requires_grad=True)
W = torch.randn(m, n, requires_grad=True)
b = torch.randn(m, requires_grad=True)
truth = torch.randn(m)

# Forward pass
z = W @ x + b  # Shape: (m,)
a = F.softmax(z, dim=0)  # Shape: (m,)
L = F.mse_loss(a, truth, reduction='mean')  # Scalar loss

# Autograd computation
L.backward()

# Save gradients computed by autograd
grad_W_autograd = W.grad.clone()
grad_b_autograd = b.grad.clone()
grad_x_autograd = x.grad.clone()

# Zero gradients to prevent accumulation
W.grad.zero_()
b.grad.zero_()
x.grad.zero_()

# Manually compute gradients
mse_factor = 1.0 / m

# Step 1: Compute dL/da
dL_da = 2 * mse_factor * (a.detach() - truth)  # Shape: (m,)

# Step 2: Compute s = a^T * dL/da
s = torch.dot(a.detach(), dL_da)  # Scalar

# Step 3: Compute dL/dz
dL_dz = a.detach() * (dL_da - s)  # Shape: (m,)

# Step 4: Compute gradients w.r.t. W, b, x
dL_dW_manual = dL_dz.unsqueeze(1) @ x.detach().unsqueeze(0)  # Shape: (m, n)
dL_db_manual = dL_dz  # Shape: (m,)
dL_dx_manual = W.detach().t() @ dL_dz  # Shape: (n,)

# Compare gradients
print("Maximum difference in W.grad:", torch.max(torch.abs(grad_W_autograd - dL_dW_manual)))
print("Maximum difference in b.grad:", torch.max(torch.abs(grad_b_autograd - dL_db_manual)))
print("Maximum difference in x.grad:", torch.max(torch.abs(grad_x_autograd - dL_dx_manual)))


Maximum difference in W.grad: tensor(0.)
Maximum difference in b.grad: tensor(0.)
Maximum difference in x.grad: tensor(0.)


In [145]:
# Compare gradients

print("\nGradients w.r.t W:")
print("Autograd gradient:\n", grad_W_autograd)
print("Manual gradient:\n", dL_dW_manual)
print("Difference:\n", grad_W_autograd - dL_dW_manual)

print("\nGradients w.r.t b:")
print("Autograd gradient:\n", grad_b_autograd)
print("Manual gradient:\n", dL_db_manual)
print("Difference:\n", grad_b_autograd - dL_db_manual)

print("\nGradients w.r.t x:")
print("Autograd gradient:\n", grad_x_autograd)
print("Manual gradient:\n", dL_dx_manual)
print("Difference:\n", grad_x_autograd - dL_dx_manual)



Gradients w.r.t W:
Autograd gradient:
 tensor([[-5.1207e-04,  9.7506e-05,  7.2401e-04, -1.8889e-04,  3.6038e-04],
        [ 1.3595e-01, -2.5887e-02, -1.9222e-01,  5.0148e-02, -9.5679e-02],
        [-1.3544e-01,  2.5789e-02,  1.9149e-01, -4.9959e-02,  9.5318e-02]])
Manual gradient:
 tensor([[-5.1207e-04,  9.7506e-05,  7.2401e-04, -1.8889e-04,  3.6038e-04],
        [ 1.3595e-01, -2.5887e-02, -1.9222e-01,  5.0148e-02, -9.5679e-02],
        [-1.3544e-01,  2.5789e-02,  1.9149e-01, -4.9959e-02,  9.5318e-02]])
Difference:
 tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

Gradients w.r.t b:
Autograd gradient:
 tensor([-0.0003,  0.0882, -0.0879])
Manual gradient:
 tensor([-0.0003,  0.0882, -0.0879])
Difference:
 tensor([0., 0., 0.])

Gradients w.r.t x:
Autograd gradient:
 tensor([-0.0630,  0.0657, -0.1086,  0.1757,  0.0383])
Manual gradient:
 tensor([-0.0630,  0.0657, -0.1086,  0.1757,  0.0383])
Difference:
 tensor([0., 0., 0., 0., 0.])
