In [2]:
import torch
print(torch.__version__)

2.7.0


# 1. Introduction to Autograd in PyTorch

Autograd is PyTorch's automatic differentiation engine. 

It allows you to compute gradients automatically for tensor operations. 

When you set `requires_grad=True` for a tensor, PyTorch tracks all operations on it to enable automatic backpropagation.


# 2. Manual Derivative Example (Without PyTorch)

This is a simple Python example to compute the derivative of \( y = x^2 \) manually using standard Python functions.


In [5]:
# Define a function representing dy/dx = 2x
def dy_dx(x):
    return 2 * x

# Compute derivative at x = 3
dy_dx(3)


6

# 3. Computing Gradients Using PyTorch Autograd

PyTorch can compute gradients automatically using its `autograd` engine. Let's compute the gradient of \( y = x^2 \) using `requires_grad=True`.


In [9]:
import torch

# Create a tensor with gradient tracking enabled
x = torch.tensor(3.0, requires_grad=True)

# Define the function y = x^2
y = x ** 2

# View tensors
print("x ->", x)
print("y ->", y)


x -> tensor(3., requires_grad=True)
y -> tensor(9., grad_fn=<PowBackward0>)


### Explanation:
tensor(3., requires_grad=True) --> This is the tensor x.

Value: 3.0

### `requires_grad=True`: 

This means PyTorch will track operations on this tensor so that it can compute gradients later (for backpropagation).

tensor(9., grad_fn=<PowBackward0>) --> This is the tensor y, which was computed as x ** 2 (i.e., 3.0 ** 2 = 9.0).

Value: 9.0

### `grad_fn=PowBackward0` :

This tells us that y was created by an operation that supports gradient computation.

Specifically, `PowBackward0` is the function that will be used during the backward pass to compute the gradient of y with respect to x (i.e., the derivative of x^2).

This shows that y is part of a `computational graph` and gradients can be propagated from it.

### What It Means Practically:

You're setting up a small computational graph where:

x is the input with gradients enabled.

y = x² is the output, and PyTorch remembers how it was computed.

If you later call y.backward(), PyTorch will compute dy/dx = 2x and store it in x.grad.

In [10]:
# Backpropagate to compute gradients
y.backward()

# Access the computed gradient (dy/dx = 2x)
x.grad

tensor(6.)

# 4. Gradient Through a Nonlinear Function: sin(x^2)

We compute the derivative of $z = sin(x^2)$ using both a manual function and PyTorch's autograd.


Manual Derivation

In [12]:
import math

# Manual derivative: dz/dx = 2x * cos(x^2)
def dz_dx(x):
    return 2 * x * math.cos(x ** 2)

# Evaluate at x = 4
dz_dx(4)

-7.661275842587077

Pytorch - Autograd

In [15]:
# PyTorch approach
x = torch.tensor(4.0, requires_grad=True)
y = x ** 2
z = torch.sin(y)

# View tensors
print("x ->", x)
print("y ->", y)
print("z ->", z)

x -> tensor(4., requires_grad=True)
y -> tensor(16., grad_fn=<PowBackward0>)
z -> tensor(-0.2879, grad_fn=<SinBackward0>)


In [16]:
# Compute gradients
z.backward()

In [17]:
# dz/dx as computed by autograd
x.grad

tensor(-7.6613)

In [18]:
# y.grad will be None (it's not a leaf node)
y.grad

  y.grad


### Warning message:-

You're trying to access .grad on a non-leaf tensor (y).

PyTorch didn’t store the gradient for it, so y.grad is None.

If you really need it, you should call y.retain_grad() before backward().

In [19]:
# PyTorch approach
x = torch.tensor(4.0, requires_grad=True)
y = x ** 2

# In order to retain y, we need retain_grad()
y.retain_grad()     
z = torch.sin(y)

# Compute gradients
z.backward()

# Observing y.grad
print(y.grad)

tensor(-0.9577)


# 5. Manual Gradient Calculation - Fw Pass & Bw Pass on Single Neuron NN

### 5.1. Setup Inputs and Parameters

You start with a single input feature (x = 6.7) and its true label (y = 0).

You initialize model parameters — weight (w) and bias (b) — without gradient tracking.

In [20]:
# Inputs and targets
x = torch.tensor(6.7)  # Input feature
y = torch.tensor(0.0)  # Ground truth label

# Parameters (no gradients for now)
w = torch.tensor(1.0)
b = torch.tensor(0.0)

### 5.2. Define Binary Cross-Entropy (BCE) Loss

$loss = -(y*log (y_{pred}) + (1 - y) *log (1 - y_{pred}))$

Implement the BCE loss function manually, which measures how close the model’s prediction is to the true label.

The loss penalizes incorrect predictions more heavily.



In [None]:
##Binary Cross-Entropy Loss Function
def binary_cross_entropy_loss(prediction, target):
    epsilon = 1e-8  # To prevent log(0)
    prediction = torch.clamp(prediction, epsilon, 1 - epsilon)
    return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))

### 5.3 Forward Pass (Prediction)

Compute a linear combination: 

$z=(w×x)+b$

Apply the sigmoid function to squash  $z$ into a probability $y_{pred}$ between 0 and 1.

$ y_{pred} = \sigma(z) = 1/(1+e ^ {-z})$

In [23]:
# Forward pass
z = w * x + b
y_pred = torch.sigmoid(z)

### 5.4 Calculate Loss

Use the BCE loss function to compute how well the prediction matches the target label.

In [24]:
# Compute loss
loss = binary_cross_entropy_loss(y_pred, y)
loss


tensor(6.7012)

### 5.5 Manually Compute Gradients

Calculate how the loss changes with respect to the prediction $(dloss/dy_{pred})$.

Calculate how the prediction changes with respect to the linear output $(dy_{pred}/dz)$.

Calculate how the linear output changes with respect to the parameters $(dz/dw, dz/db)$.

Use the chain rule to combine these derivatives and get gradients of loss with respect to $w$ and $b$.

In [27]:
# Manual gradients
dloss_dy_pred = (y_pred - y) / (y_pred * (1 - y_pred))
dy_pred_dz = y_pred * (1 - y_pred)
dz_dw = x
dz_db = 1

# Final gradients
dL_dw = dloss_dy_pred * dy_pred_dz * dz_dw
dL_db = dloss_dy_pred * dy_pred_dz * dz_db

print(f"Manual Gradient of loss w.r.t weight (dw): {dL_dw}")
print(f"Manual Gradient of loss w.r.t bias (db): {dL_db}")


Manual Gradient of loss w.r.t weight (dw): 6.691762447357178
Manual Gradient of loss w.r.t bias (db): 0.998770534992218


# 6. Pytorch Autograd Gradient Calculation - Fw Pass & Bw Pass on Single Neuron NN

Now we use PyTorch autograd to compute the same gradients automatically for weight and bias.


In [32]:
# Inputs and targets
x = torch.tensor(6.7)  # Input feature
y = torch.tensor(0.0)  # Ground truth label

# Enable gradient tracking
w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

# Linear transformation and sigmoid
z = w * x + b
y_pred = torch.sigmoid(z)

# Loss computation
loss = binary_cross_entropy_loss(y_pred, y)

# Backward pass
loss.backward()

# View gradients
print("Pytorch Autograd Gradient w.r.t weight", w.grad)  # Should match manual dw
print("Pytorch Autograd Gradient w.r.t bias", b.grad)  # Should match manual db

print(f"Manual Gradient of loss w.r.t weight (dw): {dL_dw}")
print(f"Manual Gradient of loss w.r.t bias (db): {dL_db}")


Pytorch Autograd Gradient w.r.t weight tensor(6.6918)
Pytorch Autograd Gradient w.r.t bias tensor(0.9988)
Manual Gradient of loss w.r.t weight (dw): 6.691762447357178
Manual Gradient of loss w.r.t bias (db): 0.998770534992218


# 7. Gradient of Mean Squared Function

Compute gradient of the mean of squared elements in a tensor.


In [33]:
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

# Function: mean of squared elements
y = (x ** 2).mean()

# Backward pass
y.backward()

# Gradients of y w.r.t x
x.grad

tensor([0.6667, 1.3333, 2.0000])

# 8. Clearing Gradients in PyTorch

Always clear old gradients before performing another backward pass.


In [68]:
x = torch.tensor(2.0, requires_grad=True)
y = x ** 2
y.backward()

# View gradient
print(f"gradient of x {x.grad}")

# Clear the gradient manually
print(f"gradient of x {x.grad.zero_()}")

gradient of x 4.0
gradient of x 0.0


# 9. Disabling Gradient Tracking

Use different methods to disable gradient tracking in PyTorch.


Option 1: requires_grad_(False)

In [81]:
# Option 1: requires_grad_(False)
x = torch.tensor(2.0, requires_grad=True)
x.requires_grad_(False)

# No gradient tracking here
y = x ** 2
print("y = x**2 = without grad_fn ", y)


y = x**2 = without grad_fn  tensor(4.)


Option 2: detach()

In [80]:
# Option 2: detach()
x = torch.tensor(2.0, requires_grad=True)
print("x", x)
z = x.detach()  # Detach from computation graph

# Computation on x
y = x ** 2
print("y = x**2 = with grad_fn ", y)

# Computation on z will not be tracked
y1 = z ** 2
print("y1 = z**2 = without grad_fn ", y1)


x tensor(2., requires_grad=True)
y = x**2 = with grad_fn  tensor(4., grad_fn=<PowBackward0>)
y1 = z**2 = without grad_fn  tensor(4.)


Option 3: with torch.no_grad()

In [82]:
# Option 3: with torch.no_grad()
x = torch.tensor(2.0, requires_grad=True)

with torch.no_grad():
    y = x ** 2
    print("y = x**2 = without grad_fn ", y)


y = x**2 = without grad_fn  tensor(4.)
