In [5]:
import torch

In [6]:
#When we create a tensor with requires_grad=True, PyTorch will track all operations on it.


# Create a tensor and track gradients
x = torch.tensor(2.0, requires_grad=True)

# Define a simple function: y = x^2
y = x ** 2

print("y:", y)
print('Since requires_grad=True, PyTorch remembers how y was created (from squaring x).')


y: tensor(4., grad_fn=<PowBackward0>)
Since requires_grad=True, PyTorch remembers how y was created (from squaring x).


In [7]:
y.backward()   # compute gradient
print("dy/dx at x=2:", x.grad)


dy/dx at x=2: tensor(4.)


## 1. Requires_grad
✅ Flow:

If requires_grad=True → tensor participates in autograd.

If False → tensor is treated as constant.

In [8]:
import torch

x = torch.tensor(2.0, requires_grad=True)  # track gradients
y = torch.tensor(3.0, requires_grad=False) # no gradient tracking

print(x.requires_grad)  # True
print(y.requires_grad)  # False


True
False


## 🔹 2. .backward()

In [None]:
x = torch.tensor(2.0, requires_grad=True)

y = x**2 + 1  # function
y.backward()         # compute gradient

print(x.grad)  # dy/dx = 2x  = 4


tensor(4.)


✅ Flow:

Build computation graph → Call .backward() → Autograd applies chain rule → Store result in .grad.

## 🔹 3. .grad

In [12]:
x = torch.tensor(2.0, requires_grad=True)
y = x**3   # y = x^3
y.backward()
print(x.grad)  # dy/dx = 3x^2 = 12


tensor(12.)


## 🔹 4. .zero_grad()
Gradients accumulate in PyTorch by default. So before each new backward pass, we usually reset gradients to 0.

In [13]:
w = torch.tensor(2.0, requires_grad=True)

# First pass
y1 = w * 3
y1.backward()
print("After first backward:", w.grad)  # 3

# Second pass without zeroing
y2 = w * 4
y2.backward()
print("After second backward:", w.grad)  # 3 + 4 = 7 (accumulated!)

# Reset gradients
w.grad.zero_()
y3 = w * 5
y3.backward()
print("After zero_grad and backward:", w.grad)  # 5


After first backward: tensor(3.)
After second backward: tensor(7.)
After zero_grad and backward: tensor(5.)


## 🔹 5. .detach()
✅ Flow:

detach() cuts off the history → gradients stop here.

Useful in GANs, RNNs, or when freezing part of a model.

In [None]:
x = torch.tensor(3.0, requires_grad=True)
y = x * 2       # tracked
z = y.detach()  # not tracked

print(y.requires_grad)  
print(z.requires_grad)  


True
False


## 🔹 6. torch.no_grad()
✅ Flow:

Unlike .detach(), which applies to one tensor → no_grad() disables gradient tracking for all operations inside the block.

In [15]:
x = torch.tensor(3.0, requires_grad=True)

with torch.no_grad():
    y = x * 2   # no graph built

print(y.requires_grad) 


False


## 🔹 Putting It All Together (Mini Training Example)
✅ Flow:

requires_grad=True → track w.

.backward() → compute gradient.

.grad → access gradient.

torch.no_grad() → update parameters safely (no tracking).

.zero_grad() → reset for next iteration.

In [16]:
# A simple linear regression example
w = torch.tensor(1.0, requires_grad=True)

for epoch in range(3):
    # Forward pass
    x = torch.tensor(2.0)
    target = torch.tensor(5.0)
    y = w * x
    loss = (y - target) ** 2  # MSE

    # Backward pass
    loss.backward()      # compute gradients
    print(f"Epoch {epoch+1}, Grad: {w.grad.item()}")

    # Update weight
    with torch.no_grad():
        w -= 0.1 * w.grad  # gradient descent

    # Reset gradients
    w.grad.zero_()


Epoch 1, Grad: -12.0
Epoch 2, Grad: -2.3999996185302734
Epoch 3, Grad: -0.4799995422363281


## 🌟 Final Summary

requires_grad → Tell PyTorch to track gradients for a tensor.

.backward() → Compute gradients (autograd).

.grad → Stores gradient result.

.zero_grad() → Reset gradients (important in loops).

.detach() → Get a copy of tensor without gradient tracking.

torch.no_grad() → Disable gradient tracking inside a block (used in inference).

In [18]:
import torch

# Step 1: Create input matrices with requires_grad=True
A = torch.tensor([[2.0, 3.0],
                  [1.0, 4.0]], requires_grad=True)

B = torch.tensor([[1.0, 2.0],
                  [3.0, 1.0]], requires_grad=True)

# Step 2: Forward pass (simple operation)
C = torch.matmul(A, B)   # Matrix multiplication
loss = C.sum()           # Scalar loss (required for backward)

print("Matrix A:\n", A)
print("Matrix B:\n", B)
print("Matrix C (A*B):\n", C)
print("Loss (sum of C):", loss.item())

# Step 3: Backward pass
loss.backward()   # Compute gradients

# Step 4: Check gradients
print("\nGradients:")
print("dLoss/dA:\n", A.grad)
print("dLoss/dB:\n", B.grad)

# Step 5: Zero gradients before next backward (important in training loops)
A.grad.zero_()
B.grad.zero_()
print("\nAfter zero_grad():")
print("A.grad:\n", A.grad)
print("B.grad:\n", B.grad)

# Step 6: Detach (stop gradient tracking for a tensor)
A_detached = A.detach()
print("\nDetached A (no gradient tracking):\n", A_detached)

# Step 7: No grad context (useful for inference)
with torch.no_grad():
    C_no_grad = torch.matmul(A, B)
print("\nC computed under no_grad:\n", C_no_grad)


Matrix A:
 tensor([[2., 3.],
        [1., 4.]], requires_grad=True)
Matrix B:
 tensor([[1., 2.],
        [3., 1.]], requires_grad=True)
Matrix C (A*B):
 tensor([[11.,  7.],
        [13.,  6.]], grad_fn=<MmBackward0>)
Loss (sum of C): 37.0

Gradients:
dLoss/dA:
 tensor([[3., 4.],
        [3., 4.]])
dLoss/dB:
 tensor([[3., 3.],
        [7., 7.]])

After zero_grad():
A.grad:
 tensor([[0., 0.],
        [0., 0.]])
B.grad:
 tensor([[0., 0.],
        [0., 0.]])

Detached A (no gradient tracking):
 tensor([[2., 3.],
        [1., 4.]])

C computed under no_grad:
 tensor([[11.,  7.],
        [13.,  6.]])
