In [88]:
import torch
import torch.nn as nn

In [89]:
X1 = torch.tensor([[40.2]], dtype=torch.float32)
y1 = torch.tensor([[20.1]], dtype=torch.float32)

In [90]:
model = nn.Linear(1,1)
model

Linear(in_features=1, out_features=1, bias=True)

In [91]:
loss_fn = torch.nn.MSELoss()

In [92]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [93]:
for i in model.parameters():
  print(i)  #1st one is weight, 2nd is bias

Parameter containing:
tensor([[0.3963]], requires_grad=True)
Parameter containing:
tensor([0.2359], requires_grad=True)


In [94]:
#Before training
model.weight, model.bias

(Parameter containing:
 tensor([[0.3963]], requires_grad=True),
 Parameter containing:
 tensor([0.2359], requires_grad=True))

In [95]:
# Check gradients before zeroing
print("Gradients before zeroing:")
for name, param in model.named_parameters():
    print(f"{name}: {param.grad}")

Gradients before zeroing:
weight: None
bias: None


In [96]:
#Start Training
optimizer.zero_grad()  #To reset the gradients of all the parameters in an optimizer to zero.

In [97]:
print("\nGradients after zeroing:")
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: {param.grad.norm().item()}")
    else:
        print(f"{name}: None (gradients cleared)")


Gradients after zeroing:
weight: None (gradients cleared)
bias: None (gradients cleared)


In [98]:
outputs = model(X1)
outputs

tensor([[16.1684]], grad_fn=<AddmmBackward0>)

In [99]:
loss = loss_fn(outputs, y1)
loss

tensor(15.4578, grad_fn=<MseLossBackward0>)

##What loss.backward() Does

####**Computes Gradients:** It calculates the partial derivatives of the loss function with respect to each parameter in the model (e.g., weights and biases) that has requires_grad=True. These gradients are stored in the **.grad attribute** of each parameter.
####**Backpropagation:** It performs backpropagation, propagating the loss gradient backward through the computational graph (from the loss to the inputs) using the **chain rule.**
####**Accumulates Gradients:** Gradients are accumulated into the .grad attribute of each parameter. If .grad already contains values (from a previous backward() call), new gradients are added unless cleared with optimizer.zero_grad().

In [100]:
loss.backward()
loss

tensor(15.4578, grad_fn=<MseLossBackward0>)

###**optimizer.step()** is a key method in PyTorch optimizers (like optim.SGD or optim.Adam) that updates the model's parameters based on the gradients computed during the backward pass. It applies the optimization algorithm to adjust the weights and biases, effectively taking a "step" towards minimizing the loss.

In [101]:
optimizer.step()
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [102]:
#After training
model.weight,model.bias

(Parameter containing:
 tensor([[0.7124]], requires_grad=True),
 Parameter containing:
 tensor([0.2438], requires_grad=True))

In [103]:
y1_pred = model(X1)
y1_pred

tensor([[28.8836]], grad_fn=<AddmmBackward0>)