In [None]:
import torch,torchvision 
model = torchvision.models.resnet18(pretrained=True)
data = torch.rand(1,3,64,64)
labels = torch.rand(1,1000)

In [None]:
labels.shape

torch.Size([1, 1000])

In [None]:
prediction = model(data)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [None]:
loss = (prediction - labels).sum()
loss.backward()

In [None]:
optim = torch.optim.SGD(model.parameters(),lr=1e-2,momentum=0.9)

In [None]:
optim.step()

In [None]:
a = torch.tensor([2.,3.],requires_grad=True)
b = torch.tensor([6.,4.],requires_grad=True)

In [None]:
Q = 3*a**3 - b**2

In [None]:
#When we call .backward() on Q, autograd calculates these gradients and stores them in the respective tensors’ .grad attribute.

In [None]:
"""We need to explicitly pass a gradient argument in Q.backward() because it is a vector.
gradient is a tensor of the same shape as Q, and it represents the gradient of Q w.r.t. itself, i.e.
dQ/dQ=1"""

In [None]:
external_grad = torch.tensor([1.,1.])
Q.backward(gradient=external_grad)

In [None]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)

tensor([True, True])
tensor([True, True])


In [None]:
'''torch.autograd tracks operations on all tensors which have their requires_grad flag set to True.
For tensors that don’t require gradients, setting this attribute to False excludes it from the gradient computation DAG.
The output tensor of an operation will require gradients even if only a single input tensor has requires_grad=True.'''

In [None]:
m = torch.rand(5, 5)
n = torch.rand(5, 5)
z = torch.rand(5,5,requires_grad=True)
a = m + n
print(f"Does `a` require gradients? : {a.requires_grad}")
b = m + z
print(f"Does `b` require gradients?: {b.requires_grad}")

Does `a` require gradients? : False
Does `b` require gradients?: True


In [None]:
'''In a NN, parameters that don’t compute gradients are usually called frozen parameters. 
It is useful to “freeze” part of your model if you know in advance that you 
won’t need the gradients of those parameters (this offers some performance benefits by reducing autograd computations).'''

In [None]:
#Another common usecase where exclusion from the DAG is important is for finetuning a pretrained network

In [None]:
'''In finetuning, we freeze most of the model and typically only modify the classifier layers to make predictions on new labels.
 Let’s walk through a small example to demonstrate this. As before, we load a pretrained resnet18 model, and freeze all the parameters.'''

In [None]:
from torch import nn, optim
model = torchvision.models.resnet18(pretrained=True)
for param in model.parameters():
  param.requires_grad = False

In [None]:
'''Let’s say we want to finetune the model on a new dataset with 10 labels. In resnet, the classifier is the last linear layer model.fc. 
We can simply replace it with a new linear layer (unfrozen by default) that acts as our classifier.'''

In [None]:
model.fc = nn.Linear(512,10)

In [None]:
'''Now all parameters in the model, except the parameters of model.fc, are frozen.
 The only parameters that compute gradients are the weights and bias of model.fc.'''

In [None]:
optimizer = optim.SGD(model.parameters(),lr=1e-2,momentum=0.9)

In [None]:
'''Notice although we register all the parameters in the optimizer, 
the only parameters that are computing gradients (and hence updated in gradient descent) are the weights and bias of the classifier.'''

In [None]:
'''torch.no_grad
Context-manager that disabled gradient calculation.
In this mode, the result of every computation will have requires_grad=False, even when the inputs have requires_grad=True.
'''