<a href="https://colab.research.google.com/github/shu65/pyorch_performance_tuning_guide_examples/blob/main/Use_parameter_grad_%3D_None.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

PERFORMANCE TUNING GUIDE:

https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-parameter-grad-none-instead-of-model-zero-grad-or-optimizer-zero-grad



In [None]:
!pip list | grep torch

torch                         1.8.1+cu101   
torchsummary                  1.5.1         
torchtext                     0.9.1         
torchvision                   0.9.1+cu101   


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0


In [None]:
!nvidia-smi

Mon Apr 26 13:21:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     6W /  75W |      0MiB /  7611MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import torch

device = 'cuda'
n_classes = 1000
batch_size = 32

input_batch_cpu = torch.randn(batch_size, 3, 224, 224)
input_batch = input_batch_cpu.clone().detach().to(device)

target = torch.randint(0, n_classes, (batch_size,)).to(device)

In [None]:
model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet50', pretrained=True).to(device)
model

Downloading: "https://github.com/pytorch/vision/archive/v0.9.0.zip" to /root/.cache/torch/hub/v0.9.0.zip
Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
criterion = torch.nn.CrossEntropyLoss().to(device)

In [None]:
lr = 0.1
momentum = 0.9
weight_decay = 1e-4

optimizer = torch.optim.SGD(model.parameters(), lr,
                                momentum=momentum,
                                weight_decay=weight_decay)

In [None]:
import numpy as np

max_iterations = 100
iteration_start = torch.cuda.Event(enable_timing=True)
iteration_end = torch.cuda.Event(enable_timing=True)
reset_grad_start = torch.cuda.Event(enable_timing=True)
reset_grad_end = torch.cuda.Event(enable_timing=True)

iteration_elapsed_times = []
reset_grad_elapsed_times = []

torch.cuda.synchronize()
for iter_i in range(max_iterations):
    iteration_start.record()

    model.train()
    output = model(input_batch)
    loss = criterion(output, target)
        
    reset_grad_start.record()
    optimizer.zero_grad()
    reset_grad_end.record()
        
    loss.backward()
    optimizer.step()
    iteration_end.record()

    torch.cuda.synchronize()
    elapsed_time = iteration_start.elapsed_time(iteration_end) / 1000
    iteration_elapsed_times.append(elapsed_time)

    elapsed_time = reset_grad_start.elapsed_time(reset_grad_end) / 1000
    reset_grad_elapsed_times.append(elapsed_time)

print("average iteration time:", np.average(iteration_elapsed_times), "sec.")
print("average reset grad time:", np.average(reset_grad_elapsed_times), "sec.")
print("(reset grad time)/(iteration time):", np.average(reset_grad_elapsed_times) / np.average(iteration_elapsed_times) * 100, "%")

average iteration time: 0.4129881353759767 sec.
average reset grad time: 0.0009318704015016555 sec.
(reset grad time)/(iteration time): 0.22564096197419764 %


In [None]:
import numpy as np

max_iterations = 100
iteration_start = torch.cuda.Event(enable_timing=True)
iteration_end = torch.cuda.Event(enable_timing=True)
reset_grad_start = torch.cuda.Event(enable_timing=True)
reset_grad_end = torch.cuda.Event(enable_timing=True)

iteration_elapsed_times = []
reset_grad_elapsed_times = []

torch.cuda.synchronize()
for iter_i in range(max_iterations):
    iteration_start.record()

    model.train()
    output = model(input_batch)
    loss = criterion(output, target)
        
    reset_grad_start.record()
    for param in model.parameters():
        param.grad = None
    reset_grad_end.record()
        
    loss.backward()
    optimizer.step()
    iteration_end.record()

    torch.cuda.synchronize()
    elapsed_time = iteration_start.elapsed_time(iteration_end) / 1000
    iteration_elapsed_times.append(elapsed_time)

    elapsed_time = reset_grad_start.elapsed_time(reset_grad_end) / 1000
    reset_grad_elapsed_times.append(elapsed_time)

print("average iteration time:", np.average(iteration_elapsed_times), "sec.")
print("average reset grad time:", np.average(reset_grad_elapsed_times), "sec.")
print("(reset grad time)/(iteration time):", np.average(reset_grad_elapsed_times) / np.average(iteration_elapsed_times) * 100, "%")

average iteration time: 0.4105947021484375 sec.
average reset grad time: 2.32480002567172e-06 sec.
(reset grad time)/(iteration time): 0.0005662031228136164 %


In [None]:
import numpy as np

max_iterations = 100
iteration_start = torch.cuda.Event(enable_timing=True)
iteration_end = torch.cuda.Event(enable_timing=True)
reset_grad_start = torch.cuda.Event(enable_timing=True)
reset_grad_end = torch.cuda.Event(enable_timing=True)

iteration_elapsed_times = []
reset_grad_elapsed_times = []

torch.cuda.synchronize()
for iter_i in range(max_iterations):
    iteration_start.record()

    model.train()
    output = model(input_batch)
    loss = criterion(output, target)
        
    reset_grad_start.record()
    optimizer.zero_grad(set_to_none=True)
    reset_grad_end.record()
        
    loss.backward()
    optimizer.step()
    iteration_end.record()

    torch.cuda.synchronize()
    elapsed_time = iteration_start.elapsed_time(iteration_end) / 1000
    iteration_elapsed_times.append(elapsed_time)

    elapsed_time = reset_grad_start.elapsed_time(reset_grad_end) / 1000
    reset_grad_elapsed_times.append(elapsed_time)

print("average iteration time:", np.average(iteration_elapsed_times), "sec.")
print("average reset grad time:", np.average(reset_grad_elapsed_times), "sec.")
print("(reset grad time)/(iteration time):", np.average(reset_grad_elapsed_times) / np.average(iteration_elapsed_times) * 100, "%")

average iteration time: 0.4114357562255859 sec.
average reset grad time: 2.3651200218591838e-06 sec.
(reset grad time)/(iteration time): 0.0005748455223134309 %
