<a href="https://colab.research.google.com/github/shu65/pyorch_performance_tuning_guide_examples/blob/main/Fuse_pointwise_operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


PERFORMANCE TUNING GUIDE:

https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#fuse-pointwise-operations

In [1]:
!pip list | grep torch

torch                         1.8.1+cu101   
torchsummary                  1.5.1         
torchtext                     0.9.1         
torchvision                   0.9.1+cu101   


In [2]:
!nvidia-smi

Sat May  8 13:13:56 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import time

import torch

In [4]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

In [5]:
input_batch_cpu = torch.randn(128, 3, 224, 224)
input_batch_gpu = input_batch_cpu.clone().detach().to('cuda')

In [6]:
# CPU default

n_trials = 100
out = gelu(input_batch_cpu)

start = time.time()
with torch.no_grad():
  for i in range(n_trials):
    out = gelu(input_batch_cpu)
elapsed_time = time.time() - start

print("avg cpu default:", elapsed_time/n_trials, 'sec.')

avg cpu default: 0.10588892936706543 sec.


In [7]:
# CPU torch.jit.script
torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

n_trials = 100
scripted_gelu = torch.jit.script(gelu)
out = scripted_gelu(input_batch_cpu)

start = time.time()
with torch.no_grad():
  for i in range(n_trials):
    out = scripted_gelu(input_batch_cpu)
elapsed_time = time.time() - start

print("avg cpu torch.jit.script:", elapsed_time/n_trials, 'sec.')

avg cpu torch.jit.script: 0.10547802448272706 sec.


In [8]:
# GPU default

n_trials = 1000
out = gelu(input_batch_gpu)

torch.cuda.synchronize()
start = time.time()
with torch.no_grad():
  for i in range(n_trials):
    out = gelu(input_batch_gpu)
torch.cuda.synchronize()
elapsed_time = time.time() - start

print("avg gpu default:", elapsed_time/n_trials, 'sec.')

avg gpu default: 0.003560582160949707 sec.


In [9]:
# GPU torch.jit.script
torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

n_trials = 1000
scripted_gelu = torch.jit.script(gelu)
out = scripted_gelu(input_batch_gpu)

torch.cuda.synchronize()
start = time.time()
with torch.no_grad():
  for i in range(n_trials):
    out = scripted_gelu(input_batch_gpu)
torch.cuda.synchronize()
elapsed_time = time.time() - start

print("avg gpu torch.jit.script:", elapsed_time/n_trials, 'sec.')

avg gpu torch.jit.script: 0.000788639783859253 sec.


In [10]:
# CPU torch.jit.script graph 

torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

scripted_gelu = torch.jit.script(gelu)


out = scripted_gelu(input_batch_cpu)
print("1st graph ",torch.jit.last_executed_optimized_graph())
out = scripted_gelu(input_batch_cpu)
print("2nd graph ",torch.jit.last_executed_optimized_graph())

1st graph  graph(%x.1 : Tensor):
  %1 : int = prim::Constant[value=1]()
  %2 : float = prim::Constant[value=0.5]() # <ipython-input-4-5971bb91cfd0>:2:15
  %3 : float = prim::Constant[value=1.]() # <ipython-input-4-5971bb91cfd0>:2:22
  %4 : float = prim::Constant[value=1.41421]() # <ipython-input-4-5971bb91cfd0>:2:42
  %10 : Tensor = prim::profile[profiled_type=Float(128, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cpu)](%x.1)
  %5 : Tensor = aten::mul(%10, %2) # <ipython-input-4-5971bb91cfd0>:2:11
  %11 : Tensor = prim::profile[profiled_type=Float(128, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cpu)](%x.1)
  %6 : Tensor = aten::div(%11, %4) # <ipython-input-4-5971bb91cfd0>:2:38
  %12 : Tensor = prim::profile[profiled_type=Float(128, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cpu)](%6)
  %7 : Tensor = aten::erf(%12) # <ipython-input-4-5971bb91cfd0>:2:28
  %13 : Tensor = prim::profile[profiled_type=Float(128,

In [11]:
# GPU torch.jit.script graph 

torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

scripted_gelu = torch.jit.script(gelu)


out = scripted_gelu(input_batch_gpu)
print("1st graph ",torch.jit.last_executed_optimized_graph())
out = scripted_gelu(input_batch_gpu)
print("2nd graph ",torch.jit.last_executed_optimized_graph())

1st graph  graph(%x.1 : Tensor):
  %1 : int = prim::Constant[value=1]()
  %2 : float = prim::Constant[value=0.5]() # <ipython-input-4-5971bb91cfd0>:2:15
  %3 : float = prim::Constant[value=1.]() # <ipython-input-4-5971bb91cfd0>:2:22
  %4 : float = prim::Constant[value=1.41421]() # <ipython-input-4-5971bb91cfd0>:2:42
  %10 : Tensor = prim::profile[profiled_type=Float(128, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cuda:0)](%x.1)
  %5 : Tensor = aten::mul(%10, %2) # <ipython-input-4-5971bb91cfd0>:2:11
  %11 : Tensor = prim::profile[profiled_type=Float(128, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cuda:0)](%x.1)
  %6 : Tensor = aten::div(%11, %4) # <ipython-input-4-5971bb91cfd0>:2:38
  %12 : Tensor = prim::profile[profiled_type=Float(128, 3, 224, 224, strides=[150528, 50176, 224, 1], requires_grad=0, device=cuda:0)](%6)
  %7 : Tensor = aten::erf(%12) # <ipython-input-4-5971bb91cfd0>:2:28
  %13 : Tensor = prim::profile[profiled_type=F

In [12]:
# GPU torch.jit.script graph without optimized_execution

torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

with torch.jit.optimized_execution(False):
    scripted_gelu = torch.jit.script(gelu)


    out = scripted_gelu(input_batch_gpu)
    print("1st graph ",torch.jit.last_executed_optimized_graph())
    out = scripted_gelu(input_batch_gpu)
    print("2nd graph ",torch.jit.last_executed_optimized_graph())

1st graph  graph(%x.1 : Tensor):
  %1 : int = prim::Constant[value=1]()
  %2 : float = prim::Constant[value=0.5]() # <ipython-input-4-5971bb91cfd0>:2:15
  %3 : float = prim::Constant[value=1.]() # <ipython-input-4-5971bb91cfd0>:2:22
  %4 : float = prim::Constant[value=1.41421]() # <ipython-input-4-5971bb91cfd0>:2:42
  %5 : Tensor = aten::mul(%x.1, %2) # <ipython-input-4-5971bb91cfd0>:2:11
  %6 : Tensor = aten::div(%x.1, %4) # <ipython-input-4-5971bb91cfd0>:2:38
  %7 : Tensor = aten::erf(%6) # <ipython-input-4-5971bb91cfd0>:2:28
  %8 : Tensor = aten::add(%7, %3, %1) # <string>:5:9
  %9 : Tensor = aten::mul(%5, %8) # <ipython-input-4-5971bb91cfd0>:2:11
  return (%9)

2nd graph  graph(%x.1 : Tensor):
  %1 : int = prim::Constant[value=1]()
  %2 : float = prim::Constant[value=0.5]() # <ipython-input-4-5971bb91cfd0>:2:15
  %3 : float = prim::Constant[value=1.]() # <ipython-input-4-5971bb91cfd0>:2:22
  %4 : float = prim::Constant[value=1.41421]() # <ipython-input-4-5971bb91cfd0>:2:42
  %5 : T