<a href="https://colab.research.google.com/github/shu65/blog-pytorch-notebooks/blob/main/pytorch_CUDA_Graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sat Oct 23 01:26:36 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip3 install torch==1.10.0+cu111 torchvision==0.11.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.10.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (2137.6 MB)
[K     |████████████▌                   | 834.1 MB 1.9 MB/s eta 0:11:21tcmalloc: large alloc 1147494400 bytes == 0x5560a3aa4000 @  0x7f2cbfa22615 0x556069f0f4cc 0x556069fef47a 0x556069f122ed 0x55606a003e1d 0x556069f85e99 0x556069f809ee 0x556069f13bda 0x556069f85d00 0x556069f809ee 0x556069f13bda 0x556069f82737 0x55606a004c66 0x556069f81daf 0x55606a004c66 0x556069f81daf 0x55606a004c66 0x556069f81daf 0x556069f14039 0x556069f57409 0x556069f12c52 0x556069f85c25 0x556069f809ee 0x556069f13bda 0x556069f82737 0x556069f809ee 0x556069f13bda 0x556069f81915 0x556069f13afa 0x556069f81c0d 0x556069f809ee
[K     |███████████████▉                | 1055.7 MB 1.6 MB/s eta 0:11:19tcmalloc: large alloc 1434370048 bytes == 0x5560e80fa000 @  0x7f2cbfa22615 0x556069f0f4cc 0x556069fef47a 0x556069f12

In [3]:
!pip list | grep torch

torch                         1.10.0+cu111
torchsummary                  1.5.1
torchtext                     0.10.0
torchvision                   0.11.1+cu111


In [4]:
import torch

static_input = torch.empty((5,), device="cuda")
# Warmup before capture
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
    for _ in range(3):
        static_output = static_input * 2
torch.cuda.current_stream().wait_stream(s)

# Captures the graph
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    static_output = static_input * 2

# Fills the graph's input memory with new data to compute on
static_input.copy_(torch.full((5,), 3, device="cuda"))
print("input of cuda graph", static_input)
g.replay()
# static_output holds the results
print("output of cuda graph", static_output)  # full of 3 * 2 = 6

# Fills the graph's input memory with more data to compute on
static_input.copy_(torch.full((5,), 4, device="cuda"))
print("input of cuda graph", static_input)
g.replay()
print("output of cuda graph",static_output)  # full of 4 * 2 = 8

input of cuda graph tensor([3., 3., 3., 3., 3.], device='cuda:0')
output of cuda graph tensor([6., 6., 6., 6., 6.], device='cuda:0')
input of cuda graph tensor([4., 4., 4., 4., 4.], device='cuda:0')
output of cuda graph tensor([8., 8., 8., 8., 8.], device='cuda:0')


In [5]:
import time

import torch

def training_step(model, loss_fn, optimizer, data, target):
    y_pred = model(data)
    loss = loss_fn(y_pred, target)
    loss.backward()
    optimizer.step()

N, D_in, H, D_out = 32, 128, 256, 16
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.Dropout(p=0.2),
    torch.nn.Linear(H, D_out),
    torch.nn.Dropout(p=0.1)
).cuda()
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Placeholders used for capture
static_input = torch.randn(N, D_in, device='cuda')
static_target = torch.randn(N, D_out, device='cuda')

# warmup
torch.cuda.synchronize()
for _ in range(3):
    optimizer.zero_grad(set_to_none=True)
    training_step(model, loss_fn, optimizer, static_input, static_target)
torch.cuda.synchronize()

# capture
g = torch.cuda.CUDAGraph()
optimizer.zero_grad(set_to_none=True)
with torch.cuda.graph(g):
    training_step(model, loss_fn, optimizer, static_input, static_target)

n_trials = 10
real_inputs = [torch.rand_like(static_input) for _ in range(n_trials)]
real_targets = [torch.rand_like(static_target) for _ in range(n_trials)]

torch.cuda.synchronize()
start = time.time()
for data, target in zip(real_inputs, real_targets):
    # Fills the graph's input memory with new data to compute on
    static_input.copy_(data)
    static_target.copy_(target)
    training_step(model, loss_fn, optimizer, static_input, static_target)
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg cuda default:", elapsed_time/n_trials, 'sec.')

torch.cuda.synchronize()
start = time.time()
for data, target in zip(real_inputs, real_targets):
    # Fills the graph's input memory with new data to compute on
    static_input.copy_(data)
    static_target.copy_(target)
    g.replay()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print("avg cuda graph:", elapsed_time/n_trials, 'sec.')

avg cuda default: 0.0011144161224365234 sec.
avg cuda graph: 0.00047113895416259763 sec.


In [7]:
import time

import torch

def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

n_trials = 10000
input_batch_cpu = torch.randn(1, 3, 224, 224)
#input_batch_cpu = torch.randn(32, 3, 224, 224)

input_batch_gpu = input_batch_cpu.clone().detach().to('cuda')

# default
out = gelu(input_batch_gpu)
torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    out = gelu(input_batch_gpu)
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg default:", elapsed_time/n_trials, 'sec.')

# torch.jit.script
torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

scripted_gelu = torch.jit.script(gelu)
out = scripted_gelu(input_batch_gpu)

torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    out = scripted_gelu(input_batch_gpu)
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg torch.jit.script:", elapsed_time/n_trials, 'sec.')

# CUDA Graph
gelu_graph = torch.cuda.CUDAGraph()
static_input = torch.empty_like(input_batch_gpu)
# Warmup before capture
torch.cuda.synchronize()
static_output = gelu(static_input)
torch.cuda.synchronize()

with torch.cuda.graph(gelu_graph):
    static_output = gelu(static_input)

torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    gelu_graph.replay()
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg cuda graph:", elapsed_time/n_trials, 'sec.')

scripted_gelu_graph = torch.cuda.CUDAGraph()
static_input = torch.empty_like(input_batch_gpu)
# Warmup before capture
torch.cuda.synchronize()
static_output = scripted_gelu(static_input)
torch.cuda.synchronize()

with torch.cuda.graph(scripted_gelu_graph):
    static_output = scripted_gelu(static_input)

torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    scripted_gelu_graph.replay()
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg torch.jit.script and cuda graph:", elapsed_time/n_trials, 'sec.')

avg default: 7.09254503250122e-05 sec.
avg torch.jit.script: 3.8933825492858885e-05 sec.
avg cuda graph: 6.485035419464111e-05 sec.
avg torch.jit.script and cuda graph: 3.563470840454101e-05 sec.


In [6]:
import time

import torch

def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / 1.41421))

n_trials = 10000
#input_batch_cpu = torch.randn(1, 3, 224, 224)
input_batch_cpu = torch.randn(32, 3, 224, 224)

input_batch_gpu = input_batch_cpu.clone().detach().to('cuda')

# default
out = gelu(input_batch_gpu)
torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    out = gelu(input_batch_gpu)
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg default:", elapsed_time/n_trials, 'sec.')

# torch.jit.script
torch.jit._state._jit_function_overload_caching.clear()
torch.jit._state._jit_caching_layer.clear()

scripted_gelu = torch.jit.script(gelu)
out = scripted_gelu(input_batch_gpu)

torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    out = scripted_gelu(input_batch_gpu)
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg torch.jit.script:", elapsed_time/n_trials, 'sec.')

# CUDA Graph
gelu_graph = torch.cuda.CUDAGraph()
static_input = torch.empty_like(input_batch_gpu)
# Warmup before capture
torch.cuda.synchronize()
static_output = gelu(static_input)
torch.cuda.synchronize()

with torch.cuda.graph(gelu_graph):
    static_output = gelu(static_input)

torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    gelu_graph.replay()
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg cuda graph:", elapsed_time/n_trials, 'sec.')

scripted_gelu_graph = torch.cuda.CUDAGraph()
static_input = torch.empty_like(input_batch_gpu)
# Warmup before capture
torch.cuda.synchronize()
static_output = scripted_gelu(static_input)
torch.cuda.synchronize()

with torch.cuda.graph(scripted_gelu_graph):
    static_output = scripted_gelu(static_input)

torch.cuda.synchronize()
start = time.time()
for i in range(n_trials):
    scripted_gelu_graph.replay()
torch.cuda.synchronize()
elapsed_time = time.time() - start
print("avg torch.jit.script and cuda graph:", elapsed_time/n_trials, 'sec.')

avg default: 0.001322664523124695 sec.
avg torch.jit.script: 0.00042458438873291016 sec.
avg cuda graph: 0.0013399296522140502 sec.
avg torch.jit.script and cuda graph: 0.00037378573417663573 sec.
