<a href="https://colab.research.google.com/github/shu65/pytorch_2_compile_example/blob/main/torch_2_0_compile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U torch torchvision torchaudio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch
  Downloading torch-2.0.0-cp39-cp39-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting torchvision
  Downloading torchvision-0.15.1-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
Collecting torchaudio
  Downloading torchaudio-2.0.1-cp39-cp39-manylinux1_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cusparse-cu11==11.7.4.91
  Downloading nvidia_cusparse_cu11-11.7.4.91-py3-none-manylinux1_x86_64.whl (173.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.2/173.2 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCo

In [2]:
!pip list | grep torch

torch                         2.0.0
torchaudio                    2.0.1
torchsummary                  1.5.1
torchtext                     0.14.1
torchvision                   0.15.1


In [3]:
!nvidia-smi

Fri Mar 17 18:26:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    28W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import time 

import torch
import torchvision.models as models
import torch._dynamo


batch_size = 64
n_warmup_iters = 10
n_iters = 500

x = torch.randn(batch_size, 3, 224, 224).cuda()

def get_mode():
    return models.resnet18()

In [5]:
model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for i in range(n_warmup_iters):
    optimizer.zero_grad()
    torch.cuda.synchronize()
    start = time.time()
    out = model(x)
    torch.cuda.synchronize()
    forward_elapsed_time = time.time() - start
    torch.cuda.synchronize()
    start = time.time()
    out.sum().backward()
    backward_elapsed_time = time.time() - start
    print(f"default {i} iter forward: {forward_elapsed_time/1000:.3e} msec., backward: {backward_elapsed_time/1000:.3e} msec.")
    optimizer.step()

print("-"*10)

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"default total:{elapsed_time:.3e} sec. {batch_size*n_iters/elapsed_time:.3e} imgs/sec.")

default 0 iter forward: 5.549e-03 msec., backward: 6.485e-04 msec.
default 1 iter forward: 5.869e-05 msec., backward: 6.034e-06 msec.
default 2 iter forward: 5.193e-05 msec., backward: 5.990e-06 msec.
default 3 iter forward: 5.335e-05 msec., backward: 5.827e-06 msec.
default 4 iter forward: 5.128e-05 msec., backward: 3.760e-06 msec.
default 5 iter forward: 5.155e-05 msec., backward: 4.232e-06 msec.
default 6 iter forward: 5.188e-05 msec., backward: 6.186e-06 msec.
default 7 iter forward: 5.234e-05 msec., backward: 5.688e-06 msec.
default 8 iter forward: 5.200e-05 msec., backward: 3.709e-06 msec.
default 9 iter forward: 5.167e-05 msec., backward: 6.084e-06 msec.
----------
default total:7.868e+01 sec. 4.067e+02 imgs/sec.


In [6]:
torch._dynamo.reset()

model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# compile
compiled_model = torch.compile(model)
for i in range(n_warmup_iters):
    optimizer.zero_grad()
    torch.cuda.synchronize()
    start = time.time()
    out = compiled_model(x)
    torch.cuda.synchronize()
    forward_elapsed_time = time.time() - start
    torch.cuda.synchronize()
    start = time.time()
    out.sum().backward()
    backward_elapsed_time = time.time() - start
    print(f"with compile {i} iter forward: {forward_elapsed_time/1000:.3e} msec., backward: {backward_elapsed_time/1000:.3e} msec.")
    optimizer.step()

print("-"*10)

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"with compile total:{elapsed_time:.3e} sec. {batch_size*n_iters/elapsed_time:.3e} imgs/sec.")

with compile 0 iter forward: 2.029e-02 msec., backward: 1.319e-02 msec.
with compile 1 iter forward: 5.798e-05 msec., backward: 6.389e-06 msec.
with compile 2 iter forward: 5.125e-05 msec., backward: 9.748e-06 msec.
with compile 3 iter forward: 5.196e-05 msec., backward: 6.076e-06 msec.
with compile 4 iter forward: 4.972e-05 msec., backward: 6.641e-06 msec.
with compile 5 iter forward: 4.980e-05 msec., backward: 6.386e-06 msec.
with compile 6 iter forward: 4.960e-05 msec., backward: 5.938e-06 msec.
with compile 7 iter forward: 4.988e-05 msec., backward: 5.989e-06 msec.
with compile 8 iter forward: 5.028e-05 msec., backward: 6.197e-06 msec.
with compile 9 iter forward: 5.009e-05 msec., backward: 6.032e-06 msec.
----------
with compile total:7.337e+01 sec. 4.361e+02 imgs/sec.


In [7]:
torch._dynamo.reset()

model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# compile 
compiled_model = torch.compile(model, mode="reduce-overhead")
for i in range(n_warmup_iters):
    optimizer.zero_grad()
    torch.cuda.synchronize()
    start = time.time()
    out = compiled_model(x)
    torch.cuda.synchronize()
    forward_elapsed_time = time.time() - start
    torch.cuda.synchronize()
    start = time.time()
    out.sum().backward()
    backward_elapsed_time = time.time() - start
    print(f"with compile reduce-overhead {i} iter forward: {forward_elapsed_time/1000:.3e} msec., backward: {backward_elapsed_time/1000:.3e} msec.")
    optimizer.step()

print("-"*10)

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"with compile reduce-overhead total:{elapsed_time:.3e} sec. {batch_size*n_iters/elapsed_time:.3e} imgs/sec.")

with compile reduce-overhead 0 iter forward: 7.952e-03 msec., backward: 2.075e-03 msec.
with compile reduce-overhead 1 iter forward: 5.474e-05 msec., backward: 3.117e-06 msec.
with compile reduce-overhead 2 iter forward: 4.764e-05 msec., backward: 3.864e-06 msec.
with compile reduce-overhead 3 iter forward: 4.787e-05 msec., backward: 2.983e-06 msec.
with compile reduce-overhead 4 iter forward: 4.822e-05 msec., backward: 2.951e-06 msec.
with compile reduce-overhead 5 iter forward: 4.855e-05 msec., backward: 3.038e-06 msec.
with compile reduce-overhead 6 iter forward: 4.937e-05 msec., backward: 3.098e-06 msec.
with compile reduce-overhead 7 iter forward: 4.825e-05 msec., backward: 2.992e-06 msec.
with compile reduce-overhead 8 iter forward: 4.902e-05 msec., backward: 2.908e-06 msec.
with compile reduce-overhead 9 iter forward: 4.765e-05 msec., backward: 3.263e-06 msec.
----------
with compile reduce-overhead total:7.752e+01 sec. 4.128e+02 imgs/sec.


In [8]:
torch._dynamo.reset()

model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# compile
compiled_model = torch.compile(model, mode="max-autotune")
for i in range(n_warmup_iters):
    optimizer.zero_grad()
    torch.cuda.synchronize()
    start = time.time()
    out = compiled_model(x)
    torch.cuda.synchronize()
    forward_elapsed_time = time.time() - start
    torch.cuda.synchronize()
    start = time.time()
    out.sum().backward()
    backward_elapsed_time = time.time() - start
    print(f"with compile max-autotune {i} iter forward: {forward_elapsed_time/1000:.3e} msec., backward: {backward_elapsed_time/1000:.3e} msec.")
    optimizer.step()

print("-"*10)

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"with compile max-autotune total:{elapsed_time:.3e} sec. {batch_size*n_iters/elapsed_time:.3e} imgs/sec.")



with compile max-autotune 0 iter forward: 6.270e-03 msec., backward: 1.910e-03 msec.
with compile max-autotune 1 iter forward: 5.046e-05 msec., backward: 3.277e-06 msec.
with compile max-autotune 2 iter forward: 5.078e-05 msec., backward: 3.359e-06 msec.
with compile max-autotune 3 iter forward: 5.099e-05 msec., backward: 3.483e-06 msec.
with compile max-autotune 4 iter forward: 5.013e-05 msec., backward: 5.197e-06 msec.
with compile max-autotune 5 iter forward: 5.044e-05 msec., backward: 3.144e-06 msec.
with compile max-autotune 6 iter forward: 5.031e-05 msec., backward: 3.129e-06 msec.
with compile max-autotune 7 iter forward: 5.008e-05 msec., backward: 3.624e-06 msec.
with compile max-autotune 8 iter forward: 5.063e-05 msec., backward: 3.101e-06 msec.
with compile max-autotune 9 iter forward: 5.024e-05 msec., backward: 3.330e-06 msec.
----------
with compile max-autotune total:7.335e+01 sec. 4.362e+02 imgs/sec.
