<a href="https://colab.research.google.com/github/shu65/pytorch_2_compile_example/blob/main/torch_2_0_compile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install -U torch torchvision torchaudio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
!pip list | grep torch

torch                         2.0.0
torchaudio                    2.0.1
torchsummary                  1.5.1
torchtext                     0.14.1
torchvision                   0.15.1


In [14]:
!nvidia-smi

Thu Mar 16 22:03:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P0    30W /  70W |   9529MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
import time 

import torch
import torchvision.models as models
import torch._dynamo


batch_size = 64
n_warmup_iters = 10
n_iters = 500

x = torch.randn(batch_size, 3, 224, 224).cuda()

def get_mode():
    return models.resnet18()

In [16]:
model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for _ in range(n_warmup_iters):
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"default:{elapsed_time} sec. {batch_size*n_iters/elapsed_time} imgs/sec.")

default:78.65860414505005 sec. 406.82135600818117 imgs/sec.


In [17]:
torch._dynamo.reset()

model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# compile
compiled_model = torch.compile(model)
for _ in range(n_warmup_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()


torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"with compile:{elapsed_time} sec. {batch_size*n_iters/elapsed_time} imgs/sec.")

with compile:74.9407467842102 sec. 427.0040181497405 imgs/sec.


In [18]:
torch._dynamo.reset()

model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# compile 
compiled_model = torch.compile(model, mode="reduce-overhead")
for _ in range(n_warmup_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"with compile reduce-overhead:{elapsed_time} sec. {batch_size*n_iters/elapsed_time} imgs/sec.")

with compile reduce-overhead:79.5311849117279 sec. 402.35789313986675 imgs/sec.


In [19]:
torch._dynamo.reset()

model = get_mode().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# compile
compiled_model = torch.compile(model, mode="max-autotune")
for _ in range(n_warmup_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()

torch.cuda.synchronize()
start = time.time()
for i in range(n_iters):
    optimizer.zero_grad()
    out = compiled_model(x)
    out.sum().backward()
    optimizer.step()
torch.cuda.synchronize()
elapsed_time = time.time() - start

print(f"with compile max-autotune:{elapsed_time} sec. {batch_size*n_iters/elapsed_time} imgs/sec.")

with compile max-autotune:74.86910438537598 sec. 427.4126191664514 imgs/sec.
