In [None]:
import torch 
import torchvision 
import pandas as pd
import numpy as np
import time
from torch.profiler import profile, record_function, ProfilerActivity
import numpy
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms

In [None]:
train_transforms = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

test_transforms = transforms.Compose([
    
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])


train_data = torchvision.datasets.CIFAR10(root='/data',train=True,download=True,transform=train_transforms)
test_data = torchvision.datasets.CIFAR10(root='/data',train=False,download=True,transform=test_transforms)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=True)


Files already downloaded and verified
Files already downloaded and verified


Conv Model

In [None]:
class cnn_model(Module):
  def __init__(self):
    super().__init__()

    self.cnn_layer=Sequential(
        Conv2d(1024, 512, kernel_size=3, stride=1, padding=1),
        Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
        MaxPool2d(kernel_size=2, stride=2),
        Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
        MaxPool2d(kernel_size=2, stride=2),
        Conv2d(512, 256, kernel_size=3, stride=1, padding=1),
        MaxPool2d(kernel_size=2, stride=2),
        Linear(256,10)
    )
  def forward(self,input):
    return self.cnn_layer(input)
    

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model1 = torchvision.models.resnet18(pretrained=True)
#num_ftrs = model1.fc.in_features
#model1.fc = torch.nn.Linear(num_ftrs, 10)
#model1.to(device)
model1=cnn_model().to(device)

In [None]:

criterion = torch.nn.CrossEntropyLoss().cpu()
optimizer = torch.optim.SGD(model1.parameters(), lr=0.001, momentum=0.9)
train_losses=[]
i=0
with profile(activities=[ProfilerActivity.CPU],profile_memory=True, record_shapes=True) as prof:
  train_loss=0
  for step, batch_data in enumerate(train_loader): 
    if i==10:
      break
    inputs, labels = batch_data[0].to(device=device), batch_data[1].to(device=device)
    outputs = model1(inputs)
    loss = criterion(outputs, labels)
    train_loss+=loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    i+=1
    
  train_losses.append(train_loss/len(train_loader))
  print('Total loss: ',np.mean(train_losses))
    

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

Total loss:  0.06016756445550553
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         0.25%      61.221ms         0.25%      61.221ms       5.105us       1.58 Gb       1.58 Gb         11992  
                               aten::threshold_backward         0.41%     102.181ms         0.41%     102.181ms     601.065us     230.00 Mb     230.00 Mb           170  
                 aten::max_pool2d_with_indices_backward         0.09%      22.905ms         0.19%      47.364ms      

VGG Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2 = torchvision.models.vgg11(pretrained=True)
criterion = torch.nn.CrossEntropyLoss().cpu()
optimizer = torch.optim.SGD(model2.parameters(), lr=0.001, momentum=0.9)
model2.to(device)
train_losses=[]
i=0
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],profile_memory=True, record_shapes=True) as prof:
  train_loss=0
  for step, batch_data in enumerate(train_loader): 
    if i==5:
      break
    inputs, labels = batch_data[0].to(device=device), batch_data[1].to(device=device)
    outputs = model2(inputs)
    loss = criterion(outputs, labels)
    train_loss+=loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    i+=1
  train_losses.append(train_loss/len(train_loader))
  print('Total loss: ',np.mean(train_losses))

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

  warn("CUDA is not available, disabling CUDA profiling")


Total loss:  0.13966851100287475
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm        21.72%        7.939s        21.72%        7.939s     264.633ms       2.38 Gb       2.38 Gb            30  
                                            aten::empty         0.04%      15.336ms         0.04%      15.336ms       3.145us     728.50 Mb     728.50 Mb          4877  
                                    aten::empty_strided         0.03%      11.313ms         0.03%      11.313ms      

In [None]:
!pip install onnx
!pip install onnxruntime
!pip install onnxoptimizer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnx
  Downloading onnx-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx
Successfully installed onnx-1.13.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting onnxruntime
  Downloading onnxruntime-1.14.1-cp39-cp39-manylinux_2_27_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
Collecting coloredlogs
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting humanfriendly>=9.1
  Downloading humanfriendly-10.0-py2.py3-none-

Run time with onyx decrease

In [None]:
import onnxruntime

ort_session = onnxruntime.InferenceSession("vgg16-7.onnx")

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
for step, batch_data in enumerate(train_loader): 
  inputs, labels = batch_data[0].to(device=device), batch_data[1].to(device=device)
  break
# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(inputs)}
ort_outs = ort_session.run(None, ort_inputs)

# compare ONNX Runtime and PyTorch results
torch_out = model2(inputs) 
np.testing.assert_allclose(to_numpy(torch_out), ort_outs[0], rtol=1e-03, atol=1e-05)

Optimized Onyx run time decreased further

In [None]:
!python -m onnxoptimizer vgg16-7.onnx vgg16-7_opt.onnx

In [None]:
import time
from time import perf_counter

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def time_ort_model_evaluation(model_path):
    sess_options = onnxruntime.SessionOptions()
    sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    session = onnxruntime.InferenceSession(model_path, sess_options)

    time_per_inference = []
    for _ in range(10):
        
        # compute ONNX Runtime output prediction
        ort_inputs = {session.get_inputs()[0].name: to_numpy(inputs)}
        start = perf_counter()
        session.run(None, ort_inputs)
        time_per_inference.append((1000 * (perf_counter() - start)))

    return np.mean(time_per_inference)

print('Average runtime of ONNX Model in TPU: ' + str(time_ort_model_evaluation('vgg16-7.onnx')))
print('Average runtime of ONNX Quantized Model in TPU: ' + str(time_ort_model_evaluation('vgg16-7_opt_quant.onnx')))
