# Optimizing Vision Transformer model for deployment

In [1]:
from PIL import Image
import torch
import timm
import requests
import torchvision.transforms as transforms
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

print(torch.__version__)

1.12.0


In [2]:
model = torch.hub.load("facebookresearch/deit:main", "deit_base_patch16_224", pretrained=True)
model.eval()

Using cache found in C:\Users\SB15/.cache\torch\hub\facebookresearch_deit_main


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate=none)
        (drop1): Dropout(p=0.0, inplace=False)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop2): Dropout(p=0.0, inplace=Fal

In [3]:
transform = transforms.Compose([
    transforms.Resize(256, interpolation=3),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
])



In [4]:
img = Image.open(requests.get("https://raw.githubusercontent.com/pytorch/ios-demo-app/master/HelloWorld/HelloWorld/HelloWorld/image.png", stream=True).raw)

img = transform(img)[None, ]
out = model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

269


## Scripting DeiT

In [5]:
model = torch.hub.load("facebookresearch/deit:main", "deit_base_patch16_224", pretrained=True)
model.eval()
scripted_model = torch.jit.script(model)
scripted_model.save("./data/fbdeit_scripted.pt")

Using cache found in C:\Users\SB15/.cache\torch\hub\facebookresearch_deit_main


## Quantizing DeiT

In [6]:
backend = "fbgemm"

model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.backends.quantized.engine = backend

quantized_model = torch.quantization.quantize_dynamic(model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
scripted_quantized_model = torch.jit.script(quantized_model)
scripted_quantized_model.save("./data/fbdeit_scripted_quantized.pt")



In [7]:
out = scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

269


## Optimizing DeiT

In [8]:
from torch.utils.mobile_optimizer import optimize_for_mobile
optimized_scripted_quantized_model = optimize_for_mobile(scripted_quantized_model)
optimized_scripted_quantized_model.save("./data/fbdeit_optimized_scripted_quantized.pt")

In [9]:
out = optimized_scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

269


  return forward_call(*input, **kwargs)


## Using Lite Interpreter

In [11]:
optimized_scripted_quantized_model._save_for_lite_interpreter("./data/fbdeit_optimized_scripted_quantized_lite.ptl")
ptl = torch.jit.load("./data/fbdeit_optimized_scripted_quantized_lite.ptl")

## Comparing Inference Speed


In [13]:
with torch.autograd.profiler.profile(use_cuda=False) as prof1:
    out = model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof2:
    out = scripted_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof3:
    out = scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof4:
    out = optimized_scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof5:
    out = ptl(img)

print("original model: {:.2f}ms".format(prof1.self_cpu_time_total/1000))
print("scripted model: {:.2f}ms".format(prof2.self_cpu_time_total/1000))
print("scripted & quantized model: {:.2f}ms".format(prof3.self_cpu_time_total/1000))
print("scripted & quantized & optimized model: {:.2f}ms".format(prof4.self_cpu_time_total/1000))
print("lite model: {:.2f}ms".format(prof5.self_cpu_time_total/1000))


original model: 317.48ms
scripted model: 474.38ms
scripted & quantized model: 212.56ms
scripted & quantized & optimized model: 132.56ms
lite model: 146.29ms


In [14]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'Model': ['original model','scripted model', 'scripted & quantized model', 'scripted & quantized & optimized model', 'lite model']})
df = pd.concat([df, pd.DataFrame([
    ["{:.2f}ms".format(prof1.self_cpu_time_total/1000), "0%"],
    ["{:.2f}ms".format(prof2.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof2.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
    ["{:.2f}ms".format(prof3.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof3.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
    ["{:.2f}ms".format(prof4.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof4.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
    ["{:.2f}ms".format(prof5.self_cpu_time_total/1000),
     "{:.2f}%".format((prof1.self_cpu_time_total-prof5.self_cpu_time_total)/prof1.self_cpu_time_total*100)]],
    columns=['Inference Time', 'Reduction'])], axis=1)

print(df)

                                    Model Inference Time Reduction
0                          original model       317.48ms        0%
1                          scripted model       474.38ms   -49.42%
2              scripted & quantized model       212.56ms    33.05%
3  scripted & quantized & optimized model       132.56ms    58.25%
4                              lite model       146.29ms    53.92%


'\n        Model                             Inference Time    Reduction\n0   original model                             1236.69ms           0%\n1   scripted model                             1226.72ms        0.81%\n2   scripted & quantized model                  593.19ms       52.03%\n3   scripted & quantized & optimized model      598.01ms       51.64%\n4   lite model                                  600.72ms       51.43%\n'