# 尝试自动量化

## 模型导入

以 PyTorch 前端为例阐述 TVM 自动量化机制。

创建单层卷积：

In [1]:
from torch import nn

class Model(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.conv = nn.Conv2d(3, 16, 3, 1, 1)
        self.bn = nn.BatchNorm2d(16)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)
        return x

TVM 接受 {func}`torch.jit.trace` 后的模型：

In [2]:
import torch
from tvm import relay

pt_model = Model().eval().float()
ishape = (1, 3, 4, 4)
input_shapes = [("data", ishape)]
# script_module = torch.jit.script(pt_model)
# mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
idata = torch.rand(ishape).type(torch.float32)
traced_model = torch.jit.trace(pt_model, idata)
# traced_model 翻译为 TVM 前端模型
mod, params = relay.frontend.from_pytorch(traced_model, input_shapes, use_parser_friendly_name=True)
print(mod["main"])

fn (%data: Tensor[(1, 3, 4, 4), float32] /* span=aten___convolution_0_data:0:0 */, %aten___convolution_0_weight: Tensor[(16, 3, 3, 3), float32] /* span=aten___convolution_0_weight:0:0 */, %aten___convolution_0_bias: Tensor[(16), float32] /* span=aten___convolution_0_bias:0:0 */, %aten__batch_norm_0_weight: Tensor[(16), float32] /* span=aten__batch_norm_0_weight:0:0 */, %aten__batch_norm_0_bias: Tensor[(16), float32] /* span=aten__batch_norm_0_bias:0:0 */, %aten__batch_norm_0_mean: Tensor[(16), float32] /* span=aten__batch_norm_0_mean:0:0 */, %aten__batch_norm_0_var: Tensor[(16), float32] /* span=aten__batch_norm_0_var:0:0 */) {
  %0 = nn.conv2d(%data, %aten___convolution_0_weight, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* span=aten___convolution_0:0:0 */;
  %1 = nn.bias_add(%0, %aten___convolution_0_bias) /* span=aten___convolution_0:0:0 */;
  %2 = nn.batch_norm(%1, %aten__batch_norm_0_weight, %aten__batch_norm_0_bias, %aten__batch_norm_0_mean, %aten__batch_norm_0_var) 

量化 TVM 模型：

In [3]:
import tvm
with tvm.transform.PassContext(opt_level=3):
    with relay.quantize.qconfig(skip_conv_layers=[]):
        qmod = relay.quantize.quantize(mod, params)
print(qmod["main"])

fn (%data: Tensor[(1, 3, 4, 4), float32] /* ty=Tensor[(1, 3, 4, 4), float32] span=aten___convolution_0_data:0:0 */) -> Tensor[(1, 16, 4, 4), float32] {
  %0 = multiply(%data, 16f /* ty=float32 */) /* ty=Tensor[(1, 3, 4, 4), float32] */;
  %1 = round(%0) /* ty=Tensor[(1, 3, 4, 4), float32] */;
  %2 = clip(%1, a_min=-127f, a_max=127f) /* ty=Tensor[(1, 3, 4, 4), float32] */;
  %3 = cast(%2, dtype="int8") /* ty=Tensor[(1, 3, 4, 4), int8] */;
  %4 = nn.conv2d(%3, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), int8] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3], out_dtype="int32") /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %5 = add(%4, meta[relay.Constant][1] /* ty=Tensor[(16, 1, 1), int32] */) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %6 = add(%5, meta[relay.Constant][2] /* ty=Tensor[(16, 1, 1), int32] */) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %7 = nn.relu(%6) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  %8 = add(%7, 256 /* ty=int32 */) /* ty=Tensor[(1, 16, 4, 4), int32] */;
  

## 验证结果

In [4]:
import numpy as np
dev = tvm.cpu()
data_np = np.random.uniform(low=-1, high=1, size=[1, 3, 4, 4]).astype("float32")
input_dict = {"data": data_np}

量化前结果：

In [5]:
with tvm.transform.PassContext(opt_level=3):
    vm_exec = relay.vm.compile(mod, target="llvm", params=params)
vm = tvm.runtime.vm.VirtualMachine(vm_exec, dev)
vm.set_input("main", **input_dict)
tvm_res = vm.run()

One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.


量化后结果：

In [6]:
with tvm.transform.PassContext(opt_level=3):
    qvm_exec = relay.vm.compile(qmod, target="llvm", params=params)
qvm = tvm.runtime.vm.VirtualMachine(qvm_exec, dev)
qvm.set_input("main", **input_dict)
tvm_qres = qvm.run()

对比 Torch 结果与 TVM 浮点结果：

In [7]:
with torch.no_grad():
    torch_res = traced_model(torch.from_numpy(data_np))
np.testing.assert_allclose(
    tvm_res.numpy(), torch_res.numpy(),
    rtol=1e-5, atol=1e-5
)

查看量化前后的余弦相似度与 $L2$ 损失：

In [8]:
from tvm_book.testing.metric import cosine_similarity, l2_loss

In [9]:
(
    cosine_similarity(tvm_res.numpy(), tvm_qres.numpy()), 
    l2_loss(tvm_res.numpy(), tvm_qres.numpy())
)

(0.9974910020828247, 0.0001684804738033563)

## 源码解析

可以打印完整的量化流程：

In [10]:
@tvm.instrument.pass_instrument
class PrintIR:
    def run_before_pass(self, mod, info):
        print(f"运行 pass: {info}")
        print(mod["main"])


with tvm.transform.PassContext(opt_level=3, instruments=[PrintIR()]):
    with relay.quantize.qconfig(skip_conv_layers=[]):
        qmod = relay.quantize.quantize(mod, params)

运行 pass: The meta data of the pass - pass name: sequential, opt_level: 0, required passes: []

fn (%data: Tensor[(1, 3, 4, 4), float32] /* span=aten___convolution_0_data:0:0 */) {
  %0 = nn.conv2d(%data, meta[relay.Constant][0], padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* span=aten___convolution_0:0:0 */;
  %1 = nn.bias_add(%0, meta[relay.Constant][1]) /* span=aten___convolution_0:0:0 */;
  %2 = nn.batch_norm(%1, meta[relay.Constant][2], meta[relay.Constant][3], meta[relay.Constant][4], meta[relay.Constant][5]) /* span=aten__batch_norm_0:0:0 */;
  %3 = %2.0 /* span=aten__batch_norm_0:0:0 */;
  nn.relu(%3) /* span=aten__relu_0:0:0 */
} /* ty=fn (Tensor[(1, 3, 4, 4), float32]) -> meta[IncompleteType][0] */

运行 pass: The meta data of the pass - pass name: InferType, opt_level: 0, required passes: []

fn (%data: Tensor[(1, 3, 4, 4), float32] /* span=aten___convolution_0_data:0:0 */) {
  %0 = nn.conv2d(%data, meta[relay.Constant][0], padding=[1, 1, 1, 1], channels=16, kernel_s