# Post training quantization

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.optim as optim
import torchvision

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## 量化的基本公式
 $$S=\frac{r_{max}-r_{min}}{q_{max}-q_{min}}$$
 $$Z = round(q_{max}-\frac{r_{max}}{S})$$
 $$r=S(q-Z)$$
 $$q=round(\frac{r}{S}+Z)$$

In [2]:
def calcScaleZeroPoint(min_val, max_val, num_bits=8):   # Calculate Zero_Point
    qmin = 0.
    qmax = 2. ** num_bits - 1.
    scale = float((max_val - min_val) / (qmax - qmin)) # S=(rmax-rmin)/(qmax-qmin)

    zero_point = qmax - max_val / scale    # Z=round(qmax-rmax/scale)

    if zero_point < qmin:
        zero_point = qmin
    elif zero_point > qmax:
        zero_point = qmax
    
    zero_point = int(zero_point)   # Integer

    return scale, zero_point

def quantize_tensor(x, scale, zero_point, num_bits=8, signed=False):   # 把tensor quantize
    if signed:
        qmin = - 2. ** (num_bits - 1)
        qmax = 2. ** (num_bits - 1) - 1
    else:
        qmin = 0.
        qmax = 2.**num_bits - 1.
 
    q_x = zero_point + x / scale
    q_x.clamp_(qmin, qmax).round_()     # q=round(r/S+Z)
    
    return q_x.float()  # 由于pytorch不支持int类型的运算，因此我们还是用float来表示整数

def dequantize_tensor(q_x, scale, zero_point):   # Dequantize
    return scale * (q_x - zero_point)

前面提到，在后训练量化过程中，需要先统计样本以及中间层的 min、max，同时也频繁涉及到一些量化、反量化操作，

因此我们可以把这些功能都封装成一个 `QParam` 类：update 函数就是用来统计 min、max 的。

In [3]:
class QParam:
    def __init__(self, num_bits=8):
        self.num_bits = num_bits
        self.scale = None
        self.zero_point = None
        self.min = None
        self.max = None

    def update(self, tensor):
        if self.max is None or self.max < tensor.max():
            self.max = tensor.max()
        self.max = 0 if self.max < 0 else self.max  # 这是什么语法？
        
        if self.min is None or self.min > tensor.min():
            self.min = tensor.min()
        self.min = 0 if self.min > 0 else self.min
        
        self.scale, self.zero_point = calcScaleZeroPoint(self.min, self.max, self.num_bits)
    
    def quantize_tensor(self, tensor):
        return quantize_tensor(tensor, self.scale, self.zero_point, num_bits=self.num_bits)

    def dequantize_tensor(self, q_x):
        return dequantize_tensor(q_x, self.scale, self.zero_point)

## 量化网络模块

下面要来实现一些最基本网络模块的量化形式，包括 *conv、relu、maxpooling* 以及 *fc* 层。

首先我们定义一个量化**基类**，这样可以减少一些重复代码，也能让代码结构更加清晰：这个基类规定了每个量化模块都需要提供的方法。

`__init__` 函数，除了指定量化的位数外，还需指定是否提供量化输入 (qi) 及输出参数 (qo)。在前面也提到，不是每一个网络模块都需要统计输入的 min、max，大部分中间层都是用上一层的 qo 来作为自己的 qi 的，另外有些中间层的激活函数也是直接用上一层的 qi 来作为自己的 qi 和 qo。

其次是 `freeze` 函数，这个函数会在统计完 min、max 后发挥作用。正如上文所说的，公式中有很多项是可以提前计算好的，`freeze` 就是把这些项提前固定下来，同时也将网络的权重由浮点实数转化为定点整数。

最后是 `quantize_inference`，这个函数主要是量化 inference 的时候会使用。实际 inference 的时候和正常的 forward 会有一些差异，可以根据之后的代码体会一下

In [4]:
class QModule(nn.Module):

    def __init__(self, qi=True, qo=True, num_bits=8):
        super(QModule, self).__init__()
        if qi:
            self.qi = QParam(num_bits=num_bits)
        if qo:
            self.qo = QParam(num_bits=num_bits)

    def freeze(self):
        pass

    def quantize_inference(self, x):
        raise NotImplementedError('quantize_inference should be implemented.')

### 量化卷积层
- QConv2d
- QLinear
- QReLU
- QMaxPooling2d
- QConvBNReLU

#### QConv2d
- 首先是 `__init__` 函数，可以看到我传入了一个 `conv_module` 模块，这个模块对应全精度的卷积层，
另外的 qw 参数则是用来统计 weight 的 min、max 以及对 weight 进行量化用的。
- 其次是 freeze 函数，这个函数主要就是计算公式中的 $M,q_w$ 以及 $q_b$ 。由于完全实现公式的加速效果需要更底层代码的支持，因此在 pytorch 中我用了更简单的实现方式，即优化前的公式:
$$q_a=M(\sum_i^N (q_w-Z_w)(q_x-Z_x)+q_b)+Z_a$$
注意到 freeze 函数可能会传入 qi 或者 qo，这也是之前提到的，有些中间的模块不会有自己的 qi，而是复用之前层的 qo 作为自己的 qi。
- 接着是 `forward` 函数，这个函数和正常的 forward 一样，也是在 float 上进行的，只不过需要统计输入输出以及 weight 的 min、max 而已。
有读者可能会疑惑为什么需要对 weight 量化到 int8 然后又反量化回 float，这里其实就是所谓的**伪量化节点**，因为我们在实际量化 inference 的时候会把 weight 量化到 int8，这个过程本身是有精度损失的 (来自四舍五入的 round 带来的截断误差)，所以在统计 min、max 的时候，需要把这个过程带来的误差也模拟进去。
- 最后是 `quantize_inference` 函数，这个函数在实际 inference 的时候会被调用，对应的就是上面的公式。注意，这个函数里面的卷积操作是在 int 上进行的，这是量化推理加速的关键「当然，由于 pytorch 的限制，我们仍然是在 float 上计算，只不过数值都是整数。这也可以看出量化推理是跟底层实现紧密结合的技术」。

In [5]:
from torch.autograd import Function
class FakeQuantize(Function):

    @staticmethod
    def forward(ctx, x, qparam):
        x = qparam.quantize_tensor(x)
        x = qparam.dequantize_tensor(x)
        return x

    @staticmethod
    def backward(ctx, grad_output):
        return grad_output, None

class QConv2d(QModule):

    def __init__(self, conv_module, qi=True, qo=True, num_bits=8):
        super(QConv2d, self).__init__(qi=qi, qo=qo, num_bits=num_bits)
        self.num_bits = num_bits
        self.conv_module = conv_module
        self.qw = QParam(num_bits=num_bits)

    def freeze(self, qi=None, qo=None):
        
        if hasattr(self, 'qi') and qi is not None:         # hasattr(object, name)  如果对象有该属性返回 True，否则返回 False。
            raise ValueError('qi has been provided in init function.')
        if not hasattr(self, 'qi') or qi is None:
            raise ValueError('qi is not existed, should be provided.')

        if hasattr(self, 'qo') and qo is not None:
            raise ValueError('qo has been provided in init function.')
        if not hasattr(self, 'qo') or qo is None:
            raise ValueError('qo is not existed, should be provided.')

        if qi is not None:
            self.qi = qi
        if qo is not None:
            self.qo = qo
        self.M = self.qw.scale * self.qi.scale / self.qo.scale  # Calculate M

        self.conv_module.weight.data = self.qw.quantize_tensor(self.conv_module.weight.data) # 把权重量化  标准：qw的参数
        self.conv_module.weight.data = self.conv_module.weight.data - self.qw.zero_point     # 减去零点

        self.conv_module.bias.data = quantize_tensor(self.conv_module.bias.data, scale=self.qi.scale * self.qw.scale,
                                                     zero_point=0, num_bits=32, signed=True)

    def forward(self, x):
        if hasattr(self, 'qi'):
            self.qi.update(x)
            x = FakeQuantize.apply(x, self.qi)

        self.qw.update(self.conv_module.weight.data)

        x = F.conv2d(x, FakeQuantize.apply(self.conv_module.weight, self.qw), self.conv_module.bias, 
                     stride=self.conv_module.stride,
                     padding=self.conv_module.padding, dilation=self.conv_module.dilation, 
                     groups=self.conv_module.groups)

        if hasattr(self, 'qo'):
            self.qo.update(x)
            x = FakeQuantize.apply(x, self.qo)

        return x

    def quantize_inference(self, x):
        x = x - self.qi.zero_point
        x = self.conv_module(x)
        x = self.M * x
        x.round_() 
        x = x + self.qo.zero_point        
        x.clamp_(0., 2.**self.num_bits-1.).round_()
        return x


In [6]:
class QLinear(QModule):

    def __init__(self, fc_module, qi=True, qo=True, num_bits=8):
        super(QLinear, self).__init__(qi=qi, qo=qo, num_bits=num_bits)
        self.num_bits = num_bits
        self.fc_module = fc_module
        self.qw = QParam(num_bits=num_bits)

    def freeze(self, qi=None, qo=None):

        if hasattr(self, 'qi') and qi is not None:
            raise ValueError('qi has been provided in init function.')
        if not hasattr(self, 'qi') and qi is None:
            raise ValueError('qi is not existed, should be provided.')

        if hasattr(self, 'qo') and qo is not None:
            raise ValueError('qo has been provided in init function.')
        if not hasattr(self, 'qo') and qo is None:
            raise ValueError('qo is not existed, should be provided.')

        if qi is not None:
            self.qi = qi
        if qo is not None:
            self.qo = qo
        self.M = self.qw.scale * self.qi.scale / self.qo.scale

        self.fc_module.weight.data = self.qw.quantize_tensor(self.fc_module.weight.data)
        self.fc_module.weight.data = self.fc_module.weight.data - self.qw.zero_point
        self.fc_module.bias.data = quantize_tensor(self.fc_module.bias.data, scale=self.qi.scale * self.qw.scale,
                                                   zero_point=0, num_bits=32, signed=True)

    def forward(self, x):
        if hasattr(self, 'qi'):
            self.qi.update(x)
            x = FakeQuantize.apply(x, self.qi)

        self.qw.update(self.fc_module.weight.data)

        x = F.linear(x, FakeQuantize.apply(self.fc_module.weight, self.qw), self.fc_module.bias)

        if hasattr(self, 'qo'):
            self.qo.update(x)
            x = FakeQuantize.apply(x, self.qo)

        return x

    def quantize_inference(self, x):
        x = x - self.qi.zero_point
        x = self.fc_module(x)
        x = self.M * x
        x.round_() 
        x = x + self.qo.zero_point
        x.clamp_(0., 2.**self.num_bits-1.).round_()
        return x


In [7]:
class QReLU(QModule):

    def __init__(self, qi=False, num_bits=None):
        super(QReLU, self).__init__(qi=qi, num_bits=num_bits)

    def freeze(self, qi=None):
        
        if hasattr(self, 'qi') and qi is not None:
            raise ValueError('qi has been provided in init function.')
        if not hasattr(self, 'qi') and qi is None:
            raise ValueError('qi is not existed, should be provided.')

        if qi is not None:
            self.qi = qi

    def forward(self, x):
        if hasattr(self, 'qi'):
            self.qi.update(x)
            x = FakeQuantize.apply(x, self.qi)

        x = F.relu(x)

        return x
    
    def quantize_inference(self, x):
        x = x.clone()
        x[x < self.qi.zero_point] = self.qi.zero_point
        return x

In [8]:
class QMaxPooling2d(QModule):

    def __init__(self, kernel_size=2, stride=2, padding=0, qi=False, num_bits=None):
        super(QMaxPooling2d, self).__init__(qi=qi, num_bits=num_bits)
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

    def freeze(self, qi=None):
        if hasattr(self, 'qi') and qi is not None:
            raise ValueError('qi has been provided in init function.')
        if not hasattr(self, 'qi') and qi is None:
            raise ValueError('qi is not existed, should be provided.')
        if qi is None:
            self.qi = qi

    def forward(self, x):
        if hasattr(self, 'qi'):
            self.qi.update(x)
            x = FakeQuantize.apply(x, self.qi)

        x = F.max_pool2d(x, self.kernel_size, self.stride, self.padding)

        return x

    def quantize_inference(self, x):
        return F.max_pool2d(x, self.kernel_size, self.stride, self.padding)


## Net

In [9]:

from collections import OrderedDict
class Net(nn.Module):

    def __init__(self, num_channels=1):
        super(Net, self).__init__()
        self.cnn = nn.Sequential(OrderedDict([
            ('conv1',nn.Conv2d(1, 16, kernel_size=3, stride=1,padding=1)),
            ('relu1',nn.ReLU()),
            ('max1',nn.MaxPool2d(2)),
            ('conv2',nn.Conv2d(16, 32, kernel_size=3, stride=1,padding=1)),
            ('relu2',nn.ReLU()),
            ('max2',nn.MaxPool2d(2)),
            ('Flat',nn.Flatten()),
            ('Softmax',nn.Linear(32*7*7, 10))
        ]))
        
    def forward(self, x):
        return self.cnn(x)

    def quantize(self, num_bits=8):
        self.qconv1 = QConv2d(self.cnn[0], qi=True, qo=True, num_bits=num_bits)
        self.qrelu1 = QReLU()
        self.qmaxpool2d_1 = QMaxPooling2d(kernel_size=2, stride=0, padding=0)
        self.qconv2 = QConv2d(self.cnn[3], qi=False, qo=True, num_bits=num_bits)
        self.qrelu2 = QReLU()
        self.qmaxpool2d_2 = QMaxPooling2d(kernel_size=2, stride=0, padding=0)
        self.qfc = QLinear(self.cnn[7], qi=False, qo=True, num_bits=num_bits)

    def quantize_forward(self, x):
        x = self.qconv1(x)
        x = self.qrelu1(x)
        x = self.qmaxpool2d_1(x)
        x = self.qconv2(x)
        x = self.qrelu2(x)
        x = self.qmaxpool2d_2(x)
        x = x.view(-1, 32*7*7)
        x = self.qfc(x)
        return x

    def freeze(self):
        self.qconv1.freeze()
        self.qrelu1.freeze(self.qconv1.qo)
        self.qmaxpool2d_1.freeze(self.qconv1.qo)
        self.qconv2.freeze(qi=self.qconv1.qo)
        self.qrelu2.freeze(self.qconv2.qo)
        self.qmaxpool2d_2.freeze(self.qconv2.qo)
        self.qfc.freeze(qi=self.qconv2.qo)

    def quantize_inference(self, x):
        qx = self.qconv1.qi.quantize_tensor(x)
        qx = self.qconv1.quantize_inference(qx)
        qx = self.qrelu1.quantize_inference(qx)
        qx = self.qmaxpool2d_1.quantize_inference(qx)
        qx = self.qconv2.quantize_inference(qx)
        qx = self.qrelu2.quantize_inference(qx)
        qx = self.qmaxpool2d_2.quantize_inference(qx)
        qx = qx.view(-1, 32*7*7)
        qx = self.qfc.quantize_inference(qx)
        out = self.qfc.qo.dequantize_tensor(qx)
        return out

## Train

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('data', train=True, download=True, 
                 transform=transforms.Compose([
                   transforms.ToTensor(),
                   transforms.Normalize((0.1307,), (0.3081,))
                 ])),
  batch_size=64, shuffle=True, num_workers=0
)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('data', train=False, transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
  ])),
  batch_size=64, shuffle=True, num_workers=0
)

model = Net().to(device)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [11]:
from torchsummaryX import summary
summary(Net(),torch.zeros(1,1,28,28).to("cpu"))

                        Kernel Shape     Output Shape  Params Mult-Adds
Layer                                                                  
0_cnn.Conv2d_conv1     [1, 16, 3, 3]  [1, 16, 28, 28]   160.0  112.896k
1_cnn.ReLU_relu1                   -  [1, 16, 28, 28]       -         -
2_cnn.MaxPool2d_max1               -  [1, 16, 14, 14]       -         -
3_cnn.Conv2d_conv2    [16, 32, 3, 3]  [1, 32, 14, 14]   4.64k  903.168k
4_cnn.ReLU_relu2                   -  [1, 32, 14, 14]       -         -
5_cnn.MaxPool2d_max2               -    [1, 32, 7, 7]       -         -
6_cnn.Flatten_Flat                 -        [1, 1568]       -         -
7_cnn.Linear_Softmax      [1568, 10]          [1, 10]  15.69k    15.68k
-------------------------------------------------------------------------
                         Totals
Total params             20.49k
Trainable params         20.49k
Non-trainable params        0.0
Mult-Adds             1.031744M


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_cnn.Conv2d_conv1,"[1, 16, 3, 3]","[1, 16, 28, 28]",160.0,112896.0
1_cnn.ReLU_relu1,-,"[1, 16, 28, 28]",,
2_cnn.MaxPool2d_max1,-,"[1, 16, 14, 14]",,
3_cnn.Conv2d_conv2,"[16, 32, 3, 3]","[1, 32, 14, 14]",4640.0,903168.0
4_cnn.ReLU_relu2,-,"[1, 32, 14, 14]",,
5_cnn.MaxPool2d_max2,-,"[1, 32, 7, 7]",,
6_cnn.Flatten_Flat,-,"[1, 1568]",,
7_cnn.Linear_Softmax,"[1568, 10]","[1, 10]",15690.0,15680.0


In [12]:
print(model)

Net(
  (cnn): Sequential(
    (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu1): ReLU()
    (max1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (relu2): ReLU()
    (max2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (Flat): Flatten(start_dim=1, end_dim=-1)
    (Softmax): Linear(in_features=1568, out_features=10, bias=True)
  )
)


In [13]:
model.cnn[2]

MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)

In [14]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    lossLayer = torch.nn.CrossEntropyLoss()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = lossLayer(output, target)
        loss.backward()
        optimizer.step()

        if batch_idx % 50 == 0:
            print('Train Epoch: {} [{}/{}]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset), loss.item()
            ))

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    lossLayer = torch.nn.CrossEntropyLoss(reduction='sum')
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        test_loss += lossLayer(output, target).item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
    
    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {:.0f}%\n'.format(
        test_loss, 100. * correct / len(test_loader.dataset)
    ))


if __name__ == "__main__":
    batch_size = 64
    test_batch_size = 64
    seed = 1
    epochs = 15
    lr = 0.01
    momentum = 0.5
    save_model = True
    using_bn = False

    torch.manual_seed(seed)
    if using_bn:
        model = NetBN().to(device)
    else:
        model = Net().to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer, epoch)
        test(model, device, test_loader)
    

    torch.save(model.state_dict(), './mnist_cnn.pt')

Train Epoch: 1 [0/60000]	Loss: 2.306624
Train Epoch: 1 [3200/60000]	Loss: 0.759732
Train Epoch: 1 [6400/60000]	Loss: 0.558619
Train Epoch: 1 [9600/60000]	Loss: 0.530320
Train Epoch: 1 [12800/60000]	Loss: 0.356621
Train Epoch: 1 [16000/60000]	Loss: 0.214483
Train Epoch: 1 [19200/60000]	Loss: 0.208317
Train Epoch: 1 [22400/60000]	Loss: 0.286776
Train Epoch: 1 [25600/60000]	Loss: 0.156135
Train Epoch: 1 [28800/60000]	Loss: 0.135184
Train Epoch: 1 [32000/60000]	Loss: 0.172798
Train Epoch: 1 [35200/60000]	Loss: 0.161894
Train Epoch: 1 [38400/60000]	Loss: 0.125891
Train Epoch: 1 [41600/60000]	Loss: 0.163238
Train Epoch: 1 [44800/60000]	Loss: 0.094096
Train Epoch: 1 [48000/60000]	Loss: 0.190847
Train Epoch: 1 [51200/60000]	Loss: 0.110937
Train Epoch: 1 [54400/60000]	Loss: 0.124313
Train Epoch: 1 [57600/60000]	Loss: 0.117271

Test set: Average loss: 0.1254, Accuracy: 96%

Train Epoch: 2 [0/60000]	Loss: 0.053423
Train Epoch: 2 [3200/60000]	Loss: 0.141812
Train Epoch: 2 [6400/60000]	Loss: 0.2387

Train Epoch: 10 [22400/60000]	Loss: 0.065324
Train Epoch: 10 [25600/60000]	Loss: 0.043995
Train Epoch: 10 [28800/60000]	Loss: 0.020818
Train Epoch: 10 [32000/60000]	Loss: 0.084497
Train Epoch: 10 [35200/60000]	Loss: 0.037622
Train Epoch: 10 [38400/60000]	Loss: 0.005648
Train Epoch: 10 [41600/60000]	Loss: 0.087433
Train Epoch: 10 [44800/60000]	Loss: 0.055323
Train Epoch: 10 [48000/60000]	Loss: 0.142046
Train Epoch: 10 [51200/60000]	Loss: 0.014084
Train Epoch: 10 [54400/60000]	Loss: 0.013668
Train Epoch: 10 [57600/60000]	Loss: 0.010178

Test set: Average loss: 0.0365, Accuracy: 99%

Train Epoch: 11 [0/60000]	Loss: 0.022897
Train Epoch: 11 [3200/60000]	Loss: 0.020405
Train Epoch: 11 [6400/60000]	Loss: 0.002943
Train Epoch: 11 [9600/60000]	Loss: 0.001846
Train Epoch: 11 [12800/60000]	Loss: 0.034315
Train Epoch: 11 [16000/60000]	Loss: 0.033893
Train Epoch: 11 [19200/60000]	Loss: 0.004896
Train Epoch: 11 [22400/60000]	Loss: 0.013451
Train Epoch: 11 [25600/60000]	Loss: 0.053857
Train Epoch: 1

## 后训练量化

In [15]:
model = Net()
model.load_state_dict(torch.load('./mnist_cnn.pt'))
model.quantize(num_bits=8)

def direct_quantize(model, test_loader):
    for i, (data, target) in enumerate(test_loader, 1):
        output = model.quantize_forward(data)
        if i % 200 == 0:
            break
    print('direct quantization finish')
    
model.freeze()

def quantize_inference(model, test_loader):
    correct = 0
    for i, (data, target) in enumerate(test_loader, 1):
        output = model.quantize_inference(data)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()
    print('\nTest set: Quant Model Accuracy: {:.0f}%\n'.format(100. * correct / len(test_loader.dataset)))

quantize_inference(model, test_loader)

ValueError: qi is not existed, should be provided.

In [None]:
model

In [130]:
model.qconv1

QConv2d(
  (conv_module): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)

In [16]:
import torch.nn as nn
import torch.nn.functional as F

# some base_op, such as ``Add``、``Concat``
from micronet.base_module.op import *

# ``quantize`` is quant_module, ``QuantConv2d``, ``QuantLinear``, ``QuantMaxPool2d``, ``QuantReLU`` are quant_op
from micronet.compression.quantization.wbwtab.quantize import QuantConv2d as quant_conv_wbwtab
from micronet.compression.quantization.wbwtab.quantize import ActivationQuantizer as quant_relu_wbwtab
from micronet.compression.quantization.wqaq.dorefa.quantize import QuantConv2d as quant_conv_dorefa
from micronet.compression.quantization.wqaq.dorefa.quantize import QuantLinear as quant_linear_dorefa
from micronet.compression.quantization.wqaq.iao.quantize import QuantConv2d as quant_conv_iao
from micronet.compression.quantization.wqaq.iao.quantize import QuantLinear as quant_linear_iao
from micronet.compression.quantization.wqaq.iao.quantize import QuantMaxPool2d as quant_max_pool_iao
from micronet.compression.quantization.wqaq.iao.quantize import QuantReLU as quant_relu_iao


class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        self.max_pool = nn.MaxPool2d(kernel_size=2)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, x):
        x = self.relu(self.max_pool(self.conv1(x)))
        x = self.relu(self.max_pool(self.conv2(x)))
        x = x.view(-1, 320)
        x = self.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

class QuantLeNetWbWtAb(nn.Module):
    def __init__(self):
        super(QuantLeNetWbWtAb, self).__init__()
        self.conv1 = quant_conv_wbwtab(1, 10, kernel_size=5)
        self.conv2 = quant_conv_wbwtab(10, 20, kernel_size=5)
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)
        self.max_pool = nn.MaxPool2d(kernel_size=2)
        self.relu = quant_relu_wbwtab()

    def forward(self, x):
        x = self.relu(self.max_pool(self.conv1(x)))
        x = self.relu(self.max_pool(self.conv2(x)))
        x = x.view(-1, 320)
        x = self.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

class QuantLeNetDoReFa(nn.Module):
    def __init__(self):
        super(QuantLeNetDoReFa, self).__init__()
        self.conv1 = quant_conv_dorefa(1, 10, kernel_size=5)
        self.conv2 = quant_conv_dorefa(10, 20, kernel_size=5)
        self.fc1 = quant_linear_dorefa(320, 50)
        self.fc2 = quant_linear_dorefa(50, 10)
        self.max_pool = nn.MaxPool2d(kernel_size=2)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.relu(self.max_pool(self.conv1(x)))
        x = self.relu(self.max_pool(self.conv2(x)))
        x = x.view(-1, 320)
        x = self.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

class QuantLeNetIAO(nn.Module):
    def __init__(self):
        super(QuantLeNetIAO, self).__init__()
        self.conv1 = quant_conv_iao(1, 10, kernel_size=5)
        self.conv2 = quant_conv_iao(10, 20, kernel_size=5)
        self.fc1 = quant_linear_iao(320, 50)
        self.fc2 = quant_linear_iao(50, 10)
        self.max_pool = quant_max_pool_iao(kernel_size=2)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.relu(self.max_pool(self.conv1(x)))
        x = self.relu(self.max_pool(self.conv2(x)))
        x = x.view(-1, 320)
        x = self.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

lenet = LeNet()
quant_lenet_wbwtab = QuantLeNetWbWtAb()
quant_lenet_dorefa = QuantLeNetDoReFa()
quant_lenet_iao = QuantLeNetIAO()

print('***ori_model***\n', lenet)
print('\n***quant_model_wbwtab***\n', quant_lenet_wbwtab)
print('\n***quant_model_dorefa***\n', quant_lenet_dorefa)
print('\n***quant_model_iao***\n', quant_lenet_iao)

print('\nquant_model is ready')
print('micronet is ready')


***ori_model***
 LeNet(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (max_pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): ReLU(inplace=True)
)

***quant_model_wbwtab***
 QuantLeNetWbWtAb(
  (conv1): QuantConv2d(
    1, 10, kernel_size=(5, 5), stride=(1, 1)
    (weight_quantizer): WeightQuantizer()
  )
  (conv2): QuantConv2d(
    10, 20, kernel_size=(5, 5), stride=(1, 1)
    (weight_quantizer): WeightQuantizer()
  )
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (max_pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (relu): ActivationQuantizer(
    (relu): ReLU(inplace=True)
  )
)

***quant_model_dorefa***
 QuantLeNetDoReFa(
  (conv1): Quan