In [3]:
import torch
import numpy as np
import torchvision
from torchvision import datasets, transforms
import torchvision.transforms as transforms
no_cuda = False
use_gpu = not no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
print(device)

cuda


### Load dataset

In [2]:
batch_size = 16

trainset = datasets.FashionMNIST('data', train=True, download=True, transform=transforms.ToTensor())
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    
testset = datasets.FashionMNIST('data', train=False, transform=transforms.ToTensor())
testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False)

print('Train: {}\nTest: {}'.format(len(trainset), len(testset)))

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\train-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data\FashionMNIST\raw\train-labels-idx1-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%

Extracting data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Train: 60000
Test: 10000





### Define model

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3, padding=1)
        self.conv2 = nn.Conv2d(6, 16, 3, padding=0)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(16*6*6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

model_to_quantize = Net().to(device)

### Load pretrained model

In [6]:
model_to_quantize.load_state_dict(torch.load('./model/mnist.pth'),strict=False)
model_to_quantize.eval()

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (relu): ReLU()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)

## Quantize

In [5]:
from torch.ao.quantization import get_default_qconfig
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
from torch.ao.quantization import QConfigMapping

### set quantization config and prepare model

In [6]:
# set different quantization config
qconfig = get_default_qconfig('qnnpack')
""" (below is example of different configuration)
qconfig = get_default_qconfig("fbgemm")
qconfig = torch.ao.quantization.default_qconfig
qconfig = torch.ao.quantization.qconfig.QConfig(
    activation=torch.ao.quantization.observer.HistogramObserver.with_args(
        qscheme=torch.per_tensor_symmetric, 
        dtype=torch.qint8, 
    ),
    weight=torch.ao.quantization.observer.PerChannelMinMaxObserver.with_args(
        #ch_axis=1,  
        qscheme=torch.per_channel_symmetric,
        dtype=torch.qint8,
        ))
"""
qconfig_mapping = QConfigMapping().set_global(qconfig)

In [7]:
example_inputs = (next(iter(trainloader))[0]) # to know model input data type
prepared_model = prepare_fx(model_to_quantize, qconfig_mapping, example_inputs) # prepare to quantize model (fuse module (ex:CONV+BN+RELU...)，insert observer)

Please use torch.ao.quantization.get_default_qconfig_mapping or torch.ao.quantization.get_default_qat_qconfig_mapping. Example:
    qconfig_mapping = get_default_qconfig_mapping("fbgemm")
    model = prepare_fx(model, qconfig_mapping, example_inputs)


### calibration (use representation data)

In [8]:
def calibrate(model, device, data_loader):
  model.to(device)
  model.eval()
  with torch.no_grad():
      for data, target in data_loader:
        data, target = data.to(device), target.to(device) #device
        model(data)
calibrate(prepared_model, 'cpu', testloader)  # run calibration on sample data

In [9]:
quantized_model = convert_fx(prepared_model) # convert the calibrated model to a quantized model

### check quantized model

In [10]:
print(model_to_quantize)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (relu): ReLU()
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)


In [11]:
print(quantized_model)

GraphModule(
  (conv1): QuantizedConvReLU2d(1, 6, kernel_size=(3, 3), stride=(1, 1), scale=0.008702474646270275, zero_point=0, padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): QuantizedConvReLU2d(6, 16, kernel_size=(3, 3), stride=(1, 1), scale=0.021146273240447044, zero_point=0)
  (fc1): QuantizedLinearReLU(in_features=576, out_features=120, scale=0.039127349853515625, zero_point=0, qscheme=torch.per_tensor_affine)
  (fc2): QuantizedLinearReLU(in_features=120, out_features=84, scale=0.0579749271273613, zero_point=0, qscheme=torch.per_tensor_affine)
  (fc3): QuantizedLinear(in_features=84, out_features=10, scale=0.22861747443675995, zero_point=175, qscheme=torch.per_tensor_affine)
  (softmax): Softmax(dim=1)
)



def forward(self, x):
    conv1_input_scale_0 = self.conv1_input_scale_0
    conv1_input_zero_point_0 = self.conv1_input_zero_point_0
    quantize_per_tensor = torch.quantize_per_tensor(x, conv1_input_scale_0, conv1

### performance analysis

In [12]:
import os
def print_size_of_model(model):
    """ Print the size of the model.
    
    Args:
        model: model whose size needs to be determined

    """
    torch.save(model.state_dict(), "temp.p")
    print('Size of the model(MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')

In [13]:
def compare(model, device, test_loader, quantize=False):
  model.to(device)
  model.eval()

  total = 0
  correct = 0
  with torch.no_grad():
    for data in test_loader:
      images, labels = data
      images, labels = images.to(device),labels.to(device)
      outputs = model(images)
      # the class with the highest energy is what we choose as prediction
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  test_loss = 0
  
  print("========================================= PERFORMANCE =============================================")
  print_size_of_model(model)
  print('\nAccuracy: {}/{} ({:.0f}%)\n'.format( correct, total,100. * correct / total))
  print("====================================================================================================") 

In [14]:
device = 'cpu'
compare(model=model_to_quantize, device=device, test_loader=testloader)

Size of the model(MB): 0.327587

Accuracy: 9048/10000 (90%)



In [15]:
device = 'cpu'
compare(model=quantized_model, device=device, test_loader=testloader)

Size of the model(MB): 0.088283

Accuracy: 9024/10000 (90%)



### Extract layer name

In [16]:
from torchvision.models.feature_extraction import get_graph_node_names

train_nodes, eval_nodes = get_graph_node_names(quantized_model)
print(eval_nodes)

['x', 'quantize_per_tensor', 'conv1', 'pool', 'conv2', 'pool_1', 'flatten', 'fc1', 'fc2', 'fc3', 'dequantize', 'softmax']


### Extract train weight

In [17]:
# Change layer name to get each layer value
for i in quantized_model.conv1.weight():
  print(i)

tensor([[[-0.2616,  0.1744,  0.2791],
         [-0.2093, -0.2616,  0.2268],
         [ 0.4535, -0.3314, -0.1221]]], size=(1, 3, 3), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.017443126067519188,
       zero_point=0)
tensor([[[ 0.4361, -0.2093, -0.2442],
         [ 0.4186, -0.0174, -0.3489],
         [ 0.3489,  0.2268, -0.3663]]], size=(1, 3, 3), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.017443126067519188,
       zero_point=0)
tensor([[[-1.0117,  0.0000,  0.1221],
         [-2.2327, -1.0291, -0.3489],
         [-0.9768, -0.6454,  0.2616]]], size=(1, 3, 3), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.017443126067519188,
       zero_point=0)
tensor([[[ 0.4361,  0.2442, -0.2093],
         [ 0.5407,  0.2093,  0.1047],
         [ 0.0872,  0.2616,  0.5407]]], size=(1, 3, 3), dtype=torch.qint8,
       quantization_scheme=torch.per_tensor_affine, scale=0.017443126067519188,
       zero_p

### Extract input activation

In [18]:
import torch
from torchvision.models.feature_extraction import create_feature_extractor
# Change layer name to get each layer value
model = create_feature_extractor(quantized_model, ["quantize_per_tensor"])
for data in testloader:
      images, labels = data
      continue


torch.set_printoptions(profile='full')
outputs = model(images)

print(outputs['quantize_per_tensor'].int_repr())

with open('quantize_per_tensor.txt', 'a') as f:
    for k, v in outputs.items():
        print(v.shape)
        for i in range(len(v)):
             for j in range(len(v[i])):
                  for k in range(len(v[i][j])):
                        for l in range(len(v[i][j][k])):
                              #print(v[i][j][k][l].int_repr())
                              f.write(str(bin(v[i][j][k][l].int_repr().numpy().tolist()))[2:].zfill(8))
                              f.write("\n")

tensor([[[[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
          [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
          [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
          [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
          [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
          [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
             0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
          [  0,   0,  