In [1]:
!gdown 1oHoYT7J4-xKfNu6cfBOMKHyO0QBqdYsI
!unzip /content/calibration_data2.zip -d /content/calibration_data

Downloading...
From: https://drive.google.com/uc?id=1oHoYT7J4-xKfNu6cfBOMKHyO0QBqdYsI
To: /content/calibration_data2.zip
100% 134M/134M [00:01<00:00, 73.9MB/s]
Archive:  /content/calibration_data2.zip
   creating: /content/calibration_data/data/
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000001.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000002.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000003.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000004.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000005.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000006.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000007.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000008.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000009.JPEG  
  inflating: /content/calibration_data/data/ILSVRC2012_val_00000010.JPEG  
  in

In [2]:
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.quantization.qconfig import QConfig
from torch.quantization.observer import MinMaxObserver, MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver, PerChannelMinMaxObserver
import yaml
from torch.ao.quantization.fake_quantize import FakeQuantize
import torch
import torchvision
import os
import torch.nn as nn
import torch.ao.quantization.quantize_fx as quantize_fx
import copy

In [3]:
  example_inputs = (torch.randn(1, 3, 224, 224),)
  model_fp = torchvision.models.resnet18(pretrained=True)
  model_to_quantize = copy.deepcopy(model_fp)
  quantize_fx.fuse_fx(model_to_quantize.eval())
  transform_cali = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),  # Convert the image to a PyTorch tensor
      transforms.Normalize(
          mean=[0.485, 0.456, 0.406],  # ImageNet dataset mean
          std=[0.229, 0.224, 0.225]  # ImageNet dataset standard deviation
      )
  ])
  transform_val = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),  # Convert the image to a PyTorch tensor
      transforms.Normalize(
          mean=[0.485, 0.456, 0.406],  # ImageNet dataset mean
          std=[0.229, 0.224, 0.225]  # ImageNet dataset standard deviation
      )
  ])


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 133MB/s]


In [4]:
from PIL import Image
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import random
import os

class MyDataset(Dataset):
    def __init__(self, txt_file, transform=None, dir=None):
        self.data = []
        with open(txt_file, 'r') as f:
            for line in f:
                image_path = line.split(' ')[0]
                label = line.split(' ')[1].split('\n')[0]
                self.data.append((image_path, int(label)))
        self.transform = transform
        self.dir = dir
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path, label = self.data[idx]
        if self.dir:
            image_path = os.path.join(self.dir, image_path)
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label


class CustomDataset(Dataset):
    def __init__(self, original_dataset, samples_per_epoch):
        self.original_dataset = original_dataset
        self.samples_per_epoch = samples_per_epoch

    def __getitem__(self, index):
        random_index = random.randint(0, len(self.original_dataset) - 1)
        return self.original_dataset[random_index]

    def __len__(self):
        return self.samples_per_epoch

In [5]:
val_dataset = MyDataset('/content/calibration_data/samples.txt',
                        transform=transform_val)

In [6]:
qconfig = QConfig(activation=FakeQuantize.with_args(observer=MinMaxObserver,
                                                          quant_min=0,
                                                          quant_max=255,
                                                          qscheme=torch.per_tensor_affine,
                                                          reduce_range=False,),
                        weight=FakeQuantize.with_args(observer=PerChannelMinMaxObserver,
                                                      quant_min=-128,
                                                      quant_max=127,
                                                      dtype=torch.qint8,
                                                      qscheme=torch.per_channel_symmetric,
                                                      #per_tensor_symmetric
                                                      reduce_range=False,
                                                      ch_axis=0,
                                                        ))

In [7]:
from torch.ao.quantization import (
  get_default_qconfig_mapping,
  get_default_qat_qconfig_mapping,
  QConfigMapping,
)
qconfig_mapping = QConfigMapping().set_global(qconfig)

In [8]:
model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)

In [9]:
criterion = nn.CrossEntropyLoss()

In [10]:
def evaluate_model(model, validation_dataloader, criterion, device):
    model.to(device)
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for data in validation_dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Compute the loss
            loss = criterion(outputs, labels)
            running_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_loss = running_loss / len(validation_dataloader)
    accuracy = 100 * correct / total

    return avg_loss, accuracy

In [11]:
val_loader = DataLoader(val_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)



In [12]:
loss, acc = evaluate_model(model_prepared, val_loader, criterion,'cuda')

In [13]:
acc

69.75

In [14]:
model_prepared = model_prepared.to('cpu')

In [15]:
model_quantized = quantize_fx.convert_fx(model_prepared)

In [16]:
loss, acc = evaluate_model(model_quantized, val_loader, criterion,'cpu')

In [17]:
acc

66.625

In [18]:
model_prepared

GraphModule(
  (activation_post_process_0): FakeQuantize(
    fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=255, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.0187]), zero_point=tensor([114], dtype=torch.int32)
    (activation_post_process): MinMaxObserver(min_val=-2.1179039478302, max_val=2.640000104904175)
  )
  (conv1): ConvReLU2d(
    (0): QuantizedConv2d(Reference)(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU(inplace=True)
  )
  (activation_post_process_1): FakeQuantize(
    fake_quant_enabled=tensor([1], dtype=torch.uint8), observer_enabled=tensor([1], dtype=torch.uint8), quant_min=0, quant_max=255, dtype=torch.quint8, qscheme=torch.per_tensor_affine, ch_axis=-1, scale=tensor([0.0273]), zero_point=tensor([0], dtype=torch.int32)
    (activation_post_process): MinMaxObserver(min_val=0.0, max_val=6.968907356262207)
  )
  (maxpool): MaxPool2d(kern

#TASK:
1. Split the 1000 samples into two groups, 50%,50%, one of them will be named calibration_dataset while the other will be validation_dataset. You should use calibration dataset to calibrate scale and zero point before performing actual quantization.
2. Make PTQ training for MobileNetV2, using PerchannelMinMax Quantization for weights and PerTensorMinMax Quantization separately.
3. Make Perchannel MovingAverageMinMax Quantization for weights and MovingAverage Pertensor Quantization for activation. Compare to results in 2., Check which one is better. Compare some scales and zero points of from the observers. Explain why one of the solution is better?

# 1. Split
We use pytorch's `random_split` function to split the data on two datasets, each one containing a 50% of the original datasets' data.

In [24]:
from torch.utils.data import random_split
calibration_dataset, validation_dataset = random_split(val_dataset, (0.5, 0.5))

# 2. PTQ training for MobileNetV2

# 2.1 PerChannelMinMax

In [46]:
model_mnv2 = torchvision.models.quantization.mobilenet_v2(pretrained=True)
model_to_quantize = copy.deepcopy(model_mnv2)
quantize_fx.fuse_fx(model_to_quantize.eval())
qconfig = QConfig(activation=FakeQuantize.with_args(observer=MinMaxObserver,
                                                    quant_min=0,
                                                    quant_max=255,
                                                    qscheme=torch.per_tensor_affine,
                                                    reduce_range=False,),
                  weight=FakeQuantize.with_args(observer=PerChannelMinMaxObserver,
                                              quant_min=-128,
                                              quant_max=127,
                                              dtype=torch.qint8,
                                              qscheme=torch.per_channel_symmetric,
                                              #per_tensor_symmetric
                                              reduce_range=False,
                                              ch_axis=0,
                                              ))
qconfig_mapping = QConfigMapping().set_global(qconfig)
model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
calibration_loader = DataLoader(calibration_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)
loss, acc = evaluate_model(model_prepared, calibration_loader, criterion,'cuda')
print('Loss:', loss)
print('Accuracy:', acc)
model_prepared = model_prepared.to('cpu')
model_quantized = quantize_fx.convert_fx(model_prepared)
validation_loader = DataLoader(validation_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)
loss, acc = evaluate_model(model_quantized, validation_loader, criterion,'cpu')
print('Quantized loss:', loss)
print('Quantized accuracy:', acc)

Loss: 1.0652364492416382
Accuracy: 70.75
Quantized loss: 1.530437707901001
Quantized accuracy: 63.25


# 2.2 PerTensorMinMax

In [47]:
model_mnv2 = torchvision.models.quantization.mobilenet_v2(pretrained=True)
model_to_quantize = copy.deepcopy(model_mnv2)
quantize_fx.fuse_fx(model_to_quantize.eval())
qconfig = QConfig(activation=FakeQuantize.with_args(observer=MinMaxObserver,
                                                    quant_min=0,
                                                    quant_max=255,
                                                    qscheme=torch.per_tensor_affine,
                                                    reduce_range=False,),
                  weight=FakeQuantize.with_args(observer=MinMaxObserver,
                                              quant_min=-128,
                                              quant_max=127,
                                              dtype=torch.qint8,
                                              qscheme=torch.per_tensor_symmetric,
                                              #per_tensor_symmetric
                                              reduce_range=False,
                                              ))
qconfig_mapping = QConfigMapping().set_global(qconfig)
model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
calibration_loader = DataLoader(calibration_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)
loss, acc = evaluate_model(model_prepared, calibration_loader, criterion,'cuda')
print('Loss:', loss)
print('Accuracy:', acc)
model_prepared = model_prepared.to('cpu')
model_quantized = quantize_fx.convert_fx(model_prepared)
validation_loader = DataLoader(validation_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)
loss, acc = evaluate_model(model_quantized, validation_loader, criterion,'cpu')
print('Quantized loss:', loss)
print('Quantized accuracy:', acc)

Loss: 1.0652364492416382
Accuracy: 70.75
Quantized loss: 1.3668818175792694
Quantized accuracy: 67.5


# 3. Moving average

In [48]:
model_mnv2 = torchvision.models.quantization.mobilenet_v2(pretrained=True)
model_to_quantize = copy.deepcopy(model_mnv2)
quantize_fx.fuse_fx(model_to_quantize.eval())
qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
                                                    quant_min=0,
                                                    quant_max=255,
                                                    qscheme=torch.per_tensor_affine,
                                                    reduce_range=False,),
                  weight=FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
                                              quant_min=-128,
                                              quant_max=127,
                                              dtype=torch.qint8,
                                              qscheme=torch.per_channel_symmetric,
                                              #per_tensor_symmetric
                                              reduce_range=False,
                                              ch_axis=0
                                              ))
qconfig_mapping = QConfigMapping().set_global(qconfig)
model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_mapping, example_inputs)
calibration_loader = DataLoader(calibration_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)
loss, acc = evaluate_model(model_prepared, calibration_loader, criterion,'cuda')
print('Loss:', loss)
print('Accuracy:', acc)
model_prepared = model_prepared.to('cpu')
model_quantized = quantize_fx.convert_fx(model_prepared)
validation_loader = DataLoader(validation_dataset, batch_size=100, shuffle=False, num_workers=8, drop_last=True)
loss, acc = evaluate_model(model_quantized, validation_loader, criterion,'cpu')
print('Quantized loss:', loss)
print('Quantized accuracy:', acc)

Loss: 1.0440984219312668
Accuracy: 71.5
Quantized loss: 1.5116087198257446
Quantized accuracy: 65.25


# Conclussions

The best result can be observed when using `PerTensorMinMax` (point 2.2) on the weights.

The second best result can be observed when using the moving averaged observers (point 3).

If we compare the quantized accuracy to pre-quantization accuracy we can get have a better idea of what settings performed better:

- 2.1: $\frac{63.25}{70.75} \approx 0.89$ 

- 2.2: $\frac{67.5}{70.75} \approx 0.95$ 

- 3: $\frac{65.25}{71.5} \approx 0.91$ 