Source:
    https://pytorch.org/blog/int8-quantization/

 By reducing the precision of the model’s weights and activations from 32-bit floating-point (FP32) to 8-bit integer (INT8), INT8 quantization can significantly improve the inference speed and reduce memory requirements without sacrificing accuracy.

Step by step work:
 1. Create a simple model 
    2. Use any random dataset as the input
    3. Train the model
    4. Convert the model into tflite
    5. Create a representative dataset 
    6. Perform INt8 quantization
    7. Save the quantizated model
    8. Run the inference
    
Question to answer:
1. How does INT8 quatnization is working?

In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
import numpy as np


2024-02-08 09:54:23.367087: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def create_model():
    model=models.Sequential([
        layers.Input(shape=(10,)),
        layers.Dense(5,activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

In [3]:
# Generate random input data and labels
np.random.seed(0)
num_samples = 1000
input_data = np.random.randint(2001, size=(num_samples, 10)) 
labels = np.random.randint(2, size=(num_samples, 1))

In [4]:
# Create and compile model
model = create_model()
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics = ['accuracy'])

In [5]:
# Train the model
model.fit(input_data, labels, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7ff8d4743610>

In [6]:
# Save the model
model.save('random_model.h5')

  saving_api.save_model(


In [7]:
model = tf.keras.models.load_model('random_model.h5')


In [8]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 5)                 55        
                                                                 
 dense_1 (Dense)             (None, 1)                 6         
                                                                 
Total params: 61 (244.00 Byte)
Trainable params: 61 (244.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)

tflite_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/sraj/tmpxg_5bung/assets


INFO:tensorflow:Assets written to: /tmp/sraj/tmpxg_5bung/assets
2024-02-08 09:54:45.952555: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-02-08 09:54:45.952599: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-02-08 09:54:45.953046: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /tmp/sraj/tmpxg_5bung
2024-02-08 09:54:45.953980: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-02-08 09:54:45.954004: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /tmp/sraj/tmpxg_5bung
2024-02-08 09:54:45.956897: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:375] MLIR V1 optimization pass is not enabled
2024-02-08 09:54:45.957793: I tensorflow/cc/saved_model/loader.cc:231] Restoring SavedModel bundle.
2024-02-08 09:54:45.999000: I tensorflow/cc/saved_model/loader.cc:215] Runnin

In [10]:
# Generate a representative dataset
representative_data = input_data[:100]  # Use a subset of the input data as the representative dataset


In [11]:

# Define a generator function to provide representative data
def representative_dataset_generator():
    for data in representative_data:
        yield [data.reshape(1, -1).astype(np.float32)]


In [12]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
converter.representative_dataset = representative_dataset_generator


In [13]:
# Save the quantized model to a file
with open('random_model_quantized.tflite', 'wb') as f:
    f.write(tflite_model)

In [14]:
# Checking the accuracy

In [15]:
# Load the unquantized model
unquantized_model = tf.keras.models.load_model('random_model.h5')
unquantized_loss, unquantized_accuracy = unquantized_model.evaluate(input_data, labels)




In [16]:
# Load the quantized model
interpreter = tf.lite.Interpreter(model_path='random_model_quantized.tflite')
interpreter.allocate_tensors()


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [17]:
# prepare he input and output tensors
input_index = interpreter.get_input_details()

In [18]:
input_index

[{'name': 'serving_default_input_1:0',
  'index': 0,
  'shape': array([ 1, 10], dtype=int32),
  'shape_signature': array([-1, 10], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

In [19]:
# Prepare the input data with the correct shape
input_data = input_data.astype(np.float32)
input_data = np.expand_dims(input_data, axis=0)  # Add a batch dimension
output_index = interpreter.get_output_details()[0]['index']


In [20]:
# Run inference on the quantized model and compute accuracy
num_correct = 0
for input_data, label in zip(input_data, labels):
    input_data = input_data.astype(np.float32)
    interpreter.set_tensor(input_index, input_data)
    interpreter.invoke()
    output = interpreter.get_tensor(output_index)
    predicted_label = np.argmax(output)
    if predicted_label == label:
        num_correct += 1

TypeError: SetTensor(): incompatible function arguments. The following argument types are supported:
    1. (self: tensorflow.lite.python.interpreter_wrapper._pywrap_tensorflow_interpreter_wrapper.InterpreterWrapper, i: int, value: handle, subgraph_index: int = 0) -> object

Invoked with: <tensorflow.lite.python.interpreter_wrapper._pywrap_tensorflow_interpreter_wrapper.InterpreterWrapper object at 0x7ff8cc629270>, [{'name': 'serving_default_input_1:0', 'index': 0, 'shape': array([ 1, 10], dtype=int32), 'shape_signature': array([-1, 10], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}], array([[ 684.,  559., 1653., ..., 1383., 1033., 1747.],
       [ 277., 1778., 1828., ..., 1420.,  314.,  705.],
       [1510.,  551.,   87., ...,  537.,  845.,   72.],
       ...,
       [ 481., 1163., 1169., ...,  364., 1060., 1288.],
       [ 303., 1797.,  542., ...,  514.,  212.,  856.],
       [1516.,  909.,  702., ..., 1104., 1299., 1700.]], dtype=float32)

![image](https://pytorch.org/assets/images/int8/pytorch_quant_x86_1.jpg)

![image_s](https://pytorch.org/assets/images/int8/pytorch_quant_x86_2.jpg)

https://pytorch.org/blog/int8-quantization/
https://www.intel.com/content/www/us/en/developer/articles/technical/int8-quantization-for-x86-cpu-in-pytorch.html#:~:text=INT8%20Quantization%20for%20x86%20CPU%20in%20PyTorch*,-Overview&text=By%20reducing%20the%20precision%20of,memory%20requirements%20without%20sacrificing%20accuracy.
https://www.tensorflow.org/lite/performance/post_training_integer_quant

In [21]:
import torch

class M(torch.nn.Module):
    def __init__(self):
        super(M, self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.conv = torch.nn.Conv2d(1, 1, 1)
        self.relu = torch.nn.ReLU()
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.conv(x)
        x = self.relu(x)
        x = self.dequant(x)
        return x

model_fp32 = M()

model_fp32.eval()


model_fp32.qconfig = torch.quantization.get_default_qconfig('fbgemm')


model_fp32_fused = torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])


model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)

# calibrate the prepared model to determine quantization parameters for activations
# in a real world setting, the calibration would be done with a representative dataset
input_fp32 = torch.randn(4, 1, 4, 4)
model_fp32_prepared(input_fp32)

# Convert the observed model to a quantized model. This does several things:
# quantizes the weights, computes and stores the scale and bias value to be
# used with each activation tensor, and replaces key operators with quantized
# implementations.
model_int8 = torch.quantization.convert(model_fp32_prepared)

# run the model, relevant calculations will happen in int8
res = model_int8(input_fp32)



In [22]:
model_fp32

M(
  (quant): QuantStub()
  (conv): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
  (relu): ReLU()
  (dequant): DeQuantStub()
)

In [23]:
model_fp32_fused

M(
  (quant): QuantStub()
  (conv): ConvReLU2d(
    (0): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU()
  )
  (relu): Identity()
  (dequant): DeQuantStub()
)

In [24]:
model_fp32_prepared

M(
  (quant): QuantStub(
    (activation_post_process): HistogramObserver(min_val=-2.759754180908203, max_val=3.021393299102783)
  )
  (conv): ConvReLU2d(
    (0): Conv2d(1, 1, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU()
    (activation_post_process): HistogramObserver(min_val=0.19379568099975586, max_val=1.7738090753555298)
  )
  (relu): Identity()
  (dequant): DeQuantStub()
)

In [25]:
input_fp32

tensor([[[[ 0.6188, -1.3656,  0.1770,  0.1060],
          [-0.1552, -0.3424, -0.6614, -0.1574],
          [-0.0652, -0.6068,  0.8175, -0.8872],
          [ 0.0636, -0.8041, -1.1435,  0.0300]]],


        [[[ 1.7816, -0.3857, -2.7598,  1.2740],
          [ 0.2323, -0.2197, -0.3202, -1.2060],
          [ 0.4900,  0.3746, -0.8379, -0.3279],
          [-0.5364, -1.5201,  3.0214, -0.8101]]],


        [[[-1.4131, -1.5440, -0.8876,  0.9838],
          [ 2.3838,  0.8122, -0.7225, -0.2188],
          [-1.1641,  0.1184, -1.5926,  0.6182],
          [-0.7598,  0.1797, -0.8831,  0.1931]]],


        [[[-0.3019, -1.3952,  0.5295, -0.2410],
          [-0.9327,  2.2946, -0.8485, -0.3899],
          [ 0.0132,  1.2915, -0.4745,  0.4994],
          [ 0.5789, -0.1150,  0.7701,  1.1317]]]])

In [26]:
model_int8

M(
  (quant): Quantize(scale=tensor([0.0455]), zero_point=tensor([61]), dtype=torch.quint8)
  (conv): QuantizedConvReLU2d(1, 1, kernel_size=(1, 1), stride=(1, 1), scale=0.013960925862193108, zero_point=0)
  (relu): Identity()
  (dequant): DeQuantize()
)

In [27]:
res

tensor([[[[1.1169, 0.5724, 0.9912, 0.9773],
          [0.9075, 0.8516, 0.7679, 0.9075],
          [0.9354, 0.7818, 1.1727, 0.7120],
          [0.9633, 0.7260, 0.6422, 0.9633]]],


        [[[1.4380, 0.8516, 0.1955, 1.2984],
          [1.0052, 0.8795, 0.8656, 0.6143],
          [1.0890, 1.0471, 0.7260, 0.8656],
          [0.7958, 0.5445, 1.7591, 0.7260]]],


        [[[0.5584, 0.5305, 0.6980, 1.2146],
          [1.5915, 1.1727, 0.7539, 0.8795],
          [0.6282, 0.9912, 0.5166, 1.1169],
          [0.7399, 0.9912, 0.7120, 0.9912]]],


        [[[0.8656, 0.5584, 1.1029, 0.8795],
          [0.6980, 1.5636, 0.7120, 0.8377],
          [0.9493, 1.2984, 0.8237, 1.0890],
          [1.1029, 0.9075, 1.1588, 1.2565]]]])