In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Dynamic Quantization

In [10]:
import torch
import torch.quantization
import torch.nn as nn

# set the seed for reproducibility
torch.manual_seed(0)

class SampleLSTM(nn.Module):
  """ Sample LSTM model """

  def __init__(self, in_dim, out_dim, depth):
    super(SampleLSTM, self).__init__()
    self.lstm = nn.LSTM(in_dim, out_dim, depth)

  def forward(self, inputs, hidden):
    out, hidden = self.lstm(inputs, hidden)
    return out, hidden

# shape parameters
model_dimension=20
sequence_length=10
batch_size=1
lstm_depth=1

# random data for input
inputs = torch.randn(sequence_length, batch_size, model_dimension)

# hidden is actually is a tuple of the initial hidden state and the initial cell state
hidden = (torch.randn(lstm_depth, batch_size, model_dimension), torch.randn(lstm_depth, batch_size,
                                                                           model_dimension))

In [12]:
# here is our floating point instance
original_lstm = SampleLSTM(model_dimension, model_dimension, lstm_depth)

# apply quantization on the model
quantized_lstm = torch.quantization.quantize_dynamic(original_lstm, {nn.LSTM, nn.Linear},
                                                     dtype=torch.qint8)

# show the changes that were made
print('Original model:', original_lstm)
print('Quantized model:', quantized_lstm)

Original model: SampleLSTM(
  (lstm): LSTM(20, 20)
)
Quantized model: SampleLSTM(
  (lstm): DynamicQuantizedLSTM(20, 20)
)


In [13]:
import os

# save the model and check the model size
def print_size_of_model(model, label=""):
  torch.save(model.state_dict(), "temp.p")
  size = os.path.getsize("temp.p")
  print("model: ",label, '\t', 'Size (KB):', size/1e3)
  os.remove('temp.p')
  return size

In [15]:
f = print_size_of_model(original_lstm, "fp32")
q = print_size_of_model(quantized_lstm, "int8")
print("{0:.2f} times smaller".format(f/q))

model:  fp32 	 Size (KB): 15.224
model:  int8 	 Size (KB): 6.072
2.51 times smaller


In [17]:
# Compare inference latency
print("Floating point FP32: ", original_lstm.forward(inputs, hidden))
print("Quantized INT8: ", quantized_lstm.forward(inputs, hidden))

Floating point FP32:  (tensor([[[-0.0406, -0.0728,  0.0719, -0.0153, -0.0372,  0.2424,  0.1788,
           0.0262, -0.5749,  0.2221, -0.0901, -0.2661,  0.0282,  0.3131,
           0.0984,  0.2402, -0.4956, -0.1383,  0.3905, -0.0604]],

        [[ 0.0365, -0.2455,  0.0893,  0.0454, -0.1193, -0.0401,  0.2068,
          -0.0450, -0.2900, -0.0086, -0.0600,  0.1077, -0.0627,  0.1368,
           0.0693,  0.1299, -0.2468, -0.1691,  0.2747,  0.0209]],

        [[-0.3509, -0.1391,  0.3322, -0.0668, -0.0907,  0.0641,  0.0674,
          -0.0018, -0.1955,  0.0989,  0.0234,  0.1796, -0.1917,  0.0570,
          -0.0981,  0.2644, -0.0846, -0.0656,  0.1799, -0.0516]],

        [[-0.0267, -0.1448,  0.2102, -0.0033,  0.0037,  0.0032,  0.1495,
           0.2492, -0.2192, -0.0496,  0.0523,  0.0708, -0.3022,  0.0394,
          -0.0246,  0.0592,  0.0746, -0.0228,  0.2326,  0.2190]],

        [[ 0.0448, -0.0760,  0.2978,  0.0458,  0.0085, -0.0897, -0.0399,
           0.0513, -0.2755, -0.2126,  0.0404,  0.041

In [18]:
# Compare accuracy
out1, hidden1 = original_lstm(inputs, hidden)
mag1 = torch.mean(abs(out1)).item()
print('mean absolute value of output tensor values in the FP32 model is {0:.5f} '.format(mag1))

# run the quantized model
out2, hidden2 = quantized_lstm(inputs, hidden)
mag2 = torch.mean(abs(out2)).item()
print('mean absolute value of output tensor values in the FP32 model is {0:.5f} '.format(mag2))

# compare them
mag3 = torch.mean(abs(out1-out2)).item()
print(mag3)

mean absolute value of output tensor values in the FP32 model is 0.13233 
mean absolute value of output tensor values in the FP32 model is 0.13235 
0.00181041588075459


## Static Quantization

In [19]:
# A model with few linear layer
class SampleLinearModel(torch.nn.Module):
  def __init__(self):
    super(SampleLinearModel, self).__init__()
    # QuantStub converts the incoming floating point tensors into a quantized tensor
    self.quant = torch.quantization.QuantStub()
    self.linear1 = torch.nn.Linear(10, 100)
    self.linear2 = torch.nn.Linear(100, 100)
    self.linear3 = torch.nn.Linear(100, 100)
    self.linear4 = torch.nn.Linear(100, 100)
    self.linear5 = torch.nn.Linear(100, 1)

    # DeQuanStub converts the given quantized tensor into a tensor in floating point
    self.dequant = torch.quantization.DeQuantStub()

  def forward(self, x):
    # using QuantStub and DeQuanStub operations, we can indicate the region for quantization
    # point to quantized in the quantized model
    x = self.quant(x)
    x = self.linear1(x)
    x = self.linear2(x)
    x = self.linear3(x)
    x = self.linear4(x)
    x = self.linear5(x)
    x = self.dequant(x)

    return x

In [20]:
# prepare model for static quantization
original_model = SampleLinearModel()
print(original_model)

SampleLinearModel(
  (quant): QuantStub()
  (linear1): Linear(in_features=10, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=100, bias=True)
  (linear3): Linear(in_features=100, out_features=100, bias=True)
  (linear4): Linear(in_features=100, out_features=100, bias=True)
  (linear5): Linear(in_features=100, out_features=1, bias=True)
  (dequant): DeQuantStub()
)


## Apply Quantization

In [23]:
class CustomCalibrationDataset(torch.utils.data.Dataset):
  def __init__(self):
    self.num_samples=100
    self.data=torch.rand([self.num_samples, 10])
    self.label=torch.rand([self.num_samples, 1])

  def __len__(self):
    return self.num_samples

  def __getitem__(self, idx):
    return self.data[idx], self.label[idx]

calibration_dataset = CustomCalibrationDataset()
calibration_data_loader = torch.utils.data.DataLoader(calibration_dataset)

In [24]:
original_model.eval()
original_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
quantized_model = torch.quantization.prepare(original_model)

quantized_model.eval()
for data, label in calibration_data_loader:
  quantized_model(data)

torch.quantization.convert(quantized_model, inplace=True)
print(quantized_model)



SampleLinearModel(
  (quant): Quantize(scale=tensor([0.0079]), zero_point=tensor([0]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=10, out_features=100, scale=0.020363805815577507, zero_point=60, qscheme=torch.per_channel_affine)
  (linear2): QuantizedLinear(in_features=100, out_features=100, scale=0.012322427704930305, zero_point=65, qscheme=torch.per_channel_affine)
  (linear3): QuantizedLinear(in_features=100, out_features=100, scale=0.006431824527680874, zero_point=62, qscheme=torch.per_channel_affine)
  (linear4): QuantizedLinear(in_features=100, out_features=100, scale=0.004961901344358921, zero_point=58, qscheme=torch.per_channel_affine)
  (linear5): QuantizedLinear(in_features=100, out_features=1, scale=0.0001570889144204557, zero_point=127, qscheme=torch.per_channel_affine)
  (dequant): DeQuantize()
)


In [25]:
# compare model size
f = print_size_of_model(original_model, "fp32")
q = print_size_of_model(quantized_model, "int8")
print("{0:.2f} times smaller".format(f/q))

model:  fp32 	 Size (KB): 15.224
model:  int8 	 Size (KB): 6.072
2.51 times smaller


## Quantization aware training in Pytorch

In [26]:
# create a sample model
original_model = SampleLinearModel()

training_dataset = CustomCalibrationDataset()
training_data_loader = torch.utils.data.DataLoader(calibration_dataset, 5)

## Apply quantization

In [27]:
original_model.train()
original_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
quantized_model = torch.quantization.prepare_qat(original_model)

# train the model
quantized_model.train()
mse_loss = torch.nn.MSELoss()
optimizer = torch.optim.SGD(original_model.parameters(), lr=0.001, momentum=0.9)
for data, label in training_data_loader:
  optimizer.zero_grad()
  pred = quantized_model(data)
  loss = mse_loss(pred, label)
  loss.backward()
  optimizer.step()

quantized_model.eval()
torch.quantization.convert(quantized_model, inplace=True)
print(quantized_model)

SampleLinearModel(
  (quant): Quantize(scale=tensor([0.0079]), zero_point=tensor([0]), dtype=torch.quint8)
  (linear1): QuantizedLinear(in_features=10, out_features=100, scale=0.02046726830303669, zero_point=62, qscheme=torch.per_channel_affine)
  (linear2): QuantizedLinear(in_features=100, out_features=100, scale=0.011270151473581791, zero_point=68, qscheme=torch.per_channel_affine)
  (linear3): QuantizedLinear(in_features=100, out_features=100, scale=0.007421422284096479, zero_point=65, qscheme=torch.per_channel_affine)
  (linear4): QuantizedLinear(in_features=100, out_features=100, scale=0.004118180833756924, zero_point=56, qscheme=torch.per_channel_affine)
  (linear5): QuantizedLinear(in_features=100, out_features=1, scale=0.0009085990022867918, zero_point=0, qscheme=torch.per_channel_affine)
  (dequant): DeQuantize()
)




## Compare model size

In [29]:
# compare model size
f = print_size_of_model(original_model, "fp32")
q = print_size_of_model(quantized_model, "int8")
print("{0:.2f} times smaller".format(f/q))

model:  fp32 	 Size (KB): 129.422
model:  int8 	 Size (KB): 48.586
2.66 times smaller
