## Custom Quantization functions and classses

### Necessary Imports

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
from tensorflow.lite.python.interpreter import Interpreter
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras import models, layers, utils
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.applications.resnet50 import ResNet50


### Functions for quantizing the models

In [10]:

## This is the function for linear quantization and the steps for quantization are explained in the report 
def lin_quant(tensor, n_bits):
    q_min_val = -(2 ** (n_bits - 1))
    q_max_val = (2 ** (n_bits - 1)) - 1
    scl = (np.max(tensor) - np.min(tensor)) / (q_max_val - q_min_val)
    zero_pt = q_min_val - np.round(np.min(tensor) / scl)
    q_tensor = np.round(tensor / scl) - zero_pt
    q_tensor = np.clip(q_tensor, q_min_val, q_max_val).astype(np.int32)
    dq_tensor = (q_tensor + zero_pt) * scl
    return q_tensor, dq_tensor 

## This function is used to just quantize the biases in the network
def lin_quant_bias(tnsr):
    n_bits = 32
    q_min_val = -(2 ** (n_bits - 1))
    q_max_val = (2 ** (n_bits - 1)) - 1
    scl = (np.max(tnsr) - np.min(tnsr)) / (q_max_val - q_min_val)
    zero_pt = q_min_val - np.round(np.min(tnsr) / scl)
    q_tnsr = np.round(tnsr / scl) - zero_pt
    q_tnsr = np.clip(q_tnsr, q_min_val, q_max_val).astype(np.int32)
    dq_tnsr = (q_tnsr + zero_pt) * scl
    return q_tnsr, dq_tnsr

## This is a weighted quantization method. The expalantion for this method is included in the report
def non_uni_quant(tnsr, n_bits, centrs=None):
    if centrs is None:
        centrs = np.linspace(np.min(tnsr), np.max(tnsr), num=2 ** n_bits)
    
    q_tnsr = np.zeros_like(tnsr, dtype=np.int32)
    for i in range(2 ** n_bits):
        if i == 0:
            msk = np.abs(tnsr - centrs[i]) <= np.abs(tnsr - centrs[i + 1])
        elif i == 2 ** n_bits - 1:
            msk = np.abs(tnsr - centrs[i]) < np.abs(tnsr - centrs[i - 1])
        else:
            msk = (np.abs(tnsr - centrs[i]) < np.abs(tnsr - centrs[i - 1])) & \
                   (np.abs(tnsr - centrs[i]) <= np.abs(tnsr - centrs[i + 1]))
        q_tnsr[msk] = i

    dq_tnsr = np.array([centrs[i] for i in q_tnsr])
    return q_tnsr, dq_tnsr



In [11]:
# Quantize just the weights not the biases
def quantize_model_weights(model, quantization_function, num_bits):
    quantized_weights = []
    dequantized_weights = []
    for weight in model.get_weights():
        if len(weight.shape) > 1:
            q_weight, deq_weight = quantization_function(weight, num_bits)
            quantized_weights.append(q_weight)
            dequantized_weights.append(deq_weight)
        else:
            quantized_weights.append(weight)
            dequantized_weights.append(weight)  # Keep biases unchanged
    return quantized_weights, dequantized_weights

# Quantize the weights alongwith the biases
def quantize_model_weights_biases(model, quantization_function, num_bits):
    quantized_weights = []
    dequantized_weights = []
    for weight in model.get_weights():
        q_weight, deq_weight = quantization_function(weight, num_bits)
        quantized_weights.append(q_weight)
        dequantized_weights.append(deq_weight)
    return quantized_weights, dequantized_weights

# Clone the floating point model and then replace the weights of the original model 
def replace_all_weights(model, quantized_weights,actual_loss = 'categorical_crossentropy'):
    quantized_model = tf.keras.models.clone_model(model)
    quantized_model.set_weights(quantized_weights)
    quantized_model.compile(optimizer='adam',
                        loss=actual_loss,
                        metrics=['accuracy'])
    return quantized_model


### Function for Measuring Inference

In [12]:
def measure_inference_time(model_instance, input_data, iterations=100):
    
    # Warm up the cache by executing the model for one step
    model_instance.predict(input_data, verbose=0)

# Measure the prediction time
    start_timestamp = time.time()
    for _ in range(iterations):
        model_instance.predict(input_data, verbose=0)
    end_timestamp = time.time()

# Calculate and return the average prediction time per run
    average_duration = (end_timestamp - start_timestamp) / iterations
    return average_duration


**We make two additional function to load the tflite model and measure its inference time. Since predicting using a tflite model is less starightforward than a .h5 model. We usually need to use an interpreter for evaluation. Tensorflow doesn't provide a simple way to predict using the tflite model.**

In [1]:
def measure_tflite_inference_time(model_filepath, input_data, runs=100):
    
   # A Tflite model requires an interpreter for evaluation
    tflite_evaluator = Interpreter(model_path=model_filepath)
    tflite_evaluator.allocate_tensors()

    i_info = tflite_evaluator.get_input_details()
    o_info = tflite_evaluator.get_output_details()

    # Warm up the cache by executing the model for one step
    data_input = np.array(input_data, dtype=i_info[0]['dtype'])
    tflite_evaluator.set_tensor(i_info[0]['index'], data_input)
    tflite_evaluator.invoke()

    # Measure the execution duration
    start_timestamp = time.time()
    for _ in range(runs):
        tflite_evaluator.set_tensor(i_info[0]['index'], data_input)
        tflite_evaluator.invoke()
    end_timestamp = time.time()

    # Calculate and return the average execution time per run
    avg_duration = (end_timestamp - start_timestamp) / runs
    return avg_duration




### Function for calculating Accuracy of a TFLite model

In [2]:
def evaluate_tflite_model_accuracy(model_filepath, test_data_x, test_data_y):
# A Tflite model needs an interpreter for evaluation
    tflite_evaluator = Interpreter(model_path=model_filepath)
    tflite_evaluator.allocate_tensors()
    i_spec = tflite_evaluator.get_input_details()
    o_spec = tflite_evaluator.get_output_details()

# Prepare data for evaluation
    # Prepare data for evaluation
    correct_predictions = 0
    total_samples = len(test_data_x)

    for idx in range(total_samples):
        input_sample = np.array(test_data_x[idx:idx+1], dtype=i_spec[0]['dtype'])
        tflite_evaluator.set_tensor(i_spec[0]['index'], input_sample)
        tflite_evaluator.invoke()
        output_sample = tflite_evaluator.get_tensor(o_spec[0]['index'])

    # Get the predicted label and compare with the ground truth label
        estimated_label = np.argmax(output_sample)
        actual_label = np.argmax(test_data_y[idx])

        if estimated_label == actual_label:
            correct_predictions += 1

# Calculate and return the accuracy
    model_precision = correct_predictions / total_samples
    return model_precision



## Testing our functions on a very simple model on MNIST Dataset


In [15]:
# Load MNIST dataset
(Training_Data, Training_Labels), (Testing_Data, Testing_Labels) = tf.keras.datasets.mnist.load_data()

# Normalize the data
Training_Data = Training_Data.reshape(-1, 784) / 255.0  # Reshape and normalize input data
Testing_Data = Testing_Data.reshape(-1, 784) / 255.0  # Reshape and normalize input data

# A very simple model
inputs = tf.keras.layers.Input(shape=(784,))
x = tf.keras.layers.Dense(128, activation='relu')(inputs)
x = tf.keras.layers.Dense(10, activation='softmax')(x)
Simple_Model = tf.keras.models.Model(inputs=inputs, outputs=x)

# Compile and train the model
Simple_Model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
Simple_Model.fit(Training_Data, Training_Labels, epochs=4, validation_split=0.2)

# Evaluate the model
loss, accuracy = Simple_Model.evaluate(Testing_Data, Testing_Labels)
print(f'Test accuracy: {accuracy:.3f}')


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Test accuracy: 0.974


## Comparing the Quantization accuracy for the Simple Model using our Qunatization functions

In [17]:
num_bits = [2,4,8,16]
Simple_results = pd.DataFrame(columns=['num_bits', 'Original', 'Linear', 'Non_Uniform',
                                         'Original_Size_MB', 'Linear_Size_MB', 'Non_Uniform_Size_MB',
                                         'Original_Time (s)', 'Linear_Time (s)', 'Non_Uniform_Time (s)'])


for num_bit in num_bits:
    print(f"Accuracies for {num_bit} bit quantization")
    quantized_weights_linear, dequantized_weights_linear = quantize_model_weights(Simple_Model, lin_quant, num_bit)
    quantized_weights_non_uniform, dequantized_weights_non_uniform = quantize_model_weights(Simple_Model, non_uni_quant, num_bit)

    # Evaluate the original model
    loss, original_accuracy = Simple_Model.evaluate(Testing_Data, Testing_Labels)
    print("Original Model Test Accuracy: {:.2f}".format(original_accuracy))

    # Evaluate the model with linear quantization
    simple_model_linear = replace_all_weights(Simple_Model, dequantized_weights_linear, actual_loss='sparse_categorical_crossentropy')
    loss, linear_accuracy = simple_model_linear.evaluate(Testing_Data, Testing_Labels)
    print("Linear Dequantized Model Test Accuracy: {:.2f}".format(linear_accuracy))

    # Evaluate the model with non-uniform quantization
    simple_model_non_uniform = replace_all_weights(Simple_Model, dequantized_weights_non_uniform, actual_loss='sparse_categorical_crossentropy')
    loss, non_uniform_accuracy = simple_model_non_uniform.evaluate(Testing_Data, Testing_Labels)
    print("Non-Uniform Dequantized Model Test Accuracy: {:.2f}".format(non_uniform_accuracy))

    # Save the models
    simple_model_linear.save(f'simple_model_linear_quantized_{num_bit}_bits.h5')
    simple_model_non_uniform.save(f'simple_model_non_uniform_quantized_{num_bit}_bits.h5')

    # Get the size of the models
    linear_size = os.path.getsize(f'simple_model_linear_quantized_{num_bit}_bits.h5')
    non_uniform_size = os.path.getsize(f'simple_model_non_uniform_quantized_{num_bit}_bits.h5')

    # Prepare the test data
    test_data = Testing_Data[:1]

    # Measure the inference time of the models
    original_model_time = measure_inference_time(Simple_Model, test_data)
    linear_quantized_time = measure_inference_time(simple_model_linear, test_data)
    non_uniform_quantized_time = measure_inference_time(simple_model_non_uniform, test_data)

    # Save the original model
    Simple_Model.save('Simple_model_original.h5')

    # Get the size of the original model
    original_size = os.path.getsize('Simple_model_original.h5')

     # Append the results to the DataFrame
    new_row = pd.DataFrame({'num_bits': [num_bit],
                            'Original': [round(original_accuracy, 4)],
                            'Linear': [round(linear_accuracy, 4)],
                            'Non_Uniform': [round(non_uniform_accuracy, 4)],
                            'Original_Size_MB': [round(original_size / (1024 * 1024), 4)],
                            'Linear_Size_MB': [round(linear_size / (1024 * 1024), 4)],
                            'Non_Uniform_Size_MB': [round(non_uniform_size / (1024 * 1024), 4)],
                            'Original_Time (s)': [round(original_model_time, 6)],
                            'Linear_Time (s)': [round(linear_quantized_time, 6)],
                            'Non_Uniform_Time (s)': [round(non_uniform_quantized_time, 6)]})

    
    Simple_results = pd.concat([Simple_results, new_row], ignore_index=True)
    
print(Simple_results)

Accuracies for 2 bit quantization
Original Model Test Accuracy: 0.97
Linear Dequantized Model Test Accuracy: 0.66
Non-Uniform Dequantized Model Test Accuracy: 0.40
Accuracies for 4 bit quantization
Original Model Test Accuracy: 0.97
Linear Dequantized Model Test Accuracy: 0.97
Non-Uniform Dequantized Model Test Accuracy: 0.97
Accuracies for 8 bit quantization
Original Model Test Accuracy: 0.97
Linear Dequantized Model Test Accuracy: 0.97
Non-Uniform Dequantized Model Test Accuracy: 0.97
Accuracies for 16 bit quantization
Original Model Test Accuracy: 0.97
Linear Dequantized Model Test Accuracy: 0.97
Non-Uniform Dequantized Model Test Accuracy: 0.97
  num_bits  Original  Linear  Non_Uniform  Original_Size_MB  Linear_Size_MB  \
0        2    0.9739  0.6598       0.4021            1.1893          0.4045   
1        4    0.9739  0.9710       0.9744            1.1893          0.4045   
2        8    0.9739  0.9732       0.9734            1.1893          0.4045   
3       16    0.9739  0.973

In [18]:
file_name = 'Accuracy_Simple_model.csv'
Simple_results.to_csv(file_name, encoding='utf-8', index=False)

## Quantizing with the Official Tensorflow Quantization Libraries

In [19]:
# Full 8-bit Integer Quantization
eight_bit_converter = tf.lite.TFLiteConverter.from_keras_model(Simple_Model)
eight_bit_converter.optimizations = [tf.lite.Optimize.DEFAULT]

tflite_quantized_model = eight_bit_converter.convert()

# Quantized Tensorflow Model Final Save in tflite Format
with open("Simple_quantized_model.tflite", "wb") as f:
    f.write(tflite_quantized_model)

## I now load the tensorflow quantized model and check it's size , inference time and accuracy

In [20]:
# Save the quantized TensorFlow Lite model
quantized_model_path = "Simple_quantized_model.tflite"
with open(quantized_model_path, "wb") as f:
    f.write(tflite_quantized_model)

# Get the size of the quantized model
quantized_model_size = os.path.getsize(quantized_model_path)

# Print the size of the quantized model in MB
print("Quantized Model Size: {:.2f} MB".format(quantized_model_size / (1024 * 1024)))

Quantized Model Size: 0.10 MB


**We can observe from the results above that the tensorflow quantized model is 0.1 MB whereas with our quantization the size was 0.4 MB. And these two values are still smaller than the original size of 1.19 MB**

In [21]:
# Measure the inference time of the Tensorflow Quantized model
quantized_original_model_time = measure_tflite_inference_time(quantized_model_path, test_data)
print("Quantized Model Average Inference Time: {:.6f} seconds".format(quantized_original_model_time))

Quantized Model Average Inference Time: 0.000018 seconds


**The TFLite quantized inference time is much less than our implementation i.e 0.000018 seconds**

In [22]:
accuracy_simple_quantized = evaluate_tflite_model_accuracy(quantized_model_path, Testing_Data, Testing_Labels)
print(f"Accuracy of the TFLite model: {accuracy_simple_quantized  * 100:.2f}%")

Accuracy of the TFLite model: 9.99%


**TFLite quantized model has a signifiacant degradation in accuracy here**

# Now we test our functions on a Large CNN Model

## Model to test our quantization functions

In [23]:
# Load and preprocess CIFAR-10 data
(Xtrain, Ytrain), (Xtest, Ytest) = cifar10.load_data()

#Normalization of the training and test values 
Xtrain = Xtrain.astype('float32') / 255
Xtest = Xtest.astype('float32') / 255

# Encoding the labels to vectors of one and zero
Ytrain = utils.to_categorical(Ytrain, 10)
Ytest = utils.to_categorical(Ytest, 10)


# Model for CIFAR-10 dataset
Large_CNN_Model = models.Sequential([
    layers.Input(shape=(32, 32, 3)),
    layers.Conv2D(32, kernel_size=(3, 3), padding='same'),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Conv2D(32, kernel_size=(3, 3), padding='same'),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Dropout(0.3),

    layers.Conv2D(64, kernel_size=(3, 3), padding='same'),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Conv2D(64, kernel_size=(3, 3), padding='same'),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Dropout(0.5),

    layers.Conv2D(128, kernel_size=(3, 3), padding='same'),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Conv2D(128, kernel_size=(3, 3), padding='same'),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Dropout(0.5),

    layers.Flatten(),
    layers.Dense(128),
    layers.LeakyReLU(alpha=0.1),
    layers.BatchNormalization(),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

# Compile the model with Adam optimizer
Large_CNN_Model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Model Training Part
batch_size = 32
epochs = 40

training_of_the_model = Large_CNN_Model.fit(
    x=Xtrain, y=Ytrain, batch_size=batch_size,
    epochs=epochs, validation_data=(Xtest, Ytest)
)

# Model Evaluation Part
score = Large_CNN_Model.evaluate(Xtest, Ytest, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test loss: 0.402946799993515
Test accuracy: 0.866100013256073


### Comparing the accuracies, inference time and file size of the Large CNN model using our quantization functions


In [24]:
Large_CNN_results = pd.DataFrame(columns=['num_bits', 'Original', 'Linear', 'Non_Uniform',
                                         'Original_Size_MB', 'Linear_Size_MB', 'Non_Uniform_Size_MB',
                                         'Original_Time (s)', 'Linear_Time (s)', 'Non_Uniform_Time (s)'])

num_bits = [2, 4, 8, 16]

for num_bit in num_bits:
    print(f"Accuracies for {num_bit} bit quantization")
    quantized_weights_linear, dequantized_weights_linear = quantize_model_weights(Large_CNN_Model, lin_quant, num_bit)
    quantized_weights_non_uniform, dequantized_weights_non_uniform = quantize_model_weights(Large_CNN_Model, non_uni_quant, num_bit)

    # Evaluate the original model
    loss, original_accuracy = Large_CNN_Model.evaluate(Xtest, Ytest)
    print("Original Model Test Accuracy: {:.2f}".format(original_accuracy))

    # Evaluate the model with linear quantization
    Large_model_linear = replace_all_weights(Large_CNN_Model, dequantized_weights_linear, actual_loss='categorical_crossentropy')
    loss, linear_accuracy = Large_model_linear.evaluate(Xtest, Ytest)
    print("Linear Dequantized Model Test Accuracy: {:.2f}".format(linear_accuracy))

    # Evaluate the model with non-uniform quantization
    Large_model_non_uniform = replace_all_weights(Large_CNN_Model, dequantized_weights_non_uniform, actual_loss='categorical_crossentropy')
    loss, non_uniform_accuracy = Large_model_non_uniform.evaluate(Xtest, Ytest)
    print("Non-Uniform Dequantized Model Test Accuracy: {:.2f}".format(non_uniform_accuracy))

    # Save the models
    Large_model_linear.save(f'Large_model_linear_quantized_{num_bit}_bits.h5')
    Large_model_non_uniform.save(f'Large_model_non_uniform_quantized_{num_bit}_bits.h5')

    # Get the size of the models
    linear_size = os.path.getsize(f'Large_model_linear_quantized_{num_bit}_bits.h5')
    non_uniform_size = os.path.getsize(f'Large_model_non_uniform_quantized_{num_bit}_bits.h5')

    # Prepare the test data
    test_data = Xtest[:1]

    # Measure the inference time of the models
    original_model_time = measure_inference_time(Large_CNN_Model, test_data)
    linear_quantized_time = measure_inference_time(Large_model_linear, test_data)
    non_uniform_quantized_time = measure_inference_time(Large_model_non_uniform, test_data)


    # Save the original model
    Large_CNN_Model.save('Large_CNN_model_original.h5')

    # Get the size of the original model
    original_size = os.path.getsize('Large_CNN_model_original.h5')
    # Append the results to the DataFrame
    new_row = pd.DataFrame({'num_bits': [num_bit],
                            'Original': [round(original_accuracy, 4)],
                            'Linear': [round(linear_accuracy, 4)],
                            'Non_Uniform': [round(non_uniform_accuracy, 4)],
                            'Original_Size_MB': [round(original_size / (1024 * 1024), 4)],
                            'Linear_Size_MB': [round(linear_size / (1024 * 1024), 4)],
                            'Non_Uniform_Size_MB': [round(non_uniform_size / (1024 * 1024), 4)],
                            'Original_Time (s)': [round(original_model_time, 6)],
                            'Linear_Time (s)': [round(linear_quantized_time, 6)],
                            'Non_Uniform_Time (s)': [round(non_uniform_quantized_time, 6)]})

    
    Large_CNN_results = pd.concat([Large_CNN_results, new_row], ignore_index=True)
    
print(Large_CNN_results)

# Print the size of the original model

print('Original Model Size: {:.2f} MB'.format(original_size / (1024 * 1024)))

Accuracies for 2 bit quantization
Original Model Test Accuracy: 0.87
Linear Dequantized Model Test Accuracy: 0.10
Non-Uniform Dequantized Model Test Accuracy: 0.13
Accuracies for 4 bit quantization
Original Model Test Accuracy: 0.87
Linear Dequantized Model Test Accuracy: 0.82
Non-Uniform Dequantized Model Test Accuracy: 0.84
Accuracies for 8 bit quantization
Original Model Test Accuracy: 0.87
Linear Dequantized Model Test Accuracy: 0.86
Non-Uniform Dequantized Model Test Accuracy: 0.87
Accuracies for 16 bit quantization
Original Model Test Accuracy: 0.87
Linear Dequantized Model Test Accuracy: 0.86
Non-Uniform Dequantized Model Test Accuracy: 0.87
  num_bits  Original  Linear  Non_Uniform  Original_Size_MB  Linear_Size_MB  \
0        2    0.8661  0.1008       0.1312            6.4538          2.1926   
1        4    0.8661  0.8237       0.8435            6.4538          2.1926   
2        8    0.8661  0.8633       0.8656            6.4538          2.1926   
3       16    0.8661  0.862

In [25]:
file_name = 'Accuracy_Large_CNN_model.csv'
Large_CNN_results.to_csv(file_name, encoding='utf-8', index=False)

## Quantizing the Large CNN model using TFLite

In [26]:
# Full 8-bit Integer Quantization
eight_bit_converter = tf.lite.TFLiteConverter.from_keras_model(Large_CNN_Model)
eight_bit_converter.optimizations = [tf.lite.Optimize.DEFAULT]

tflite_quantized_model = eight_bit_converter.convert()

# Quantized Tensorflow Model Final Save in tflite Format
with open("Large_quantized_model.tflite", "wb") as f:
    f.write(tflite_quantized_model)



### Size, Inference Times and the Latency of the TFLite quantized model

In [27]:
# Save the quantized TensorFlow Lite model
quantized_model_path = "Large_quantized_model.tflite"
with open(quantized_model_path, "wb") as f:
    f.write(tflite_quantized_model)

# Get the size of the quantized model
quantized_model_size = os.path.getsize(quantized_model_path)

# Print the size of the quantized model in MB
print("Quantized Model Size: {:.2f} MB".format(quantized_model_size / (1024 * 1024)))

Quantized Model Size: 0.55 MB


In [28]:
# Measure the inference time of the Tensorflow Quantized model
quantized_original_model_time = measure_tflite_inference_time(quantized_model_path, test_data)
print("Quantized Model Average Inference Time: {:.6f} seconds".format(quantized_original_model_time))

Quantized Model Average Inference Time: 0.002209 seconds


In [29]:
accuracy_simple_quantized = evaluate_tflite_model_accuracy(quantized_model_path, Xtest, Ytest)
print(f"Accuracy of the TFLite model: {accuracy_simple_quantized  * 100:.2f}%")

Accuracy of the TFLite model: 86.59%


## Testing our functions on bigger models like ResNet 50

In [31]:
# Load and preprocess CIFAR-10 data
(Xtrain, Ytrain), (Xtest, Ytest) = tf.keras.datasets.cifar10.load_data()

#Normalization of the training and test values 
Xtrain = Xtrain.astype('float32') / 255
Xtest = Xtest.astype('float32') / 255

# One hot encoding the labels or we can use sparse_categorical_crossentropy as our loss function
Ytrain = utils.to_categorical(Ytrain, 10)
Ytest = utils.to_categorical(Ytest, 10)


# Training a ResNet50 model
ResNet_50_model = ResNet50(weights=None, include_top=True, input_shape=(32, 32, 3), classes=10)
ResNet_50_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

batch_size = 256

# Train the model
ResNet_50_model.fit(Xtrain, Ytrain, epochs=40,batch_size = batch_size,validation_data=(Xtest, Ytest))



Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f0a60787f70>

### Quantized Accuracy using our funtions for ResNet 50

In [32]:
Resnet_results = pd.DataFrame(columns=['num_bits', 'Original', 'Linear', 'Non_Uniform',
                                         'Original_Size_MB', 'Linear_Size_MB', 'Non_Uniform_Size_MB',
                                         'Original_Time (s)', 'Linear_Time (s)', 'Non_Uniform_Time (s)'])

num_bits = [2, 4, 8]

for num_bit in num_bits:
    print(f"Accuracies for {num_bit} bit quantization")
    quantized_weights_linear, dequantized_weights_linear = quantize_model_weights(ResNet_50_model, lin_quant, num_bit)
    quantized_weights_non_uniform, dequantized_weights_non_uniform = quantize_model_weights(ResNet_50_model, non_uni_quant, num_bit)

    # Evaluate the original model
    loss, original_accuracy = ResNet_50_model.evaluate(Xtest, Ytest)
    print("Original Model Test Accuracy: {:.2f}".format(original_accuracy))

    # Evaluate the model with linear quantization
    Resnet_model_linear = replace_all_weights(ResNet_50_model, dequantized_weights_linear, actual_loss='categorical_crossentropy')
    loss, linear_accuracy = Resnet_model_linear.evaluate(Xtest, Ytest)
    print("Linear Dequantized Model Test Accuracy: {:.2f}".format(linear_accuracy))

    # Evaluate the model with non-uniform quantization
    Resnet_model_non_uniform = replace_all_weights(ResNet_50_model, dequantized_weights_non_uniform, actual_loss='categorical_crossentropy')
    loss, non_uniform_accuracy = Resnet_model_non_uniform.evaluate(Xtest, Ytest)
    print("Non-Uniform Dequantized Model Test Accuracy: {:.2f}".format(non_uniform_accuracy))

    # Save the models
    Resnet_model_linear.save(f'Large_model_linear_quantized_{num_bit}_bits.h5')
    Resnet_model_non_uniform.save(f'Large_model_non_uniform_quantized_{num_bit}_bits.h5')

    # Get the size of the models
    linear_size = os.path.getsize(f'Large_model_linear_quantized_{num_bit}_bits.h5')
    non_uniform_size = os.path.getsize(f'Large_model_non_uniform_quantized_{num_bit}_bits.h5')

    # Prepare the test data
    test_data = Xtest[:1]

    # Measure the inference time of the models
    original_model_time = measure_inference_time(ResNet_50_model, test_data)
    linear_quantized_time = measure_inference_time(Resnet_model_linear, test_data)
    non_uniform_quantized_time = measure_inference_time(Resnet_model_non_uniform, test_data)
    # Save the original model
    ResNet_50_model.save('Resnet_model_original.h5')

    # Get the size of the original model
    original_size = os.path.getsize('Resnet_model_original.h5')
    # Append the results to the DataFrame
    new_row = pd.DataFrame({'num_bits': [num_bit],
                            'Original': [round(original_accuracy, 4)],
                            'Linear': [round(linear_accuracy, 4)],
                            'Non_Uniform': [round(non_uniform_accuracy, 4)],
                            'Original_Size_MB': [round(original_size / (1024 * 1024), 4)],
                            'Linear_Size_MB': [round(linear_size / (1024 * 1024), 4)],
                            'Non_Uniform_Size_MB': [round(non_uniform_size / (1024 * 1024), 4)],
                            'Original_Time (s)': [round(original_model_time, 6)],
                            'Linear_Time (s)': [round(linear_quantized_time, 6)],
                            'Non_Uniform_Time (s)': [round(non_uniform_quantized_time, 6)]})

    
    Resnet_results = pd.concat([Resnet_results, new_row], ignore_index=True)


print(Resnet_results)

# Print the size of the original model

print('Original Model Size: {:.2f} MB'.format(original_size / (1024 * 1024)))



Accuracies for 2 bit quantization
Original Model Test Accuracy: 0.67
Linear Dequantized Model Test Accuracy: 0.10
Non-Uniform Dequantized Model Test Accuracy: 0.10
Accuracies for 4 bit quantization
Original Model Test Accuracy: 0.67
Linear Dequantized Model Test Accuracy: 0.58
Non-Uniform Dequantized Model Test Accuracy: 0.64
Accuracies for 8 bit quantization
Original Model Test Accuracy: 0.67
Linear Dequantized Model Test Accuracy: 0.67
Non-Uniform Dequantized Model Test Accuracy: 0.67
  num_bits  Original  Linear  Non_Uniform  Original_Size_MB  Linear_Size_MB  \
0        2    0.6724  0.1000       0.1000          270.6505         90.5555   
1        4    0.6724  0.5758       0.6363          270.6505         90.5555   
2        8    0.6724  0.6693       0.6712          270.6505         90.5555   

   Non_Uniform_Size_MB  Original_Time (s)  Linear_Time (s)  \
0              90.5555           0.061806         0.061334   
1              90.5555           0.074481         0.061416   
2    

In [33]:
file_name = 'Accuracy_Resnet_model.csv'
Resnet_results.to_csv(file_name, encoding='utf-8', index=False)

## Quantizing the ResNet model using TFLite

In [34]:

# Load your Keras model
resnet_model = tf.keras.models.load_model("Resnet_model_original.h5")

# 8-bit quantized model TFLite
eight_converter = tf.lite.TFLiteConverter.from_keras_model(resnet_model)
eight_converter.optimizations = [tf.lite.Optimize.DEFAULT]

tflite_quantized_model = eight_converter.convert()

# Save the quantized TensorFlow Lite model
with open("Resnet_quantized_model.tflite", "wb") as f:
    f.write(tflite_quantized_model)




### Size, Inference Times and the Accuracy of the TFLite quantized model

In [35]:

# Save the quantized TensorFlow Lite model
quantized_model_path = "Resnet_quantized_model.tflite"
with open(quantized_model_path, "wb") as f:
    f.write(tflite_quantized_model)

# Get the size of the quantized model
quantized_model_size = os.path.getsize(quantized_model_path)

# Print the size of the quantized model in MB
print("Quantized Model Size: {:.2f} MB".format(quantized_model_size / (1024 * 1024)))


Quantized Model Size: 22.84 MB


In [36]:
# Inference Time of the TFLite Quantized model
quantized_resnet_model_time = measure_tflite_inference_time(quantized_model_path, test_data)
print("Quantized Model Average Inference Time: {:.6f} seconds".format(quantized_resnet_model_time))

Quantized Model Average Inference Time: 0.011498 seconds


In [37]:
# Accuracy for the tflite model
accuracy_resnet_quantized = evaluate_tflite_model_accuracy(quantized_model_path, Xtest, Ytest)
print(f"Accuracy of the TFLite model: {accuracy_resnet_quantized  * 100:.2f}%")

Accuracy of the TFLite model: 67.23%
