### Quantized model benchmarking

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys 
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.mobilenet import preprocess_input
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# configuration parameters 
TEST_DATA_DIR = '/Users/sanchit/Documents/Projects/Datasets/animals/test/'
MODEL_PATH = "./models/mobilenet.h5"
TFLITE_MODEL_DIR = "./models/tflite/"
QUANT_TYPE = "w_fp16" # no_quant, w_int8 or w_fp16
TEST_SAMPLES = 450
NUM_CLASSES = 3
IMG_WIDTH, IMG_HEIGHT = 224, 224
BATCH_SIZE = 64
LABELS = ["cats", "dogs", "panda"]
QUANT_NAME_MAP = {"no_quant": "no quantization", "w_int8": "weights 8-bit INT quantized", 
                  "w_fp16": "weights 16-bit FP quantized"}

### Load quantized model according to the type

In [4]:
if QUANT_TYPE == "no_quant":
    # model without any quantization
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_no_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "w_int8":
    # model with weights INT8 quantization
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_weights_int8_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "w_fp16":
    # model with weights FP16 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_weights_float16_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model

else:
    print(f"Wrong quantization type has been chosen for {QUANT_NAME_MAP[QUANT_TYPE]}")
    sys.exit(0)

interpreter for weights 16-bit FP quantized loading ...


In [5]:
# get indices of input and output tensors for each model 
input_ind = interpret.get_input_details()[0]["index"]
out_ind   = interpret.get_output_details()[0]["index"]

#input_ind_w_int8 = interpret_weights_int8.get_input_details()[0]["index"]
#out_ind_w_int8   = interpret_weights_int8.get_output_details()[0]["index"]

#input_ind_w_fp16 = interpret_weights_fp16.get_input_details()[0]["index"]
#out_ind_w_fp16   = interpret_weights_fp16.get_output_details()[0]["index"]

### Create test generator

In [6]:
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_generator = test_datagen.flow_from_directory(
    TEST_DATA_DIR,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=1,
    shuffle=False,
    class_mode='categorical')

Found 450 images belonging to 3 classes.


References for using interpreter: 
- (check evaluate the models): https://www.tensorflow.org/lite/performance/post_training_float16_quant 
- it uses batch generator and batch predictions (TODO later): https://thinkmobile.dev/testing-tensorflow-lite-image-classification-model/ 
- TinyML book chapter on Interpreter

In [7]:
# save the predicted class label (highest probability one) in a list
print(f"computing results for {QUANT_NAME_MAP[QUANT_TYPE]} ... \n")
results = []
accuracy_count = 0
for i in range(TEST_SAMPLES): 
    
    # generate a batch of images 
    test_image = test_generator.next() 
    
    # set the input image to the input index 
    interpret.set_tensor(input_ind, test_image[0]) 
    
    # run the inference 
    interpret.invoke() 
    
    # read the predictions from the output tensor
    predictions = interpret.tensor(out_ind) # or, get_tensor(out_ind)
    
    # get the highest predicted class
    pred_class = np.argmax(predictions()[0])
    
    #print("predicted class: ", pred_class, " and actual class: ", test_generator.classes[i])
    
    results.append(pred_class)
    
    if pred_class == test_generator.classes[i]:
        accuracy_count += 1 

computing results for weights 16-bit FP quantized ... 



In [8]:
# compute the accuracy percentage
print(f"accuracy percentage for {QUANT_NAME_MAP[QUANT_TYPE]}: {round((accuracy_count / TEST_SAMPLES) * 100, 3)}% \n")

accuracy percentage for weights 16-bit FP quantized: 97.556% 



In [9]:
# Plot confusion matrix, classification report
print("-"*50)
print(f"Confusion matrix for {QUANT_NAME_MAP[QUANT_TYPE]}: \n")
print(confusion_matrix(y_true=test_generator.classes, y_pred=results))
print("-"*50)
print(f"Classification report for {QUANT_NAME_MAP[QUANT_TYPE]}: \n")
print(classification_report(y_true=test_generator.classes, y_pred=results, target_names=LABELS))
print("-"*50)

--------------------------------------------------
Confusion matrix for weights 16-bit FP quantized: 

[[146   4   0]
 [  7 143   0]
 [  0   0 150]]
--------------------------------------------------
Classification report for weights 16-bit FP quantized: 

              precision    recall  f1-score   support

        cats       0.95      0.97      0.96       150
        dogs       0.97      0.95      0.96       150
       panda       1.00      1.00      1.00       150

    accuracy                           0.98       450
   macro avg       0.98      0.98      0.98       450
weighted avg       0.98      0.98      0.98       450

--------------------------------------------------
