### Quantized model benchmarking

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import sys 
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.mobilenet import preprocess_input
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
# configuration parameters 
TEST_DATA_DIR = '/Users/sanchit/Documents/Projects/Datasets/animals/test/'
MODEL_PATH = "./models/mobilenet.h5"
TFLITE_MODEL_DIR = "./models/tflite/"
QUANT_TYPE = "both_int8" # no_quant, w_int8, w_fp16, both_int8, both_fp16
TEST_SAMPLES = 450
NUM_CLASSES = 3
IMG_WIDTH, IMG_HEIGHT = 224, 224
BATCH_SIZE = 64
LABELS = ["cats", "dogs", "panda"]
QUANT_NAME_MAP = {"no_quant": "no quantization", "w_int8": "weights 8-bit INT quantized", 
                  "w_fp16": "weights 16-bit FP quantized", "both_int8": "both weights and activations INT8 quantized", 
                 "both_fp16": "both weights and activations FP-16 quantized"}

### Load quantized model according to the type

In [22]:
if QUANT_TYPE == "no_quant":
    # model without any quantization
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_no_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "w_int8":
    # model with weights INT8 quantization
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_weights_int8_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "w_fp16":
    # model with weights FP16 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_weights_float16_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "both_int8":
    # model with both weights and activations INT8 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_both_int8_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model
    
elif QUANT_TYPE == "both_fp16":
    # model with both weights and activations INT8 quantization 
    print(f"interpreter for {QUANT_NAME_MAP[QUANT_TYPE]} loading ...")
    interpret = tf.lite.Interpreter(model_path = TFLITE_MODEL_DIR + "mobilenet_both_fp16_quant.tflite")
    interpret.allocate_tensors() # allocate memory to the model

else:
    print(f"Wrong quantization type has been chosen for {QUANT_NAME_MAP[QUANT_TYPE]}")
    sys.exit(0)

interpreter for both weights and activations INT8 quantized loading ...


In [23]:
# get indices of input and output tensors for each model 
input_ind = interpret.get_input_details()[0]["index"]
out_ind   = interpret.get_output_details()[0]["index"]

#input_ind_w_int8 = interpret_weights_int8.get_input_details()[0]["index"]
#out_ind_w_int8   = interpret_weights_int8.get_output_details()[0]["index"]

#input_ind_w_fp16 = interpret_weights_fp16.get_input_details()[0]["index"]
#out_ind_w_fp16   = interpret_weights_fp16.get_output_details()[0]["index"]

### Create test generator

In [24]:
test_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
test_generator = test_datagen.flow_from_directory(
    TEST_DATA_DIR,
    target_size=(IMG_WIDTH, IMG_HEIGHT),
    batch_size=1,
    shuffle=False,
    class_mode='categorical')

Found 450 images belonging to 3 classes.


References for using interpreter: 
- (check evaluate the models): https://www.tensorflow.org/lite/performance/post_training_float16_quant 
- it uses batch generator and batch predictions (TODO later): https://thinkmobile.dev/testing-tensorflow-lite-image-classification-model/ 
- TinyML book chapter on Interpreter

In [25]:
# save the predicted class label (highest probability one) in a list
print(f"computing results for {QUANT_NAME_MAP[QUANT_TYPE]} ... \n")
results = []
accuracy_count = 0
for i in range(TEST_SAMPLES): 
    
    print(f"computing results for {i}th image ...")
    
    # generate a batch of images 
    test_image = test_generator.next() 
    
    # set the input image to the input index 
    interpret.set_tensor(input_ind, test_image[0]) 
    
    # run the inference 
    interpret.invoke() 
    
    # read the predictions from the output tensor
    predictions = interpret.tensor(out_ind) # or, get_tensor(out_ind)
    
    # get the highest predicted class
    pred_class = np.argmax(predictions()[0])
    
    #print("predicted class: ", pred_class, " and actual class: ", test_generator.classes[i])
    
    results.append(pred_class)
    
    if pred_class == test_generator.classes[i]:
        accuracy_count += 1 

computing results for both weights and activations INT8 quantized ... 

computing results for 0th image ...
computing results for 1th image ...
computing results for 2th image ...
computing results for 3th image ...
computing results for 4th image ...
computing results for 5th image ...
computing results for 6th image ...
computing results for 7th image ...
computing results for 8th image ...
computing results for 9th image ...
computing results for 10th image ...
computing results for 11th image ...
computing results for 12th image ...
computing results for 13th image ...
computing results for 14th image ...
computing results for 15th image ...
computing results for 16th image ...
computing results for 17th image ...
computing results for 18th image ...
computing results for 19th image ...
computing results for 20th image ...
computing results for 21th image ...
computing results for 22th image ...
computing results for 23th image ...
computing results for 24th image ...
computing res

computing results for 217th image ...
computing results for 218th image ...
computing results for 219th image ...
computing results for 220th image ...
computing results for 221th image ...
computing results for 222th image ...
computing results for 223th image ...
computing results for 224th image ...
computing results for 225th image ...
computing results for 226th image ...
computing results for 227th image ...
computing results for 228th image ...
computing results for 229th image ...
computing results for 230th image ...
computing results for 231th image ...
computing results for 232th image ...
computing results for 233th image ...
computing results for 234th image ...
computing results for 235th image ...
computing results for 236th image ...
computing results for 237th image ...
computing results for 238th image ...
computing results for 239th image ...
computing results for 240th image ...
computing results for 241th image ...
computing results for 242th image ...
computing re

computing results for 433th image ...
computing results for 434th image ...
computing results for 435th image ...
computing results for 436th image ...
computing results for 437th image ...
computing results for 438th image ...
computing results for 439th image ...
computing results for 440th image ...
computing results for 441th image ...
computing results for 442th image ...
computing results for 443th image ...
computing results for 444th image ...
computing results for 445th image ...
computing results for 446th image ...
computing results for 447th image ...
computing results for 448th image ...
computing results for 449th image ...


In [28]:
# compute the accuracy percentage
print(f"accuracy percentage for {QUANT_NAME_MAP[QUANT_TYPE]}: {round((accuracy_count / TEST_SAMPLES) * 100, 3)}% \n")

accuracy percentage for both weights and activations INT8 quantized: 97.556% 



In [29]:
# Plot confusion matrix, classification report
print("-"*50)
print(f"Confusion matrix for {QUANT_NAME_MAP[QUANT_TYPE]}: \n")
print(confusion_matrix(y_true=test_generator.classes, y_pred=results))
print("-"*50)
print(f"Classification report for {QUANT_NAME_MAP[QUANT_TYPE]}: \n")
print(classification_report(y_true=test_generator.classes, y_pred=results, target_names=LABELS))
print("-"*50)

--------------------------------------------------
Confusion matrix for both weights and activations INT8 quantized: 

[[147   3   0]
 [  8 142   0]
 [  0   0 150]]
--------------------------------------------------
Classification report for both weights and activations INT8 quantized: 

              precision    recall  f1-score   support

        cats       0.95      0.98      0.96       150
        dogs       0.98      0.95      0.96       150
       panda       1.00      1.00      1.00       150

    accuracy                           0.98       450
   macro avg       0.98      0.98      0.98       450
weighted avg       0.98      0.98      0.98       450

--------------------------------------------------
