In [1]:
%load_ext autotime

!nvidia-smi -L

import os

os.environ['CUDA_VISIBLE_DEVICES']='2' # No GPU

GPU 0: NVIDIA GeForce RTX 3090 (UUID: GPU-3b49e2b8-87f0-c515-798b-3492ec05a183)
GPU 1: NVIDIA GeForce GTX 1080 Ti (UUID: GPU-07628ed7-6ef8-fd67-7d03-cb6a89f72de4)


# Post training

In [2]:
import numpy as np, tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.applications.vgg16 import preprocess_input
from tqdm.notebook import tqdm

# Experiment tracking with mlflow
import mlflow
import mlflow.tensorflow as mltf
from pathlib import Path
import time, multiprocessing

In [3]:
tf.config.get_visible_devices('GPU')

[]

In [4]:
fmd = "./mlflow/artifacts/1/38162c8d183043f1bfddf866e1ee9175/artifacts/model/data/model" #final model directory


targetMap='''aegypti landing
aegypti smashed
albopictus landing
albopictus smashed
Culex landing
Culex smashed'''.split('\n')

test_path = "./dataset/data_splitting/Pred/"
IMG_SIZE = (224, 224)
BATCH_SIZE = 32



gen = ImageDataGenerator(
    preprocessing_function=preprocess_input
)

test = gen.flow_from_directory(test_path, target_size=IMG_SIZE,
                                      classes=targetMap, class_mode='categorical', batch_size=BATCH_SIZE)

Found 3600 images belonging to 6 classes.


# No Optimization

In [5]:
def get_dir_size(directory):
    root_dir=Path(".")
    size = sum(f.stat().st_size for f in root_dir.glob(directory+'/**/*') if f.is_file())
    return f"Size in MB: {size // (1024*1024)}"

In [6]:
get_dir_size(fmd)

'Size in MB: 56'

In [7]:
model = keras.models.load_model(fmd)

In [8]:
model.evaluate(test, verbose=1) # CPU load, around 90%



[0.20030760765075684, 0.9466666579246521]

## Dynamic Range quantization

In [9]:
converter = tf.lite.TFLiteConverter.from_saved_model(fmd) #from_keras_model
converter.optimizations = [tf.lite.Optimize.DEFAULT]
drq_model = converter.convert()

In [10]:
drq_dir = Path("./optimized/drq")
drq_dir.mkdir(exist_ok=True, parents=True)
drq_file = drq_dir/"mosqueto_quant.tflite"
drq_file.write_bytes(drq_model);

get_dir_size("./optimized/drq/")

'Size in MB: 9'

In [11]:
#load model, let see load time
drq_model = tf.lite.Interpreter(model_path="./optimized/drq/mosqueto_quant.tflite", 
                                num_threads=multiprocessing.cpu_count())

input_index = drq_model.get_input_details()[0]["index"] 
output_index = drq_model.get_output_details()[0]["index"]

# there is a problem, tflite by default inference 1 image at a time shape(1, 224, 224, 3)

drq_model.resize_tensor_input(input_index, [BATCH_SIZE, 224, 224, 3]); 
drq_model.allocate_tensors() # from 1.22sec down to 40.1ms

In [12]:
# Now it can accept batch data
drq_model.get_input_details()

[{'name': 'serving_default_input_2:0',
  'index': 0,
  'shape': array([ 32, 224, 224,   3], dtype=int32),
  'shape_signature': array([ -1, 224, 224,   3], dtype=int32),
  'dtype': numpy.float32,
  'quantization': (0.0, 0),
  'quantization_parameters': {'scales': array([], dtype=float32),
   'zero_points': array([], dtype=int32),
   'quantized_dimension': 0},
  'sparsity_parameters': {}}]

# This is a single-core predictor (By default)

> CPU load, around 65% (All cores)  
> the way you load data, can be a bottleneck  

In [13]:
preds = []
targets = []
counter = 0
for test_imgs, test_labels in tqdm(test):
    if counter == 113: #3600 images / 32 batch_size = 112.5
        break
    if test_imgs.shape[0] == 16: # last batch is of size 16
        test_imgs= np.concatenate((test_imgs, test_imgs))
        test_labels= np.concatenate((test_labels, test_labels))
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    drq_model.set_tensor(input_index, test_imgs)
    
    # Run inference
    drq_model.invoke()
    
    # Post-processing: remove batch dimension and find the digit with highest
    # probability.
    output = drq_model.tensor(output_index)
    preds.append(np.argmax(output(), axis=1))
    targets.append(np.argmax(test_labels, axis=1))
    counter+=1

  0%|          | 0/113 [00:00<?, ?it/s]

In [14]:
preds[-1][16:] = 1
targets[-1][16:] = 0
f"{np.sum(np.array(preds)==np.array(targets)) / test.n :.4f}" # which is about 0.001 drop in accuracy

'0.9456'

### Don't compare the time with model.evalute() cell (Why ?)

[Why is TensorFlow Lite slower than TensorFlow on desktop?](https://stackoverflow.com/questions/54093424/why-is-tensorflow-lite-slower-than-tensorflow-on-desktop)

In [15]:
def representative_dataset():
    for _ in range(5):
        imgs, _ = test.next()
        yield [imgs]

In [16]:
converter = tf.lite.TFLiteConverter.from_saved_model(fmd)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset

# No float fallback some Edge devices can't handle float operations
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  # or tf.uint8
converter.inference_output_type = tf.int8  # or tf.uint8

fInt_model = converter.convert()

In [17]:
fInt_dir = Path("./optimized/fInt")
fInt_dir.mkdir(exist_ok=True, parents=True)
fInt_file = fInt_dir/"mosqueto_quant.tflite"
fInt_file.write_bytes(fInt_model);

get_dir_size("./optimized/fInt/")

'Size in MB: 9'

In [18]:
#load model, let see load time
fInt_model = tf.lite.Interpreter(model_path="./optimized/fInt/mosqueto_quant.tflite", 
                                num_threads=multiprocessing.cpu_count())

input_index = fInt_model.get_input_details()[0]["index"] 
output_index = fInt_model.get_output_details()[0]["index"]

# there is a problem, tflite by default inference 1 image at a time shape(1, 224, 224, 3)

fInt_model.resize_tensor_input(input_index, [BATCH_SIZE, 224, 224, 3]); 
fInt_model.allocate_tensors() # from 1.22sec down to 4.1ms

> CPU load, around 80% (All cores)  

In [19]:
preds = []
targets = []
counter = 0
for test_imgs, test_labels in tqdm(test):
    test_imgs = test_imgs.astype('int8')
    test_labels = test_labels.astype('int8')
    if counter == 113: #3600 images / 32 batch_size = 112.5
        break
    if test_imgs.shape[0] == 16: # last batch is of size 16
        test_imgs= np.concatenate((test_imgs, test_imgs))
        test_labels= np.concatenate((test_labels, test_labels))
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    fInt_model.set_tensor(input_index, test_imgs)
    
    # Run inference
    fInt_model.invoke()
    
    # Post-processing: remove batch dimension and find the digit with highest
    # probability.
    output = fInt_model.tensor(output_index)
    preds.append(np.argmax(output(), axis=1))
    targets.append(np.argmax(test_labels, axis=1))
    counter+=1

  0%|          | 0/113 [00:00<?, ?it/s]

In [20]:
preds[-1][16:] = 1
targets[-1][16:] = 0
f"{np.sum(np.array(preds)==np.array(targets)) / test.n :.4f}" # which is about 0.8 drop in accuracy 
# (previous try was less than 4%)

'0.8600'

# For Float 16 :

```
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_quant_model = converter.convert()
```