### Conversion from tensorflow graphdefs to TensorFlow Lite's flat buffer format 
(see [notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant.ipynb#scrollTo=BTC1rDAuei_1))

Quantize by converting 32-bit floats to more efficient 8-bit integers.

Efficient model format, using a FlatBuffer that is optimized for small size and portability.

### Train a TensorFlow model

In [2]:
import logging
logging.getLogger("tensorflow").setLevel(logging.DEBUG)

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pathlib

# Load MNIST dataset
mnist = keras.datasets.mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the input image so that each pixel value is between 0 to 1.
train_images = train_images / 255.0
test_images = test_images / 255.0

# Define the model architecture
model = keras.Sequential([
  keras.layers.InputLayer(input_shape=(28, 28)),
  keras.layers.Reshape(target_shape=(28, 28, 1)),
  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),
  keras.layers.MaxPooling2D(pool_size=(2, 2)),
  keras.layers.Flatten(),
  keras.layers.Dense(10)
])

# Train the digit classification model
model.compile(optimizer='adam',
              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(
  train_images,
  train_labels,
  epochs=1,
  validation_data=(test_images, test_labels)
)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


<tensorflow.python.keras.callbacks.History at 0x7f18e7fb8b70>

Optimizations:

- **Quantization** can reduce both the size of model and the time required for inference. For many models, there is only a minimal loss of accuracy.

There are several post-training quantization options to choose from. Here is a summary table of the choices and the benefits they provide:

1. Dynamic range quantization: 4x smaller, 2x-3x speedup	CPU

2. Full integer quantization: 4x smaller, 3x+ speedup	CPU, Edge TPU, 

3. Microcontrollers: Float16 quantization	2x smaller, GPU acceleration	CPU, GPU

### Conversion

In [3]:
import tensorflow as tf

#from Saved Model
'''
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
tflite_model = converter.convert()
open("converted_model.tflite", "wb").write(tflite_model)
'''

#from Session
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert() 

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /tmp/tmpn4qzu9wv/assets


In [5]:
#Convert to TF Lite without quantization
tflite_models_dir = pathlib.Path("/tmp/mnist_tflite_models/")
tflite_models_dir.mkdir(exist_ok=True, parents=True)
tflite_model_file = tflite_models_dir/"mnist_model.tflite"
tflite_model_file.write_bytes(tflite_model)

84452

### Dynamic range quantization
quantizes only the weights from floating point to integer, which has 8-bits of precision (see [details](https://www.tensorflow.org/lite/performance/post_training_quantization#dynamic_range_quantization))

In [6]:
#To quantize the model on export, set the optimizations flag to optimize for size
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_quant_model = converter.convert()
tflite_model_quant_file = tflite_models_dir/"mnist_model_quant.tflite"
tflite_model_quant_file.write_bytes(tflite_quant_model)

INFO:tensorflow:Assets written to: /tmp/tmp2plb7ffn/assets


INFO:tensorflow:Assets written to: /tmp/tmp2plb7ffn/assets


23840

### Full integer quantization
Further latency improvements, reductions in peak memory usage, and compatibility with integer only hardware devices or accelerators by making sure all model math is integer quantized. (see [details](https://www.tensorflow.org/lite/performance/post_training_quantization#full_integer_quantization))

In [10]:
optimize_lite_model = True  
num_calibration_steps = 60  
representative_dataset = None

In [12]:
#For full integer quantization, you need to measure the dynamic range of activations and inputs by supplying sample input data to the converter. 
#See the representative_dataset_gen()

#to fully integer quantize a model, but use float operators when they don't have an integer implementation (to ensure conversion occurs smoothly):
'''
def representative_dataset_gen():
  for _ in range(num_calibration_steps):
    # Get sample input data as a numpy array in a method of your choosing.
    yield [input]
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset_gen
tflite_quant_model = converter.convert()
'''

#integer only to ensure compatibility with integer only devices (such as 8-bit microcontrollers) and accelerators (such as the Coral Edge TPU):
def representative_dataset_gen():
  for _ in range(num_calibration_steps):
    # Get sample input data as a numpy array in a method of your choosing.
    yield [input]
converter.optimizations = [tf.lite.Optimize.DEFAULT]
#converter.representative_dataset = representative_dataset_gen #optional
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  # or tf.uint8
converter.inference_output_type = tf.int8  # or tf.uint8
tflite_quant_model = converter.convert() #NOTE: The converter will throw an error if it encounters an operation it cannot currently quantize.

INFO:tensorflow:Assets written to: /tmp/tmpogjso5ll/assets


INFO:tensorflow:Assets written to: /tmp/tmpogjso5ll/assets


AttributeError: ignored

### Float16 quantization

You can reduce the size of a floating point model by quantizing the weights to float16. It reduces model size by up to half and causes minimal loss in accuracy.


In [13]:
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
tflite_quant_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmp3ys4n80v/assets


INFO:tensorflow:Assets written to: /tmp/tmp3ys4n80v/assets


ValueError: ignored

### Inference

In [14]:
#load model
interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))
interpreter.allocate_tensors()

In [16]:
#load quantized model
interpreter_quant = tf.lite.Interpreter(model_path=str(tflite_model_quant_file))
interpreter_quant.allocate_tensors()

In [17]:
test_image = np.expand_dims(test_images[0], axis=0).astype(np.float32)

input_index = interpreter.get_input_details()[0]["index"]
output_index = interpreter.get_output_details()[0]["index"]

interpreter.set_tensor(input_index, test_image)
interpreter.invoke()
predictions = interpreter.get_tensor(output_index)

In [18]:
# A helper function to evaluate the TF Lite model using "test" dataset.
def evaluate_model(interpreter):
  input_index = interpreter.get_input_details()[0]["index"]
  output_index = interpreter.get_output_details()[0]["index"]

  # Run predictions on every image in the "test" dataset.
  prediction_digits = []
  for test_image in test_images:
    # Pre-processing: add batch dimension and convert to float32 to match with
    # the model's input data format.
    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)
    interpreter.set_tensor(input_index, test_image)

    # Run inference.
    interpreter.invoke()

    # Post-processing: remove batch dimension and find the digit with highest
    # probability.
    output = interpreter.tensor(output_index)
    digit = np.argmax(output()[0])
    prediction_digits.append(digit)

  # Compare prediction results with ground truth labels to calculate accuracy.
  accurate_count = 0
  for index in range(len(prediction_digits)):
    if prediction_digits[index] == test_labels[index]:
      accurate_count += 1
  accuracy = accurate_count * 1.0 / len(prediction_digits)

  return accuracy

print("evaluation on the model:", evaluate_model(interpreter))
print("evaluation on the dynamic range quantized model: ", evaluate_model(interpreter_quant))

evaluation on the model: 0.9645
evaluation on the dynamic range quantized model:  0.9646
