In [1]:
# This example was tested on the following NVIDIA Docker image: nvcr.io/nvidia/tensorflow:20.08-tf2-py3
# !nvidia-docker run --shm-size 8g --ulimit memlock=-1 -it -v $PWD:/examples -v ~/.aws/:/.aws --network=host nvcr.io/nvidia/tensorflow:20.08-tf2-py3

In [2]:
# Prepare your imagenet TFRecord using the following:
# https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh

In [3]:
# !pip install --upgrade pip -q
# !pip install matplotlib pandas -q

In [4]:
import os
import time
import shutil
import json
import time
import pandas as pd
import numpy as np
import requests
from functools import partial

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.python.saved_model import tag_constants, signature_constants
from tensorflow.python.framework import convert_to_constants

from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
print(f"TensorRT version: {get_linked_tensorrt_version()}")
print(f"TensorFlow version: {tf.__version__}")

TensorRT version: (7, 1, 3)
TensorFlow version: 2.2.0


#### Download Keras Resnet50 model

In [5]:
def load_save_resnet50_model(saved_model_dir = 'resnet50_saved_model'):
    model = ResNet50(weights='imagenet')
    shutil.rmtree(saved_model_dir, ignore_errors=True)
    model.save(saved_model_dir, include_optimizer=False, save_format='tf')

In [6]:
saved_model_dir = 'resnet50_saved_model' 
load_save_resnet50_model(saved_model_dir)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: resnet50_saved_model/assets


In [7]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    image = tf.keras.applications.resnet50.preprocess_input(image)
    
    return image, label, label_text

def get_dataset(batch_size, use_cache=False):
    data_dir = '/examples/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=8)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

#### Predict using GPU + Keras

In [8]:
%%time
model = tf.keras.models.load_model(saved_model_dir)
batch_size = 128
display_every = 5000
display_threshold = display_every

pred_labels = []
actual_labels = []
iter_times = []

# Get the tf.data.TFRecordDataset object for the ImageNet2012 validation dataset
dataset = get_dataset(batch_size)  

for i, (validation_ds, batch_labels, _) in enumerate(dataset):
    start_time = time.time()
    pred_prob_keras = model(validation_ds)
    iter_times.append(time.time() - start_time)
       
    actual_labels.extend(label for label_list in batch_labels.numpy() for label in label_list)
    pred_labels.extend(list(np.argmax(pred_prob_keras, axis=1)))
    
    if i*batch_size >= display_threshold:
        print(f'Images {i*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
        display_threshold+=display_every

iter_times = np.array(iter_times)
acc_keras_gpu = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)

Images 5120/50000. Average i/s 1202.2403815597772
Images 10112/50000. Average i/s 1255.4613821650041
Images 15104/50000. Average i/s 1274.7982025827173
Images 20096/50000. Average i/s 1282.2455741806853
Images 25088/50000. Average i/s 1286.8815339138637
Images 30080/50000. Average i/s 1292.0688698725226
Images 35072/50000. Average i/s 1295.2494914105964
Images 40064/50000. Average i/s 1296.9882964127578
Images 45056/50000. Average i/s 1299.6527953350485
CPU times: user 9min 48s, sys: 59.1 s, total: 10min 47s
Wall time: 3min 13s


In [9]:
results = pd.DataFrame(columns = [f'keras_gpu_{batch_size}'])
results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
results.loc['user_batch_size']         = [batch_size]
results.loc['accuracy']                = [acc_keras_gpu]
results.loc['prediction_time']         = [np.sum(iter_times)]
results.loc['images_per_sec_mean']     = [np.mean(batch_size / iter_times)]
results.loc['images_per_sec_std']      = [np.std(batch_size / iter_times, ddof=1)]
results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
results.loc['latency_median']          = [np.median(iter_times) * 1000]
results.loc['latency_min']             = [np.min(iter_times) * 1000]
display(results)

Unnamed: 0,keras_gpu_128
instance_type,g4dn.2xlarge
user_batch_size,128
accuracy,0.74956
prediction_time,46.5699
images_per_sec_mean,1300.16
images_per_sec_std,141.432
latency_mean,119.105
latency_99th_percentile,482.203
latency_median,97.7492
latency_min,73.1673


#### Predict using GPU + TensorRT

In [10]:
def build_fn(batch_size, dataset):
    for i, (build_image, _, _) in enumerate(dataset):
        if i > 1:
            break
        yield (build_image,)

def calibrate_fn(n_calib, batch_size, dataset):
    for i, (calib_image, _, _) in enumerate(dataset):
        if i > n_calib // batch_size:
            break
        yield (calib_image,)

def build_tensorrt_engine(precision, batch_size, dataset):
    from tensorflow.python.compiler.tensorrt import trt_convert as trt
    conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=precision.upper(),
                                                                   max_workspace_size_bytes=(1<<32),
                                                                   maximum_cached_engines=2)
    converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
                                        conversion_params=conversion_params)
    
    if precision.lower() == 'int8':
        n_calib=100
        converter.convert(calibration_input_fn=partial(calibrate_fn, n_calib, batch_size, 
                                                       dataset.shuffle(buffer_size=n_calib, reshuffle_each_iteration=True)))
    else:
        converter.convert()
        
    trt_compiled_model_dir = f'resnet50_trt_saved_models/resnet50_{precision}_{batch_size}'
    shutil.rmtree(trt_compiled_model_dir, ignore_errors=True)

    converter.build(input_fn=partial(build_fn, batch_size, dataset))
    converter.save(output_saved_model_dir=trt_compiled_model_dir)
    print(f'\nOptimized for {precision} and batch size {batch_size}, directory:{trt_compiled_model_dir}\n')
    return trt_compiled_model_dir

In [11]:
def trt_predict_benchmark(precision, batch_size, use_cache=False, display_every=100, warm_up=10):

    print('\n=======================================================')
    print(f'Benchmark results for precision: {precision}, batch size: {batch_size}')
    print('=======================================================\n')
    
    dataset = get_dataset(batch_size)
    
    # If caching is enabled, cache dataset for better i/o performance
    if use_cache:
        print('Caching dataset ...')
        start_time = time.time()
        for (img,_,_) in dataset:
            continue
        print(f'Finished caching {time.time() - start_time}')
    
    trt_compiled_model_dir = build_tensorrt_engine(precision, batch_size, dataset)
    saved_model_trt = tf.saved_model.load(trt_compiled_model_dir, tags=[tag_constants.SERVING])
    model_trt = saved_model_trt.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
    
    pred_labels = []
    actual_labels = []
    iter_times = []
    
    display_every = 5000
    display_threshold = display_every
    initial_time = time.time()
    
    walltime_start = time.time()
    for i, (validation_ds, batch_labels, _) in enumerate(dataset):
        if i==0:
            for w in range(warm_up):
                _ = model_trt(validation_ds);
                
        start_time = time.time()
        trt_results = model_trt(validation_ds);
        iter_times.append(time.time() - start_time)
        
        actual_labels.extend(label for label_list in batch_labels.numpy() for label in label_list)
        pred_labels.extend(list(tf.argmax(trt_results['predictions'], axis=1).numpy()))
        if (i)*batch_size >= display_threshold:
            print(f'Images {(i)*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
            display_threshold+=display_every
    
    print(f'Wall time: {time.time() - walltime_start}')

    acc_trt = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    iter_times = np.array(iter_times)
   
    results = pd.DataFrame(columns = [f'trt_{precision}_{batch_size}'])
    results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
    results.loc['user_batch_size']         = [batch_size]
    results.loc['accuracy']                = [acc_trt]
    results.loc['prediction_time']         = [np.sum(iter_times)]
    results.loc['images_per_sec_mean']     = [np.mean(batch_size / iter_times)]
    results.loc['images_per_sec_std']      = [np.std(batch_size / iter_times, ddof=1)]
    results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
    results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
    results.loc['latency_median']          = [np.median(iter_times) * 1000]
    results.loc['latency_min']             = [np.min(iter_times) * 1000]
    display(results.T)
   
    return results, iter_times

In [12]:
import itertools
bench_options = {
    'batch_size': [128],
    'precision': ['fp32', 'int8']
}

bname, bval = zip(*bench_options.items())
blist = [dict(zip(bname, h)) for h in itertools.product(*bval)]

print('Benchmark sweep combinations:')
for b in blist:
    print(b)

Benchmark sweep combinations:
{'batch_size': 128, 'precision': 'fp32'}
{'batch_size': 128, 'precision': 'int8'}


In [13]:
iter_ds = pd.DataFrame()
col_name = lambda boption: f'trt_{boption["precision"]}_{boption["batch_size"]}'

for boption in blist:
    res, it = trt_predict_benchmark(**boption)
    iter_ds = pd.concat([iter_ds, pd.DataFrame(it, columns=[col_name(boption)])], axis=1)
    results = pd.concat([results, res], axis=1)

display(results)


Benchmark results for precision: fp32, batch size: 128

INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp32_128/assets

Optimized for fp32 and batch size 128, directory:resnet50_trt_saved_models/resnet50_fp32_128

Images 5120/50000. Average i/s 7670.863059439031
Images 10112/50000. Average i/s 7708.883268765703
Images 15104/50000. Average i/s 7781.459817044685
Images 20096/50000. Average i/s 7866.970004647157
Images 25088/50000. Average i/s 7847.518386147604
Images 30080/50000. Average i/s 7894.069897762841
Images 35072/50000. Average i/s 7891.932053748691
Images 40064/50000. Average i/s 7853.895544474464
Images 45056/50000. Average i/s 7832.923218960731
Wall time: 119.47140908241272


Unnamed: 0,instance_type,user_batch_size,accuracy,prediction_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
trt_fp32_128,g4dn.2xlarge,128,0.74956,6.75721,7869.37,1240.29,17.2819,23.9022,16.0117,8.42524



Benchmark results for precision: int8, batch size: 128

INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_int8_128/assets

Optimized for int8 and batch size 128, directory:resnet50_trt_saved_models/resnet50_int8_128

Images 5120/50000. Average i/s 7396.425219329722
Images 10112/50000. Average i/s 7541.030847957469
Images 15104/50000. Average i/s 7576.9405442854595
Images 20096/50000. Average i/s 7608.913582959295
Images 25088/50000. Average i/s 7650.61839783195
Images 30080/50000. Average i/s 7707.159023536961
Images 35072/50000. Average i/s 7731.482805650456
Images 40064/50000. Average i/s 7743.64490248324
Images 45056/50000. Average i/s 7757.7598333462
Wall time: 63.807910203933716


Unnamed: 0,instance_type,user_batch_size,accuracy,prediction_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
trt_int8_128,g4dn.2xlarge,128,0.74816,6.5471,7764.95,945.311,16.7445,22.0578,16.4688,8.5845


Unnamed: 0,keras_gpu_128,trt_fp32_128,trt_int8_128
instance_type,g4dn.2xlarge,g4dn.2xlarge,g4dn.2xlarge
user_batch_size,128,128,128
accuracy,0.74956,0.74956,0.74816
prediction_time,46.5699,6.75721,6.5471
images_per_sec_mean,1300.16,7869.37,7764.95
images_per_sec_std,141.432,1240.29,945.311
latency_mean,119.105,17.2819,16.7445
latency_99th_percentile,482.203,23.9022,22.0578
latency_median,97.7492,16.0117,16.4688
latency_min,73.1673,8.42524,8.5845
