# GPU inference with NVIDIA T4 on Amazon EC2 G4 instance
This example demonstrates GPU inference with:
* GPU accelerated TensorFlow/Keras
* NVIDIA TensorRT optimizer and runtime

This example was tested on Amazon EC2 `g4dn.xlarge` using the following AWS Deep Learning AMI:
`Deep Learning AMI (Ubuntu 18.04) Version 35.0`

And the following NVIDIA TensorFlow Docker image: 
`nvcr.io/nvidia/tensorflow:20.08-tf2-py3`

Create a Docker container:<br>
`nvidia-docker run --shm-size 8g --ulimit memlock=-1 -it -v $PWD:/examples -v ~/.aws/:/.aws --network=host nvcr.io/nvidia/tensorflow:20.08-tf2-py3`

Prepare your imagenet validation TFRecord files using the following helper script:
https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh

Save it to `/examples/datasets/` or update the dataset location in the `get_dataset()` function

In [1]:
# !pip install --upgrade pip -q
# !pip install matplotlib pandas -q

In [2]:
import os
import time
import shutil
import json
import time
import pandas as pd
import numpy as np
import requests
from functools import partial

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.python.saved_model import tag_constants, signature_constants
from tensorflow.python.framework import convert_to_constants

from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version

print(f"TensorRT version: {get_linked_tensorrt_version()}")
print(f"TensorFlow version: {tf.__version__}")

TensorRT version: (7, 1, 3)
TensorFlow version: 2.2.0


In [3]:
results = None
batch_size = 8

### Download Keras Resnet50 model

In [4]:
def load_save_resnet50_model(saved_model_dir = 'resnet50_saved_model'):
    model = ResNet50(weights='imagenet')
    shutil.rmtree(saved_model_dir, ignore_errors=True)
    model.save(saved_model_dir, include_optimizer=False, save_format='tf')

In [5]:
saved_model_dir = 'resnet50_saved_model' 
load_save_resnet50_model(saved_model_dir)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: resnet50_saved_model/assets


### Use `tf.data` to read ImageNet validation dataset

In [6]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    image = tf.keras.applications.resnet50.preprocess_input(image)
    
    return image, label, label_text

def get_dataset(batch_size, use_cache=False):
    data_dir = '/examples/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

#### Predict using GPU + Keras

In [7]:
model = tf.keras.models.load_model(saved_model_dir)
display_every = 5000
display_threshold = display_every

pred_labels = []
actual_labels = []
iter_times = []

# Get the tf.data.TFRecordDataset object for the ImageNet2012 validation dataset
dataset = get_dataset(batch_size)  

walltime_start = time.time()
for i, (validation_ds, batch_labels, _) in enumerate(dataset):
    start_time = time.time()
    pred_prob_keras = model(validation_ds)
    iter_times.append(time.time() - start_time)
    
    actual_labels.extend(label for label_list in batch_labels.numpy() for label in label_list)
    pred_labels.extend(list(np.argmax(pred_prob_keras, axis=1)))
    
    if i*batch_size >= display_threshold:
        print(f'Images {i*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
        display_threshold+=display_every

iter_times = np.array(iter_times)
acc_keras_gpu = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)

results = pd.DataFrame(columns = [f'keras_gpu_{batch_size}'])
results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
results.loc['user_batch_size']         = [batch_size]
results.loc['accuracy']                = [acc_keras_gpu]
results.loc['prediction_time']         = [np.sum(iter_times)]
results.loc['wall_time']               = [time.time() - walltime_start]
results.loc['images_per_sec_mean']     = [np.mean(batch_size / iter_times)]
results.loc['images_per_sec_std']      = [np.std(batch_size / iter_times, ddof=1)]
results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
results.loc['latency_median']          = [np.median(iter_times) * 1000]
results.loc['latency_min']             = [np.min(iter_times) * 1000]
display(results.T)

Images 5000/50000. Average i/s 113.52902935209637
Images 10000/50000. Average i/s 114.4822357792094
Images 15000/50000. Average i/s 114.17100473485702
Images 20000/50000. Average i/s 114.61583324110076
Images 25000/50000. Average i/s 114.84187563894713
Images 30000/50000. Average i/s 115.14781546576788
Images 35000/50000. Average i/s 115.10561798972904
Images 40000/50000. Average i/s 115.20453046771337
Images 45000/50000. Average i/s 115.71023295020717


Unnamed: 0,keras_gpu_8
instance_type,g4dn.xlarge
user_batch_size,8
accuracy,0.74956
prediction_time,440.113
wall_time,443.712
images_per_sec_mean,115.746
images_per_sec_std,7.3476
latency_mean,70.418
latency_99th_percentile,84.4612
latency_median,69.0285


#### Predict using GPU + TensorRT

In [8]:
def build_fn(batch_size, dataset):
    for i, (build_image, _, _) in enumerate(dataset):
        if i > 1:
            break
        yield (build_image,)

def calibrate_fn(n_calib, batch_size, dataset):
    for i, (calib_image, _, _) in enumerate(dataset):
        if i > n_calib // batch_size:
            break
        yield (calib_image,)

def build_tensorrt_engine(precision, batch_size, dataset):
    from tensorflow.python.compiler.tensorrt import trt_convert as trt
    conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=precision.upper(),
                                                                   max_workspace_size_bytes=(1<<32),
                                                                   maximum_cached_engines=2)
    converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
                                        conversion_params=conversion_params)
    
    if precision.lower() == 'int8':
        n_calib=50
        converter.convert(calibration_input_fn=partial(calibrate_fn, n_calib, batch_size, 
                                                       dataset.shuffle(buffer_size=n_calib, reshuffle_each_iteration=True)))
    else:
        converter.convert()
        
    trt_compiled_model_dir = f'resnet50_trt_saved_models/resnet50_{precision}_{batch_size}'
    shutil.rmtree(trt_compiled_model_dir, ignore_errors=True)

    converter.build(input_fn=partial(build_fn, batch_size, dataset))
    converter.save(output_saved_model_dir=trt_compiled_model_dir)
    print(f'\nOptimized for {precision} and batch size {batch_size}, directory:{trt_compiled_model_dir}\n')
    return trt_compiled_model_dir

In [9]:
def trt_predict_benchmark(precision, batch_size, use_cache=False, display_every=100, warm_up=10):

    print('\n=======================================================')
    print(f'Benchmark results for precision: {precision}, batch size: {batch_size}')
    print('=======================================================\n')
    
    dataset = get_dataset(batch_size)
    
    # If caching is enabled, cache dataset for better i/o performance
    if use_cache:
        print('Caching dataset ...')
        start_time = time.time()
        for (img,_,_) in dataset:
            continue
        print(f'Finished caching {time.time() - start_time}')
    
    trt_compiled_model_dir = build_tensorrt_engine(precision, batch_size, dataset)
    saved_model_trt = tf.saved_model.load(trt_compiled_model_dir, tags=[tag_constants.SERVING])
    model_trt = saved_model_trt.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
    
    pred_labels = []
    actual_labels = []
    iter_times = []
    
    display_every = 5000
    display_threshold = display_every
    initial_time = time.time()
    
    walltime_start = time.time()
    for i, (validation_ds, batch_labels, _) in enumerate(dataset):
        if i==0:
            for w in range(warm_up):
                _ = model_trt(validation_ds);
                
        start_time = time.time()
        trt_results = model_trt(validation_ds);
        iter_times.append(time.time() - start_time)
        
        actual_labels.extend(label for label_list in batch_labels.numpy() for label in label_list)
        pred_labels.extend(list(tf.argmax(trt_results['predictions'], axis=1).numpy()))
        if (i)*batch_size >= display_threshold:
            print(f'Images {(i)*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
            display_threshold+=display_every
    
    print(f'Wall time: {time.time() - walltime_start}')

    acc_trt = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    iter_times = np.array(iter_times)
   
    results = pd.DataFrame(columns = [f'trt_{precision}_{batch_size}'])
    results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
    results.loc['user_batch_size']         = [batch_size]
    results.loc['accuracy']                = [acc_trt]
    results.loc['prediction_time']         = [np.sum(iter_times)]
    results.loc['wall_time']               = [time.time() - walltime_start]   
    results.loc['images_per_sec_mean']     = [np.mean(batch_size / iter_times)]
    results.loc['images_per_sec_std']      = [np.std(batch_size / iter_times, ddof=1)]
    results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
    results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
    results.loc['latency_median']          = [np.median(iter_times) * 1000]
    results.loc['latency_min']             = [np.min(iter_times) * 1000]
    display(results.T)
   
    return results, iter_times

In [10]:
import itertools
bench_options = {
    'batch_size': [batch_size],
    'precision': ['fp32', 'fp16', 'int8']
}

bname, bval = zip(*bench_options.items())
blist = [dict(zip(bname, h)) for h in itertools.product(*bval)]

print('Benchmark sweep combinations:')
for b in blist:
    print(b)

Benchmark sweep combinations:
{'batch_size': 8, 'precision': 'fp32'}
{'batch_size': 8, 'precision': 'fp16'}
{'batch_size': 8, 'precision': 'int8'}


In [11]:
iter_ds = pd.DataFrame()

if results is None:
    results = pd.DataFrame()

col_name = lambda boption: f'trt_{boption["precision"]}_{boption["batch_size"]}'

for boption in blist:
    res, it = trt_predict_benchmark(**boption)
    iter_ds = pd.concat([iter_ds, pd.DataFrame(it, columns=[col_name(boption)])], axis=1)
    results = pd.concat([results, res], axis=1)

display(results)


Benchmark results for precision: fp32, batch size: 8

INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp32_8/assets

Optimized for fp32 and batch size 8, directory:resnet50_trt_saved_models/resnet50_fp32_8

Images 5000/50000. Average i/s 1706.9338144076587
Images 10000/50000. Average i/s 1709.7124824008995
Images 15000/50000. Average i/s 1714.181552149894
Images 20000/50000. Average i/s 1706.435347541865
Images 25000/50000. Average i/s 1694.5647994188168
Images 30000/50000. Average i/s 1686.1055872763206
Images 35000/50000. Average i/s 1691.5992314594068
Images 40000/50000. Average i/s 1690.6736552055474
Images 45000/50000. Average i/s 1678.790774983944
Wall time: 143.3079001903534


Unnamed: 0,instance_type,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
trt_fp32_8,g4dn.xlarge,8,0.74956,38.1336,143.327,1666.69,960.928,6.10138,13.797,5.91063,1.36304



Benchmark results for precision: fp16, batch size: 8

INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp16_8/assets

Optimized for fp16 and batch size 8, directory:resnet50_trt_saved_models/resnet50_fp16_8

Images 5000/50000. Average i/s 1972.9929443064034
Images 10000/50000. Average i/s 1931.125588304386
Images 15000/50000. Average i/s 1897.5221155505612
Images 20000/50000. Average i/s 1897.409528086548
Images 25000/50000. Average i/s 1903.1808092268618
Images 30000/50000. Average i/s 1907.7603788948525
Images 35000/50000. Average i/s 1881.8081660423734
Images 40000/50000. Average i/s 1811.5798501140848
Images 45000/50000. Average i/s 1725.9618582526007
Wall time: 135.05888485908508


Unnamed: 0,instance_type,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
trt_fp16_8,g4dn.xlarge,8,0.74968,38.0335,135.078,1707.24,1016.37,6.08536,14.1668,5.91636,1.43266



Benchmark results for precision: int8, batch size: 8

INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_int8_8/assets

Optimized for int8 and batch size 8, directory:resnet50_trt_saved_models/resnet50_int8_8

Images 5000/50000. Average i/s 1879.6287615037268
Images 10000/50000. Average i/s 1890.5233308310728
Images 15000/50000. Average i/s 1904.7501508674482
Images 20000/50000. Average i/s 1898.7457632383791
Images 25000/50000. Average i/s 1902.8776155291969
Images 30000/50000. Average i/s 1898.16488970591
Images 35000/50000. Average i/s 1889.473046700565
Images 40000/50000. Average i/s 1894.5937887248815
Images 45000/50000. Average i/s 1893.7721136475534
Wall time: 133.06834959983826


Unnamed: 0,instance_type,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
trt_int8_8,g4dn.xlarge,8,0.74924,34.3497,133.087,1895.03,1086.22,5.49594,12.2826,5.27298,1.44053


Unnamed: 0,keras_gpu_8,trt_fp32_8,trt_fp16_8,trt_int8_8
instance_type,g4dn.xlarge,g4dn.xlarge,g4dn.xlarge,g4dn.xlarge
user_batch_size,8,8,8,8
accuracy,0.74956,0.74956,0.74968,0.74924
prediction_time,440.113,38.1336,38.0335,34.3497
wall_time,443.712,143.327,135.078,133.087
images_per_sec_mean,115.746,1666.69,1707.24,1895.03
images_per_sec_std,7.3476,960.928,1016.37,1086.22
latency_mean,70.418,6.10138,6.08536,5.49594
latency_99th_percentile,84.4612,13.797,14.1668,12.2826
latency_median,69.0285,5.91063,5.91636,5.27298
