In [1]:
# !nvidia-docker run --shm-size 8g --ulimit memlock=-1 -it -v $PWD:/examples -v ~/.aws/:/.aws --network=host nvcr.io/nvidia/tensorflow:20.08-tf2-py3

In [2]:
# !aws s3 sync s3://imagenet-dataset-us-west-2/imagenet-data/tfrecords/validation/ /examples/datasets/

In [3]:
# !pip install matplotlib pandas

In [4]:
import os
import time
import shutil
import json
import time
import pandas as pd
import numpy as np
from functools import partial

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.python.saved_model import tag_constants, signature_constants
from tensorflow.python.framework import convert_to_constants

from tensorflow.compiler.tf2tensorrt.wrap_py_utils import get_linked_tensorrt_version
print(f"TensorRT version: {get_linked_tensorrt_version()}")
print(f"TensorFlow version: {tf.__version__}")

TensorRT version: (7, 1, 3)
TensorFlow version: 2.2.0


#### Download Keras Resnet50 model

In [5]:
def load_save_resnet50_model(saved_model_dir = 'resnet50_saved_model'):
    model = ResNet50(weights='imagenet')
    shutil.rmtree(saved_model_dir, ignore_errors=True)
    model.save(saved_model_dir, include_optimizer=False, save_format='tf')

In [6]:
saved_model_dir = 'resnet50_saved_model' 
load_save_resnet50_model(saved_model_dir)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: resnet50_saved_model/assets


In [7]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

In [8]:
def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    image = tf.keras.applications.resnet50.preprocess_input(image)
    
    return image, label, label_text

In [9]:
def get_dataset(batch_size):
    data_dir = '/examples/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=8)
    dataset = dataset.batch(batch_size=batch_size).cache('./tfdatacache')
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    return dataset

#### Visualize random images from the dataset

In [10]:
# import matplotlib.pyplot as plt

# ds = get_dataset(1).shuffle(buffer_size=10,reshuffle_each_iteration=True)

# plt.figure()
# ds, lb, lt = next(ds.as_numpy_iterator())
# plt.imshow(ds[0]/255)
# # label_dict[str(actual_labels[r])]
# print(lt[0].decode("utf-8"))

#### Predict using GPU + Keras

In [11]:
# %%time
# model = tf.keras.models.load_model(saved_model_dir)
# pred_prob_keras = model.predict(imagenet_val_np_images, batch_size=5, verbose=1)

# pred_labels_keras = []
# for i in range(pred_prob_keras.shape[0]):
#     label_num_keras = np.argmax(pred_prob_keras[i])
#     pred_labels_keras.append(label_num_keras)

# np.sum(np.array(actual_labels) == np.array(pred_labels_keras))/len(actual_labels)

#### Predict using GPU + TensorRT

In [12]:
def build_fn(batch_size, dataset):
    for i, (build_image, _, _) in enumerate(dataset):
        if i > 1:
            break
        yield (build_image,)

In [13]:
def calibrate_fn(n_calib, batch_size, dataset):
    for i, (calib_image, _, _) in enumerate(dataset):
        if i > n_calib // batch_size:
            break
        yield (calib_image,)

In [14]:
def build_tensorrt_engine(precision, batch_size, dataset):
    from tensorflow.python.compiler.tensorrt import trt_convert as trt
    conversion_params = trt.DEFAULT_TRT_CONVERSION_PARAMS._replace(precision_mode=precision.upper(),
                                                                   max_workspace_size_bytes=(1<<32),
                                                                   maximum_cached_engines=2)
    converter = trt.TrtGraphConverterV2(input_saved_model_dir='resnet50_saved_model',
                                        conversion_params=conversion_params)
    
    if precision.lower() == 'int8':
        n_calib=100
        converter.convert(calibration_input_fn=partial(calibrate_fn, n_calib, batch_size, 
                                                       dataset.shuffle(buffer_size=n_calib, reshuffle_each_iteration=True)))
    else:
        converter.convert()
        
    trt_compiled_model_dir = f'resnet50_trt_saved_models/resnet50_{precision}_{batch_size}'
    shutil.rmtree(trt_compiled_model_dir, ignore_errors=True)

    converter.build(input_fn=partial(build_fn, batch_size, dataset))
    converter.save(output_saved_model_dir=trt_compiled_model_dir)
    print(f'\nOptimized for {precision} and batch size {batch_size}, directory:{trt_compiled_model_dir}\n')
    return trt_compiled_model_dir

In [15]:
def trt_predict_benchmark(precision, batch_size, display_every=100, warm_up=50):

    print('\n=======================================================')
    print(f'Benchmark results for precision: {precision}, batch size: {batch_size}')
    print('=======================================================\n')
    
    start_time = time.time()
    dataset = get_dataset(batch_size)
    for (img,_,_) in dataset:
        continue
    print(f'Finished caching {time.time() - start_time}')
    
    trt_compiled_model_dir = build_tensorrt_engine(precision, batch_size, dataset)
    saved_model_trt = tf.saved_model.load(trt_compiled_model_dir, tags=[tag_constants.SERVING])
    model_trt = saved_model_trt.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
#     model_trt = convert_to_constants.convert_variables_to_constants_v2(model_trt)
    
    pred_labels = []
    actual_labels = []
    iter_times = []
    
    display_threshold = 0
    initial_time = time.time()
    
    for i, (validation_ds, label, _) in enumerate(dataset):
        if i==0:
            for w in range(warm_up):
                _ = model_trt(validation_ds);
                
        start_time = time.time()
        trt_results = model_trt(validation_ds);
        iter_times.append(time.time() - start_time)

        actual_labels.extend(l for k in label.numpy() for l in k)
        pred_labels.extend(list(tf.argmax(trt_results['predictions'], axis=1).numpy()))
        if (i+1)*batch_size >= display_threshold:
            print(f'Images {(i+1)*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times))}')
            display_threshold+=10000

    acc_trt = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    
    iter_times = np.array(iter_times)
    results = pd.DataFrame([[f'trt_{precision}_{batch_size}',
                          acc_trt, 
                          np.sum(iter_times), 
                          np.mean(batch_size / iter_times),
                          np.std(batch_size / iter_times, ddof=1),
                          np.percentile(iter_times, q=99, interpolation="lower") * 1000, 
                          np.mean(iter_times) * 1000,
                          np.std(iter_times*1000, ddof=1),
                          np.median(iter_times) * 1000, 
                          np.min(iter_times) * 1000]], 
                           columns = ['model', 
                                      'accuracy', 
                                      'prediction_time', 
                                      'images_per_sec_mean', 
                                      'images_per_sec_std',
                                      'latency_99th_percentile', 
                                      'latency_mean', 
                                      'latency_std', 
                                      'latency_median', 
                                      'latency_min'])
    return results, iter_times

In [16]:
import itertools
bench_options = {
    'batch_size': [1, 5, 8, 50, 128],
    'precision': ['int8', 'fp16', 'fp32']
}

# bench_options = {
#     'batch_size': [8],
#     'precision': ['fp16', 'fp32']
# }

bname, bval = zip(*bench_options.items())
blist = [dict(zip(bname, h)) for h in itertools.product(*bval)]
for b in blist:
    print(b)

{'batch_size': 1, 'precision': 'int8'}
{'batch_size': 1, 'precision': 'fp16'}
{'batch_size': 1, 'precision': 'fp32'}
{'batch_size': 5, 'precision': 'int8'}
{'batch_size': 5, 'precision': 'fp16'}
{'batch_size': 5, 'precision': 'fp32'}
{'batch_size': 8, 'precision': 'int8'}
{'batch_size': 8, 'precision': 'fp16'}
{'batch_size': 8, 'precision': 'fp32'}
{'batch_size': 50, 'precision': 'int8'}
{'batch_size': 50, 'precision': 'fp16'}
{'batch_size': 50, 'precision': 'fp32'}
{'batch_size': 128, 'precision': 'int8'}
{'batch_size': 128, 'precision': 'fp16'}
{'batch_size': 128, 'precision': 'fp32'}


In [17]:
# iter_ds = pd.DataFrame()
# col_name = lambda boption: str(boption).replace('\'','').replace(': ','_').replace(', ','_').strip('{}')
# boption = {'batch_size': 128, 'precision': 'fp16'}
# iter_ds = pd.concat([iter_ds, pd.DataFrame([1,2,3,4], columns=[col_name(boption)])],axis=1)
# boption = {'batch_size': 1, 'precision': 'fp16'}
# iter_ds = pd.concat([iter_ds, pd.DataFrame([1,2,3,4,5,6,7,8,9], columns=[col_name(boption)])],axis=1)
# iter_ds

In [None]:
results = pd.DataFrame()
iter_ds = pd.DataFrame()

col_name = lambda boption: f'trt_{boption["precision"]}_{boption["batch_size"]}'

for boption in blist:
    res, it = trt_predict_benchmark(**boption)
    iter_ds = pd.concat([iter_ds, pd.DataFrame(it, columns=[col_name(boption)])], axis=1)
    display(iter_ds.head())
    
    if results.empty:
        results = res
    else:
        results = results.append(res)

results = results.reset_index(drop=True)


Benchmark results for precision: int8, batch size: 1

Finished caching 127.72667479515076
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_int8_1/assets

Optimized for int8 and batch size 1, directory:resnet50_trt_saved_models/resnet50_int8_1

Images 1/50000. Average i/s 282.6350404312669


Unnamed: 0,trt_int8_1
0,0.003538
1,0.001835
2,0.001791
3,0.001693
4,0.001744



Benchmark results for precision: fp16, batch size: 1

Finished caching 224.7132318019867
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp16_1/assets

Optimized for fp16 and batch size 1, directory:resnet50_trt_saved_models/resnet50_fp16_1

Images 1/50000. Average i/s 175.97986070319712


Unnamed: 0,trt_int8_1,trt_fp16_1
0,0.003538,0.005682
1,0.001835,0.002238
2,0.001791,0.002167
3,0.001693,0.001686
4,0.001744,0.001795



Benchmark results for precision: fp32, batch size: 1

Finished caching 226.2343053817749
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp32_1/assets

Optimized for fp32 and batch size 1, directory:resnet50_trt_saved_models/resnet50_fp32_1

Images 1/50000. Average i/s 57.27028687685186


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1
0,0.003538,0.005682,0.017461
1,0.001835,0.002238,0.001832
2,0.001791,0.002167,0.001745
3,0.001693,0.001686,0.001702
4,0.001744,0.001795,0.001705



Benchmark results for precision: int8, batch size: 5

Finished caching 142.1171326637268
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_int8_5/assets

Optimized for int8 and batch size 5, directory:resnet50_trt_saved_models/resnet50_int8_5

Images 5/50000. Average i/s 1352.5649790390196
Images 10000/50000. Average i/s 2937.065229774058
Images 20000/50000. Average i/s 2930.0735211276387
Images 30000/50000. Average i/s 2929.4013512179963


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1,trt_int8_5
0,0.003538,0.005682,0.017461,0.003697
1,0.001835,0.002238,0.001832,0.00177
2,0.001791,0.002167,0.001745,0.001666
3,0.001693,0.001686,0.001702,0.001681
4,0.001744,0.001795,0.001705,0.001647



Benchmark results for precision: fp16, batch size: 5

Finished caching 176.504634141922
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp16_5/assets

Optimized for fp16 and batch size 5, directory:resnet50_trt_saved_models/resnet50_fp16_5

Images 5/50000. Average i/s 929.1356164990475
Images 10000/50000. Average i/s 2926.5615160748184
Images 20000/50000. Average i/s 2923.443926830864
Images 30000/50000. Average i/s 2920.3310122363846


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1,trt_int8_5,trt_fp16_5
0,0.003538,0.005682,0.017461,0.003697,0.005381
1,0.001835,0.002238,0.001832,0.00177,0.001827
2,0.001791,0.002167,0.001745,0.001666,0.001682
3,0.001693,0.001686,0.001702,0.001681,0.001673
4,0.001744,0.001795,0.001705,0.001647,0.001725



Benchmark results for precision: fp32, batch size: 5

Finished caching 168.16408467292786
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp32_5/assets

Optimized for fp32 and batch size 5, directory:resnet50_trt_saved_models/resnet50_fp32_5

Images 5/50000. Average i/s 286.0155749219208
Images 10000/50000. Average i/s 2977.987118575803
Images 20000/50000. Average i/s 2973.526521654071
Images 30000/50000. Average i/s 2970.6918011417183


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1,trt_int8_5,trt_fp16_5,trt_fp32_5
0,0.003538,0.005682,0.017461,0.003697,0.005381,0.017482
1,0.001835,0.002238,0.001832,0.00177,0.001827,0.001731
2,0.001791,0.002167,0.001745,0.001666,0.001682,0.001667
3,0.001693,0.001686,0.001702,0.001681,0.001673,0.001656
4,0.001744,0.001795,0.001705,0.001647,0.001725,0.001659



Benchmark results for precision: int8, batch size: 8

Finished caching 173.93588590621948
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_int8_8/assets

Optimized for int8 and batch size 8, directory:resnet50_trt_saved_models/resnet50_int8_8

Images 8/50000. Average i/s 2397.601429081815
Images 10000/50000. Average i/s 4629.989438625767
Images 20000/50000. Average i/s 4627.123598697512
Images 30000/50000. Average i/s 4629.141094222522
Images 40000/50000. Average i/s 4627.292680298871
Images 50000/50000. Average i/s 4628.824276280757


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1,trt_int8_5,trt_fp16_5,trt_fp32_5,trt_int8_8
0,0.003538,0.005682,0.017461,0.003697,0.005381,0.017482,0.003337
1,0.001835,0.002238,0.001832,0.00177,0.001827,0.001731,0.001767
2,0.001791,0.002167,0.001745,0.001666,0.001682,0.001667,0.001702
3,0.001693,0.001686,0.001702,0.001681,0.001673,0.001656,0.001673
4,0.001744,0.001795,0.001705,0.001647,0.001725,0.001659,0.001684



Benchmark results for precision: fp16, batch size: 8

Finished caching 224.16503524780273
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp16_8/assets

Optimized for fp16 and batch size 8, directory:resnet50_trt_saved_models/resnet50_fp16_8

Images 8/50000. Average i/s 1497.364094783346
Images 10000/50000. Average i/s 4569.783372101516
Images 20000/50000. Average i/s 4600.736583228089
Images 30000/50000. Average i/s 4609.001940746131
Images 40000/50000. Average i/s 4606.382380962096
Images 50000/50000. Average i/s 4607.4650268580335


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1,trt_int8_5,trt_fp16_5,trt_fp32_5,trt_int8_8,trt_fp16_8
0,0.003538,0.005682,0.017461,0.003697,0.005381,0.017482,0.003337,0.005343
1,0.001835,0.002238,0.001832,0.00177,0.001827,0.001731,0.001767,0.001784
2,0.001791,0.002167,0.001745,0.001666,0.001682,0.001667,0.001702,0.001646
3,0.001693,0.001686,0.001702,0.001681,0.001673,0.001656,0.001673,0.001645
4,0.001744,0.001795,0.001705,0.001647,0.001725,0.001659,0.001684,0.001879



Benchmark results for precision: fp32, batch size: 8

Finished caching 224.25335907936096
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)
INFO:tensorflow:Assets written to: resnet50_trt_saved_models/resnet50_fp32_8/assets

Optimized for fp32 and batch size 8, directory:resnet50_trt_saved_models/resnet50_fp32_8

Images 8/50000. Average i/s 461.5211267605634
Images 10000/50000. Average i/s 4682.4155171489165
Images 20000/50000. Average i/s 4688.6087254970225
Images 30000/50000. Average i/s 4688.805226601394
Images 40000/50000. Average i/s 4681.180041858195
Images 50000/50000. Average i/s 4683.573184768727


Unnamed: 0,trt_int8_1,trt_fp16_1,trt_fp32_1,trt_int8_5,trt_fp16_5,trt_fp32_5,trt_int8_8,trt_fp16_8,trt_fp32_8
0,0.003538,0.005682,0.017461,0.003697,0.005381,0.017482,0.003337,0.005343,0.017334
1,0.001835,0.002238,0.001832,0.00177,0.001827,0.001731,0.001767,0.001784,0.00173
2,0.001791,0.002167,0.001745,0.001666,0.001682,0.001667,0.001702,0.001646,0.001728
3,0.001693,0.001686,0.001702,0.001681,0.001673,0.001656,0.001673,0.001645,0.001665
4,0.001744,0.001795,0.001705,0.001647,0.001725,0.001659,0.001684,0.001879,0.001695



Benchmark results for precision: int8, batch size: 50

Finished caching 225.1445529460907
INFO:tensorflow:Linked TensorRT version: (7, 1, 3)
INFO:tensorflow:Loaded TensorRT version: (7, 1, 3)


In [None]:
d = iter_ds.loc[:,['batch_size_5_precision_fp16']].dropna()
d.hist()

# Debug

In [None]:
# # trt_compiled_model_dir = build_tensorrt_engine('fp16', 8)
# # shutil.rmtree(trt_compiled_model_dir, ignore_errors=True)
# # saved_model_trt = tf.saved_model.load(trt_compiled_model_dir, tags=[tag_constants.SERVING])
# saved_model_trt = tf.saved_model.load('resnet50_trt_model_fp16_8', tags=[tag_constants.SERVING])

# model_trt = saved_model_trt.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
# model_trt = convert_to_constants.convert_variables_to_constants_v2(model_trt)

In [None]:
# dataset = get_dataset(8)
# labels_append = []
# actual_labels = []
# for i, (a,b,c) in enumerate(dataset):
#         res = model_trt(a);
#         actual_labels.extend(p[0] for p in b.numpy())
#         labels_append.extend(list(tf.argmax(res[0], axis=1).numpy()))
# #         print(b.numpy())
# #         print(list(b.numpy()))
#         if i > 2:
#             break
# print(labels_append)
# print(actual_labels)

In [None]:
# !python /opt/tensorflow/tensorflow-source/tensorflow/python/tools/import_pb_to_tensorboard.py --model_dir resnet50_trt_model_fp32 --log_dir tblogs/0