#### Neuron workflow

In [None]:
# !aws s3 sync s3://imagenet-dataset-us-west-2/imagenet-data/tfrecords/validation/ /home/ubuntu/datasets/
# !pip install matplotlib pandas

In [1]:
!/opt/aws/neuron/bin/neuron-cli reset
import os
import time
import shutil
import json
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.neuron as tfn
import tensorflow.compat.v1.keras as keras
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from concurrent import futures
from itertools import compress

### Resnet50 FP16 saved model

In [None]:
!python gen_resnet50_keras.py
!python optimize_for_inference.py --graph resnet50_fp32_keras.pb --out_graph resnet50_fp32_keras_opt.pb
!python fp32tofp16.py  --graph resnet50_fp32_keras_opt.pb --out_graph resnet50_fp16_keras_opt.pb

In [None]:
def pb_to_saved_model(pb_path, input_names, output_names, model_dir):
    graph_def = tf.GraphDef()
    graph_def.ParseFromString(open(pb_path, 'rb').read())
    with tf.Session(graph=tf.Graph()) as sess:
        tf.import_graph_def(graph_def, name='')
        inputs = {name: sess.graph.get_tensor_by_name(ts_name) for name, ts_name in input_names.items()}
        outputs = {name: sess.graph.get_tensor_by_name(ts_name) for name, ts_name in output_names.items()}
        tf.saved_model.simple_save(sess, model_dir, inputs, outputs)

saved_model_dir = 'resnet50_saved_model_fp16'
pb_to_saved_model("resnet50_fp16_keras_opt.pb", 
                  {"input_1:0": "input_1:0"}, 
                  {"probs/Softmax:0" : "probs/Softmax:0"}, 
                  saved_model_dir)

### Resnet50 FP32 saved model

In [None]:
# Export SavedModel
saved_model_dir = 'resnet50_saved_model_fp32'
shutil.rmtree(saved_model_dir, ignore_errors=True)

keras.backend.set_learning_phase(0)
model = ResNet50(weights='imagenet')
tf.saved_model.simple_save(session = keras.backend.get_session(),
                           export_dir = saved_model_dir,
                           inputs = {'input_1:0': model.inputs[0]},
                           outputs = {'probs/Softmax:0': model.outputs[0]})

### Compile models with different batch sizes and cores

| Batch | nc1 | nc2 | nc4 | nc8 | nc12 | nc16 |
| :-: | :-: | :-: | :-: |:-:|:-:|:-:|
|1|      1|   1|   1|   2|   2|    2|
|2|      1|   1|   0|   1|   2|    2|
|3|      1|   1|   1|   1|   1|    1|
|4|      1|   1|   0|   0|   1|    0|
|5|      1|   1|   0|   0|   1|    0|

0 - Failed, 1 - Compiled, 2 - Compiled with static weights

In [None]:
def compile_inf1_model(saved_model_dir, batch_size=1, num_cores=1, use_static_weights=False):
    print(f'-----------batch size: {batch_size}, num cores: {num_cores}----------')
    print('Compiling...')
    parent_dir = 'resnet50_inf1_saved_models'
    compiled_model_dir = f'resnet50_{saved_model_dir[-4:]}_batch_{batch_size}_inf1_cores_{num_cores}'
    inf1_compiled_model_dir = os.path.join(parent_dir, compiled_model_dir)
    
    shutil.rmtree(inf1_compiled_model_dir, ignore_errors=True)

    example_input = np.zeros([batch_size,224,224,3], dtype='float'+saved_model_dir[-2:])

    compiler_args = ['--batching_en', '--rematerialization_en', '--spill_dis',
                     '--sb_size', str((batch_size + 6)*10),
                     '--enable-replication', 'True', '-O2',
                     '--verbose','1', '--num-neuroncores', str(num_cores)]
    
    if use_static_weights:
        compiler_args.append('--static-weights')
    
    start_time = time.time()
    compiled_res = tfn.saved_model.compile(model_dir = saved_model_dir,
                            model_feed_dict={'input_1:0': example_input},
                            new_model_dir = inf1_compiled_model_dir,
                            dynamic_batch_size=True,
                            compiler_workdir=f'./compiler-workdir/{inf1_compiled_model_dir}',
                            compiler_args = compiler_args)
    print(f'Compile time: {time.time() - start_time}')
    print(inf1_compiled_model_dir)
    print(compiled_res)
    print('----------- Done! ----------- \n')
    

In [None]:
options = [{'batch_size':1, 'num_cores':1},
           {'batch_size':1, 'num_cores':4},
           {'batch_size':1, 'num_cores':16, 'use_static_weights': True},
           {'batch_size':2, 'num_cores':16, 'use_static_weights': True},
           {'batch_size':5, 'num_cores':1},
           {'batch_size':5, 'num_cores':2},
           {'batch_size':5, 'num_cores':12}]

for opt in options:
    compile_inf1_model('resnet50_saved_model_fp32', **opt)
    
for opt in options:
    compile_inf1_model('resnet50_saved_model_fp16', **opt)

### Prepare dataset

In [2]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    [image,] = tf.py_function(preprocess_input, [image], [tf.float32])
    
    return image, label, label_text

def get_dataset(batch_size, use_cache=False):
    data_dir = '/home/ubuntu/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=8)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

## Single threaded execution: Single core and multi-core pipeline models

In [None]:
def inf1_predict_benchmark_single_threaded(neuron_saved_model_name, user_batch_size, use_cache=True, warm_up=10):
    print(f'Running model {neuron_saved_model_name}, user_batch_size: {user_batch_size}\n')

    model_inf1 = tf.contrib.predictor.from_saved_model(neuron_saved_model_name)

    iter_times = []
    pred_labels = []
    actual_labels = []
    display_threshold = 0
    warm_up = 10

    ds = get_dataset(user_batch_size, use_cache)

    ds_iter = ds.make_initializable_iterator()
    ds_next = ds_iter.get_next()
    ds_init_op = ds_iter.initializer

    with tf.Session() as sess:
        if use_cache:
            sess.run(ds_init_op)
            print('\nCaching dataset ...')
            start_time = time.time()
            try:
                while True:
                    (validation_ds,label,_) = sess.run(ds_next)
            except tf.errors.OutOfRangeError:
                pass
            print(f'Caching finished: {time.time()-start_time} sec')  

        try:
            sess.run(ds_init_op)
            counter = 0
            ipname = list(model_inf1.feed_tensors.keys())[0]
            resname = list(model_inf1.fetch_tensors.keys())[0]

            while True:
                (validation_ds,label,_) = sess.run(ds_next)

                model_feed_dict={ipname: validation_ds}

                if counter == 0:
                    for i in range(warm_up):
                        _ = model_inf1(model_feed_dict);                    

                start_time = time.time()
                inf1_results = model_inf1(model_feed_dict);
                iter_times.append(time.time() - start_time)

                actual_labels.extend(l for k in label for l in k)
                pred_labels.extend(list(np.argmax(inf1_results[resname], axis=1)))

                if (counter+1)*user_batch_size >= display_threshold:
                    print(f'Images {(counter+1)*user_batch_size}/50000. Average i/s {np.mean(user_batch_size/np.array(iter_times))}')
                    display_threshold+=5000

                counter+=1

        except tf.errors.OutOfRangeError:
            pass
        
    acc_inf1 = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    iter_times = np.array(iter_times)
    
    results = pd.DataFrame()
    results['model']                   = [f'{compiled_model_dir}_user_batch_{user_batch_size}_single_core']
    results['accuracy']                = [acc_inf1]
    results['prediction_time']         = [np.sum(iter_times)]
    results['images_per_sec_mean']     = [np.mean(user_batch_size/np.array(iter_times))]
    results['latency_per_thread_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
    results['latency_per_thread_mean']            = [np.mean(iter_times) * 1000]
    results['latency_per_thread_median']          = [np.median(iter_times) * 1000]
    results['latency_per_thread_min']             = [np.min(iter_times) * 1000]
    
    display(results)
    return results, iter_times

In [None]:
batch_size = 1
num_cores = 1
compiled_model_precision = 'fp16'

saved_model_dir = f'resnet50_saved_model_{compiled_model_precision}'
parent_dir = 'resnet50_inf1_saved_models'

compiled_model_dir = f'resnet50_{compiled_model_precision}_batch_{batch_size}_inf1_cores_{num_cores}'
inf1_compiled_model_dir = os.path.join(parent_dir, compiled_model_dir)

print(f'inf1_compiled_model_dir: {inf1_compiled_model_dir}')
print(f'compiled_model_precision: {compiled_model_precision}')

results, latency_per_thread = inf1_predict_benchmark_single_threaded_1(inf1_compiled_model_dir, 
                                                                     user_batch_size = batch_size*10, 
                                                                     use_cache=False, 
                                                                     warm_up=10)

## Multi-threaded execution
### Benchmark: Measure using latency of first thread

In [3]:
# # Multi-process multi-core batched
# def inf1_benchmark_latency_first_threads(neuron_saved_model_name, user_batch_size, num_model_copies, threads_per_model = 1, use_cache=True, warm_up=True):
#     try:
#         predictor_list = [tf.contrib.predictor.from_saved_model(neuron_saved_model_name) for _ in range(num_model_copies)]
#     except Exception as e:
#         print(str(e))

#     predictor_list = predictor_list * threads_per_model
#     inference_threads = len(predictor_list)
        
#     global latency_per_thread, counter, total_images_predicted, total_latency, total_throughput
#     latency_per_thread = [[] for _ in range(inference_threads)]
#     total_images_predicted = [0 for _ in range(inference_threads)]
#     total_latency = [0 for _ in range(inference_threads)]
#     total_throughput = []
#     counter = 0
    
#     ipname = list(predictor_list[0].feed_tensors.keys())[0]
#     resname = list(predictor_list[0].fetch_tensors.keys())[0]

#     iter_times = []
#     pred_labels = []
#     actual_labels = []

#     ds = get_dataset(user_batch_size, use_cache)

#     ds_iter = ds.make_initializable_iterator()
#     ds_next = ds_iter.get_next()
#     ds_init_op = ds_iter.initializer
    
#     if use_cache:
#         with tf.Session() as sess:
#             sess.run(ds_init_op)
#             print('\nCaching dataset ...')
#             start_time = time.time()
#             try:
#                 while True:
#                     (validation_ds,label,_) = sess.run(ds_next)
#             except tf.errors.OutOfRangeError:
#                 pass
#             print(f'Caching finished: {time.time()-start_time} sec')  
            
#     ds_iter = ds.make_initializable_iterator()
#     ds_next = ds_iter.get_next()
#     ds_init_op = ds_iter.initializer
    
#     def inf1_predict(predictor, model_feed_dict, counter, inference_threads, thread_assignment, user_batch_size, warm_up):
#         global latency_per_thread, total_images_predicted, total_latency
#         if counter in range(inference_threads):
#             if warm_up:
#                 _ = predictor(model_feed_dict)
       
#         start_time = time.time()
#         pred = predictor(model_feed_dict)
#         latency = time.time() - start_time
        
#         latency_per_thread[thread_assignment].append(latency)
#         total_images_predicted[thread_assignment] += user_batch_size
#         total_latency[thread_assignment] += latency
        
#         return {'index':counter,'latency':latency, **pred}
    
#     def measure_throughput(user_batch_size):
#         global counter, total_images_predicted, total_latency, total_throughput, throughput
#         display_threshold = 0
#         start_time = 0
#         throughput = 0
#         while counter >= 0:
#             if total_latency[0] > 1.0:
#                 throughput = sum(total_images_predicted)/total_latency[0]
#                 total_throughput.append(throughput)
#                 total_images_predicted = [0 for _ in range(len(total_images_predicted))]
#                 total_latency = [0 for _ in range(len(total_latency))]
#             if (counter)*user_batch_size >= display_threshold:
#                 print(f'Images: {counter*user_batch_size}/50000. Average images/sec across threads: {throughput}')
#                 display_threshold+=5000

#     # submit each image to predictors in a round-robin fashion
#     future_list = []
#     with futures.ThreadPoolExecutor(max_workers=inference_threads + 1) as executor:
#         executor.submit(measure_throughput, user_batch_size)
#         inference_walltime = time.time()
#         with tf.Session() as sess:
#             try:
#                 sess.run(ds_init_op)
#                 while True:
#                     (validation_ds,label,_) = sess.run(ds_next)
#                     model_feed_dict={ipname: validation_ds}
#                     actual_labels.extend(l for k in label for l in k)
                    
#                     thread_assignment = counter % len(predictor_list)
#                     predictor = predictor_list[thread_assignment]
#                     ex_results = executor.submit(inf1_predict, predictor, model_feed_dict, counter, inference_threads, thread_assignment, user_batch_size, warm_up)
                    
#                     future_list.append(ex_results)
#                     counter+=1

#             except tf.errors.OutOfRangeError:
#                 counter = -1
#                 pass

#         pred_labels = []
#         img_index = []
#         iter_times_all_threads = []
#         for f in future_list:
#             res = f.result()
#             img_index.append(res['index'])
#             iter_times_all_threads.append(res['latency'])
#             pred_labels.extend(np.argmax(res['probs/Softmax:0'], axis=1))
#         inference_walltime = time.time() - inference_walltime
        
#     iter_times_all_threads = np.array(iter_times_all_threads)
#     acc_inf1 = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    
#     results = pd.DataFrame()
#     results['model']                              = [neuron_saved_model_name]
#     results['instance_type']                      = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
#     results['user_batch_size']                    = [user_batch_size]
#     results['num_model_copies']                   = [num_model_copies]
#     results['threads_per_model']                  = [threads_per_model]
#     results['accuracy']                           = [acc_inf1]
#     results['prediction_time']                    = [np.sum(iter_times_all_threads)]
#     results['images_per_sec_mean']                = [np.mean(total_throughput)]
#     results['images_per_sec_per_thread_mean']     = [np.mean(user_batch_size / iter_times_all_threads)]
#     results['latency_per_thread_99th_percentile'] = [np.percentile(iter_times_all_threads, q=99, interpolation="lower") * 1000]
#     results['latency_per_thread_mean']            = [np.mean(iter_times_all_threads) * 1000]
#     results['latency_per_thread_median']          = [np.median(iter_times_all_threads) * 1000]
#     results['latency_per_thread_min']             = [np.min(iter_times_all_threads) * 1000]
#     results['inference_walltime']                 = [inference_walltime]

#     display(results)
#     return results, latency_per_thread, iter_times_all_threads

## Multi-threaded execution
### Benchmark: Measure using max latency thread

In [4]:
# Calculate images per 1 sec, Multi-process multi-core batched
def inf1_benchmark_latency_max_threads(neuron_saved_model_name, user_batch_size, num_model_copies, threads_per_model = 1, use_cache=True, warm_up=True):
    
    try:
        predictor_list = [tf.contrib.predictor.from_saved_model(neuron_saved_model_name) for _ in range(num_model_copies)]
    except Exception as e:
        print(str(e))

    predictor_list = predictor_list * threads_per_model
    inference_threads = len(predictor_list)
        
    global latency_per_thread, counter, total_images_predicted, total_latency, total_throughput
    latency_per_thread = [[] for _ in range(inference_threads)]
    total_images_predicted = [0 for _ in range(inference_threads)]
    total_latency = [0 for _ in range(inference_threads)]
    total_throughput = []
    counter = 0
    
    ipname = list(predictor_list[0].feed_tensors.keys())[0]
    resname = list(predictor_list[0].fetch_tensors.keys())[0]

    iter_times = []
    pred_labels = []
    actual_labels = []

    ds = get_dataset(user_batch_size, use_cache)

    ds_iter = ds.make_initializable_iterator()
    ds_next = ds_iter.get_next()
    ds_init_op = ds_iter.initializer
    
    if use_cache:
        with tf.Session() as sess:
            sess.run(ds_init_op)
            print('\nCaching dataset ...')
            start_time = time.time()
            try:
                while True:
                    (validation_ds,label,_) = sess.run(ds_next)
            except tf.errors.OutOfRangeError:
                pass
            print(f'Caching finished: {time.time()-start_time} sec')  
            
    ds_iter = ds.make_initializable_iterator()
    ds_next = ds_iter.get_next()
    ds_init_op = ds_iter.initializer
    
    def inf1_predict(predictor, model_feed_dict, counter, inference_threads, thread_assignment, user_batch_size, warm_up):
        global latency_per_thread, total_images_predicted, total_latency
        if counter in range(inference_threads):
            if warm_up:
                _ = predictor(model_feed_dict)
       
        start_time = time.time()
        pred = predictor(model_feed_dict)
        latency = time.time() - start_time
        
        latency_per_thread[thread_assignment].append(latency)
        total_images_predicted[thread_assignment] += user_batch_size
        total_latency[thread_assignment] += latency
        
        return {'index':counter,'latency':latency, **pred}
    
    def measure_throughput(user_batch_size):
        global counter, total_images_predicted, total_latency, total_throughput, throughput
        display_threshold = 0
        start_time = 0
        throughput = 0
        while counter >= 0:
            if max(total_latency) -  start_time > 0.1:
                throughput = 10*sum(total_images_predicted)/max(total_latency)
                total_throughput.append(throughput)
                start_time = 0
                total_images_predicted = [0 for _ in range(len(total_images_predicted))]
                total_latency = [0 for _ in range(len(total_latency))]
            if (counter)*user_batch_size >= display_threshold:
                print(f'Images: {counter*user_batch_size}/50000. Average images/sec across threads: {throughput}')
                display_threshold+=5000

    # submit each image to predictors in a round-robin fashion
    future_list = []
    with futures.ThreadPoolExecutor(max_workers=inference_threads + 1) as executor:
        executor.submit(measure_throughput, user_batch_size)
        with tf.Session() as sess:
            try:
                sess.run(ds_init_op)
                while True:
                    (validation_ds,label,_) = sess.run(ds_next)
                    model_feed_dict={ipname: validation_ds}
                    actual_labels.extend(l for k in label for l in k)
                    
                    thread_assignment = counter % len(predictor_list)
                    predictor = predictor_list[thread_assignment]
                    ex_results = executor.submit(inf1_predict, predictor, model_feed_dict, counter, inference_threads, thread_assignment, user_batch_size, warm_up)
                    
                    future_list.append(ex_results)
                    counter+=1

            except tf.errors.OutOfRangeError:
                counter = -1
                pass

        pred_labels = []
        img_index = []
        iter_times_all_threads = []
        for f in future_list:
            res = f.result()
            img_index.append(res['index'])
            iter_times_all_threads.append(res['latency'])
            pred_labels.extend(np.argmax(res['probs/Softmax:0'], axis=1))
    
    iter_times_all_threads = np.array(iter_times_all_threads)
    acc_inf1 = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    
    results = pd.DataFrame()
    results['model']                              = [neuron_saved_model_name]
    results['instance_type']                      = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
    results['user_batch_size']                    = [user_batch_size]
    results['num_model_copies']                   = [num_model_copies]
    results['threads_per_model']                  = [threads_per_model]
    results['accuracy']                           = [acc_inf1]
    results['prediction_time']                    = [np.sum(iter_times_all_threads)]
    results['images_per_sec_mean']                = [np.mean(total_throughput)]
    results['images_per_sec_per_thread_mean']     = [np.mean(user_batch_size / iter_times_all_threads)]
    results['latency_per_thread_99th_percentile'] = [np.percentile(iter_times_all_threads, q=99, interpolation="lower") * 1000]
    results['latency_per_thread_mean']            = [np.mean(iter_times_all_threads) * 1000]
    results['latency_per_thread_median']          = [np.median(iter_times_all_threads) * 1000]
    results['latency_per_thread_min']             = [np.min(iter_times_all_threads) * 1000]
    display(results)
    return results, latency_per_thread, iter_times_all_threads

In [11]:
def submit_inference_request(batch_size, compiled_num_cores, saved_model_precision, num_model_copies=0, threads_per_model=2):
    avail_neuroncores_dict = {
        'inf1.xlarge' : 4,
        'inf1.2xlarge' : 4,
        'inf1.6xlarge' : 16,
        'inf1.24xlarge' : 64
    }
    instance_type = requests.get('http://169.254.169.254/latest/meta-data/instance-type').text
    avail_num_cores = avail_neuroncores_dict.get(instance_type, 0)
    
    if not num_model_copies:
        num_model_copies = avail_num_cores//compiled_num_cores
    
    user_batch_size = batch_size*10

    parent_dir = 'resnet50_inf1_saved_models'
    compiled_model_dir = f'resnet50_{saved_model_precision}_batch_{batch_size}_inf1_cores_{compiled_num_cores}'
    inf1_compiled_model_dir = os.path.join(parent_dir, compiled_model_dir)

    print(f"""
-> Compiled batch size: {batch_size}
-> Compiled neuron cores: {compiled_num_cores}
-> Saved model precision: {saved_model_precision}

-> Available neuron cores: {avail_num_cores}
-> Number of model copies that fits on {instance_type}: {num_model_copies}
-> Number of CPU threads to feed each model: {threads_per_model}
-> User batch size: {user_batch_size}

-> Compiled model dir: {inf1_compiled_model_dir}
            """)
    
    results, _, _ = inf1_benchmark_latency_max_threads(inf1_compiled_model_dir, 
                                                          user_batch_size = user_batch_size, 
                                                          num_model_copies = num_model_copies, 
                                                          threads_per_model = threads_per_model,
                                                          use_cache=True)
    
    return results

In [12]:
results = pd.DataFrame()

model_list = [{'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp32', 'threads_per_model':1},
             {'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp32', 'threads_per_model':2},
             {'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp32', 'threads_per_model':4}]

for m in model_list:
    res = submit_inference_request(**m)
    if results.empty:
        results = res
    else:
        results = results.append(res)

results = results.reset_index(drop=True)
results


-> Compiled batch size: 5
-> Compiled neuron cores: 1
-> Saved model precision: fp32

-> Available neuron cores: 16
-> Number of model copies that fits on inf1.6xlarge: 16
-> Number of CPU threads to feed each model: 1
-> User batch size: 50

-> Compiled model dir: resnet50_inf1_saved_models/resnet50_fp32_batch_5_inf1_cores_1
            
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified Saved

Unnamed: 0,model,instance_type,user_batch_size,num_model_copies,threads_per_model,accuracy,prediction_time,images_per_sec_mean,images_per_sec_per_thread_mean,latency_per_thread_99th_percentile,latency_per_thread_mean,latency_per_thread_median,latency_per_thread_min
0,resnet50_inf1_saved_models/resnet50_fp32_batch...,inf1.6xlarge,50,16,1,0.74862,214.939892,2656.106881,265.068709,1117.420673,214.939892,182.432055,171.118975



-> Compiled batch size: 5
-> Compiled neuron cores: 1
-> Saved model precision: fp32

-> Available neuron cores: 16
-> Number of model copies that fits on inf1.6xlarge: 16
-> Number of CPU threads to feed each model: 2
-> User batch size: 50

-> Compiled model dir: resnet50_inf1_saved_models/resnet50_fp32_batch_5_inf1_cores_1
            
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified Saved

Unnamed: 0,model,instance_type,user_batch_size,num_model_copies,threads_per_model,accuracy,prediction_time,images_per_sec_mean,images_per_sec_per_thread_mean,latency_per_thread_99th_percentile,latency_per_thread_mean,latency_per_thread_median,latency_per_thread_min
0,resnet50_inf1_saved_models/resnet50_fp32_batch...,inf1.6xlarge,50,16,2,0.74862,685.417934,1484.43441,142.486072,2080.97291,685.417934,544.714093,173.460007



-> Compiled batch size: 5
-> Compiled neuron cores: 1
-> Saved model precision: fp32

-> Available neuron cores: 16
-> Number of model copies that fits on inf1.6xlarge: 16
-> Number of CPU threads to feed each model: 4
-> User batch size: 50

-> Compiled model dir: resnet50_inf1_saved_models/resnet50_fp32_batch_5_inf1_cores_1
            
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified Saved

Unnamed: 0,model,instance_type,user_batch_size,num_model_copies,threads_per_model,accuracy,prediction_time,images_per_sec_mean,images_per_sec_per_thread_mean,latency_per_thread_99th_percentile,latency_per_thread_mean,latency_per_thread_median,latency_per_thread_min
0,resnet50_inf1_saved_models/resnet50_fp32_batch...,inf1.6xlarge,50,16,4,0.74862,1358.257376,769.428956,78.040777,2826.501369,1358.257376,1413.931966,210.726976


Unnamed: 0,model,instance_type,user_batch_size,num_model_copies,threads_per_model,accuracy,prediction_time,images_per_sec_mean,images_per_sec_per_thread_mean,latency_per_thread_99th_percentile,latency_per_thread_mean,latency_per_thread_median,latency_per_thread_min
0,resnet50_inf1_saved_models/resnet50_fp32_batch...,inf1.6xlarge,50,16,1,0.74862,214.939892,2656.106881,265.068709,1117.420673,214.939892,182.432055,171.118975
1,resnet50_inf1_saved_models/resnet50_fp32_batch...,inf1.6xlarge,50,16,2,0.74862,685.417934,1484.43441,142.486072,2080.97291,685.417934,544.714093,173.460007
2,resnet50_inf1_saved_models/resnet50_fp32_batch...,inf1.6xlarge,50,16,4,0.74862,1358.257376,769.428956,78.040777,2826.501369,1358.257376,1413.931966,210.726976


In [None]:
results = pd.DataFrame()

model_list = [{'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp16', 'threads_per_model':1},
             {'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp16', 'threads_per_model':2},
                          {'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp16', 'threads_per_model':4},
                          {'batch_size':5, 'compiled_num_cores':1, 'saved_model_precision':'fp16', 'threads_per_model':8}]

for m in model_list:
    res = submit_inference_request(**m)
    if results.empty:
        results = res
    else:
        results = results.append(res)

results = results.reset_index(drop=True)
results

In [None]:
img_index, iter_times_per_thread, pred_prob = map(list,zip(*[(f.result()['i_num'],f.result()['latency'],f.result()['probs/Softmax:0']) for f in future_list]))

In [None]:
s = time.time()
pred_labels = []
img_index = []
iter_times_per_thread = []

for f in future_list:
    res = f.result()
    img_index.append(res['i_num'])
    iter_times_per_thread.append(res['latency'])
    pred_labels.extend(np.argmax(res['probs/Softmax:0'], axis=1))

print(time.time()-s)

In [None]:
len(latency_per_thread)

In [None]:
batch_size = 1
compiled_num_cores = 1
compiled_model_precision = 'fp32'

infer_num_cores = 1
threads_per_model = 6

saved_model_dir = f'resnet50_saved_model_{compiled_model_precision}'
parent_dir = 'resnet50_inf1_saved_models'

compiled_model_dir = f'resnet50_{compiled_model_precision}_batch_{batch_size}_inf1_cores_{compiled_num_cores}'
inf1_compiled_model_dir = os.path.join(parent_dir, compiled_model_dir)

print(f'inf1_compiled_model_dir: {inf1_compiled_model_dir}')
print(f'compiled_model_precision: {compiled_model_precision}')

results, latency_per_thread, pred_prob = inf1_predict_benchmark_multi_threaded(inf1_compiled_model_dir, 
                                      user_batch_size = batch_size*10, 
                                      infer_num_cores = infer_num_cores, 
                                      threads_per_model = threads_per_model,
                                      use_cache=False, 
                                      warm_up=10)

In [None]:
len(latency_per_thread[0])

In [None]:
batch_size = 5
num_cores = 1
compiled_model_precision = 'fp32'

saved_model_dir = f'resnet50_saved_model_{compiled_model_precision}'
parent_dir = 'resnet50_inf1_saved_models'

compiled_model_dir = f'resnet50_{compiled_model_precision}_batch_{batch_size}_inf1_cores_{num_cores}'
inf1_compiled_model_dir = os.path.join(parent_dir, compiled_model_dir)

print(f'inf1_compiled_model_dir: {inf1_compiled_model_dir}')
print(f'compiled_model_precision: {compiled_model_precision}')

results, latency_per_thread = inf1_predict_benchmark_multi_threaded(inf1_compiled_model_dir, 
                                      user_batch_size = batch_size*10, 
                                      infer_num_cores = 16, 
                                      threads_per_model = 1,
                                      use_cache=True, 
                                      warm_up=10)

results, latency_per_thread = inf1_predict_benchmark_multi_threaded(inf1_compiled_model_dir, 
                                      user_batch_size = batch_size*10, 
                                      infer_num_cores = 16, 
                                      threads_per_model = 2,
                                      use_cache=True, 
                                      warm_up=10)

In [None]:
model_attributes = [{'batch_size':1, 'num_cores':1},
                    {'batch_size':5, 'num_cores':1},
                    {'batch_size':1, 'num_cores':16, 'use_static_weights': True},
                    {'batch_size':2, 'num_cores':16, 'use_static_weights': True},
                    {'batch_size':1, 'num_cores':4},
                    {'batch_size':5, 'num_cores':2},
                    {'batch_size':5, 'num_cores':12}]

saved_model_dir = 'resnet50_saved_model_fp16'
parent_dir = 'resnet50_inf1_saved_models'
    
for model in model_attributes:
    batch_size = model['batch_size']
    num_cores = model['num_cores']
    
    compiled_model_dir = f'resnet50_{saved_model_dir[-4:]}_batch_{batch_size}_inf1_cores_{num_cores}'
    inf1_compiled_model_dir = os.path.join(parent_dir, compiled_model_dir)
    
    if num_cores == 1:
        infer_num_cores = 16
    else:
        infer_num_cores = 1
        
    print(inf1_compiled_model_dir)
    
    inf1_predict_benchmark_multi_threaded(inf1_compiled_model_dir, 
                                          user_batch_size = batch_size*10, 
                                          infer_num_cores = infer_num_cores, 
                                          use_cache=False, 
                                          warm_up=10)
    

In [None]:
boption = {'compiled_batch_size': 1, 'precision': 'auto_bfloat16', 'compiled_num_cores': 1}
results, future_list, latency_per_thread = inf1_predict_benchmark_multi_threaded(boption, 
                                                                                 user_batch_size=50, 
                                                                                 infer_num_cores = 4, 
                                                                                 use_cache=False, 
                                                                                 warm_up=10)
results

In [None]:
boption = {'compiled_batch_size': 1, 'compiled_num_cores': 1, 'precision': 'fp16'}
results, future_list, latency_per_thread = inf1_predict_benchmark_multi_threaded(boption, 
                                                                                 user_batch_size=50, 
                                                                                 infer_num_cores = 4, 
                                                                                 use_cache=False, 
                                                                                 warm_up=10)
results

In [None]:
boption = {'compiled_batch_size': 5, 'compiled_num_cores': 1, 'precision': 'fp16'}
results, future_list, latency_per_thread = inf1_predict_benchmark_multi_threaded(boption, 
                                                                                 user_batch_size=50, 
                                                                                 infer_num_cores = 4, 
                                                                                 use_cache=False, 
                                                                                 warm_up=10)
results

In [None]:
boption = {'compiled_batch_size': 5, 'compiled_num_cores': 1, 'precision': 'fp16'}
results, future_list, latency_per_thread = inf1_predict_benchmark_multi_threaded(boption, 
                                                                                 user_batch_size=50, 
                                                                                 infer_num_cores = 16, 
                                                                                 use_cache=False, 
                                                                                 warm_up=10)
results

In [None]:
boption = {'compiled_batch_size': 5, 'compiled_num_cores': 1, 'precision': 'fp16'}
results, future_list, latency_per_thread = inf1_predict_benchmark_multi_threaded(boption, 
                                                                                 user_batch_size=50, 
                                                                                 infer_num_cores = 32, 
                                                                                 use_cache=False, 
                                                                                 warm_up=10)
results

In [None]:
boption = {'compiled_batch_size': 1, 'compiled_num_cores': 16, 'precision': 'fp16'}
results, future_list, latency_per_thread = inf1_predict_benchmark_multi_threaded(boption, 
                                                                                 user_batch_size=50, 
                                                                                 infer_num_cores = 1, 
                                                                                 use_cache=False, 
                                                                                 warm_up=10)
results

In [None]:
predictor = tf.contrib.predictor.from_saved_model('resnet50_inf1_saved_models/resnet50_fp16_batch_1_cores_1')
resname = list(predictor.fetch_tensors.keys())[0]

In [None]:
img_index, iter_times_per_thread, pred_prob = map(list,zip(*[(f.result()['i_num'],f.result()['latency'],f.result()[resname]) for f in future_list]))

In [None]:
10/np.mean(iter_times_per_thread)

In [None]:
# [q for q in list(np.argmax(p, axis=1)) for p in pred_prob]
a = []
for p in pred_prob:
    a.extend(np.argmax(p, axis=1))
len(a)
a

In [None]:
np.argmax(pred_prob, axis=0).shape

In [None]:
idx_sorted = np.argsort(img_index)
iter_times_sorted = [iter_times_per_thread[i] for i in idx_sorted]
pred_prob_sorted = [pred_prob[i] for i in idx_sorted]

idx_sorted

In [None]:
# !/opt/aws/neuron/bin/neuron-cli reset
# model_inf1 = tf.contrib.predictor.from_saved_model('resnet50_inf1_saved_models/resnet50_auto_bfloat16_batch_1_cores_1')
# model_inf1 = tf.contrib.predictor.from_saved_model('resnet50_inf1_saved_models/resnet50_fp16_batch_1_cores_1')
# model_inf1 = tf.contrib.predictor.from_saved_model('resnet50_inf1_saved_models/resnet50_fp16_batch_5_cores_1')
# model_inf1 = tf.contrib.predictor.from_saved_model('resnet50_inf1_saved_models/resnet50_fp16_batch_1_cores_16')

blist = [{'compiled_batch_size': 1, 'precision': 'auto_bfloat16', 'num_cores': 1},
         {'compiled_batch_size': 1, 'precision': 'fp16',  'num_cores': 1},
         {'compiled_batch_size': 5, 'precision': 'fp16', 'num_cores': 1},
         {'compiled_batch_size': 1, 'precision': 'fp16', 'num_cores': 16},]

results, iter_times = inf1_predict_benchmark(blist[3], user_batch_size=10, use_cache = True)

In [None]:
col_name = lambda boption: f'inf1_{boption["precision"]}_batchcompiled_{boption["compiled_batch_size"]}_batchuser_{boption["user_batch_size"]}'
use_cache = False

for boption in blist:
    model_path = f'resnet50_inf1_saved_models/resnet50_{boption['precision']}_{boption['compiled_batch_size']}'
    dataset = get_dataset(batch_size = boption['user_batch_size'], use_cache)
    if use_cache:
        print('Start caching ...')
        start_time = time.time()
        for _ in dataset:
            continue
        print(f'Caching finished: {time.time()-start_time} sec')
    
    res, it = inf1_predict_benchmark(dataset, model_path, **boption)
    iter_ds = pd.concat([iter_ds, pd.DataFrame(it, columns=[col_name(boption)])], axis=1)
    if results.empty:
        results = res
    else:
        results = results.append(res)
    display(results)

results = results.reset_index(drop=True)