#### Neuron workflow

In [None]:
# !aws s3 sync s3://imagenet-dataset-us-west-2/imagenet-data/tfrecords/validation/ /home/ubuntu/datasets/
# !pip install matplotlib pandas

In [2]:
!/opt/aws/neuron/bin/neuron-cli reset
import os
import time
import shutil
import json
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.neuron as tfn
import tensorflow.compat.v1.keras as keras
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.preprocessing import image
from concurrent import futures
from itertools import compress

### Resnet50 FP32 saved model

In [3]:
# Export SavedModel
saved_model_dir = 'resnet50_saved_model'
shutil.rmtree(saved_model_dir, ignore_errors=True)

keras.backend.set_learning_phase(0)
model = ResNet50(weights='imagenet')
tf.saved_model.simple_save(session = keras.backend.get_session(),
                           export_dir = saved_model_dir,
                           inputs = {'input_1:0': model.inputs[0]},
                           outputs = {'probs/Softmax:0': model.outputs[0]})

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.simple_save.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: resnet50_saved_model/saved_model.pb


### Compile models with different batch sizes and cores

In [4]:
def compile_inf1_model(saved_model_dir, inf1_model_dir, batch_size=1, num_cores=1, use_static_weights=False):
    print(f'-----------batch size: {batch_size}, num cores: {num_cores}----------')
    print('Compiling...')
    
    compiled_model_dir = f'resnet50_batch_{batch_size}_inf1_cores_{num_cores}'
    inf1_compiled_model_dir = os.path.join(inf1_model_dir, compiled_model_dir)
    shutil.rmtree(inf1_compiled_model_dir, ignore_errors=True)

    example_input = np.zeros([batch_size,224,224,3], dtype='float32')

    compiler_args = ['--verbose','1', '--num-neuroncores', str(num_cores)]
    if use_static_weights:
        compiler_args.append('--static-weights')
    
    start_time = time.time()
    compiled_res = tfn.saved_model.compile(model_dir = saved_model_dir,
                            model_feed_dict={'input_1:0': example_input},
                            new_model_dir = inf1_compiled_model_dir,
                            dynamic_batch_size=True,
                            compiler_workdir=f'./compiler-workdir/{inf1_compiled_model_dir}',
                            compiler_args = compiler_args)
    print(f'Compile time: {time.time() - start_time}')
    
    compile_success = False
    perc_on_inf = compiled_res['OnNeuronRatio'] * 100
    if perc_on_inf > 50:
        compile_success = True
            
    print(inf1_compiled_model_dir)
    print(compiled_res)
    print('----------- Done! ----------- \n')
    
    return compile_success

### Use `tf.data` to read ImageNet validation dataset

In [5]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    [image,] = tf.py_function(preprocess_input, [image], [tf.float32])
    
    return image, label, label_text

def get_dataset(batch_size, use_cache=False):
    data_dir = '/home/ubuntu/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=8)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

## Single AWS Inferentia chip execution
* Single core compiled models with automatic data parallel model upto 4 cores
* Multi-core compiled models for pipeline execution

In [6]:
def inf1_predict_benchmark_single_threaded(neuron_saved_model_name, batch_size, user_batch_size, num_cores, use_cache=False, warm_up=10):
    print(f'Running model {neuron_saved_model_name}, user_batch_size: {user_batch_size}\n')

    model_inf1 = tf.contrib.predictor.from_saved_model(neuron_saved_model_name)

    iter_times = []
    pred_labels = []
    actual_labels = []
    display_threshold = 0
    warm_up = 10

    ds = get_dataset(user_batch_size, use_cache)

    ds_iter = ds.make_initializable_iterator()
    ds_next = ds_iter.get_next()
    ds_init_op = ds_iter.initializer

    with tf.Session() as sess:
        if use_cache:
            sess.run(ds_init_op)
            print('\nCaching dataset ...')
            start_time = time.time()
            try:
                while True:
                    (validation_ds,label,_) = sess.run(ds_next)
            except tf.errors.OutOfRangeError:
                pass
            print(f'Caching finished: {time.time()-start_time} sec')  

        try:
            sess.run(ds_init_op)
            counter = 0
            
            display_every = 5000
            display_threshold = display_every
            
            ipname = list(model_inf1.feed_tensors.keys())[0]
            resname = list(model_inf1.fetch_tensors.keys())[0]
            
            walltime_start = time.time()

            while True:
                (validation_ds,batch_labels,_) = sess.run(ds_next)

                model_feed_dict={ipname: validation_ds}

                if counter == 0:
                    for i in range(warm_up):
                        _ = model_inf1(model_feed_dict);                    

                start_time = time.time()
                inf1_results = model_inf1(model_feed_dict);
                iter_times.append(time.time() - start_time)
                
                actual_labels.extend(label for label_list in batch_labels for label in label_list)
                pred_labels.extend(list(np.argmax(inf1_results[resname], axis=1)))

                if counter*user_batch_size >= display_threshold:
                    print(f'Images {counter*user_batch_size}/50000. Average i/s {np.mean(user_batch_size/np.array(iter_times[-display_every:]))}')
                    display_threshold+=display_every

                counter+=1

        except tf.errors.OutOfRangeError:
            pass
        
    acc_inf1 = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    iter_times = np.array(iter_times)
    
    results = pd.DataFrame(columns = [f'inf1_compiled_batch_size_{batch_size}_compiled_cores_{num_cores}'])
    results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
    results.loc['compiled_batch_size']     = [user_batch_size]
    results.loc['user_batch_size']         = [user_batch_size]
    results.loc['accuracy']                = [acc_inf1]
    results.loc['prediction_time']         = [np.sum(iter_times)]
    results.loc['wall_time']               = [time.time() - walltime_start]
    results.loc['images_per_sec_mean']     = [np.mean(user_batch_size / iter_times)]
    results.loc['images_per_sec_std']      = [np.std(user_batch_size / iter_times, ddof=1)]
    results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
    results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
    results.loc['latency_median']          = [np.median(iter_times) * 1000]
    results.loc['latency_min']             = [np.min(iter_times) * 1000]
    display(results.T)

    return results, iter_times

In [7]:
inf1_model_dir = 'resnet50_inf1_saved_models'
saved_model_dir = 'resnet50_saved_model'

compile_inf1_model(saved_model_dir, inf1_model_dir, batch_size=1, num_cores=1)
compile_inf1_model(saved_model_dir, inf1_model_dir, batch_size=5, num_cores=1)

-----------batch size: 1, num cores: 1----------
Compiling...
INFO:tensorflow:Restoring parameters from resnet50_saved_model/variables/variables
INFO:tensorflow:Froze 320 variables.
INFO:tensorflow:Converted 320 variables to const ops.
INFO:tensorflow:fusing subgraph neuron_op_d6f098c01c780733 with neuron-cc; log file is at /home/ubuntu/examples/ai-accelerators-examples/compiler-workdir/resnet50_inf1_saved_models/resnet50_batch_1_inf1_cores_1/neuron_op_d6f098c01c780733/graph_def.neuron-cc.log
INFO:tensorflow:Number of operations in TensorFlow session: 4638
INFO:tensorflow:Number of operations after tf.neuron optimizations: 556
INFO:tensorflow:Number of operations placed on Neuron runtime: 554
INFO:tensorflow:No assets to save.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: resnet50_inf1_saved_models/resnet50_batch_1_inf1_cores_1/saved_model.pb
INFO:tensorflow:Successfully converted resnet50_saved_model to resnet50_inf1_saved_models/resnet50_batch_1_inf1_core

True

In [8]:
inf1_model_dir = 'resnet50_inf1_saved_models'

compile_options = [{'batch_size': 1, 'num_cores': 1},
                  {'batch_size': 5, 'num_cores': 1}]

iter_ds = pd.DataFrame()
results = pd.DataFrame()

for opt in compile_options:
    batch_size = opt["batch_size"]
    num_cores = opt["num_cores"]
    compiled_model_dir = f'resnet50_batch_{batch_size}_inf1_cores_{num_cores}'
    inf1_compiled_model_dir = os.path.join(inf1_model_dir, compiled_model_dir)
   
    print(f'inf1_compiled_model_dir: {inf1_compiled_model_dir}')
    col_name = lambda opt: f'inf1_{batch_size}_multicores_{num_cores}'
    
    res, iter_times = inf1_predict_benchmark_single_threaded(inf1_compiled_model_dir,
                                                                     batch_size = batch_size,
                                                                     user_batch_size = batch_size*10,
                                                                     num_cores = num_cores,
                                                                     use_cache=False, 
                                                                     warm_up=10)
    
    iter_ds = pd.concat([iter_ds, pd.DataFrame(iter_times, columns=[col_name(opt)])], axis=1)
    results = pd.concat([results, res], axis=1)
    
display(results)

inf1_compiled_model_dir: resnet50_inf1_saved_models/resnet50_batch_1_inf1_cores_1
Running model resnet50_inf1_saved_models/resnet50_batch_1_inf1_cores_1, user_batch_size: 10

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
Instructions for updating:
Use `fo

Unnamed: 0,instance_type,compiled_batch_size,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
inf1_compiled_batch_size_1_compiled_cores_1,inf1.6xlarge,10,10,0.74852,89.8638,95.6517,559.222,39.71,17.9728,19.9103,18.1652,16.1915


inf1_compiled_model_dir: resnet50_inf1_saved_models/resnet50_batch_5_inf1_cores_1
Running model resnet50_inf1_saved_models/resnet50_batch_5_inf1_cores_1, user_batch_size: 50

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:The specified SavedModel has no variables; no checkpoints were restored.
Images 5000/50000. Average i/s 738.6758460621063
Images 10000/50000. Average i/s 740.3487413644553
Images 15000/50000. Average i/s 740.3791603988406
Images 20000/50000. Average i/s 739.9697937942765
Images 25000/50000. Average i/s 739.4084359109809
Images 30000/50000. Average i/s 739.4808519265042
Images 35000/50000. Average i/s 739.8308391039278
Images 40000/50000. Average i/s 739.8149770681252
Images 45000/50000. Average i/s 739.6895689153628


Unnamed: 0,instance_type,compiled_batch_size,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
inf1_compiled_batch_size_5_compiled_cores_1,inf1.6xlarge,50,50,0.7486,67.9414,74.0794,739.384,50.7144,67.9414,74.1017,67.953,60.1182


Unnamed: 0,inf1_compiled_batch_size_1_compiled_cores_1,inf1_compiled_batch_size_5_compiled_cores_1
instance_type,inf1.6xlarge,inf1.6xlarge
compiled_batch_size,10,50
user_batch_size,10,50
accuracy,0.74852,0.7486
prediction_time,89.8638,67.9414
wall_time,95.6517,74.0794
images_per_sec_mean,559.222,739.384
images_per_sec_std,39.71,50.7144
latency_mean,17.9728,67.9414
latency_99th_percentile,19.9103,74.1017
