# Amazon Elastic Inference (EI) inference on Amazon EC2 CPU instance
This example demonstrates Amazon Elastic Inference with Amazon EI enabled TensorFlow

This example was tested on Amazon EC2 `c5.2xlarge` the following AWS Deep Learning AMI: 
`Deep Learning AMI (Ubuntu 18.04) Version 35.0`

Run this notebook using the following conda environment:
`amazonei_tensorflow2_p36`

Prepare your imagenet validation TFRecord files using the following helper script:
https://github.com/tensorflow/models/blob/archive/research/inception/inception/data/download_and_preprocess_imagenet.sh

Save it to `/home/ubuntu/datasets/` or update the dataset location in the `get_dataset()` function

In [1]:
# !pip install matplotlib pandas

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
from ei_for_tf.python.predictor.ei_predictor import EIPredictor
import numpy as np
import pandas as pd
import shutil
import requests
import time
import json
import os
import boto3
tf.__version__

'2.0.2'

In [3]:
# https://github.com/tensorflow/tensorflow/issues/29931
temp = tf.zeros([8, 224, 224, 3])
_ = tf.keras.applications.resnet50.preprocess_input(temp)

In [4]:
results = None
batch_size = 8

ei_client = boto3.client('elastic-inference')
print(json.dumps(ei_client.describe_accelerators()['acceleratorSet'], indent=1))

[
 {
  "acceleratorHealth": {
   "status": "Ok"
  },
  "acceleratorType": "eia2.large",
  "acceleratorId": "eia-63a6cf28f02841469c58055bff078a95",
  "availabilityZone": "us-west-2a",
  "attachedResource": "arn:aws:ec2:us-west-2:453691756499:instance/i-00487fc33ad7ef5eb"
 },
 {
  "acceleratorHealth": {
   "status": "Ok"
  },
  "acceleratorType": "eia2.xlarge",
  "acceleratorId": "eia-ef9561df7dd74b308ecefbd8b362ca69",
  "availabilityZone": "us-west-2a",
  "attachedResource": "arn:aws:ec2:us-west-2:453691756499:instance/i-00487fc33ad7ef5eb"
 }
]


In [5]:
def load_save_resnet50_model(saved_model_dir = 'resnet50_saved_model'):
    model = ResNet50(weights='imagenet')
    shutil.rmtree(saved_model_dir, ignore_errors=True)
    model.save(saved_model_dir, include_optimizer=False, save_format='tf')

In [6]:
saved_model_dir = 'resnet50_saved_model' 
# load_save_resnet50_model(saved_model_dir)

In [7]:
def deserialize_image_record(record):
    feature_map = {'image/encoded': tf.io.FixedLenFeature([], tf.string, ''),
                  'image/class/label': tf.io.FixedLenFeature([1], tf.int64, -1),
                  'image/class/text': tf.io.FixedLenFeature([], tf.string, '')}
    obj = tf.io.parse_single_example(serialized=record, features=feature_map)
    imgdata = obj['image/encoded']
    label = tf.cast(obj['image/class/label'], tf.int32)   
    label_text = tf.cast(obj['image/class/text'], tf.string)   
    return imgdata, label, label_text

def val_preprocessing(record):
    imgdata, label, label_text = deserialize_image_record(record)
    label -= 1
    image = tf.io.decode_jpeg(imgdata, channels=3, 
                              fancy_upscaling=False, 
                              dct_method='INTEGER_FAST')

    shape = tf.shape(image)
    height = tf.cast(shape[0], tf.float32)
    width = tf.cast(shape[1], tf.float32)
    side = tf.cast(tf.convert_to_tensor(256, dtype=tf.int32), tf.float32)

    scale = tf.cond(tf.greater(height, width),
                  lambda: side / width,
                  lambda: side / height)
    
    new_height = tf.cast(tf.math.rint(height * scale), tf.int32)
    new_width = tf.cast(tf.math.rint(width * scale), tf.int32)
    
    image = tf.image.resize(image, [new_height, new_width], method='bicubic')
    image = tf.image.resize_with_crop_or_pad(image, 224, 224)
    
    image = tf.keras.applications.resnet50.preprocess_input(image)
    
    return image, label, label_text

def get_dataset(batch_size, use_cache=False):
    data_dir = '/home/ubuntu/datasets/*'
    files = tf.io.gfile.glob(os.path.join(data_dir))
    dataset = tf.data.TFRecordDataset(files)
    
    dataset = dataset.map(map_func=val_preprocessing, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size=batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    dataset = dataset.repeat(count=1)
    
    if use_cache:
        shutil.rmtree('tfdatacache', ignore_errors=True)
        os.mkdir('tfdatacache')
        dataset = dataset.cache(f'./tfdatacache/imagenet_val')
    
    return dataset

In [8]:
print('\n=======================================================')
print(f'Benchmark results for CPU Keras, batch size: {batch_size}')
print('=======================================================\n')

model = tf.keras.models.load_model(saved_model_dir)
display_every = 5000
display_threshold = display_every

pred_labels = []
actual_labels = []
iter_times = []

# Get the tf.data.TFRecordDataset object for the ImageNet2012 validation dataset
dataset = get_dataset(batch_size)  

walltime_start = time.time()
for i, (validation_ds, batch_labels, _) in enumerate(dataset):
    start_time = time.time()
    pred_prob_keras = model(validation_ds)
    iter_times.append(time.time() - start_time)
    
    actual_labels.extend(label for label_list in batch_labels.numpy() for label in label_list)
    pred_labels.extend(list(np.argmax(pred_prob_keras, axis=1)))
    
    if i*batch_size >= display_threshold:
        print(f'Images {i*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
        display_threshold+=display_every

iter_times = np.array(iter_times)
acc_keras_gpu = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)

results = pd.DataFrame(columns = [f'keras_cpu_{batch_size}'])
results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
results.loc['accelerator']             = ['NA']
results.loc['user_batch_size']         = [batch_size]
results.loc['accuracy']                = [acc_keras_gpu]
results.loc['prediction_time']         = [np.sum(iter_times)]
results.loc['wall_time']               = [time.time() - walltime_start]
results.loc['images_per_sec_mean']     = [np.mean(batch_size / iter_times)]
results.loc['images_per_sec_std']      = [np.std(batch_size / iter_times, ddof=1)]
results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
results.loc['latency_median']          = [np.median(iter_times) * 1000]
results.loc['latency_min']             = [np.min(iter_times) * 1000]
display(results.T)


Benchmark results for CPU Keras, batch size: 8

Images 5000/50000. Average i/s 26.555694032421247
Images 10000/50000. Average i/s 26.676666543597392
Images 15000/50000. Average i/s 26.77406612095138
Images 20000/50000. Average i/s 26.822275491462182
Images 25000/50000. Average i/s 26.847471484622154
Images 30000/50000. Average i/s 26.859330729648033
Images 35000/50000. Average i/s 26.865594015573578
Images 40000/50000. Average i/s 26.873174015987328
Images 45000/50000. Average i/s 26.91567530151017


Unnamed: 0,instance_type,accelerator,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
keras_cpu_8,c5.2xlarge,,8,0.74956,1860.75,1864.85,26.8839,0.502054,297.721,330.937,296.569,286.248


In [9]:
def ei_predict_benchmark(saved_model_dir, batch_size, accelerator_id):
    
    ei_size = ei_client.describe_accelerators()['acceleratorSet'][accelerator_id]['acceleratorType']

    print('\n=======================================================')
    print(f'Benchmark results for EI: {ei_size}, batch size: {batch_size}')
    print('=======================================================\n')
    
    eia_model = EIPredictor(saved_model_dir, 
                                accelerator_id=1)

    display_every = 5000
    display_threshold = display_every

    pred_labels = []
    actual_labels = []
    iter_times = []

    # Get the tf.data.TFRecordDataset object for the ImageNet2012 validation dataset
    dataset = get_dataset(batch_size)  

    walltime_start = time.time()
    ipname = list(eia_model.feed_tensors.keys())[0]
    resname = list(eia_model.fetch_tensors.keys())[0]

    for i, (validation_ds, batch_labels, _) in enumerate(dataset):

        model_feed_dict={'input_1': validation_ds.numpy()}
        start_time = time.time()
        pred_prob = eia_model(model_feed_dict)
        iter_times.append(time.time() - start_time)

        actual_labels.extend(label for label_list in batch_labels.numpy() for label in label_list)
        pred_labels.extend(list(np.argmax(pred_prob['probs'], axis=1)))

        if i*batch_size >= display_threshold:
            print(f'Images {i*batch_size}/50000. Average i/s {np.mean(batch_size/np.array(iter_times[-display_every:]))}')
            display_threshold+=display_every

    iter_times = np.array(iter_times)
    acc_keras_gpu = np.sum(np.array(actual_labels) == np.array(pred_labels))/len(actual_labels)
    
    results = pd.DataFrame(columns = [f'EI_{batch_size}_{ei_size}'])
    results.loc['instance_type']           = [requests.get('http://169.254.169.254/latest/meta-data/instance-type').text]
    results.loc['accelerator']             = [ei_size]
    results.loc['user_batch_size']         = [batch_size]
    results.loc['accuracy']                = [acc_keras_gpu]
    results.loc['prediction_time']         = [np.sum(iter_times)]
    results.loc['wall_time']               = [time.time() - walltime_start]
    results.loc['images_per_sec_mean']     = [np.mean(batch_size / iter_times)]
    results.loc['images_per_sec_std']      = [np.std(batch_size / iter_times, ddof=1)]
    results.loc['latency_mean']            = [np.mean(iter_times) * 1000]
    results.loc['latency_99th_percentile'] = [np.percentile(iter_times, q=99, interpolation="lower") * 1000]
    results.loc['latency_median']          = [np.median(iter_times) * 1000]
    results.loc['latency_min']             = [np.min(iter_times) * 1000]
    display(results.T)
    
    return results, iter_times

In [10]:
ei_options = [{'ei_acc_id': 0}]

iter_ds = pd.DataFrame()
if results is None:
    results = pd.DataFrame()

col_name = lambda ei_acc_id: f'ei_{ei_client.describe_accelerators()["acceleratorSet"][ei_acc_id]["acceleratorType"]}_batch_size_{batch_size}'

    
for opt in ei_options:
    ei_acc_id = opt["ei_acc_id"]
    res, iter_times = ei_predict_benchmark(saved_model_dir, batch_size, ei_acc_id)
    
    iter_ds = pd.concat([iter_ds, pd.DataFrame(iter_times, columns=[col_name(ei_acc_id)])], axis=1)
    results = pd.concat([results, res], axis=1)
    
display(results)


Benchmark results for EI: eia2.large, batch size: 8

Using DEFAULT_SERVING_SIGNATURE_DEF_KEY .....
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from resnet50_saved_model/variables/variables
Images 5000/50000. Average i/s 160.79150457173685
Images 10000/50000. Average i/s 160.57224536199263
Images 15000/50000. Average i/s 160.17887928377442
Images 20000/50000. Average i/s 159.55135762825725
Images 25000/50000. Average i/s 159.05273547195634
Images 30000/50000. Average i/s 158.55027160196224
Images 35000/50000. Average i/s 158.2158252593362
Images 40000/50000. Average i/s 157.88468338480075
Images 45000/50000. Average i/s 157.15614275808505


Unnamed: 0,instance_type,accelerator,user_batch_size,accuracy,prediction_time,wall_time,images_per_sec_mean,images_per_sec_std,latency_mean,latency_99th_percentile,latency_median,latency_min
EI_8_eia2.large,c5.2xlarge,eia2.large,8,0.74956,321.635,331.082,157.271,5.04556,51.4616,55.3734,50.7524,47.5709


Unnamed: 0,keras_cpu_8,EI_8_eia2.large
instance_type,c5.2xlarge,c5.2xlarge
accelerator,,eia2.large
user_batch_size,8,8
accuracy,0.74956,0.74956
prediction_time,1860.75,321.635
wall_time,1864.85,331.082
images_per_sec_mean,26.8839,157.271
images_per_sec_std,0.502054,5.04556
latency_mean,297.721,51.4616
latency_99th_percentile,330.937,55.3734
