# Optimized input payload for CV & SageMaker Endpoints with TF

In this example you'll see how to create a custom handler for a SageMaker endpoint that uses a TF2.x model for CV to process images. The compression in .jpeg is used to keep the payload as small as possible.

This solution minimizes as much as possible the overhead of the traditional formats like: Json, CSV, protobuf, etc.  The [custom TF serving handler for SageMaker](https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/deploying_tensorflow_serving.html) also uses GRPC to communicate with Tensorflow server. It reduces even more the data transfer overhead. But it is important to keep in mind that even GRPC has a linear overhead when translating the predictions from proto/tensor format to numpy. If you need to improve this, you have to invoke the model directly in Tensorflow and remove Tensorflow Serving.

If you're looking for a more elegant solution than just concatenating .jpeg files to create a stream, try [RecordIO](https://mesos.apache.org/documentation/latest/recordio/) or [TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord). You can change the way you're encoding your payload and the way you're reading it inside the container before sending to the model.

## Initialize SageMaker Session

In [None]:
import sagemaker
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
print(f'Default bucket: {bucket}')

## Download some images

In [None]:
import urllib.request

img_urls = [
    'https://images.unsplash.com/photo-1501386761578-eac5c94b800a',
    'https://images.unsplash.com/photo-1543465077-db45d34b88a5',
    'https://images.unsplash.com/photo-1552249007-6759fe2742b6'
]
for i,url in enumerate(img_urls):
    urllib.request.urlretrieve(url, f"image_{i}.jpg")

## Get a pre-trained model and upload it to S3

In [None]:
import urllib

with urllib.request.urlopen('https://spock.cloud/models/yolov3-keras.tar.gz') as m:
    s3_uri = sagemaker_session.upload_string_as_file_body(m.read(), bucket, 'models/yolov3-keras/model.tar.gz')
    print(s3_uri)

In [None]:
import os
if not os.path.isdir('code'): os.mkdir('code')

In [None]:
%%writefile code/requirements.txt
opencv-python
tensorflow==2.4.0

In [None]:
%%writefile code/inference.py
import subprocess
subprocess.call(["chmod", "777", "/tmp"])
subprocess.check_call(["apt", "update", "-y"])
subprocess.check_call(["apt", "install", "-y", "libgl1"]) # for libGL, required by opencv

import io
import cv2
import grpc
import json
import time
import pickle
import numpy as np
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc

max_batch_size=10
max_image_side=512 + 5 # grpc overhead
yolov3_output_size_per_image=6 * 1024 * 1024 # ~5.3MB of output per image

MAX_GRPC_SEND_MESSAGE_LENGTH = max_batch_size * 3 * max_image_side * max_image_side * 4
MAX_GRPC_RECEIVE_MESSAGE_LENGTH = max_batch_size * yolov3_output_size_per_image

client = None
def init_client(grpc_port): # this method initializes a single instance of the client
    global client, request
    if client is not None: return
    options = [
        ('grpc.max_send_message_length', MAX_GRPC_SEND_MESSAGE_LENGTH),
        ('grpc.max_receive_message_length', MAX_GRPC_RECEIVE_MESSAGE_LENGTH)
    ]
    channel = grpc.insecure_channel(f'0.0.0.0:{grpc_port}', options=options)
    client = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    
    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'yolov3'# this is the same name of the root dir in your model.tar.gz
    request.model_spec.signature_name = 'serving_default'
    print('Client initialized')

def predict(payload):
    global client, request
    
    request.inputs['input_1'].Clear()
    request.inputs['input_1'].CopyFrom(tf.make_tensor_proto(payload))
    # invoke the model
    prediction = client.Predict(request, 25)
    # get the predictions
    result = []
    for o in [ 'conv2d_58', 'conv2d_66', 'conv2d_74' ]:
        result.append( tf.make_ndarray(prediction.outputs[o]) )
    
    # ATTENTION: you need to post-process the network output first!
    # reference: https://github.com/qqwweee/keras-yolo3/blob/master/yolo.py
    
    # this is just a quick-n-dirty way to return all raw values in NPY format
    result = np.concatenate([i.flatten() for i in result])
    return result

# customizing the request handler
# More info: https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/deploying_tensorflow_serving.html
def handler(data, context):
    if context.request_content_type != 'application/octet-stream':
        raise ValueError(f'Unsupported content type: {context.request_content_type or "unknown"}')
    if context.accept_header != 'application/x-npy':
        raise ValueError(f'Unsupported accept type: {context.accept_header or "unknown"}')
    
    init_client(context.grpc_port)
    
    # iterate through all the images and create a batch
    data_len = int.from_bytes(data.read(4), 'big')
    payload = []
    while data_len > 0:
        imgread_start_time = time.time()
        img = np.frombuffer(data.read(data_len), dtype=np.uint8)
        img = cv2.imdecode(img, cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # convert from BGR to RGB
        img = img.astype(np.float32) / 255. # normalize
        img = np.expand_dims(img, axis=0) # put in NHWC format
        payload.append(img)
        data_len = int.from_bytes(data.read(4), 'big')
    if len(payload) == 0: raise ValueError('Empty input data!')
    
    # invoke the model
    result = predict(np.vstack(payload))
    
    # pickle the resulting tensor and return it as NPY
    buffer = io.BytesIO()
    pickle.dump(result, buffer)
    buffer.seek(0)

    return buffer.read(), context.accept_header

In [None]:
from sagemaker.tensorflow import TensorFlowModel
# Create a SageMaker model from the serialized model in S3
model = TensorFlowModel(
    model_data=s3_uri,
    role=role,
    sagemaker_session=sagemaker_session, # comment to make local mode work
    framework_version="2.4",
    source_dir='code',
    entry_point='code/inference.py' # https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/using_tf.html#id33
)

In [None]:
predictor = model.deploy(initial_instance_count=1, instance_type='ml.g4dn.xlarge')

## Build a big binary stream with all your .jpeg files
This is one of the most optimized ways to send multiple images to a remote webservice.
It is not the most elegant way... There are specific formats for this operation like RecordIO or TFRecord, but both introduce overheads. If the overhead is not a problem for you, I recommend you to use one of these open formats to serialize data. However, if you're looking for something that minimizes the overhead at any cost, this can be a solution, but remember that you need to control the byte streaming on both sides (client and server)

What we're going to do now is:
- to read all images
- make them square by padding with 0's
- resize them to the shape expected by the Neural Network
- compress the image back to .jpg
- serialize all the images, intercalating the # of bytes each image has.

To read this, you start by reading 4bytes (int32) to get the number of bytes of the 1st image, then you read the # of bytes you found from the stream and build your first .jpg file. Repeat this process for all images in the streaming until you get an EOF.

In [10]:
%%time
import io
import cv2
import numpy as np

batch_size=10
img_size = 512
encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 95]
buffer = io.BytesIO()
for i in range(batch_size):
    i = i%3
    img = cv2.imread(f'image_{i}.jpg')
    h,w,c = img.shape
    if h!=w: # make it square but keep aspect ratio
        sqr_size = max(h,w)
        sqr_img = np.zeros((sqr_size, sqr_size, c), dtype=np.uint8)
        sqr_img[:h, :w],img = img,sqr_img
    img = cv2.resize(img, (img_size, img_size))
    _,img = cv2.imencode('.jpeg', img, encode_param) # compress again to jpeg (minimize the payload)
    buffer.write(len(img).to_bytes(4,'big'))
    buffer.write(img.tobytes())
print(f'Buffer length: {buffer.getbuffer().nbytes} bytes')

Buffer length: 1031488 bytes
CPU times: user 3.78 s, sys: 945 ms, total: 4.73 s
Wall time: 4.68 s


In [14]:
from sagemaker.serializers import IdentitySerializer
from sagemaker.deserializers import NumpyDeserializer

predictor.serializer = IdentitySerializer()
predictor.deserializer = NumpyDeserializer()
buffer.seek(0)
%time preds = predictor.predict(buffer.read())

CPU times: user 257 ms, sys: 168 ms, total: 425 ms
Wall time: 2.51 s


### Rebuild the output just to validate the process
The recommendation is to add a post-processing step inside your code (in the container)

In [15]:
# N=1 -> 0.03s (output=5.3MB)
# N=5 -> 0.151s (output=26.14MB)
# N=10 -> 0.335s (output=52.29MB)
# (N, 16, 16, 255) -> N*255KB, (N, 32, 32, 255) --> N*1020KB, (N, 64, 64, 255) -> N*4080KB
offset=0
output=[]
for i in [16,32,64]:
    pivot = offset+batch_size*i*i*255
    output.append(preds[offset:pivot].reshape((batch_size,i,i,255)))
    offset=pivot
[o.shape for o in output]

[(10, 16, 16, 255), (10, 32, 32, 255), (10, 64, 64, 255)]