## Setup

In [9]:
import numpy as np
import time
import json
import requests
import boto3
import os
import sagemaker

In [113]:
from sagemaker import get_execution_role, Session, image_uris

role = get_execution_role()
sess = Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
sm_client = boto3.client("sagemaker", region_name=region)

## Load model from Torch Hub

In [1]:
import torch

In [59]:
print(torch.__version__)

1.7.1


In [4]:
model = torch.jit.load('yolov5.pt')

In [41]:
model_name = "YoloV5s"
model = torch.hub.load('ultralytics/yolov5', f'{model_name.lower()}')  # or yolov5m, yolov5l, yolov5x, custom

Using cache found in /home/ec2-user/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2022-4-8 torch 1.7.1 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


Python 3.7.0 required by YOLOv5, but Python 3.6.13 is currently installed


In [42]:
image = torch.zeros([1, 3, 640, 640], dtype=torch.float32)
model.train()
model_trace = torch.jit.trace(model, image)
# Save your model
model_trace.save('model.pth')

### Create inference script

In [15]:
!mkdir code/

In [97]:
%%writefile code/inference.py
import torch
import os
import logging
import io
import json
from PIL import Image
import numpy as np
import torchvision
from torchvision import transforms

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

IMAGE_SIZE = 640

# def predict_fn(image, model):
#     return model(image)

def input_fn(request_body, content_type):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    iobytes = io.BytesIO(request_body)
    img = Image.open(iobytes)
    preprocess = transforms.Compose([
        transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
        transforms.ToTensor()
        ])
    input_tensor = preprocess(img)
    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
    
    return input_batch.to(device)    

# postprocess
def output_fn(predictions, content_type):
#     return predictions.numpy().xyxy[0].to_json(orient="records")
    res = predictions.cpu().numpy().tolist()
    rv = json.dumps(res)
    return rv[0:100]

Overwriting code/inference.py


### Create model archive

In [98]:
model_archive_name = 'model.tar.gz'

In [99]:
!tar -cvzf {model_archive_name} model.pth code/

model.pth
code/
code/.ipynb_checkpoints/
code/.ipynb_checkpoints/inference-checkpoint.py
code/inference.py


In [100]:
# model package tarball (model artifact + inference code)
model_url = sess.upload_data(path=model_archive_name, key_prefix='model')
print('model uploaded to: {}'.format(model_url))

model uploaded to: s3://sagemaker-us-east-2-156991241640/model/model.tar.gz


### Create model and test inference

In [101]:
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.predictor import Predictor

framework_version = '1.7.1'
py_version = 'py36'

sm_model = PyTorchModel(model_data=model_url,
                               framework_version=framework_version,
                               role=role,
                               sagemaker_session=sess,
                               entry_point='code/inference.py',
                               py_version=py_version
                              )

In [103]:
instance_type = 'ml.g4dn.xlarge'
uncompiled_predictor = sm_model.deploy(initial_instance_count=1, instance_type=instance_type)

Creating model with name: pytorch-inference-2022-04-08-21-26-41-032
Creating endpoint-config with name pytorch-inference-2022-04-08-21-26-41-333
Creating endpoint with name pytorch-inference-2022-04-08-21-26-41-333


--------!

In [104]:
import boto3
import requests
client = boto3.client('sagemaker-runtime', region_name=region)
content_type = 'application/x-image'
sample_img_url = "https://github.com/ultralytics/yolov5/raw/master/data/images/zidane.jpg"
body = requests.get(sample_img_url).content

In [107]:
%%timeit
rv = client.invoke_endpoint(EndpointName=uncompiled_predictor.endpoint_name, Body=body, ContentType=content_type)

1.52 s ± 9.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Inference Recommender

In [108]:
# ML framework details
framework = 'pytorch'
# Note that only the framework major and minor version is supported for Neo compilation
framework_version = '.'.join(torch.__version__.split('.')[:-1])
# model name as standardized by model zoos or a similar open source model
model_name = "yolov5"
# ML model details
ml_domain = "COMPUTER_VISION"
ml_task = "OBJECT_DETECTION"

print("PyT Version", framework_version)

PyT Version 1.7


In [161]:
instance_type = "ml.c5.xlarge"  # Note: you can use any GPU-based instance type here, this is just to get a GPU tagged image
dlc_uri = image_uris.retrieve(
    framework,
    region,
    version=framework_version,
    py_version="py3",
    instance_type=instance_type,
    image_scope="inference",
)
dlc_uri

'763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-inference:1.7-cpu-py3'

### Create sample payload archive

In [162]:
payload_archive_name = "payload.tar.gz"

In [163]:
import os

## optional: download sample images
SAMPLES_BUCKET = "sagemaker-sample-files"
PREFIX = "datasets/image/pets/"
payload_location = "./sample-payload/"

if not os.path.exists(payload_location):
    os.makedirs(payload_location)
    print("Directory ", payload_location, " Created ")
else:
    print("Directory ", payload_location, " already exists")

sess.download_data(payload_location, SAMPLES_BUCKET, PREFIX)

Directory  ./sample-payload/  already exists


### Tar the payload

In [118]:
!cd ./sample-payload/ && tar czvf ../payload.tar.gz *

boxer_dog.jpg
british_blue_shorthair_cat.jpg
english_cocker_spaniel_dog.jpg
shiba_inu_dog.jpg


### Upload to S3

Next, we'll upload the packaged payload examples (payload.tar.gz) that was created above to S3.  The S3 location will be used as input to our Inference Recommender job later in this notebook. 

In [120]:
sample_payload_url = sess.upload_data(path=payload_archive_name, key_prefix="payload")

### Register model in Model Registry

In order to use Inference Recommender, you must have a versioned model in SageMaker Model Registry.  To register a model in the Model Registry, you must have a model artifact packaged in a tarball and an inference container image.  Registering a model includes the following steps:


1) **Create Model Group:** This is a one-time task per machine learning use case. A Model Group contains one or more versions of your packaged model. 

2) **Register Model Version/Package:** This task is performed for each new packaged model version. 

### Create Model Group

In [164]:
model_package_group_name = "{}-models-".format(framework) + str(round(time.time()))
model_package_group_description = "{} models".format(ml_task.lower())

model_package_group_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageGroupDescription": model_package_group_description,
}

create_model_package_group_response = sm_client.create_model_package_group(
    **model_package_group_input_dict
)
print(
    "ModelPackageGroup Arn : {}".format(create_model_package_group_response["ModelPackageGroupArn"])
)

ModelPackageGroup Arn : arn:aws:sagemaker:us-east-2:156991241640:model-package-group/pytorch-models-1649457428


### Register Model Version/Package

In this step, you'll register your pretrained model that was packaged in the prior steps as a new version in SageMaker Model Registry.  First, you'll configure the model package/version identifying which model package group this new model should be registered within as well as identify the initial approval status. You'll also identify the domain and task for your model.  These values were set earlier in the notebook 
where `ml_domain = 'COMPUTER_VISION'` and `ml_task = 'OBJECT_DETECTION'`

*Note: ModelApprovalStatus is a configuration parameter that can be used in conjunction with SageMaker Projects to trigger automated deployment pipeline.*  

In [165]:
model_package_description = "{} {} inference recommender".format(framework, model_name)

model_approval_status = "PendingManualApproval"

create_model_package_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "Domain": ml_domain.upper(),
    "Task": ml_task.upper(),
    "SamplePayloadUrl": sample_payload_url,
    "ModelPackageDescription": model_package_description,
    "ModelApprovalStatus": model_approval_status,
}

### Set up inference specification

You'll now setup the inference specification configuration for your model version.  This contains information on how the model should be hosted.

Inference Recommender expects a single input MIME type for sending requests. Learn more about [common inference data formats on SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html). This MIME type will be sent in the Content-Type header when invoking your endpoint.

In [166]:
input_mime_types = ["application/octet-stream"]

If you specify a set of instance types below (i.e. non-empty list), then Inference Recommender will only support recommendations within the set of instances below. For this example, we provide a list of common instance types used for image classification algorithms.

In [167]:
supported_realtime_inference_types = [
    "ml.c5.xlarge",
    "ml.inf1.xlarge",
    "ml.g4dn.xlarge"
]

### Optional: Model optimization

[Amazon SageMaker Neo](https://aws.amazon.com/sagemaker/neo) is a capability of SageMaker that automatically optimizes your ML models for any target instance type. With Neo, you don’t need to set up third-party or framework-specific compiler software, or tune the model manually for optimizing inference performance. 

Inference Recommender compiles your model using SageMaker Neo if the `ModelInput` field is provided. To prepare the inputs for model compilation, specify the input shape for your trained model.

In [168]:
data_input_configuration = "[[1,3,640,640]]"

In [169]:
modelpackage_inference_specification = {
    "InferenceSpecification": {
        "Containers": [
            {
                "Image": dlc_uri,
                "Framework": framework.upper(),
                "FrameworkVersion": framework_version,
                "NearestModelName": model_name,
                "ModelInput": {"DataInputConfig": data_input_configuration},
            }
        ],
        "SupportedContentTypes": input_mime_types,  # required, must be non-null
        "SupportedResponseMIMETypes": [],
        "SupportedRealtimeInferenceInstanceTypes": supported_realtime_inference_types,  # optional
    }
}

# Specify the model data
modelpackage_inference_specification["InferenceSpecification"]["Containers"][0][
    "ModelDataUrl"
] = model_url

In [170]:
create_model_package_input_dict.update(modelpackage_inference_specification)

In [171]:
create_mode_package_response = sm_client.create_model_package(**create_model_package_input_dict)
model_package_arn = create_mode_package_response["ModelPackageArn"]
print("ModelPackage Version ARN : {}".format(model_package_arn))

ModelPackage Version ARN : arn:aws:sagemaker:us-east-2:156991241640:model-package/pytorch-models-1649457428/1


In [172]:
sm_client.describe_model_package(ModelPackageName=model_package_arn)

{'ModelPackageGroupName': 'pytorch-models-1649457428',
 'ModelPackageVersion': 1,
 'ModelPackageArn': 'arn:aws:sagemaker:us-east-2:156991241640:model-package/pytorch-models-1649457428/1',
 'ModelPackageDescription': 'pytorch yolov5 inference recommender',
 'CreationTime': datetime.datetime(2022, 4, 8, 22, 37, 18, 405000, tzinfo=tzlocal()),
 'InferenceSpecification': {'Containers': [{'Image': '763104351884.dkr.ecr.us-east-2.amazonaws.com/pytorch-inference:1.7-cpu-py3',
    'ImageDigest': 'sha256:1b6b7276ef97a34269479d73c180775b1fedd31bedaa083d406d7cce9ae633c4',
    'ModelDataUrl': 's3://sagemaker-us-east-2-156991241640/model/model.tar.gz',
    'ModelInput': {'DataInputConfig': '[[1,3,640,640]]'},
    'Framework': 'PYTORCH',
    'FrameworkVersion': '1.7',
    'NearestModelName': 'yolov5'}],
  'SupportedRealtimeInferenceInstanceTypes': ['ml.c5.xlarge',
   'ml.inf1.xlarge',
   'ml.g4dn.xlarge'],
  'SupportedContentTypes': ['application/octet-stream'],
  'SupportedResponseMIMETypes': []},
 

### Create a SageMaker Inference Recommender Default Job

Now with your model in Model Registry, you can kick off a 'Default' job to get instance recommendations. This only requires your `ModelPackageVersionArn` and comes back with recommendations within an hour. 

The output is a list of instance type recommendations with associated environment variables, cost, throughput and latency metrics.

In [173]:
import boto3
import uuid
from sagemaker import get_execution_role

client = boto3.client("sagemaker", region)

role = get_execution_role()
default_job = uuid.uuid1()
default_response = client.create_inference_recommendations_job(
    JobName=str(default_job),
    JobDescription="Job Description",
    JobType="Default",
    RoleArn=role,
    InputConfig={"ModelPackageVersionArn": model_package_arn},
)

print(default_response)

{'JobArn': 'arn:aws:sagemaker:us-east-2:156991241640:inference-recommendations-job/73515cfc-b78c-11ec-9bfd-06a8b5eb1f1a', 'ResponseMetadata': {'RequestId': '1fe573e2-4bc8-4fab-9e6a-c08b363c347b', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '1fe573e2-4bc8-4fab-9e6a-c08b363c347b', 'content-type': 'application/x-amz-json-1.1', 'content-length': '120', 'date': 'Fri, 08 Apr 2022 22:37:19 GMT'}, 'RetryAttempts': 0}}


## 8. Instance Recommendation Results

Each inference recommendation includes `InstanceType`, `InitialInstanceCount`, `EnvironmentParameters` which are tuned environment variable parameters for better performance. We also include performance and cost metrics such as `MaxInvocations`, `ModelLatency`, `CostPerHour` and `CostPerInference`. We believe these metrics will help you narrow down to a specific endpoint configuration that suits your use case. 

Example:   

If your motivation is overall price-performance with an emphasis on throughput, then you should focus on `CostPerInference` metrics  
If your motivation is a balance between latency and throughput, then you should focus on `ModelLatency` / `MaxInvocations` metrics

| Metric | Description |
| --- | --- |
| ModelLatency | The interval of time taken by a model to respond as viewed from SageMaker. This interval includes the local communication times taken to send the request and to fetch the response from the container of a model and the time taken to complete the inference in the container. <br /> Units: Milliseconds |
| MaximumInvocations | The maximum number of InvokeEndpoint requests sent to an endpoint per minute. <br /> Units: None |
| CostPerHour | The estimated cost per hour for your real-time endpoint. <br /> Units: US Dollars |
| CostPerInference | The estimated cost per inference for your real-time endpoint. <br /> Units: US Dollars |

In [None]:
import pprint
import pandas as pd

finished = False
while not finished:
    inference_recommender_job = sm_client.describe_inference_recommendations_job(
        JobName=str(default_job)
    )
    if inference_recommender_job["Status"] in ["COMPLETED", "STOPPED", "FAILED"]:
        finished = True
    else:
        print("In progress")
        time.sleep(300)

if inference_recommender_job["Status"] == "FAILED":
    print("Inference recommender job failed ")
    print("Failed Reason: {}".format(inference_recommender_job["FailureReason"]))
else:
    print("Inference recommender job completed")

In progress


### Detailing out the result

In [None]:
data = [
    {**x["EndpointConfiguration"], **x["ModelConfiguration"], **x["Metrics"]}
    for x in inference_recommender_job["InferenceRecommendations"]
]
df = pd.DataFrame(data)
df.drop("VariantName", inplace=True, axis=1)
pd.set_option("max_colwidth", 400)
df.head()

### CloudWatch metrics

In [154]:
from cloudwatch import *

In [150]:
cw_client = boto3.client("cloudwatch", region)

In [160]:
get_endpoint_metrics(sm_client, cw_client, region, str(default_job), include_plots=True)

Unnamed: 0,InstanceType,MaximumInvocations,ModelLatency,CostPerHour,CostPerInference,EndpointName,VariantName,InitialCount,EnvParameters,StartTime,EndTime
0,ml.g4dn.xlarge,41,4410,0.7360000014305114,0.0002991869987454,sm-epc-476aa900-8445-467e-8919-327b16270596,sm-epc-476aa900-8445-467e-8919-327b16270596,1,[],2022-04-08 21:42:51.425000+00:00,2022-04-08 22:19:25.090000+00:00
1,ml.g4dn.xlarge,42,4614,0.7360000014305114,0.0002920634869951,sm-epc-7d9aff47-fafd-462b-b2c6-ef8868be46fc,sm-epc-7d9aff47-fafd-462b-b2c6-ef8868be46fc,1,[],2022-04-08 21:42:51.425000+00:00,2022-04-08 22:19:25.090000+00:00
2,ml.inf1.6xlarge,1155,1356,1.4160000085830688,2.04329007829e-05,sm-epc-93bb16bb-7eb3-40e2-932c-26afae165533,sm-epc-93bb16bb-7eb3-40e2-932c-26afae165533,1,"[{'Key': 'NEURONCORE_GROUP_SIZES', 'ValueType': 'string', 'Value': '1'}]",2022-04-08 21:42:51.425000+00:00,2022-04-08 22:19:25.090000+00:00
