# Reduce costs with Elastic Inference 

Create this script…

In [1]:
%%bash
cat create_sm_tarball.py

import torch, torchvision
import subprocess

# Toggle inference mode
model = torchvision.models.densenet121(pretrained=True).eval()
cv_input = torch.rand(1,3,224,224)
model = torch.jit.trace(model,cv_input)
torch.jit.save(model, 'model.pt')
subprocess.call(['tar', '-czvf', 'densenet121_traced.tar.gz', 'model.pt'])


And and empty file call script.py as placeholder.

In [2]:
! touch script.py

# Create the endpoint

In [3]:
import sagemaker
from sagemaker.pytorch import PyTorchModel

sagemaker_session = sagemaker.Session()
region = 'us-west-2'
role = 'arn:aws:iam::921212210452:role/service-role/AmazonSageMaker-ExecutionRole-20191122T164449'

In [4]:
instance_type = 'c5.large'
accelerator_type = 'eia2.medium'

ecr_image = '763104351884.dkr.ecr.{}.amazonaws.com/pytorch-inference-eia:1.3.1-cpu-py3'.format(region)

# Satisfy regex
endpoint_name = 'pt-ei-densenet121-tracedV-{}-{}'.format(instance_type, accelerator_type).replace('.', '').replace('_', '')
tar_filename = 'densenet121_traced.tar.gz'

# script.py should be blank to use default EI model_fn and predict_fn
# For non-EI PyTorch usage, must implement own model_fn
entry_point = 'script.py'

# Returns S3 bucket URL
print('Upload tarball to S3')
model_data = sagemaker_session.upload_data(path=tar_filename)

Upload tarball to S3


In [5]:


pytorch = PyTorchModel(framework_version='1.4.0', model_data=model_data, 
                role=role, image=ecr_image, entry_point=entry_point, sagemaker_session=sagemaker_session)

# Function will exit before endpoint is finished creating
predictor = pytorch.deploy(initial_instance_count=1, instance_type='ml.' + instance_type, 
                accelerator_type='ml.' + accelerator_type, endpoint_name=endpoint_name, wait=False)


In [6]:
import sagemaker
from sagemaker.pytorch import PyTorchPredictor
import torch
import boto3
import datetime
import math
import numpy as np
import time

instance_type = 'c5.large'
accelerator_type = 'eia2.medium'

In [8]:
endpoint_name = 'pt-ei-densenet121-tracedV-{}-{}'.format(instance_type, accelerator_type).replace('.', '').replace('_', '')
predictor = PyTorchPredictor(endpoint_name)
data = torch.rand(1,3,224,224)

# Do warmup round of 100 inferences to warm up routers
print('Doing warmup round of 100 inferences (not counted)')
for i in range(100):
  output = predictor.predict(data)
time.sleep(15)

client_times = []
print('Running 1000 inferences for {}:'.format(endpoint_name))
cw_start = datetime.datetime.utcnow()
for i in range(1000):
  client_start = time.time()
  output = predictor.predict(data)
  client_end = time.time()
  client_times.append((client_end - client_start)*1000)
cw_end = datetime.datetime.utcnow()

print('Client end-to-end latency percentiles:')
client_avg = np.mean(client_times)
client_p50 = np.percentile(client_times, 50)
client_p90 = np.percentile(client_times, 90)
client_p95 = np.percentile(client_times, 95)
client_p100 = np.percentile(client_times, 100)
print('Avg | P50 | P90 | P95 | P100')
print('{:.4f} | {:.4f} | {:.4f} | {:.4f}\n'.format(client_avg, client_p50, client_p90, client_p95, client_p100))

print('Getting Cloudwatch:')
cloudwatch = boto3.client('cloudwatch')
statistics=['SampleCount', 'Average', 'Minimum', 'Maximum']
extended=['p50', 'p90', 'p95', 'p100']

# Give 5 minute buffer to end
cw_end += datetime.timedelta(minutes=5)

# Period must be 1, 5, 10, 30, or multiple of 60
# Calculate closest multiple of 60 to the total elapsed time
factor = math.ceil((cw_end - cw_start).total_seconds() / 60)
period = factor * 60
print('Time elapsed: {} seconds'.format((cw_end - cw_start).total_seconds()))
print('Using period of {} seconds\n'.format(period))

cloudwatch_ready = False
# Keep polling CloudWatch metrics until datapoints are available
while not cloudwatch_ready:
  time.sleep(30)
  print('Waiting 30 seconds ...')
  # Must use default units of microseconds
  model_latency_metrics = cloudwatch.get_metric_statistics(MetricName='ModelLatency',
                                             Dimensions=[{'Name': 'EndpointName',
                                                          'Value': endpoint_name},
                                                         {'Name': 'VariantName',
                                                          'Value': "AllTraffic"}],
                                             Namespace="AWS/SageMaker",
                                             StartTime=cw_start,
                                             EndTime=cw_end,
                                             Period=period,
                                             Statistics=statistics,
                                             ExtendedStatistics=extended
                                             )

  # Should be 1000
  if len(model_latency_metrics['Datapoints']) > 0:
    print('{} latency datapoints ready'.format(model_latency_metrics['Datapoints'][0]['SampleCount']))
    print('Side-car latency percentiles:')
    side_avg = model_latency_metrics['Datapoints'][0]['Average'] / 1000
    side_p50 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p50'] / 1000
    side_p90 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p90'] / 1000
    side_p95 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p95'] / 1000
    side_p100 = model_latency_metrics['Datapoints'][0]['ExtendedStatistics']['p100'] / 1000
    print('Avg | P50 | P90 | P95 | P100')
    print('{:.4f} | {:.4f} | {:.4f} | {:.4f}\n'.format(side_avg, side_p50, side_p90, side_p95, side_p100))

    cloudwatch_ready = True

Doing warmup round of 100 inferences (not counted)
Running 1000 inferences for pt-ei-densenet121-tracedV-c5large-eia2medium:
Client end-to-end latency percentiles:
Avg | P50 | P90 | P95 | P100
65.0304 | 63.2639 | 72.3619 | 76.7848

Getting Cloudwatch:
Time elapsed: 365.033259 seconds
Using period of 420 seconds

Waiting 30 seconds ...
740.0 latency datapoints ready
Side-car latency percentiles:
Avg | P50 | P90 | P95 | P100
50.4217 | 49.1494 | 57.2891 | 61.2198

