# A/B testing, traffic shifting and autoscaling
create an endpoint with multiple variants, splitting the traffic between them. Then after testing and reviewing the endpoint performance metrics, you will shift the traffic to one variant and configure it to autoscale.

In [2]:
# please ignore warning messages during the installation
!pip install --disable-pip-version-check -q sagemaker==2.35.0
!conda install -q -y pytorch==1.6.0 -c pytorch
!pip install --disable-pip-version-check -q transformers==3.5.1

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import boto3, sagemaker, pandas as pd, botocore

config = botocore.config.Config(user_agent_extra='dlai-pds/c3/w2')

# low-level service client of the boto3 session
sm = boto3.client(service_name='sagemaker', config=config)
sm_runtime = boto3.client('sagemaker-runtime', config=config)
sess = sagemaker.Session(sagemaker_client=sm, sagemaker_runtime_client=sm_runtime)

bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

cw = boto3.client(service_name='cloudwatch', config=config)

autoscale = boto3.client(service_name="application-autoscaling", config=config)

[0mCollecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

[0m

<a name='c3w2-1.'></a>
# 1. Create an endpoint with multiple variants
Let's deploy an endpoint splitting the traffic between these two models 50/50 to perform A/B Testing. Instead of creating a PyTorch Model object and calling `model.deploy()` function, you will create an `Endpoint configuration` with multiple model variants. Here is the workflow you will follow to create an endpoint:
<img src="./c3w2/images/endpoint-workflow.png" width="60%" align="center">

In [None]:
#Two modelsare saved in the following S3 bucket paths.  These `tar.gz` files contain the model artifacts, which result from model training.
model_a_s3_uri = 's3://dlai-practical-data-science/models/ab/variant_a/model.tar.gz'
model_b_s3_uri = 's3://dlai-practical-data-science/models/ab/variant_b/model.tar.gz'

inference_instance_type = 'ml.m5.large'

#Create an ECR URI using the 'PyTorch' framework
inference_image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",version='1.6.0',instance_type=inference_instance_type, 
    region=region, py_version='py3', image_scope='inference')
print(inference_image_uri)

#Create Amazon SageMaker Models:
import time
from pprint import pprint
timestamp = int(time.time())
model_name_a = '{}-{}'.format('a', timestamp)
model_name_b = '{}-{}'.format('b', timestamp)

def check_model_existence(model_name): #to check if the model already exists in Amazon SageMaker
    for model in sm.list_models()['Models']:
        if model_name == model['ModelName']:
            return True
    return False

#Create an Amazon SageMaker Model based on the model_a_s3_uri data:
if not check_model_existence(model_name_a):
    model_a = sm.create_model(ModelName=model_name_a, ExecutionRoleArn=role,
                              PrimaryContainer={'ModelDataUrl': model_a_s3_uri, 'Image':inference_image_uri })
    pprint(model_a)
else:
    print("Model {} already exists".format(model_name_a))
    
#Create an Amazon SageMaker Model based on the model_b_s3_uri data:
if not check_model_existence(model_name_b):
    model_b = sm.create_model(ModelName=model_name_b, ExecutionRoleArn=role, 
                              PrimaryContainer={'ModelDataUrl': model_b_s3_uri, 'Image': inference_image_uri})
    pprint(model_b)
else:
    print("Model {} already exists".format(model_name_b))

#Set up Amazon SageMaker production variants: A production variant is a packaged SageMaker Model combined with the configuration related to how that model will be hosted.You have constructed the model in the section above. The hosting resources configuration includes information on how you want that model to be hosted: the number and type of instances, a pointer to the SageMaker package model, as well as a variant name and variant weight. A single SageMaker Endpoint can actually include multiple production variants.
#Create an Amazon SageMaker production variant for the SageMaker Models
from sagemaker.session import production_variant

variantA = production_variant(
    model_name=model_name_a, instance_type=inference_instance_type,
    initial_weight=50,  # traffic distribution weight
    initial_instance_count=1, variant_name='VariantA',) # production variant name
print(variantA)

variantB = production_variant(
    model_name=model_name_b, instance_type=inference_instance_type, initial_weight=50, 
    initial_instance_count=1, variant_name='VariantB',)
print(variantB)

#Configure and create the endpoint;
#Check if the endpoint configuration and endpoint itself already exist in Amazon SageMaker.
def check_endpoint_config_existence(endpoint_config_name):
    for endpoint_config in sm.list_endpoint_configs()['EndpointConfigs']:
        if endpoint_config_name == endpoint_config['EndpointConfigName']:
            return True
    return False

def check_endpoint_existence(endpoint_name):
    for endpoint in sm.list_endpoints()['Endpoints']:
        if endpoint_name == endpoint['EndpointName']:
            return True
    return False

#Create the endpoint configuration:
endpoint_config_name = '{}-{}'.format('ab', timestamp)

if not check_endpoint_config_existence(endpoint_config_name):
    endpoint_config = sm.create_endpoint_config(
        EndpointConfigName=endpoint_config_name, 
        ProductionVariants=[variantA, variantB])
    pprint(endpoint_config)
else:
    print("Endpoint configuration {} already exists".format(endpoint_config_name))
    
#Create an endpoint:
model_ab_endpoint_name = '{}-{}'.format('ab', timestamp)
if not check_endpoint_existence(model_ab_endpoint_name):
    endpoint_response = sm.create_endpoint(EndpointName=model_ab_endpoint_name, EndpointConfigName=endpoint_config_name )
    print('Creating endpoint {}'.format(model_ab_endpoint_name))
    pprint(endpoint_response)
else:
    print("Endpoint {} already exists".format(model_ab_endpoint_name))

#Review the created endpoint configuration and endpoint in the AWS console.
from IPython.core.display import display, HTML
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpointConfig/{}">REST Endpoint configuration</a></b>'.format(region, endpoint_config_name)))
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST endpoint</a></b>'.format(region, model_ab_endpoint_name)))

In [None]:
#wait for endpoint
#%%time
waiter = sm.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=model_ab_endpoint_name)

## Test model on a few sample strings

In [None]:
#Create an Amazon SageMaker Predictor based on the deployed endpoint:
from sagemaker.predictor import Predictor
from sagemaker.serializers import JSONLinesSerializer
from sagemaker.deserializers import JSONLinesDeserializer

inputs = [{"features": ["I love this product!"]}, {"features": ["OK, but not great."]}, {"features": ["This is not the right product."]},]

predictor = Predictor(
    endpoint_name=model_ab_endpoint_name,
    serializer=JSONLinesSerializer(), #a serializer object, used to encode data for an inference endpoint
    deserializer=JSONLinesDeserializer(),sagemaker_session=sess)

predicted_classes = predictor.predict(inputs)

for predicted_class in predicted_classes:
    print("Predicted class {} with probability {}".format(predicted_class['predicted_label'], predicted_class['probability']))

In [None]:
#Generate traffic and review the endpoint performance metrics; Now you will generate traffic. To analyze the endpoint performance you will review some of the metrics that Amazon SageMaker emits in CloudWatch: CPU Utilization, Latency and Invocations. Full list of namespaces and metrics can be found [here](https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html). CloudWatch `get_metric_statistics` documentation can be found [here](https://docs.aws.amazon.com/AmazonCloudWatch/latest/APIReference/API_GetMetricStatistics.html).

def plot_endpoint_metrics_for_variants(endpoint_name, namespace_name, metric_name, variant_names, start_time, 
                                     end_time):
    
    try:
        joint_variant_metrics = None

        for variant_name in variant_names:
            metrics = cw.get_metric_statistics( # extracts the results in a dictionary format
                Namespace=namespace_name, # the namespace of the metric, e.g. "AWS/SageMaker"
                MetricName=metric_name, # the name of the metric, e.g. "CPUUtilization"
                StartTime=start_time, # the time stamp that determines the first data point to return
                EndTime=end_time, # the time stamp that determines the last data point to return
                Period=60, # the granularity, in seconds, of the returned data points
                Statistics=["Sum"], # the metric statistics
                Dimensions=[ # dimensions, as CloudWatch treats each unique combination of dimensions as a separate metric
                    {"Name": "EndpointName", "Value": endpoint_name}, 
                    {"Name": "VariantName", "Value": variant_name}])
            
            if metrics["Datapoints"]: # access the results from the distionary using the key "Datapoints"
                df_metrics = pd.DataFrame(metrics["Datapoints"]) \
                    .sort_values("Timestamp") \
                    .set_index("Timestamp") \
                    .drop("Unit", axis=1) \
                    .rename(columns={"Sum": variant_name}) # rename the column with the metric results as a variant_name
                
                if joint_variant_metrics is None:
                    joint_variant_metrics = df_metrics
                else:
                    joint_variant_metrics = joint_variant_metrics.join(df_metrics, how="outer")
        
        joint_variant_metrics.plot(title=metric_name)
    except:
        pass

#Establish wide enough time bounds to show all the charts using the same timeframe:
from datetime import datetime, timedelta
start_time = datetime.now() - timedelta(minutes=30)
end_time = datetime.now() + timedelta(minutes=30)
print('Start Time: {}'.format(start_time))
print('End Time: {}'.format(end_time))

#Set the list of the the variant names to analyze.
variant_names = [variantA["VariantName"], variantB["VariantName"]]
print(variant_names)

#Run some predictions and view the metrics for each variant.
#%%time
for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)

In [None]:
#query CloudWatch to get a few metrics that are split across variants. If you see Metrics not yet available, please be patient as metrics may take a few mins to appear in CloudWatch.
time.sleep(30) # Sleep to accomodate a slight delay in metrics gathering
# CPUUtilization
# The sum of each individual CPU core's utilization. 
# The CPU utilization of each core can range between 0 and 100. For example, if there are four CPUs, CPUUtilization can range from 0% to 400%.
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="/aws/sagemaker/Endpoints", 
    metric_name="CPUUtilization", variant_names=variant_names, start_time=start_time, end_time=end_time)

# Invocations
# The number of requests sent to a model endpoint.
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="Invocations",
    variant_names=variant_names, start_time=start_time, end_time=end_time)

# InvocationsPerInstance
# The number of invocations sent to a model, normalized by InstanceCount in each production variant.
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="InvocationsPerInstance",
    variant_names=variant_names, start_time=start_time, end_time=end_time)

# ModelLatency
# The interval of time taken by a model to respond as viewed from SageMaker (in microseconds).
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="ModelLatency",
    variant_names=variant_names, start_time=start_time, end_time=end_time)

## 3. Shift the traffic to one variant and review the endpoint performance metrics
Generally, the winning model would need to be chosen. The decision would be made based on the endpoint performance metrics and some other business related evaluations. Here you can assume that the winning model is in the Variant B and shift all traffic to it.

In [None]:
#Construct a list with the updated endpoint weights. No downtime occurs during this traffic-shift activity. this take time

updated_endpoint_config = [
    {"VariantName": variantA["VariantName"],"DesiredWeight": 0,},
    {"VariantName": variantB["VariantName"], "DesiredWeight": 100,},]

#Update variant weights in the configuration of the existing endpoint. this takes time. There is no downtime while the update is applying.
sm.update_endpoint_weights_and_capacities(EndpointName=model_ab_endpoint_name, DesiredWeightsAndCapacities=updated_endpoint_config)

#review the endpoint in the AWS console.
from IPython.core.display import display, HTML
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST endpoint</a></b>'.format(region, model_ab_endpoint_name)))

In [None]:
#wait
waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=model_ab_endpoint_name)

In [None]:
#%%time

for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)

In [None]:
#Run some more predictions and view the metrics for each variant.
#%%time
for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)
#Μake sure the predictions ^^ above ^^ ran successfully. If you see Metrics not yet available, please be patient as metrics may take a few minutes to appear in CloudWatch. Compare the results with the plots above.

In [None]:
# CPUUtilization
# The sum of each individual CPU core's utilization. 
# The CPU utilization of each core can range between 0 and 100. For example, if there are four CPUs, CPUUtilization can range from 0% to 400%.
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="/aws/sagemaker/Endpoints",
    metric_name="CPUUtilization", variant_names=variant_names, start_time=start_time, end_time=end_time)

# Invocations
# The number of requests sent to a model endpoint.
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="Invocations",
    variant_names=variant_names, start_time=start_time, end_time=end_time)

# InvocationsPerInstance
# The number of invocations sent to a model, normalized by InstanceCount in each production variant.
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="InvocationsPerInstance",
    variant_names=variant_names, start_time=start_time, end_time=end_time)

# ModelLatency
# The interval of time taken by a model to respond as viewed from SageMaker (in microseconds).
plot_endpoint_metrics_for_variants(
    endpoint_name=model_ab_endpoint_name, namespace_name="AWS/SageMaker", metric_name="ModelLatency",
    variant_names=variant_names, start_time=start_time, end_time=end_time)

<a name='c3w2-4.'></a>
# 4. Configure one variant to autoscale

Let's configure Variant B to autoscale. You would not autoscale Variant A since no traffic is being passed to it at this time.
First, you need to define a scalable target. It is an AWS resource and in this case you want to scale a `sagemaker` resource as indicated in the `ServiceNameSpace` parameter. Then the `ResourceId` is a SageMaker Endpoint. Because autoscaling is used by other AWS resources, you’ll see a few parameters that will remain static for scaling SageMaker Endpoints. Thus the `ScalableDimension` is a set value for SageMaker Endpoint scaling.
You also need to specify a few key parameters that control the min and max behavior for your Machine Learning instances. The `MinCapacity` indicates the minimum number of instances you plan to scale in to. The `MaxCapacity` is the maximum number of instances you want to scale out to. So in this case you always want to have at least 1 instance running and a maximum of 2 during peak periods. 

In [None]:
autoscale.register_scalable_target(
    ServiceNamespace="sagemaker", ResourceId="endpoint/" + model_ab_endpoint_name + "/variant/VariantB",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount", MinCapacity=1, MaxCapacity=2, RoleARN=role,
    SuspendedState={"DynamicScalingInSuspended": False, "DynamicScalingOutSuspended": False, "ScheduledScalingSuspended": False,},)

waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=model_ab_endpoint_name)

#Check that the parameters from the function above are in the description of the scalable target:
autoscale.describe_scalable_targets(ServiceNamespace="sagemaker", MaxResults=100,)

Define and apply scaling policy using the `put_scaling_policy` function. The scaling policy provides additional information about the scaling behavior for your instance. `TargetTrackingScaling` refers to a specific autoscaling type supported by SageMaker, that uses a scaling metric and a target value as the indicator to scale.
In the scaling policy configuration, you have the predefined metric `PredefinedMetricSpecification` which is the number of invocations on your instance and the `TargetValue` which indicates the number of invocations per ML instance you want to allow before triggering your scaling policy. A scale out cooldown of 60 seconds means that after autoscaling successfully scales out it starts to calculate the cooldown time. The scaling policy won’t increase the desired capacity again until the cooldown period ends.
The scale in cooldown setting of 300 seconds means that SageMaker will not attempt to start another cooldown policy within 300 seconds of when the last one completed.

In [None]:
autoscale.put_scaling_policy(
    PolicyName="bert-reviews-autoscale-policy", ServiceNamespace="sagemaker",
    ResourceId="endpoint/" + model_ab_endpoint_name + "/variant/VariantB",
    ScalableDimension="sagemaker:variant:DesiredInstanceCount",
    PolicyType="TargetTrackingScaling",
    TargetTrackingScalingPolicyConfiguration={
        "TargetValue": 2.0, # the number of invocations per ML instance you want to allow before triggering your scaling policy
        "PredefinedMetricSpecification": {
            "PredefinedMetricType": "SageMakerVariantInvocationsPerInstance", # scaling metric
        },
        "ScaleOutCooldown": 60, # wait time, in seconds, before beginning another scale out activity after last one completes
        "ScaleInCooldown": 300,},) # wait time, in seconds, before beginning another scale in activity after last one completes

waiter = sm.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=model_ab_endpoint_name)

#Generate traffic again and review the endpoint in the AWS console.
#%%time
for i in range(0, 100):
    predicted_classes = predictor.predict(inputs)
    
#Review the autoscaling:
from IPython.core.display import display, HTML
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST endpoint</a></b>'.format(region, model_ab_endpoint_name)))