In [1]:
#!pip install --upgrade pip

import os
import sagemaker
print(sagemaker.__version__)

from sagemaker.pytorch import PyTorch
from sagemaker.pytorch.model import PyTorchModel

sagemaker_session = sagemaker.Session()

role = sagemaker.get_execution_role()

2.127.0


In [2]:
role

'arn:aws:iam::249517808360:role/service-role/AmazonSageMaker-ExecutionRole-20220920T155276'

In [4]:
# upload new model to s3 with boto3

import boto3
# Define session and bucket

bucket = sagemaker_session.default_bucket()
region = boto3.session.Session().region_name

s3_client = boto3.client('s3', region_name=region)
prefix = "newmodel"
s3_client.upload_file('model.tar.gz', bucket, prefix + '/model.tar.gz')

model_uri = 's3://{}/model.tar.gz'.format(bucket)
new_model_uir = 's3://{}/{}/model.tar.gz'.format(bucket,prefix)
print(model_uri,new_model_uir)


s3://sagemaker-cn-northwest-1-383709301087/model.tar.gz s3://sagemaker-cn-northwest-1-383709301087/newmodel/model.tar.gz


## Sagemaker endpoint Autoscaling 配置

### 列出现有的线上的endpoint
根据已经部署好的终端节点endpoint 确定终端节点名称endpoint name 以及 终端节点配置endpoint config

In [88]:
import pprint
import json
from datetime import date, datetime

pp = pprint.PrettyPrinter(indent=4, depth=4)

sagemaker_client = boto3.client('sagemaker')
response = sagemaker_client.list_endpoints(SortBy='CreationTime',StatusEquals='InService',NameContains='pytorch',CreationTimeAfter=datetime(2022, 10, 1))
pp.pprint(response)

{   'Endpoints': [   {   'CreationTime': datetime.datetime(2022, 12, 1, 2, 56, 41, 972000, tzinfo=tzlocal()),
                         'EndpointArn': 'arn:aws-cn:sagemaker:cn-northwest-1:383709301087:endpoint/pytorch-inference-2022-12-01-02-56-41-738',
                         'EndpointName': 'pytorch-inference-2022-12-01-02-56-41-738',
                         'EndpointStatus': 'InService',
                         'LastModifiedTime': datetime.datetime(2022, 12, 2, 13, 27, 6, 93000, tzinfo=tzlocal())}],
    'ResponseMetadata': {   'HTTPHeaders': {   'content-length': '289',
                                               'content-type': 'application/x-amz-json-1.1',
                                               'date': 'Fri, 02 Dec 2022 '
                                                       '13:39:01 GMT',
                                               'x-amzn-requestid': '4833c855-238b-420b-bd0c-ce362413ac67'},
                            'HTTPStatusCode': 200,
                    

In [89]:
endpoint_name = response['Endpoints'][0]['EndpointName']
endpoint_name

'pytorch-inference-2022-12-01-02-56-41-738'

In [93]:
# fetch endpoint details
response = client.describe_endpoint(EndpointName=endpoint_name)
pp.pprint(response)
old_endpointconfig = response['EndpointConfigName']

{   'CreationTime': datetime.datetime(2022, 12, 1, 2, 56, 41, 972000, tzinfo=tzlocal()),
    'EndpointArn': 'arn:aws-cn:sagemaker:cn-northwest-1:383709301087:endpoint/pytorch-inference-2022-12-01-02-56-41-738',
    'EndpointConfigName': 'new-endpoint-config',
    'EndpointName': 'pytorch-inference-2022-12-01-02-56-41-738',
    'EndpointStatus': 'InService',
    'LastModifiedTime': datetime.datetime(2022, 12, 2, 13, 27, 6, 93000, tzinfo=tzlocal()),
    'ProductionVariants': [   {   'CurrentInstanceCount': 2,
                                  'CurrentWeight': 1.0,
                                  'DeployedImages': [{...}],
                                  'DesiredInstanceCount': 2,
                                  'DesiredWeight': 1.0,
                                  'VariantName': 'Variant2'}],
    'ResponseMetadata': {   'HTTPHeaders': {   'content-length': '778',
                                               'content-type': 'application/x-amz-json-1.1',
                         

In [115]:
response = client.describe_endpoint_config(EndpointConfigName =old_endpointconfig)
old_endpointVariants = response['ProductionVariants'][0]
old_endpointVariants

{'VariantName': 'AllTraffic',
 'ModelName': 'pytorch-inference-2022-12-01-02-56-41-175',
 'InitialInstanceCount': 2,
 'InstanceType': 'ml.g4dn.2xlarge',
 'InitialVariantWeight': 1.0}

### 配置自动扩展策略

In [127]:

asg_client = boto3.client('application-autoscaling')

register_variants = client.describe_endpoint(EndpointName=endpoint_name)['ProductionVariants'][0]['VariantName']
resource_id='endpoint/' + endpoint_name + '/variant/' + register_variants # This is the format in which application autoscaling references the endpoint

response = asg_client.register_scalable_target(
    ServiceNamespace='sagemaker', #
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount',
    MinCapacity=1,
    MaxCapacity=20
)

response

{'ResponseMetadata': {'RequestId': 'bf509636-75ce-4249-8f3b-294bd89d988a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'bf509636-75ce-4249-8f3b-294bd89d988a',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '2',
   'date': 'Fri, 02 Dec 2022 14:18:59 GMT'},
  'RetryAttempts': 0}}

## 更新线上终端节点
首先创建model（指定container，model文件），再用production_variant生成新的模型版本（variant_new）,然后用create_endpoint_config创建新的模型部署定义（instance型号数量等），update_endpoint更新endpoint。

### 列出现有的注册的模型
根据已经注册的模型拿到在用的推理镜像uri

In [51]:
response = sagemaker_client.list_models(SortBy='CreationTime',CreationTimeAfter=datetime(2022, 10, 1))
model_name = response['Models'][0]['ModelName']

response = sagemaker_client.describe_model(
    ModelName = model_name
)
pp.pprint(response)
image_uri = response['PrimaryContainer']['Image']
image_uri

{   'CreationTime': datetime.datetime(2022, 12, 1, 2, 56, 41, 621000, tzinfo=tzlocal()),
    'EnableNetworkIsolation': False,
    'ExecutionRoleArn': 'arn:aws-cn:iam::383709301087:role/sgbootcamp-internal-SageMakerExecutionRole-1HIZ19SFK76J4',
    'ModelArn': 'arn:aws-cn:sagemaker:cn-northwest-1:383709301087:model/pytorch-inference-2022-12-01-02-56-41-175',
    'ModelName': 'pytorch-inference-2022-12-01-02-56-41-175',
    'PrimaryContainer': {   'Environment': {   'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',
                                               'SAGEMAKER_MODEL_SERVER_WORKERS': '2',
                                               'SAGEMAKER_PROGRAM': 'inference.py',
                                               'SAGEMAKER_REGION': 'cn-northwest-1',
                                               'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code'},
                            'Image': '727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-inference:1.10.2-gpu-py38',
          

'727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/pytorch-inference:1.10.2-gpu-py38'

### 注册新的模型

In [54]:
response = sagemaker_client.create_model(ModelName = model_name+'-new', ExecutionRoleArn = role, PrimaryContainer={
        'Image': image_uri,
        'ImageConfig': {
            'RepositoryAccessMode': 'Platform'
        },
        'Mode': 'SingleModel',
        'ModelDataUrl': new_model_uir       
    })
response

{'ModelArn': 'arn:aws-cn:sagemaker:cn-northwest-1:383709301087:model/pytorch-inference-2022-12-01-02-56-41-175-new',
 'ResponseMetadata': {'RequestId': '9e4c4c71-2943-40f4-bac5-3c9275054ad6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9e4c4c71-2943-40f4-bac5-3c9275054ad6',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '115',
   'date': 'Fri, 02 Dec 2022 12:42:52 GMT'},
  'RetryAttempts': 0}}

### 创建新的生产变体variant
最好先将新模型变体的'InitialVariantWeight'设置为0，等部署完成之后再调整两个变体之间的权重

In [110]:
from sagemaker.session import production_variant

variant2 = production_variant(model_name=model_name+'-new',
                              instance_type='ml.g4dn.2xlarge',
                              initial_instance_count=2,
                              variant_name='VariantNew',
                              initial_weight=1)


In [111]:
variant_new

{'ModelName': 'pytorch-inference-2022-12-01-02-56-41-175-new',
 'VariantName': 'VariantNew',
 'InitialVariantWeight': 1.0,
 'InitialInstanceCount': 1,
 'InstanceType': 'ml.g4dn.4xlarge'}

In [116]:
old_endpointVariants

{'VariantName': 'AllTraffic',
 'ModelName': 'pytorch-inference-2022-12-01-02-56-41-175',
 'InitialInstanceCount': 2,
 'InstanceType': 'ml.g4dn.2xlarge',
 'InitialVariantWeight': 1.0}

### 创建新的终端节点配置
新的终端节点配置里面包括 旧模型的生产变体，新模型的生产变体

In [129]:
newEndpointConfigName = 'pytorch-inference-new-endpoint-config'
response = sagemaker_client.create_endpoint_config( EndpointConfigName = newEndpointConfigName,
                                                   ProductionVariants= [old_endpointVariants,variant_new])

In [131]:
pp.pprint(client.describe_endpoint_config(EndpointConfigName =newEndpointConfigName))

{   'CreationTime': datetime.datetime(2022, 12, 2, 14, 5, 57, 745000, tzinfo=tzlocal()),
    'EndpointConfigArn': 'arn:aws-cn:sagemaker:cn-northwest-1:383709301087:endpoint-config/pytorch-inference-new-endpoint-config',
    'EndpointConfigName': 'pytorch-inference-new-endpoint-config',
    'ProductionVariants': [   {   'InitialInstanceCount': 2,
                                  'InitialVariantWeight': 1.0,
                                  'InstanceType': 'ml.g4dn.2xlarge',
                                  'ModelName': 'pytorch-inference-2022-12-01-02-56-41-175',
                                  'VariantName': 'AllTraffic'},
                              {   'InitialInstanceCount': 1,
                                  'InitialVariantWeight': 1.0,
                                  'InstanceType': 'ml.g4dn.4xlarge',
                                  'ModelName': 'pytorch-inference-2022-12-01-02-56-41-175-new',
                                  'VariantName': 'VariantNew'}],
    'Respo

### 更新终端节点
需要先取消注册自动扩展才能更新终端节点

In [128]:
asg_client = boto3.client('application-autoscaling')
deregister_variant= client.describe_endpoint(EndpointName=endpoint_name)['ProductionVariants'][0]['VariantName'] 

resource_id='endpoint/' + endpoint_name + '/variant/' + deregister_variant
response = asg_client.deregister_scalable_target(
    ServiceNamespace='sagemaker',
    ResourceId=resource_id,
    ScalableDimension='sagemaker:variant:DesiredInstanceCount')

新的终端节点有两个变体，一个对应新模型，一个对应旧模型，更新的时候endpoint会变成 updating的状态，大约需要花 7-8 mins

In [132]:
response = sagemaker_client.update_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=newEndpointConfigName)

可以调整两个变体(即新旧模型)之间的流量比例，大约花费1min

In [134]:
variant1= client.describe_endpoint(EndpointName=endpoint_name)['ProductionVariants'][0]['VariantName'] 
variant2= client.describe_endpoint(EndpointName=endpoint_name)['ProductionVariants'][-1]['VariantName']

In [135]:
response = sagemaker_client.update_endpoint_weights_and_capacities(
    EndpointName=endpoint_name,
    DesiredWeightsAndCapacities=[
        {
            "DesiredWeight": 75,
            "VariantName": variant1
        },
        {
            "DesiredWeight": 25,
            "VariantName": variant2
        }
    ]
)