In [9]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

In [None]:
!pip install transformers[torch]

In [None]:
!pip install nvidia-pyindex
!pip install tritonclient[http]

!pip install -qU pip awscli boto3 sagemaker transformers


**Test how to create BERT torchscript model**

In [None]:
from transformers import BertModel, BertTokenizer, BertConfig
import torch

enc = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenizing input text
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = enc.tokenize(text)
print(f"BERT:Tokenized:Text={tokenized_text}:::")

# Masking one of the input tokens
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
print(f"BERT:indexed_tokens:={indexed_tokens}::")

# -- segments id's
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
dummy_model_input = {'input_ids':tokens_tensor, 'attention_mask':segments_tensors }
print(f"BERT:Combining:DICT: all: creating dummy:input:Model:={dummy_model_input}::")

dummy_input = [tokens_tensor, segments_tensors]
print(f"BERT:Finally combining all: creating dummy:input={dummy_input}::")

# Initializing the model with the torchscript flag
# Flag set to True even though it is not necessary as this model does not have an LM Head.
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)

# Instantiating the model
model = BertModel(config)

# The model needs to be in evaluation mode for torchscript 
model.eval()

# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)

# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
#torch.jit.save(traced_model, "./bert-uc/traced_bert.pt")

In [None]:
print("Using the BERT Tokensizer::")

print(indexed_tokens)
print(dummy_input)
print(dummy_model_input)


dummy_model_input

**Test Tokenizers**

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# load model and tokenizer
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
dummy_model_bert_input = tokenizer("This is a sample", return_tensors="pt")

print("Using the BERT:AUTO:TOKENSIZER: Tokenizer::")
print(dummy_model_bert_input) # -- dict -- input id's and attention mask


### Export as ONYX

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# export
torch.onnx.export(
    model, 
    tuple(dummy_model_input.values()), #tuple(dummy_model_input.values()),
    f="./bert-uc/torch-model.onnx",  
    input_names=['input_ids', 'attention_mask'], 
    output_names=['logits'], 
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}, 
                  'attention_mask': {0: 'batch_size', 1: 'sequence'}, 
                  'logits': {0: 'batch_size', 1: 'sequence'}}, 
    do_constant_folding=True, 
    opset_version=13, 
)


In [None]:
[tokens_tensor, segments_tensors]

In [None]:
import torch

In [None]:
%%writefile triton-serve/bert-uc/config.pbtxt
platform: "pytorch_libtorch"
max_batch_size: 32
input [
  {
    name: "INPUT__0"
    data_type: TYPE_INT32
    dims: [512]
  },
  {
    name: "INPUT__1"
    data_type: TYPE_INT32
    dims: [512]
  }
]
output [
  {
    name: "OUTPUT__0"
    data_type: TYPE_FP32
    dims: [512, 768]
  },
  {
    name: "1634__1"
    data_type: TYPE_FP32
    dims: [768]
  }
]
instance_group {
  count: 1
  kind: KIND_GPU
}
dynamic_batching {
  preferred_batch_size: 32
}

### Run for Triton server

**Note**: Amazon SageMaker expects the model tarball file to have a top level directory with the same name as the model defined in the `config.pbtxt`. Below is the sample model directory structure

```
bert-uc
├── 1
│   └── model.pt
└── config.pbtxt
```

**Have to use the same Tokenizer to generate the input to test as BERT uncased**

In [4]:
from transformers import BertModel, BertTokenizer, BertConfig
import torch

enc = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenizing input text
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = enc.tokenize(text)
print(f"BERT:Tokenized:Text={tokenized_text}:::")

# Masking one of the input tokens
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
print(f"BERT:indexed_tokens:={indexed_tokens}::")

# -- segments id's -- CAN WE GENERATE THEM via model
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Creating a dummy input
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
print(tokens_tensor)
print(segments_tensors)

BERT:Tokenized:Text=['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', 'henson', 'was', 'a', 'puppet', '##eer', '[SEP]']:::
BERT:indexed_tokens:=[101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 103, 2001, 1037, 13997, 11510, 102]::
tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958,   103,  2001,
          1037, 13997, 11510,   102]])
tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])


### Create the BERT Model in Torch Script mode -- .pt model
use the ore trained and use torchscript flag here

In [None]:
from transformers import BertModel, BertTokenizer, BertConfig
import torch


# If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)

# Change to eva lmodel
model.eval()

# run a dummy prediction of tokens by tensors
output = model(tokens_tensor)
print(len(output), type(output), type(output[0]))

# Creating the trace
traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
torch.jit.save(traced_model, "./triton-serve/bert-uc/1/model.pt")

### Create the LARGE CASE BERT Model in Torch Script using dummy inputs -- .pt model
Create using the dummy inputs

In [54]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))


model = BertModel.from_pretrained("bert-large-uncased", torchscript=True)
enc = BertTokenizer.from_pretrained("bert-large-uncased")

bs = 1
seq_len = 512
dummy_inputs = [
    torch.randint(1000, (bs, seq_len)).to(device),
    torch.zeros(bs, seq_len, dtype=torch.int).to(device),
]
model = model.eval()
model.to(device)

traced_model = torch.jit.trace(model, dummy_inputs)
torch.jit.save(traced_model, "./triton-serve/bert-uc/1/model.pt")

print("Saved {}".format(traced_model))

Using cpu device


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Saved BertModel(
  original_name=BertModel
  (embeddings): BertEmbeddings(
    original_name=BertEmbeddings
    (word_embeddings): Embedding(original_name=Embedding)
    (position_embeddings): Embedding(original_name=Embedding)
    (token_type_embeddings): Embedding(original_name=Embedding)
    (LayerNorm): LayerNorm(original_name=LayerNorm)
    (dropout): Dropout(original_name=Dropout)
  )
  (encoder): BertEncoder(
    original_name=BertEncoder
    (layer): ModuleList(
      original_name=ModuleList
      (0): BertLayer(
        original_name=BertLayer
        (attention): BertAttention(
          original_name=BertAttention
          (self): BertSelfAttention(
            original_name=BertSelfAttention
            (query): Linear(original_name=Linear)
            (key): Linear(original_name=Linear)
            (value): Linear(original_name=Linear)
            (dropout): Dropout(original_name=Dropout)
          )
          (output): BertSelfOutput(
            original_name=BertSelfO

#### Test encoders various methods

In [64]:
enc(
    "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs.", 
    padding="max_length", 
    max_length=64
   )

{'input_ids': [101, 13012, 2669, 28937, 8241, 3640, 1037, 6112, 1998, 3341, 1999, 7512, 2368, 6129, 5576, 23569, 27605, 5422, 2005, 2119, 17368, 2015, 1998, 14246, 2271, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [68]:
encoded_tokens = enc.encode_plus(
    "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs.",
    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
    max_length = 512,           
    pad_to_max_length = True, # Pad & truncate all sentences
)
#encoded_tokens 

**Predict test using the traced model Needs Tokens and Attention mask both**

In [86]:
import torch
import torch.nn.functional as F

encoded_input = enc(
    "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs.", 
    return_tensors='pt',
    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
    max_length = 512,           
    pad_to_max_length = True, # Pad & truncate all sentences

)
unscripted_output = model(**encoded_input)
scripted_output = traced_model(encoded_input['input_ids'],encoded_input['attention_mask'])
#enc.decode(output_sequences[0], skip_special_tokens=True)

# unscripted_output = r18(dummy_input)         # Get the unscripted model's prediction...
# scripted_output = r18_scripted(dummy_input)  # ...and do the same for the scripted version

unscripted_top5 = unscripted_output #F.softmax(unscripted_output, dim=1).topk(5).indices
scripted_top5 = scripted_output #F.softmax(scripted_output, dim=1).topk(5).indices

print('Python model top 5 results:\n  {}'.format(unscripted_top5))
print('TorchScript model top 5 results:\n  {}'.format(scripted_top5))

Python model top 5 results:
  (tensor([[[-0.4472,  0.3378, -0.1825,  ..., -0.8584, -1.3538,  0.4175],
         [-0.2535,  0.1439, -0.1481,  ..., -0.5559, -0.3288, -0.1320],
         [-0.1767, -0.2466, -0.3732,  ..., -0.2728,  0.2910,  0.3882],
         ...,
         [ 0.1781,  0.3419, -0.1034,  ...,  0.2751, -0.5518, -0.1267],
         [ 0.0978,  0.4017,  0.0016,  ...,  0.2461, -0.4993, -0.1283],
         [ 0.1179,  0.4896,  0.0395,  ..., -0.0642, -0.5680, -0.2543]]],
       grad_fn=<NativeLayerNormBackward0>), tensor([[-0.9507, -0.9791,  0.9999,  ..., -0.9996,  0.8956, -0.9869]],
       grad_fn=<TanhBackward0>))
TorchScript model top 5 results:
  (tensor([[[-0.4472,  0.3378, -0.1825,  ..., -0.8584, -1.3538,  0.4175],
         [-0.2535,  0.1439, -0.1481,  ..., -0.5559, -0.3288, -0.1320],
         [-0.1767, -0.2466, -0.3732,  ..., -0.2728,  0.2910,  0.3882],
         ...,
         [ 0.1781,  0.3419, -0.1034,  ...,  0.2751, -0.5518, -0.1267],
         [ 0.0978,  0.4017,  0.0016,  ...,  0

In [80]:
output_sequences

(tensor([[[-0.4472,  0.3378, -0.1825,  ..., -0.8584, -1.3538,  0.4175],
          [-0.2535,  0.1439, -0.1481,  ..., -0.5559, -0.3288, -0.1320],
          [-0.1767, -0.2466, -0.3732,  ..., -0.2728,  0.2910,  0.3882],
          ...,
          [ 0.1781,  0.3419, -0.1034,  ...,  0.2751, -0.5518, -0.1267],
          [ 0.0978,  0.4017,  0.0016,  ...,  0.2461, -0.4993, -0.1283],
          [ 0.1179,  0.4896,  0.0395,  ..., -0.0642, -0.5680, -0.2543]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[-0.9507, -0.9791,  0.9999,  ..., -0.9996,  0.8956, -0.9869]],
        grad_fn=<TanhBackward0>))

In [53]:
output

(tensor([[[-1.1981e-03,  3.5823e-01,  1.1576e-01,  ..., -5.2510e-01,
           -4.8926e-01,  4.9398e-01],
          [-1.1116e-01,  2.2301e-01,  2.2965e-01,  ...,  3.2444e-01,
           -4.7153e-01, -1.2064e-01],
          [-2.6466e-01,  3.8914e-01,  2.2249e-01,  ...,  1.4599e-01,
           -4.2386e-01,  1.8661e-01],
          ...,
          [-1.0477e-01,  2.0685e-01, -1.8756e-01,  ...,  2.3559e-01,
            1.5210e-02,  2.3419e-01],
          [-1.0014e+00, -5.2059e-02, -1.4015e+00,  ..., -2.5614e-01,
            8.7576e-01, -7.5121e-01],
          [ 3.1511e-01,  4.4562e-01,  2.2009e-01,  ..., -5.3646e-02,
           -3.9441e-01,  2.3170e-01]]], grad_fn=<NativeLayerNormBackward0>),
 tensor([[-0.9942, -0.9572,  0.9960,  ...,  0.1145,  0.9608, -0.7614]],
        grad_fn=<TanhBackward0>))

In [None]:
enc.encode_plus

### UPLOAD of the Model.tar after it has been created correctly by 

Because we share the same model tar with bloom and with bert-uc
rm model.tar.gz in the triton-serve directory

tar --exclude=".git" --exclude=".gitattributes" --exclude="model.tar.gz" --exclude="*.bin" -zcvf model.tar.gz

**Upload the model.tar.gz to S3 location**

In [None]:
import sagemaker
from sagemaker import get_execution_role, Session, image_uris
from sagemaker.utils import name_from_base
import boto3
region = boto3.Session().region_name
role = get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
runtime_sm_client = boto3.client("sagemaker-runtime")


In [None]:
s3_model_path_triton = sagemaker.s3.S3Uploader().upload(
    local_path="./triton-serve/model.tar.gz",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bert-uc",
    sagemaker_session=session
)
s3_mme_model_path='s3://sagemaker-us-east-1-622343165275/bloom/triton_models/'
print(s3_model_path_triton)
print(s3_mme_model_path)

#### Start Single Model Triton for starting

**Triton Image download and sagemaker variables**

In [10]:
from sagemaker import get_execution_role, Session, image_uris
import boto3
from sagemaker.utils import name_from_base

region = boto3.Session().region_name
role = get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
runtime_sm_client = boto3.client("sagemaker-runtime")

base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3".format(
    account_id=account_id_map[region], region=region, base=base
)
print(triton_image_uri)

785573368785.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tritonserver:22.07-py3


**Model creation**

In [None]:
endpoint_name_p5 = name_from_base(f"p5-bert-uc-")
print(endpoint_name_p5)

container_p5 = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_model_path_triton,
    'Environment': {
        #'SAGEMAKER_PROGRAM' : 'inference.py',
        #'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'bert-uc',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216", #"16777216000",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    }
}
create_model_response = sm_client.create_model(
    ModelName=endpoint_name_p5, ExecutionRoleArn=role, PrimaryContainer=container_p5
)
print(create_model_response)

**Endpoint config**

In [None]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5,
    ProductionVariants=[
        {
            "InstanceType": "ml.g5.8xlarge", #"ml.g4dn.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])


**Endpoint**

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5, EndpointConfigName=endpoint_name_p5
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])


In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
status = resp["EndpointStatus"]
print("SINGLE:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
    status = resp["EndpointStatus"]
    print("Single:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Single:model:triton:Status: " + status)

**Now Invoke The endpoint**

In [27]:
import tritonclient.http as httpclient
from transformers import BertTokenizer
import numpy as np


def tokenize_text(text, enc, max_length=512):
    #enc = BertTokenizer.from_pretrained("bert-base-uncased")
    print(f"Tokenize:text:why??::max_length={max_length}::Tokenizer={enc}")
    encoded_text = enc(text, padding="max_length", max_length=max_length)
    return encoded_text["input_ids"], encoded_text["attention_mask"]


def _get_sample_tokenized_text_binary(text, input_names, output_names, enc, max_length=512):
    inputs = []
    outputs = []
    inputs.append(httpclient.InferInput(input_names[0], [1, max_length], "INT32"))
    inputs.append(httpclient.InferInput(input_names[1], [1, max_length], "INT32"))
    indexed_tokens, attention_mask = tokenize_text(text,enc)

    indexed_tokens = np.array(indexed_tokens, dtype=np.int32)
    indexed_tokens = np.expand_dims(indexed_tokens, axis=0)
    inputs[0].set_data_from_numpy(indexed_tokens, binary_data=True)

    attention_mask = np.array(attention_mask, dtype=np.int32)
    attention_mask = np.expand_dims(attention_mask, axis=0)
    inputs[1].set_data_from_numpy(attention_mask, binary_data=True)

    outputs.append(httpclient.InferRequestedOutput(output_names[0], binary_data=True))
    outputs.append(httpclient.InferRequestedOutput(output_names[1], binary_data=True))
    request_body, header_length = httpclient.InferenceServerClient.generate_request_body(
        inputs, outputs=outputs
    )
    return request_body, header_length


def get_sample_tokenized_text_binary_pt(text, enc, max_length=512):
    return _get_sample_tokenized_text_binary(
        text, ["INPUT__0", "INPUT__1"], ["OUTPUT__0", "1634__1"], enc, max_length
    )


def get_sample_tokenized_text_binary_trt(text, enc):
    return _get_sample_tokenized_text_binary(text, ["token_ids", "attn_mask"], ["output", "1634"], enc, max_length)

def get_decoded_text(tensors_tokens, enc):
    return_text=tokenizer.batch_decode(gen_tokens)[0]
    return return_text

In [56]:
%%time

import json
max_seq_length=512
text_triton = "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs."
print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")

input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="application/octet-stream", Body=json.dumps(payload)
)

output = json.loads(response["Body"].read().decode("utf8"))

print(output.keys())

Leverage the Tokenizer=PreTrainedTokenizer(name_or_path='bert-large-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})::max_seq_length=512:: create above when creating the model 
Tokenize:text:why??::max_length=512::Tokenizer=PreTrainedTokenizer(name_or_path='bert-large-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
dict_keys(['model_name', 'model_version', 'outputs'])
CPU times: user 135 ms, sys: 28.2 ms, total: 163 ms
Wall time: 291 ms


In [58]:
output['outputs'][0]

{'name': 'OUTPUT__0',
 'datatype': 'FP32',
 'shape': [1, 512, 768],
 'data': [-0.9098714590072632,
  -0.2597026824951172,
  -0.06315510720014572,
  0.09869029372930527,
  -0.30474230647087097,
  -0.027747122570872307,
  -0.11878933012485504,
  0.2429538518190384,
  -0.02532930113375187,
  -0.4470613896846771,
  -0.09216994047164917,
  -0.06129676103591919,
  -0.30296772718429565,
  0.5541083216667175,
  0.2988329231739044,
  -0.1416700929403305,
  -0.2583746910095215,
  0.6833750009536743,
  0.23119167983531952,
  0.19423067569732666,
  -0.5840367078781128,
  -0.6408720016479492,
  0.24452438950538635,
  -0.028376661241054535,
  -0.34075701236724854,
  0.03934742882847786,
  -0.24103625118732452,
  -0.33337512612342834,
  0.18526393175125122,
  0.30861321091651917,
  -0.48096364736557007,
  0.2748537063598633,
  -0.17834848165512085,
  -0.7473604679107666,
  0.7897679209709167,
  -0.3581537902355194,
  -0.2610751986503601,
  -0.1931782215833664,
  -0.15656787157058716,
  0.026670292019

#### Use the Binary Headers for Triton - faster but same results - BUT ERRORS out as NO RESPONSE is recieved 

We can also use binary+json as the payload format to get better performance for the inference call. The specification of this format is provided here.

Note: With the binary+json format, we have to specify the length of the request metadata in the header to allow Triton to correctly parse the binary payload. This is done using a custom Content-Type header application/vnd.sagemaker-triton.binary+json;json-header-size={}.

Please not, this is different from using Inference-Header-Content-Length header on a stand-alone Triton server since custom headers are not allowed in SageMaker.


In [43]:
%%time

import json
max_seq_length=512
text_triton = "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs."
print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")

#input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

request_body, header_length = get_sample_tokenized_text_binary_pt(text_triton, enc) # this returns 


response_binary = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, 
    ContentType="application/vnd.sagemaker-triton.binary+json;json-header-size={}".format(header_length), 
    Body=request_body
)

# Parse json header size length from the response
header_length_prefix = "application/vnd.sagemaker-triton.binary+json;json-header-size="
header_length_str = response["ContentType"][len(header_length_prefix) :]

try:
    # Read response body
    result = httpclient.InferenceServerClient.parse_response_body(
        response_binary["Body"].read(), header_length=int(header_length_str)
    )
    output0_data = result.as_numpy("OUTPUT__0")
    output1_data = result.as_numpy("1634__1")
    print(output0_data)
    print(output1_data)
except:
    print("Error in parsing respinse -- probably the body is empty")

Leverage the Tokenizer=PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})::max_seq_length=512:: create above when creating the model 
Tokenize:text:why??::max_length=512::Tokenizer=PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
Error in parsing respinse -- probably the body is empty
CPU times: user 3.46 ms, sys: 9.35 ms, total: 12.8 ms
Wall time: 32.3 ms


In [44]:
endpoint_name_p5

'p5-bert-uc--2022-09-08-03-02-53-774'

In [47]:
%%time
import json

max_seq_length=512
text_triton = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """


print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")
input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids}, # -- enc.tokenize(text)}, #
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

max_run = 100
for ii in range(0, max_run):
    response = runtime_sm_client.invoke_endpoint(
        EndpointName=endpoint_name_p5, ContentType="application/octet-stream", Body=json.dumps(payload)
    )

    output_dict = json.loads(response["Body"].read().decode("utf8"))

    # -- output_dict['outputs'][0]['data']  -- has 0 and 1 as 2 indexes in list 
    output_dict.keys()

    #enc.decode(output_dict['outputs'][0]['data'], skip_special_tokens=True)
output_dict.keys()

Leverage the Tokenizer=PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})::max_seq_length=512:: create above when creating the model 
Tokenize:text:why??::max_length=512::Tokenizer=PreTrainedTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
CPU times: user 14 s, sys: 1.07 s, total: 15.1 s
Wall time: 28.5 s


dict_keys(['model_name', 'model_version', 'outputs'])

In [22]:
output_dict['outputs'][0]

{'name': 'OUTPUT__0',
 'datatype': 'FP32',
 'shape': [1, 512, 768],
 'data': [-0.15243282914161682,
  -0.8572331666946411,
  0.06608188152313232,
  -0.20899571478366852,
  0.35779935121536255,
  -0.4324319064617157,
  0.21307486295700073,
  0.7328435778617859,
  0.2850395441055298,
  -0.8913273811340332,
  0.2898162603378296,
  -0.2516830265522003,
  0.17877909541130066,
  0.22467152774333954,
  -0.16646161675453186,
  0.21520552039146423,
  0.4105544686317444,
  0.49988511204719543,
  0.15959863364696503,
  0.11675862967967987,
  0.012811945751309395,
  -0.6604794859886169,
  0.46912506222724915,
  0.11688332259654999,
  0.15712383389472961,
  -0.03815995156764984,
  -0.13969361782073975,
  0.015787124633789062,
  0.11993402242660522,
  0.3402771055698395,
  -0.6780798435211182,
  0.1458730399608612,
  -0.25922656059265137,
  -0.7832548022270203,
  0.28116920590400696,
  -0.12131515890359879,
  -0.28475871682167053,
  -0.2089376002550125,
  -0.11570572853088379,
  0.35242488980293274,

In [23]:
enc.decode(output_dict['outputs'][0]['data'])

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [unused0] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [None]:
input_ids
attention_mask 

# open file in write mode
with open(r'./temp-bloom/input_ids.txt', 'w') as fp:
    for item in input_ids:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done input_ids')
    
# open file in write mode
with open(r'./temp-bloom/attention_mask.txt', 'w') as fp:
    for item in attention_mask:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done attention_mask')


### Clean up

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5)
sm_client.delete_model(ModelName=endpoint_name_p5)

In [None]:
# general imports
import boto3
import json
import os
import re
import copy
import time
from time import gmtime, strftime
import numpy as np
import datetime
import pprint
import pandas as pd

# sagemaker
import sagemaker
from sagemaker import get_execution_role

# triton
import tritonclient.http as httpclient

# transformers
from transformers import BertTokenizer

# custom CloudWatch
#from cloudwatch import get_endpoint_metrics


In [None]:
!docker run --gpus=all --rm -it  -v `pwd`/workspace:/workspace nvcr.io/nvidia/pytorch:21.08-py3 /bin/bash generate_models.sh

## START MME for triton 

**Upload first**

### Upload multiple copies for MME

In [None]:
for ii in range(1,100):
    s3_model_path_triton_mme = sagemaker.s3.S3Uploader().upload(
        local_path="./triton-serve/model.tar.gz",
        desired_s3_uri=f"s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bert-uc/model-{ii}",
        sagemaker_session=session
    )
s3_model_path_mme='s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bert-uc'
print("MULTIPLE:Uplodas:")
print(s3_model_path_triton_mme)
print(s3_model_path_mme)

In [None]:
model

In [None]:
s3_model_path_mme

**Create the model**

In [None]:
endpoint_name_p5_mme = name_from_base(f"p5-bert-uc-mme")
print(endpoint_name_p5_mme)

container_p5_mme = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_model_path_mme,
    'Mode':'MultiModel',
    'Environment': {
        #'SAGEMAKER_PROGRAM' : 'inference.py',
        #'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'model-1',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216", #"16777216000",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    }
}
create_model_response_mme = sm_client.create_model(
    ModelName=endpoint_name_p5_mme, ExecutionRoleArn=role, PrimaryContainer=container_p5_mme
)
print(create_model_response_mme)

**Create the Endpoint config**

In [None]:
create_endpoint_config_response_mme = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5_mme,
    ProductionVariants=[
        {
            "InstanceType": "ml.g4dn.xlarge", #"ml.g4dn.xlarge",ml.g5.8xlarge
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5_mme,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response_mme["EndpointConfigArn"])

**Create the endpoint**

In [None]:
create_endpoint_response_mme = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5_mme, EndpointConfigName=endpoint_name_p5_mme
)

print("Endpoint Arn: " + create_endpoint_response_mme["EndpointArn"])


In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5_mme)
status = resp["EndpointStatus"]
print("MME:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5_mme)
    status = resp["EndpointStatus"]
    print("MME:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("MME:model:triton:Status: " + status)

**Test the end point**

In [None]:
%%time

import json
max_seq_length=512
text_triton = "Triton Inference Server provides a cloud and edge inferencing solution optimized for both CPUs and GPUs."
print(f"Leverage the Tokenizer={enc}::max_seq_length={max_seq_length}:: create above when creating the model ")
input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5_mme, ContentType="application/octet-stream", Body=json.dumps(payload), TargetModel  = "/model-9/model.tar.gz"
)

output_dict = json.loads(response["Body"].read().decode("utf8"))

# -- output_dict['outputs'][0]['data']  -- has 0 and 1 as 2 indexes in list 
output_dict.keys()

enc.decode(output_dict['outputs'][0]['data'], skip_special_tokens=True)

In [None]:
endpoint_name_p5_mme

In [None]:
response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5_mme, ContentType="text/json", Body=json.dumps(payload), TargetModel  = "/model-9/model.tar.gz"
)
output_dict = json.loads(response["Body"].read().decode("utf8"))

# -- output_dict['outputs'][0]['data']  -- has 0 and 1 as 2 indexes in list 
output_dict.keys()

**set up in S3 payload to be used for inference load testing**

In [None]:
max_seq_length=512
text_triton = """
                Create payload JSON and upload it on S3. 
                This will be used by Inference Recommender to run the load test.
              """

input_ids, attention_mask = tokenize_text(text_triton, enc, max_length=max_seq_length)

payload = {
    "inputs": [
        {"name": "INPUT__0", "shape": [1, max_seq_length], "datatype": "INT32", "data": input_ids},
        {"name": "INPUT__1", "shape": [1, max_seq_length], "datatype": "INT32", "data": attention_mask},
    ]
}

print(f"Sample payload to be used with Inference Recommender")
print(payload)

payload_location = "./sample-payload/"
!mkdir -p $payload_location

payload_archive_name = "payload.tar.gz"

with open(payload_location + "request.json", "w") as f:
    json.dump(payload, f)


!cd ./sample-payload/ && tar czvf ../payload.tar.gz *

print(f"payload.tar.gz created at {payload_location}/{payload_archive_name}")

**Upload sample payload to S3**

In [None]:
s3_sample_data_path_triton = sagemaker.s3.S3Uploader().upload(
    local_path=f"{payload_archive_name}",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/triton_test_data",
    sagemaker_session=session
)
s3_sample_data_path_triton

## Inference Load test set up
### DOES NOT WORK FOR MME -- SO SKIP this section

In [None]:
ml_domain = "NATURAL_LANGUAGE_PROCESSING"
ml_task = "FILL_MASK"
ml_framework = "PYTORCH"
framework_version = "1.6.0"
model_tested = "bert-base-uncased"

In [None]:
ts = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
sm_model_name = "pt-triton-benchmark-model-" + ts
model_package_group_name = "pt-triton-benchmark-model-group-" + ts
advanced_job = "pt-triton-benchmark-advanced-job-" + ts

print(f"SageMaker Model Name: {sm_model_name}")
print(f"SageMaker Mode Package Name: {model_package_group_name}")
print(f"SageMaker Advanced Job Name: {advanced_job}")

In [None]:
s3_model_path_mme

In [None]:
container_infrec_mme = {
    'Image': triton_image_uri,
    "NearestModelName": model_tested, #'model-1',
    "Framework": ml_framework,
    'ModelDataUrl': s3_model_path_mme,
    #'Mode':'MultiModel',
    'Environment': {
        #'SAGEMAKER_PROGRAM' : 'inference.py',
        #'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'model-1',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216", #"16777216000",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    },
}


In [None]:
model_pacakge_group_response = sm_client.create_model_package_group(
    ModelPackageGroupName=str(model_package_group_name),
    ModelPackageGroupDescription="BERT large uncased Model group for Triton Serving",
)
print(f"Model Registry package group: {model_pacakge_group_response}")


In [None]:
model_package_version_response = sm_client.create_model_package(
    ModelPackageGroupName=str(model_package_group_name),
    ModelPackageDescription="BERT large uncased Model group for Triton Serving",
    Domain=ml_domain,
    Task=ml_task,
    SamplePayloadUrl=s3_sample_data_path_triton,
    InferenceSpecification={
        "Containers": [container_infrec_mme],
        "SupportedRealtimeInferenceInstanceTypes": [
            "ml.g4dn.4xlarge",
            "ml.g4dn.4xlarge",
        ],
        "SupportedContentTypes": ["application/octet-stream"],
        "SupportedResponseMIMETypes": ["application/json"],
    },
)
model_package_version_response

In [None]:
advanced_response = sm_client.create_inference_recommendations_job(
    JobName=advanced_job,
    JobDescription="nlp triton Inference Advanced Recommender Job",
    JobType="Advanced",
    RoleArn=role,
    InputConfig={
        "ModelPackageVersionArn": model_package_version_response["ModelPackageArn"],
        "JobDurationInSeconds": 7200,
        "EndpointConfigurations": [
            #{"InstanceType": "ml.p3.8xlarge"},
            #{"InstanceType": "ml.p3.2xlarge"},
            {"InstanceType": "ml.p2.16xlarge"},
            {"InstanceType": "ml.g4dn.xlarge"},
            {"InstanceType": "ml.g4dn.8xlarge"},
            {"InstanceType": "ml.g4dn.4xlarge"},
            {"InstanceType": "ml.g4dn.2xlarge"},
            {"InstanceType": "ml.g4dn.12xlarge"},
        ],
        "TrafficPattern": {
            "TrafficType": "PHASES",
            "Phases": [
                {
                    "InitialNumberOfUsers": 2,
                    "SpawnRate": 3,
                    "DurationInSeconds": 900,
                },  # simulating 50 users, 2 initial and 3 new users every minute for 16 minutes
            ],  # second phase, we will strt with 50 users, steady traffic for 5 minutes
        },
        "ResourceLimit": {"MaxNumberOfTests": 10, "MaxParallelOfTests": 5},
    },
    StoppingConditions={
        "MaxInvocations": 30000,
        "ModelLatencyThresholds": [{"Percentile": "P95", "ValueInMilliseconds": 500}],
    },
)

print(advanced_response)


In [None]:
%%time

ended = False
while not ended:
    inference_recommender_job = sm_client.describe_inference_recommendations_job(
        JobName=str(advanced_job)
    )
    if inference_recommender_job["Status"] in ["COMPLETED", "STOPPED", "FAILED"]:
        print(f"Inference recommender job status: {inference_recommender_job['Status']} ")
        ended = True
    else:
        print("Inference recommender job in progress")
        time.sleep(300)

if inference_recommender_job["Status"] == "FAILED":
    print("Inference recommender job failed ")
    print("Failed Reason: {}".inference_recommender_job["FailedReason"])
else:
    print("Inference recommender job completed")

## Clean up

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5_mme)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5_mme)
sm_client.delete_model(ModelName=endpoint_name_p5_mme)