In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | sudo bash

In [None]:
!sudo yum install git-lfs -y

In [None]:
!git lfs install


In [None]:
!git clone https://huggingface.co/bigscience/bloom-3b

In [None]:
!mkdir -p bloom-3b/code

In [7]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

In [65]:
%%writefile bloom-3b/code/inference.py

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


def model_fn(model_dir):
    print(f"Bloom:LLM:model_fn()::called dir={model_dir}::")
    model_8bit = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", load_in_8bit=True)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    return model_8bit, tokenizer


def predict_fn(data, model_and_tokenizer):
    print(f"Bloom:LLM:predict_fn()::called dir={model_and_tokenizer}::")
    model, tokenizer = model_and_tokenizer
    text = data.pop("inputs", data)
    encoded_input = tokenizer(text, return_tensors='pt')
    output_sequences = model.generate(input_ids=encoded_input['input_ids'].cuda(), **data)
    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)


def torch_predict_fn(input_data, model):
    device = torch.device("cpu")
    model = model.to(device)
    input_data = data.to(device)
    model.eval()
    with torch.jit.optimized_execution(True, {"target_device": "eia:0"}):
        output = model(input_data)

Overwriting bloom-3b/code/inference.py


In [66]:
%%writefile bloom-3b/code/requirements.txt
bitsandbytes
accelerate
git+https://github.com/huggingface/transformers.git@main#egg=transformers

Overwriting bloom-3b/code/requirements.txt


In [None]:
# tar --exclude=".git" --exclude=".gitattributes" -zcvf model.tar.gz *
# tar -tf model.tar.gz
# -- /home/ec2-user/SageMaker/bloom-3b

# -- TRITON Is at triton_serve
# - tar --exclude=".git" --exclude=".gitattributes" --exclude="model.tar.gz" -zcvf model.tar.gz triton-serve

In [68]:
import sagemaker

session=sagemaker.Session()
role=sagemaker.get_execution_role()

In [69]:
s3_model_path = sagemaker.s3.S3Uploader().upload(
    local_path="./bloom-3b/model.tar.gz",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/models",
    sagemaker_session=session
)
print(s3_model_path)

s3://sagemaker-us-east-1-622343165275/bloom/models/model.tar.gz


In [56]:
s3_model_path='s3://sagemaker-us-east-1-622343165275/bloom/models/model.tar.gz'

In [70]:
from sagemaker.utils import name_from_base
from sagemaker.huggingface.model import HuggingFaceModel

model_id='bloom-3b'
endpoint_name = name_from_base(f"{model_id}-bnb")

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    model_data=s3_model_path,
    role=role,
    transformers_version="4.17",
    pytorch_version="1.10",
    py_version='py38',
)



In [None]:
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge",
    endpoint_name=endpoint_name
)

-------------

In [75]:
predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'bloom-3b-bnb-2022-09-07-22-12-51-451'

In [77]:
data = {
    "inputs" : "Transformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor.predict(data=data)
print(res)

Transformers with bnb-Int8 work best on Linux.
If you want to use a b


In [25]:
data = {
    "inputs" : "the man worked as a carpenter.", #"Tramsformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor.predict(data=data)
print(res)

the man worked as a carpenter. He was a tall, thin, wiry
man,


In [29]:
%%time

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

data = {
    "inputs": text,
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    #"min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,

}


#body = json.dumps(data)

res = predictor.predict(data=data)
print(res)

This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows.  He was scared.  
What happened to him? What did he think?  How would you describe his behavior?

Hints:

Don't forget that your characters are living.
Be sure to include a description of their appearance and activities.
Keep details short so they won't get lost when you write more material later.
A good idea is to include some humor as well.

A:

The answer is actually quite simple :)

 The mouse has no hair

and 

 it lives in a hole (or under a rock, or something like this).

because the mouse doesn't have any fur but it does live in a place where its hair might fall out due to rain etc
CPU times: user 4.83 ms, sys: 383 µs, total: 5.22 ms
Wall time: 13 s


## Deploy on p5 instances

In [18]:
%%writefile triton-serve/bloom-3b/config.pbtxt
platform: "pytorch_libtorch"
max_batch_size: 32
input [
  {
    name: "INPUT__0"
    data_type: TYPE_INT32
    dims: [512]
  },
  {
    name: "INPUT__1"
    data_type: TYPE_INT32
    dims: [512]
  }
]
output [
  {
    name: "OUTPUT__0"
    data_type: TYPE_FP32
    dims: [512, 768]
  },
  {
    name: "1634__1"
    data_type: TYPE_FP32
    dims: [768]
  }
]
instance_group [
    {
      count: 2
      kind: KIND_GPU
    }
  ]
dynamic_batching {
   preferred_batch_size: 16
   max_queue_delay_microseconds: 1000
 }


Overwriting triton-serve/bloom-3b/config.pbtxt


#### Load the model to convert to a .pt state for TRITON server

#### This saves as PyTorch model we need torchscript model format

In [None]:
import torch
PATH="./bloom-3b/pytorch_model.bin"
model_bin = torch.load(PATH)  # --    collections.OrderedDict
#torch.save(model_bin,"./triton-serve/bloom-3b/1/pytorch_model.pt")
torch.save(model_bin,"./triton-serve/bloom-3b/1/model.pt")


#### This saves as a PyTorch SCRIPT mode based model which is what we need for the model to load in TritonServer

In [60]:
#!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com, https://pypi.ngc.nvidia.com
Collecting accelerate
  Downloading accelerate-0.12.0-py3-none-any.whl (143 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.0/144.0 KB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.12.0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [78]:
import torch
import torchvision

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json

PATH="./bloom-3b/pytorch_model.bin"
model_bin = torch.load(PATH)


In [87]:
from transformers import BloomModel, BloomConfig

# Initializing a Bloom configuration
configuration = BloomConfig(**model_bin)

# Initializing a model from the configuration
model = BloomModel(configuration)

# Accessing the model configuration
configuration = model.config
print(type(configuration))

model.eval()

<class 'transformers.models.bloom.configuration_bloom.BloomConfig'>


BloomModel(
  (word_embeddings): Embedding(250880, 64)
  (word_embeddings_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (h): ModuleList(
    (0): BloomBlock(
      (input_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (self_attention): BloomAttention(
        (query_key_value): Linear(in_features=64, out_features=192, bias=True)
        (dense): Linear(in_features=64, out_features=64, bias=True)
        (attention_dropout): Dropout(p=0.0, inplace=False)
      )
      (post_attention_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mlp): BloomMLP(
        (dense_h_to_4h): Linear(in_features=64, out_features=256, bias=True)
        (dense_4h_to_h): Linear(in_features=256, out_features=64, bias=True)
        (gelu_impl): BloomGelu()
      )
    )
    (1): BloomBlock(
      (input_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (self_attention): BloomAttention(
        (query_key_value): Linear(in_fe

In [123]:
print(tokens_tensor)
segments_tensors

tensor([[  6168,    632,    267, 113695,  21624,  44001,     17, 138829,     15,
          30845,    722,   8885,    267,  39841,     17,  32465,  26143,   3403,
            722,  11173,    664,    368,  39841,   6149,  55061,   1309,     29,
            419,   3359,   1912,  26143,   3638,    267,   1207, 160174,  35184,
            189, 146903,     29,   4867,    267,  32046,    530,  55379,     92,
          19783,     15,    368,  35184,   2214,   1309,    361,    368, 228895,
           1865]])


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])

In [126]:
text

"This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.\n\nPrompt: A scary story about a haunted mouse\nStory: On a dark and stormy night, the mouse crept in the shadows. "

In [125]:

# An example input you would normally provide to your model's forward() method.
data = {
    "inputs" : "Transformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
tokenizer = AutoTokenizer.from_pretrained("./bloom-3b")
encoded_input = tokenizer(text, return_tensors='pt').convert_to_tensors()
print(type(encoded_input), encoded_input)

tokens_tensor = encoded_input['input_ids']
segments_tensors = encoded_input['attention_mask']

# -- works only on GPU devices
#output_sequences = model.generate(input_ids=encoded_input['input_ids'].cuda(), **data)
#print(output_sequences)
#return tokenizer.decode(output_sequences[0], skip_special_tokens=True)

# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(
    model, 
    [tokens_tensor, segments_tensors] ,
    strict=False)

# Save the TorchScript model
traced_script_module.save("./triton-serve/bloom-3b/1/model.pt")


<class 'transformers.tokenization_utils_base.BatchEncoding'> {'input_ids': tensor([[  6168,    632,    267, 113695,  21624,  44001,     17, 138829,     15,
          30845,    722,   8885,    267,  39841,     17,  32465,  26143,   3403,
            722,  11173,    664,    368,  39841,   6149,  55061,   1309,     29,
            419,   3359,   1912,  26143,   3638,    267,   1207, 160174,  35184,
            189, 146903,     29,   4867,    267,  32046,    530,  55379,     92,
          19783,     15,    368,  35184,   2214,   1309,    361,    368, 228895,
           1865]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}


IndexError: tuple index out of range

In [127]:
import torch
from transformers import BloomTokenizerFast, BloomForCausalLM

tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom")
model = BloomForCausalLM.from_pretrained("bigscience/bloom")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
logits = outputs.logits
print(loss)
print(logits)
print(outputs)
print(len(tokens_tensor))



Downloading tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/13.8M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading pytorch_model_00003-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00004-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00005-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00006-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00007-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00008-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00009-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00010-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Downloading pytorch_model_00011-of-00072.bin:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

In [None]:
traced_script_module = torch.jit.trace(
    model, 
    [tokens_tensor, segments_tensors] ,
    strict=False)

# Save the TorchScript model
traced_script_module.save("./triton-serve/bloom-3b/1/model.pt")

In [99]:
s3_model_path_triton = sagemaker.s3.S3Uploader().upload(
    local_path="./triton-serve/model.tar.gz",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bloom-3b",
    sagemaker_session=session
)
s3_mme_model_path='s3://sagemaker-us-east-1-622343165275/bloom/triton_models/'
print(s3_model_path_triton)
print(s3_mme_model_path)

s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bloom-3b/model.tar.gz
s3://sagemaker-us-east-1-622343165275/bloom/triton_models/


In [105]:
#!rm model.tar.gz

In [102]:
s3_mme_model_path

's3://sagemaker-us-east-1-622343165275/bloom/triton_models/'

In [103]:
from sagemaker import get_execution_role, Session, image_uris
import boto3
region = boto3.Session().region_name
role = get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
runtime_sm_client = boto3.client("sagemaker-runtime")

base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3".format(
    account_id=account_id_map[region], region=region, base=base
)
print(triton_image_uri)

785573368785.dkr.ecr.us-east-1.amazonaws.com/sagemaker-tritonserver:22.07-py3


## Standard Single Model

In [104]:
s3_mme_model_path

's3://sagemaker-us-east-1-622343165275/bloom/triton_models/'

**Single Model from EXACT s3 location**

In [105]:
endpoint_name_p5 = name_from_base(f"p5-{model_id}-bnb")
print(endpoint_name_p5)

container_p5 = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_model_path_triton,
    'Environment': {
        'SAGEMAKER_PROGRAM' : 'inference.py',
        'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'bloom-3b',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216000", #"16777216",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    }
}
create_model_response = sm_client.create_model(
    ModelName=endpoint_name_p5, ExecutionRoleArn=role, PrimaryContainer=container_p5
)
print(create_model_response)

p5-bloom-3b-bnb-2022-09-07-22-54-22-926
{'ModelArn': 'arn:aws:sagemaker:us-east-1:622343165275:model/p5-bloom-3b-bnb-2022-09-07-22-54-22-926', 'ResponseMetadata': {'RequestId': '7a121ad0-d5f9-4ef8-b931-cf4aa0cabde3', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '7a121ad0-d5f9-4ef8-b931-cf4aa0cabde3', 'content-type': 'application/x-amz-json-1.1', 'content-length': '101', 'date': 'Wed, 07 Sep 2022 22:54:22 GMT'}, 'RetryAttempts': 0}}


**SingleModel end point config**

In [106]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5,
    ProductionVariants=[
        {
            "InstanceType": "ml.g5.8xlarge", #"ml.g4dn.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])


Endpoint Config Arn: arn:aws:sagemaker:us-east-1:622343165275:endpoint-config/p5-bloom-3b-bnb-2022-09-07-22-54-22-926


**Finally create the end point -- SINGLE model**

In [107]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5, EndpointConfigName=endpoint_name_p5
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])


Endpoint Arn: arn:aws:sagemaker:us-east-1:622343165275:endpoint/p5-bloom-3b-bnb-2022-09-07-22-54-22-926


In [108]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
status = resp["EndpointStatus"]
print("SINGLE:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
    status = resp["EndpointStatus"]
    print("Single:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Single:model:triton:Status: " + status)

SINGLE:Model:endpoint:Triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: Creating
Single:model:triton:Status: F

**Now Invoke the Single Model**

In [None]:
%%time
import json

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

payload = {
    "inputs": [
        {
            "name": "INPUT__0",
            "data": text,
            "do_sample": True,
            "temperature": 0.7,
            "max_new_tokens":200,
            #"min_tokens": 100,
            "repetition_penalty": 1.1,
            "top_p": 500,

        }
    ]
}


response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="text/csv", 
    Body=json.dumps(payload),
)

print(json.loads(response["Body"].read().decode("utf8")))


**SINGLE Model Clean up**

In [109]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5)
sm_client.delete_model(ModelName=endpoint_name_p5)

{'ResponseMetadata': {'RequestId': '475d36bc-b27e-463c-8cfb-ee3ee1eca0d1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '475d36bc-b27e-463c-8cfb-ee3ee1eca0d1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Wed, 07 Sep 2022 23:27:13 GMT'},
  'RetryAttempts': 0}}

## START TRITON MME 

#### MME container

**1. Create The MME container**

In [120]:
endpoint_name_p5 = name_from_base(f"p5-{model_id}-bnb")
print(endpoint_name_p5)

container_p5 = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_mme_model_path,
    "Mode" : "MultiModel",
    'Environment': {
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'bloom-3b',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000"
    }
}
create_model_response = sm_client.create_model(
    ModelName=endpoint_name_p5, ExecutionRoleArn=role, PrimaryContainer=container_p5
)
print(create_model_response)

p5-bloom-3b-bnb-2022-09-05-06-17-07-707
{'ModelArn': 'arn:aws:sagemaker:us-east-1:622343165275:model/p5-bloom-3b-bnb-2022-09-05-06-17-07-707', 'ResponseMetadata': {'RequestId': '89e5122a-b2dd-4cb3-8944-480cb0bbeabf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '89e5122a-b2dd-4cb3-8944-480cb0bbeabf', 'content-type': 'application/x-amz-json-1.1', 'content-length': '101', 'date': 'Mon, 05 Sep 2022 06:17:07 GMT'}, 'RetryAttempts': 0}}


In [121]:
endpoint_name_p5

'p5-bloom-3b-bnb-2022-09-05-06-17-07-707'

**2. Create The MME End Point CONFIG P5**

In [122]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5,
    ProductionVariants=[
        {
            "InstanceType": "ml.g4dn.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])


Endpoint Config Arn: arn:aws:sagemaker:us-east-1:622343165275:endpoint-config/p5-bloom-3b-bnb-2022-09-05-06-17-07-707


**3. Create The MME ENDPoint P5**

In [123]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5, EndpointConfigName=endpoint_name_p5
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])


Endpoint Arn: arn:aws:sagemaker:us-east-1:622343165275:endpoint/p5-bloom-3b-bnb-2022-09-05-06-17-07-707


In [124]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:622343165275:endpoint/p5-bloom-3b-bnb-2022-09-05-06-17-07-707
Status: InService


#### Now invoke the MME end point

In [125]:
prefix = "bloom-3b/model.tar.gz"

In [None]:
%%time
import json

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

payload = {
    "inputs": [
        {
            "name": "INPUT__0",
            "data": text,
            "do_sample": True,
            "temperature": 0.7,
            "max_new_tokens":200,
            #"min_tokens": 100,
            "repetition_penalty": 1.1,
            "top_p": 500,

        }
    ]
}


response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="text/csv", 
    Body=json.dumps(payload),
    TargetModel="bloom-3b/model.tar.gz",
)

print(json.loads(response["Body"].read().decode("utf8")))


## Clean up MME

In [128]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5)
sm_client.delete_model(ModelName=endpoint_name_p5)

{'ResponseMetadata': {'RequestId': 'a6cda7b5-60f7-4e27-8b72-9d2e9080561d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a6cda7b5-60f7-4e27-8b72-9d2e9080561d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 05 Sep 2022 06:30:27 GMT'},
  'RetryAttempts': 0}}

### Now predict on P5

In [33]:
%%time

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

data = {
    "inputs": text,
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    #"min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,

}


#body = json.dumps(data)

res = predictor_p5.predict(data=data)
print(res)

This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows.  He was scared... but he had to stay inside because it was too cold outside.
The mouse was alone for hours. He could hear the wind howling and the rain shaking him down. Then... all of a sudden...
What did you do?
Follow your nose... and go find out what it smells like.

Hints:

You can use flashbacks or descriptions to help us imagine where the hero is.
Using pictures will help us picture and remember places we have never been before.
It's best if the mouse's own words are used.

A:

I think this one might be good:  

 The mouse crept through the darkness of the night into his little hole (a cave) that he shared with his sister.  They were scared by the storm in their tree house, but they knew there would always be food at the bottom just beyond the thicket.  
 Whe

In [37]:
%%time
data = {
    "inputs" : "the man worked as a carpenter.", #"Tramsformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor_p5.predict(data=data)
print(res)

the man worked as a carpenter. He was a handsome young man, and had
a very
CPU times: user 3.74 ms, sys: 1.68 ms, total: 5.41 ms
Wall time: 3.12 s


In [39]:
%%time
data = {
    "inputs" : "Transformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor_p5.predict(data=data)
print(res)

Transformers with bnb-Int8 work best on this.

With the above, it is possible
CPU times: user 4.88 ms, sys: 0 ns, total: 4.88 ms
Wall time: 3.23 s


In [41]:
predictor_p5.delete_endpoint()

## BOTO 3 way to do this

In [None]:
%%time

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

parameters = {
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    #"min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,
    }

data = {
    "inputs": {
        "text_inputs": text,
        "parameters": parameters
    }
}


body = json.dumps(data)


response = sagemaker_runtime.invoke_endpoint( 
        EndpointName=endpoint_name, 
        Body = body, 
        ContentType = 'application/json'
)