In [None]:
!pip install -U pip awscli boto3 sagemaker transformers==4.21.3

In [None]:
import transformers
transformers.__version__

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | sudo bash

In [None]:
!sudo yum install git-lfs -y

In [None]:
!git lfs install


In [None]:
!git clone https://huggingface.co/bigscience/bloom-3b

In [None]:
!mkdir -p bloom-3b/code

In [None]:
account_id_map = {
    'us-east-1': '785573368785',
    'us-east-2': '007439368137',
    'us-west-1': '710691900526',
    'us-west-2': '301217895009',
    'eu-west-1': '802834080501',
    'eu-west-2': '205493899709',
    'eu-west-3': '254080097072',
    'eu-north-1': '601324751636',
    'eu-south-1': '966458181534',
    'eu-central-1': '746233611703',
    'ap-east-1': '110948597952',
    'ap-south-1': '763008648453',
    'ap-northeast-1': '941853720454',
    'ap-northeast-2': '151534178276',
    'ap-southeast-1': '324986816169',
    'ap-southeast-2': '355873309152',
    'cn-northwest-1': '474822919863',
    'cn-north-1': '472730292857',
    'sa-east-1': '756306329178',
    'ca-central-1': '464438896020',
    'me-south-1': '836785723513',
    'af-south-1': '774647643957'
}

In [None]:
%%writefile bloom-3b/code/inference.py

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


def model_fn(model_dir):
    print(f"Bloom:LLM:model_fn()::called dir={model_dir}::")
    model_8bit = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", load_in_8bit=True)
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    return model_8bit, tokenizer


def predict_fn(data, model_and_tokenizer):
    print(f"Bloom:LLM:predict_fn()::called dir={model_and_tokenizer}::")
    model, tokenizer = model_and_tokenizer
    text = data.pop("inputs", data)
    encoded_input = tokenizer(text, return_tensors='pt')
    output_sequences = model.generate(input_ids=encoded_input['input_ids'].cuda(), **data)
    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)


def torch_predict_fn(input_data, model):
    device = torch.device("cpu")
    model = model.to(device)
    input_data = data.to(device)
    model.eval()
    with torch.jit.optimized_execution(True, {"target_device": "eia:0"}):
        output = model(input_data)

In [None]:
%%writefile bloom-3b/code/requirements.txt
bitsandbytes
accelerate
git+https://github.com/huggingface/transformers.git@main#egg=transformers

In [None]:
# tar --exclude=".git" --exclude=".gitattributes" -zcvf model.tar.gz *
# tar -tf model.tar.gz
# -- /home/ec2-user/SageMaker/bloom-3b

# -- TRITON Is at triton_serve
# - tar --exclude=".git" --exclude=".gitattributes" --exclude="model.tar.gz" -zcvf model.tar.gz triton-serve

In [None]:
import sagemaker

session=sagemaker.Session()
role=sagemaker.get_execution_role()

In [None]:
s3_model_path = sagemaker.s3.S3Uploader().upload(
    local_path="./bloom-3b/model.tar.gz",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/models",
    sagemaker_session=session
)
print(s3_model_path)

In [None]:
s3_model_path='s3://sagemaker-us-east-1-622343165275/bloom/models/model.tar.gz'

In [None]:
from sagemaker.utils import name_from_base
from sagemaker.huggingface.model import HuggingFaceModel

model_id='bloom-3b'
endpoint_name = name_from_base(f"{model_id}-bnb")

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
    model_data=s3_model_path,
    role=role,
    transformers_version="4.17",
    pytorch_version="1.10",
    py_version='py38',
)



In [None]:
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge",
    endpoint_name=endpoint_name
)

In [None]:
predictor.endpoint

In [None]:
data = {
    "inputs" : "Transformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor.predict(data=data)
print(res)

In [None]:
data = {
    "inputs" : "the man worked as a carpenter.", #"Tramsformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor.predict(data=data)
print(res)

In [None]:
%%time

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

data = {
    "inputs": text,
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    #"min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,

}


#body = json.dumps(data)

res = predictor.predict(data=data)
print(res)

## Deploy on p5 instances

In [None]:
%%writefile triton-serve/bloom-3b/config.pbtxt
platform: "pytorch_libtorch"
max_batch_size: 32
input [
  {
    name: "INPUT__0"
    data_type: TYPE_INT32
    dims: [512]
  },
  {
    name: "INPUT__1"
    data_type: TYPE_INT32
    dims: [512]
  }
]
output [
  {
    name: "OUTPUT__0"
    data_type: TYPE_FP32
    dims: [512, 768]
  },
  {
    name: "1634__1"
    data_type: TYPE_FP32
    dims: [768]
  }
]
instance_group [
    {
      count: 2
      kind: KIND_GPU
    }
  ]
dynamic_batching {
   preferred_batch_size: 16
   max_queue_delay_microseconds: 1000
 }


#### Load the model to convert to a .pt state for TRITON server

#### This saves as PyTorch model we need torchscript model format

In [None]:
import torch
PATH="./bloom-3b/pytorch_model.bin"
model_bin = torch.load(PATH)  # --    collections.OrderedDict



#### This saves as a PyTorch SCRIPT mode based model which is what we need for the model to load in TritonServer

In [None]:
#!pip install accelerate

In [None]:
import torch
import torchvision

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json

PATH="./bloom-3b/pytorch_model.bin"
model_bin = torch.load(PATH) # collections.OrderedDict


In [None]:
len(model_bin['word_embeddings.weight'])

In [None]:
len(model_bin['h.0.input_layernorm.weight'])

In [None]:
from transformers import BloomModel, BloomConfig

# Initializing a Bloom configuration
configuration = BloomConfig(**model_bin)

# Initializing a model from the configuration
model = BloomModel(configuration)

# Accessing the model configuration
configuration = model.config
print(type(configuration))

model.eval()

In [None]:
print(tokens_tensor)
segments_tensors

In [None]:
model_bin

In [None]:
#from transformers import BloomModel, BloomConfig
# An example input you would normally provide to your model's forward() method.
text =  "Transformers with bnb-Int8 work best on"
tokenizer = AutoTokenizer.from_pretrained("./bloom-3b") # - class type is BloomTokenizerFast
encoded_input = tokenizer(text, return_tensors='pt').convert_to_tensors()
print(type(encoded_input), encoded_input)

PATH="./bloom-3b/pytorch_model.bin"
JSON_PATH="./bloom-3b/pytorch_model_json.json"
# write the model to json file
with open(JSON_PATH, 'w') as f:
    f.write(json.dumps(model_bin))
    
# - load the model
from transformers import AutoModelForCausalLM, AutoTokenizer
model_8bit = AutoModelForCausalLM.from_pretrained(JSON_PATH, device_map="auto", load_in_8bit=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

bs = 1
seq_len = 512
dummy_inputs = [
    torch.randint(1000, (bs, seq_len)).to(device),
    torch.zeros(bs, seq_len, dtype=torch.int).to(device),
]
model_8bit = model_8bit.eval()
model_8bit.to(device)

traced_model = torch.jit.trace(model_8bit, dummy_inputs)
# Save the TorchScript model
traced_model.save("./triton-serve/bloom-3b/1/model.pt")

print(":PyTorch:TorchScript:Model:Saved {}".format(traced_model))




In [None]:
traced_script_module = torch.jit.trace(
    model, 
    [tokens_tensor, segments_tensors] ,
    strict=False)

# Save the TorchScript model
traced_script_module.save("./triton-serve/bloom-3b/1/model.pt")

In [None]:
s3_model_path_triton = sagemaker.s3.S3Uploader().upload(
    local_path="./triton-serve/model.tar.gz",
    desired_s3_uri="s3://sagemaker-us-east-1-622343165275/bloom/triton_models/bloom-3b",
    sagemaker_session=session
)
s3_mme_model_path='s3://sagemaker-us-east-1-622343165275/bloom/triton_models/'
print(s3_model_path_triton)
print(s3_mme_model_path)

In [None]:
#!rm model.tar.gz

In [None]:
s3_mme_model_path

In [None]:
from sagemaker import get_execution_role, Session, image_uris
import boto3
region = boto3.Session().region_name
role = get_execution_role()
sm_client = boto3.client(service_name="sagemaker")
runtime_sm_client = boto3.client("sagemaker-runtime")

base = "amazonaws.com.cn" if region.startswith("cn-") else "amazonaws.com"
triton_image_uri = "{account_id}.dkr.ecr.{region}.{base}/sagemaker-tritonserver:22.07-py3".format(
    account_id=account_id_map[region], region=region, base=base
)
print(triton_image_uri)

## Standard Single Model

In [None]:
s3_mme_model_path

**Single Model from EXACT s3 location**

In [None]:
endpoint_name_p5 = name_from_base(f"p5-{model_id}-bnb")
print(endpoint_name_p5)

container_p5 = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_model_path_triton,
    'Environment': {
        'SAGEMAKER_PROGRAM' : 'inference.py',
        'SAGEMAKER_SUBMIT_DIRECTORY' : 'code',
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'bloom-3b',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000",
        "SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" : "16777216000", #"16777216",
        "SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE": "1048576"
    }
}
create_model_response = sm_client.create_model(
    ModelName=endpoint_name_p5, ExecutionRoleArn=role, PrimaryContainer=container_p5
)
print(create_model_response)

**SingleModel end point config**

In [None]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5,
    ProductionVariants=[
        {
            "InstanceType": "ml.g5.8xlarge", #"ml.g4dn.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])


**Finally create the end point -- SINGLE model**

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5, EndpointConfigName=endpoint_name_p5
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])


In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
status = resp["EndpointStatus"]
print("SINGLE:Model:endpoint:Triton:Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
    status = resp["EndpointStatus"]
    print("Single:model:triton:Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Single:model:triton:Status: " + status)

**Now Invoke the Single Model**

In [None]:
%%time
import json

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

payload = {
    "inputs": [
        {
            "name": "INPUT__0",
            "data": text,
            "do_sample": True,
            "temperature": 0.7,
            "max_new_tokens":200,
            #"min_tokens": 100,
            "repetition_penalty": 1.1,
            "top_p": 500,

        }
    ]
}


response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="text/csv", 
    Body=json.dumps(payload),
)

print(json.loads(response["Body"].read().decode("utf8")))


**SINGLE Model Clean up**

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5)
sm_client.delete_model(ModelName=endpoint_name_p5)

## START TRITON MME 

#### MME container

**1. Create The MME container**

In [None]:
endpoint_name_p5 = name_from_base(f"p5-{model_id}-bnb")
print(endpoint_name_p5)

container_p5 = {
    'Image': triton_image_uri,
    'ModelDataUrl': s3_mme_model_path,
    "Mode" : "MultiModel",
    'Environment': {
        'SAGEMAKER_TRITON_DEFAULT_MODEL_NAME': 'bloom-3b',
        "SAGEMAKER_TRITON_BATCH_SIZE": "16",
        "SAGEMAKER_TRITON_MAX_BATCH_DELAY": "1000"
    }
}
create_model_response = sm_client.create_model(
    ModelName=endpoint_name_p5, ExecutionRoleArn=role, PrimaryContainer=container_p5
)
print(create_model_response)

In [None]:
endpoint_name_p5

**2. Create The MME End Point CONFIG P5**

In [None]:
create_endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_name_p5,
    ProductionVariants=[
        {
            "InstanceType": "ml.g4dn.xlarge",
            "InitialVariantWeight": 1,
            "InitialInstanceCount": 1,
            "ModelName": endpoint_name_p5,
            "VariantName": "AllTraffic",
        }
    ],
)

print("Endpoint Config Arn: " + create_endpoint_config_response["EndpointConfigArn"])


**3. Create The MME ENDPoint P5**

In [None]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=endpoint_name_p5, EndpointConfigName=endpoint_name_p5
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])


In [None]:
import time
resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name_p5)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

#### Now invoke the MME end point

In [None]:
prefix = "bloom-3b/model.tar.gz"

In [None]:
%%time
import json

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

payload = {
    "inputs": [
        {
            "name": "INPUT__0",
            "data": text,
            "do_sample": True,
            "temperature": 0.7,
            "max_new_tokens":200,
            #"min_tokens": 100,
            "repetition_penalty": 1.1,
            "top_p": 500,

        }
    ]
}


response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name_p5, ContentType="text/csv", 
    Body=json.dumps(payload),
    TargetModel="bloom-3b/model.tar.gz",
)

print(json.loads(response["Body"].read().decode("utf8")))


## Clean up MME

In [None]:
sm_client.delete_endpoint(EndpointName=endpoint_name_p5)
sm_client.delete_endpoint_config(EndpointConfigName=endpoint_name_p5)
sm_client.delete_model(ModelName=endpoint_name_p5)

### Now predict on P5

In [None]:
%%time

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

data = {
    "inputs": text,
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    #"min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,

}


#body = json.dumps(data)

res = predictor_p5.predict(data=data)
print(res)

In [None]:
%%time
data = {
    "inputs" : "the man worked as a carpenter.", #"Tramsformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor_p5.predict(data=data)
print(res)

In [None]:
%%time
data = {
    "inputs" : "Transformers with bnb-Int8 work best on",
    "do_sample" : True,
    "temperature" : 0.5
}
res = predictor_p5.predict(data=data)
print(res)

In [None]:
predictor_p5.delete_endpoint()

## BOTO 3 way to do this

In [None]:
%%time

text = """This is a creative writing exercise. Below, you'll be given a prompt. Your story should be based on the prompt.

Prompt: A scary story about a haunted mouse
Story: On a dark and stormy night, the mouse crept in the shadows. """

parameters = {
    "do_sample": True,
    "temperature": 0.7,
    "max_new_tokens":200,
    #"min_tokens": 100,
    "repetition_penalty": 1.1,
    "top_p": 500,
    }

data = {
    "inputs": {
        "text_inputs": text,
        "parameters": parameters
    }
}


body = json.dumps(data)


response = sagemaker_runtime.invoke_endpoint( 
        EndpointName=endpoint_name, 
        Body = body, 
        ContentType = 'application/json'
)