# Deploy gpt-j on SageMaker

Now, we will deploy the model on SageMaker realtime endpoint, which is also trained on SageMaker with deepspeed on multiple nodes.

In [1]:
import sagemaker
import boto3

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


sagemaker role arn: arn:aws:iam::687912291502:role/webui-notebook-stack-ExecutionRole-62U5FV4LJQS
sagemaker bucket: sagemaker-us-west-2-687912291502
sagemaker session region: us-west-2


We trained the Flan-T5-XXL, and the model is saved as BF16 format. We will use Huggingface accelerate to speed up the model inference. 

In [2]:
!mkdir deploy_code

In [3]:
%%writefile deploy_code/requirements.txt
accelerate==0.16.0
transformers==4.26.0
bitsandbytes==0.37.0

Writing deploy_code/requirements.txt


In [4]:
%%writefile deploy_code/serving.properties
engine=DeepSpeed
option.tensor_parallel_degree=8
option.s3url=s3://sagemaker-us-west-2-687912291502/llm/models/gpt-j/deepspeed/

Writing deploy_code/serving.properties


## for local test only

In [4]:
!pip install -r deploy_code/requirements.txt
!pip install https://publish.djl.ai/deepspeed/deepspeed-0.8.0-py2.py3-none-any.whl

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting deepspeed==0.8.0
  Downloading https://publish.djl.ai/deepspeed/deepspeed-0.8.0-py2.py3-none-any.whl (756 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.7/756.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting ninja
  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.0/146.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting hjson
  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo, ninja, hjson, deepspee

In [7]:
import deepspeed
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

tensor_parallel = 8
model_location = "/home/ec2-user/SageMaker/gpt-j-sample/deploy/djl_deepspeed/model/"
logging.info(f"Loading model in {model_location}")

tokenizer = AutoTokenizer.from_pretrained(model_location)
model = AutoModelForCausalLM.from_pretrained(model_location,
                                             low_cpu_mem_usage=True)

data = ["""hello ,how are you ?"""]

parameters = {
  #"early_stopping": True,
  #"length_penalty": 2.0,
  "max_new_tokens": 50,
  "temperature": 0,
  "min_length": 10,
  "no_repeat_ngram_size": 2,
}

model.to(torch.float16)
logging.info(f"Starting DeepSpeed init with TP={tensor_parallel}")
model = deepspeed.init_inference(model,
                                 mp_size=tensor_parallel,
                                 dtype=model.dtype,
                                 replace_method='auto',
                                 replace_with_kernel_inject=True)
tokenizer.pad_token = tokenizer.eos_token
input_tokens = tokenizer.batch_encode_plus(data,
                                           return_tensors="pt",
                                           padding=True)
for t in input_tokens:
    if torch.is_tensor(input_tokens[t]):
        input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
outputs = model.generate(**input_tokens, **parameters)

[2023-04-24 09:04:53,927] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.8.0+4f7229c, git-hash=4f7229c, git-branch=HEAD
[2023-04-24 09:04:53,929] [INFO] [logging.py:68:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[2023-04-24 09:04:53,932] [INFO] [comm.py:639:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2023-04-24 09:04:57,261] [INFO] [comm.py:693:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=172.16.60.74, master_port=29500
[2023-04-24 09:04:57,264] [INFO] [comm.py:657:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [5]:
%%writefile deploy_code/model.py
from djl_python import Input, Output
import deepspeed
import torch
import logging
import math
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer



def load_model(properties):
    tensor_parallel = properties["tensor_parallel_degree"]
    model_location = properties['model_dir']
    if "model_id" in properties:
        model_location = properties['model_id']
    logging.info(f"Loading model in {model_location}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_location)
    
    model = AutoModelForCausalLM.from_pretrained(model_location,
                                                 low_cpu_mem_usage=True)
    if "dtype" in properties:
        if properties["dtype"] == "float16":
            model.to(torch.float16)
        if properties["dtype"] == "bfloat16":
            model.to(torch.bfloat16)
    
    logging.info(f"Starting DeepSpeed init with TP={tensor_parallel}")
    model = deepspeed.init_inference(model,
                                     mp_size=tensor_parallel,
                                     dtype=model.dtype,
                                     replace_method='auto',
                                     replace_with_kernel_inject=True)
    return model.module, tokenizer


model = None
tokenizer = None
generator = None


def run_inference(model, tokenizer, data, params):
    generate_kwargs = params
    tokenizer.pad_token = tokenizer.eos_token
    input_tokens = tokenizer.batch_encode_plus(data,
                                               return_tensors="pt",
                                               padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
    outputs = model.generate(**input_tokens, **generate_kwargs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    data = inputs.get_as_json()
    
    input_sentences = data["inputs"]
    params = data["parameters"]
    
    outputs = run_inference(model, tokenizer, input_sentences, params)
    result = {"outputs": outputs}
    return Output().add_as_json(result)

Writing deploy_code/model.py


We will use LMI (large model inference) container on SageMaker to serve the LLM.

In [23]:
import sagemaker

sess = sagemaker.Session()
region = sess._region_name

inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117"
)

print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117


In [24]:
!rm model-gpt-j-deepspeed.tar.gz
!tar czvf model-gpt-j-deepspeed.tar.gz -C deploy_code .

./
./requirements.txt
./serving.properties
./model.py


In [25]:
s3_code_prefix = 'hf-large-model-djl-deploy/code-gpt-j-deepspeed'
bucket = sess.default_bucket() 
s3_code_artifact = sess.upload_data("model-gpt-j-deepspeed.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

S3 Code or Model tar ball uploaded to --- > s3://sagemaker-us-west-2-687912291502/hf-large-model-djl-deploy/code-gpt-j-deepspeed/model-gpt-j-deepspeed.tar.gz


In [29]:
from sagemaker.utils import name_from_base
import boto3
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

model_name = name_from_base(f"code-gpt-j-deepspeed")
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={
        "Image": inference_image_uri,
        "ModelDataUrl": s3_code_artifact
    },
    
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

code-gpt-j-deepspeed-2023-04-24-00-45-11-348
Image going to be used is ---- > 763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.0-cu117
Created Model: arn:aws:sagemaker:us-west-2:687912291502:model/code-gpt-j-deepspeed-2023-04-24-00-45-11-348


In [30]:
endpoint_config_name = f"{model_name}-config-88"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.48xlarge",
            "InitialInstanceCount": 1,
            #"ModelDataDownloadTimeoutInSeconds": 2400,
            "ContainerStartupHealthCheckTimeoutInSeconds": 2400,
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:687912291502:endpoint-config/code-gpt-j-deepspeed-2023-04-24-00-45-11-348-config-88',
 'ResponseMetadata': {'RequestId': 'af3fcf19-4d00-46ea-98a6-5601eee20bb1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'af3fcf19-4d00-46ea-98a6-5601eee20bb1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '135',
   'date': 'Mon, 24 Apr 2023 00:45:11 GMT'},
  'RetryAttempts': 0}}

In [31]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-west-2:687912291502:endpoint/code-gpt-j-deepspeed-2023-04-24-00-45-11-348-endpoint


In [32]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-west-2:687912291502:endpoint/code-gpt-j-deepspeed-2023-04-24-00-45-11-348-endpoint
Status: InService


Use the low level boto3 API to generate context.

In [36]:
%%time
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

prompts = """Summarize the following news article:
Peter and Elizabeth took a taxi to attend the night party in the city. While in the party, Elizabeth collapsed and was rushed to the hospital.
Since she was diagnosed with a brain injury, the doctor told Peter to stay besides her until she gets well. Therefore, Peter stayed with her at the hospital for 3 days without leaving.
Summary:
"""

prompts = """hello ,how are you ?"""

parameters = {
  #"early_stopping": True,
  #"length_penalty": 2.0,
  "max_new_tokens": 50,
  "temperature": 0,
  "min_length": 10,
  "no_repeat_ngram_size": 2,
}


response_model = smr_client.invoke_endpoint(
            EndpointName=endpoint_name,
            Body=json.dumps(
            {
                "inputs": [prompts],
                "parameters": parameters
            }
            ),
            ContentType="application/json",
        )

response_model['Body'].read().decode('utf8')

CPU times: user 15.5 ms, sys: 0 ns, total: 15.5 ms
Wall time: 1.13 s


'{\n  "outputs":[\n    "hello,how are you??\\n\\nionion,ion ision.ionIONionioionoion\\nIONIONIOionIOIONDEion youion yourion?ionDEIONioION, you areion withion andionionsion:ion toion"\n  ]\n}'