In [1]:
!pip install boto3==1.24.68

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting boto3==1.24.68
  Downloading boto3-1.24.68-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.5/132.5 KB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: boto3
  Attempting uninstall: boto3
    Found existing installation: boto3 1.24.79
    Uninstalling boto3-1.24.79:
      Successfully uninstalled boto3-1.24.79
Successfully installed boto3-1.24.68
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
%%sh
docker pull deepjavalibrary/djl-serving:0.18.0-deepspeed

0.18.0-deepspeed: Pulling from deepjavalibrary/djl-serving
d5fd17ec1767: Pulling fs layer
602a45a9c0c5: Pulling fs layer
e1bae4c1f40f: Pulling fs layer
d9d586ab2510: Pulling fs layer
2b44adc78060: Pulling fs layer
cd4d84563a60: Pulling fs layer
e19a4e23074d: Pulling fs layer
a69bd65705b8: Pulling fs layer
7145f7b4815b: Pulling fs layer
e367c0f08642: Pulling fs layer
eee5fed2d5ca: Pulling fs layer
46b22735db66: Pulling fs layer
dc2cc42a02f3: Pulling fs layer
41ae858b91d1: Pulling fs layer
d46f0903e5a5: Pulling fs layer
3cb1e2965cce: Pulling fs layer
0391b9b1de9f: Pulling fs layer
d4d8280fcfac: Pulling fs layer
eee5fed2d5ca: Waiting
46b22735db66: Waiting
dc2cc42a02f3: Waiting
41ae858b91d1: Waiting
d46f0903e5a5: Waiting
3cb1e2965cce: Waiting
0391b9b1de9f: Waiting
d4d8280fcfac: Waiting
d9d586ab2510: Waiting
2b44adc78060: Waiting
cd4d84563a60: Waiting
e19a4e23074d: Waiting
a69bd65705b8: Waiting
7145f7b4815b: Waiting
e367c0f08642: Waiting
602a45a9c0c5: Verifying Checksum
602a45a9c0c5: Downlo

In [3]:
!docker images

REPOSITORY                                                                   TAG                IMAGE ID       CREATED        SIZE
catboost-sagemaker-multimodel                                                latest             befea775c8ef   12 hours ago   1.35GB
874199810560.dkr.ecr.us-east-1.amazonaws.com/catboost-sagemaker-multimodel   latest             befea775c8ef   12 hours ago   1.35GB
ubuntu                                                                       18.04              35b3f4f76a24   3 weeks ago    63.1MB
deepjavalibrary/djl-serving                                                  0.18.0-deepspeed   c2edba9c6d73   2 months ago   12.7GB


In [4]:
%%sh

# The name of our container
img=djl_deepspeed


account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration
region=$(aws configure get region)

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${img}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${img}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${img}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}


# # Build the docker image locally with the image name and then push it to ECR
docker tag deepjavalibrary/djl-serving:0.18.0-deepspeed ${fullname}

docker push $fullname

Login Succeeded
The push refers to repository [874199810560.dkr.ecr.us-east-1.amazonaws.com/djl_deepspeed]
42737ae997a9: Preparing
4e257e38b819: Preparing
b16fc1914e14: Preparing
7e20e12fff52: Preparing
01b6a2323efe: Preparing
55df8d287560: Preparing
61d781c1452a: Preparing
dbc3bf935e02: Preparing
1a5fac543081: Preparing
a8d0c4c62eef: Preparing
7ed9a71261c7: Preparing
a1eeba43cdbe: Preparing
6127942867a5: Preparing
e592fe6d10a9: Preparing
f42691182163: Preparing
68016c5bb65c: Preparing
8034550a3bbe: Preparing
bf8cedc62fb3: Preparing
7ed9a71261c7: Waiting
a1eeba43cdbe: Waiting
6127942867a5: Waiting
e592fe6d10a9: Waiting
f42691182163: Waiting
68016c5bb65c: Waiting
8034550a3bbe: Waiting
bf8cedc62fb3: Waiting
61d781c1452a: Waiting
dbc3bf935e02: Waiting
1a5fac543081: Waiting
a8d0c4c62eef: Waiting
55df8d287560: Waiting
01b6a2323efe: Pushed
7e20e12fff52: Pushed
b16fc1914e14: Pushed
61d781c1452a: Pushed
55df8d287560: Pushed
dbc3bf935e02: Pushed
a8d0c4c62eef: Pushed
a1eeba43cdbe: Pushed
4e257e3

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



#### Write Model.py

In [None]:
!mkdir -p djl

In [5]:
%%writefile djl/model.py

from djl_python import Input, Output
import os
import deepspeed
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

predictor = None


def get_model():
    model_name = "EleutherAI/gpt-j-6B"
    tensor_parallel = int(os.getenv("TENSOR_PARALLEL_DEGREE", "2"))
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    model = AutoModelForCausalLM.from_pretrained(
        model_name, revision="float32", torch_dtype=torch.float32
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=model.dtype,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
    )
    return generator


def handle(inputs: Input) -> None:
    global predictor
    if not predictor:
        predictor = get_model()

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None

    data = inputs.get_as_string()
    result = predictor(data, do_sample=True, min_tokens=200, max_new_tokens=256)
    return Output().add(result)

Writing djl/model.py


In [6]:
%%writefile djl/serving.properties
engine = Rubikon
gpu.minWorkers=1
gpu.maxWorkers=2

Writing djl/serving.properties


In [7]:
import sagemaker, boto3

session = sagemaker.Session()
account = session.account_id()
region = session.boto_region_name
img = "djl_deepspeed"
fullname = account + ".dkr.ecr." + region + ".amazonaws.com/" + img + ":latest"
bucket = session.default_bucket()
path = "s3://" + bucket + "/DEMO-djl-big-model"

#### Tar structure

In [9]:
%%sh

rm gpt-j.tar.gz
#always start fresh

#mkdir -p gpt-j
#mv model.py gpt-j
#mv serving.properties gpt-j
tar -czvf gpt-j.tar.gz djl/
#aws s3 cp gpt-j.tar.gz {path}


djl/
djl/model.py
djl/serving.properties


In [10]:
model_s3_url = sagemaker.s3.S3Uploader.upload(
    "gpt-j.tar.gz", path, kms_key=None, sagemaker_session=session
)
model_s3_url

's3://sagemaker-us-east-1-874199810560/DEMO-djl-big-model/gpt-j.tar.gz'

In [18]:
from datetime import datetime

sm_client = boto3.client("sagemaker")

time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
model_name = "gpt-j-" + time_stamp

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=session.get_caller_identity_arn(),
    PrimaryContainer={
        "Image": fullname,
        "ModelDataUrl": model_s3_url,
        "Environment": {"TENSOR_PARALLEL_DEGREE": "2"},
    },
)

Now we create an endpoint configuration that SageMaker hosting services uses to deploy models. Note that we configured ModelDataDownloadTimeoutInSeconds and ContainerStartupHealthCheckTimeoutInSeconds to accommodate the large size of our model.

In [19]:
initial_instance_count = 1
instance_type = "ml.p3.2xlarge" # "ml.g5.48xlarge"
variant_name = "AllTraffic"
endpoint_config_name = "t-j-config-" + time_stamp

production_variants = [
    {
        "VariantName": variant_name,
        "ModelName": model_name,
        "InitialInstanceCount": initial_instance_count,
        "InstanceType": instance_type,
        "ModelDataDownloadTimeoutInSeconds": 1800,
        "ContainerStartupHealthCheckTimeoutInSeconds": 3600,
    }
]

endpoint_config = {
    "EndpointConfigName": endpoint_config_name,
    "ProductionVariants": production_variants,
}

ep_conf_res = sm_client.create_endpoint_config(**endpoint_config)
ep_conf_res

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:874199810560:endpoint-config/t-j-config-2022-10-02-19-45-08',
 'ResponseMetadata': {'RequestId': '952b2a36-2f43-4177-bb36-0ae71d44af93',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '952b2a36-2f43-4177-bb36-0ae71d44af93',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '111',
   'date': 'Sun, 02 Oct 2022 19:45:08 GMT'},
  'RetryAttempts': 0}}

In [20]:
endpoint_name = "gpt-j" + time_stamp
ep_res = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
ep_res

{'EndpointArn': 'arn:aws:sagemaker:us-east-1:874199810560:endpoint/gpt-j2022-10-02-19-45-08',
 'ResponseMetadata': {'RequestId': '7644b098-7d61-4595-b513-6adb218b04c5',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7644b098-7d61-4595-b513-6adb218b04c5',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Sun, 02 Oct 2022 19:45:09 GMT'},
  'RetryAttempts': 0}}

In [None]:
print("Waiting for {} endpoint to be in service...".format(endpoint_name))
waiter = sm_client.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=endpoint_name)

print("Created {} endpoint is in Service and read to invoke ...".format(endpoint_name))

Waiting for gpt-j2022-10-02-19-45-08 endpoint to be in service...


In [None]:
import json

client = boto3.client("sagemaker-runtime")

content_type = "text/plain"  # The MIME type of the input data in the request body.
payload = "Amazon.com is the best"  # Payload for inference.
response = client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType=content_type, Body=payload
)
print(response["Body"].read())

#### Clean up

In [None]:
sm_client.delete_endpoint(endpoint_name)