In [1]:
!pip install boto3==1.24.68

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting boto3==1.24.68
  Downloading boto3-1.24.68-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.5/132.5 KB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: boto3
  Attempting uninstall: boto3
    Found existing installation: boto3 1.24.79
    Uninstalling boto3-1.24.79:
      Successfully uninstalled boto3-1.24.79
Successfully installed boto3-1.24.68
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
%%sh
docker pull deepjavalibrary/djl-serving:0.18.0-deepspeed

0.18.0-deepspeed: Pulling from deepjavalibrary/djl-serving
d5fd17ec1767: Pulling fs layer
602a45a9c0c5: Pulling fs layer
e1bae4c1f40f: Pulling fs layer
d9d586ab2510: Pulling fs layer
2b44adc78060: Pulling fs layer
cd4d84563a60: Pulling fs layer
e19a4e23074d: Pulling fs layer
a69bd65705b8: Pulling fs layer
7145f7b4815b: Pulling fs layer
e367c0f08642: Pulling fs layer
eee5fed2d5ca: Pulling fs layer
46b22735db66: Pulling fs layer
dc2cc42a02f3: Pulling fs layer
41ae858b91d1: Pulling fs layer
d46f0903e5a5: Pulling fs layer
3cb1e2965cce: Pulling fs layer
0391b9b1de9f: Pulling fs layer
d4d8280fcfac: Pulling fs layer
eee5fed2d5ca: Waiting
46b22735db66: Waiting
dc2cc42a02f3: Waiting
41ae858b91d1: Waiting
d46f0903e5a5: Waiting
3cb1e2965cce: Waiting
0391b9b1de9f: Waiting
d4d8280fcfac: Waiting
d9d586ab2510: Waiting
2b44adc78060: Waiting
cd4d84563a60: Waiting
e19a4e23074d: Waiting
a69bd65705b8: Waiting
7145f7b4815b: Waiting
e367c0f08642: Waiting
602a45a9c0c5: Verifying Checksum
602a45a9c0c5: Downlo

In [3]:
!docker images

REPOSITORY                                                                   TAG                IMAGE ID       CREATED        SIZE
catboost-sagemaker-multimodel                                                latest             befea775c8ef   12 hours ago   1.35GB
874199810560.dkr.ecr.us-east-1.amazonaws.com/catboost-sagemaker-multimodel   latest             befea775c8ef   12 hours ago   1.35GB
ubuntu                                                                       18.04              35b3f4f76a24   3 weeks ago    63.1MB
deepjavalibrary/djl-serving                                                  0.18.0-deepspeed   c2edba9c6d73   2 months ago   12.7GB


In [4]:
%%sh

# The name of our container
img=djl_deepspeed


account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in the current configuration
region=$(aws configure get region)

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${img}:latest"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${img}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${img}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}


# # Build the docker image locally with the image name and then push it to ECR
docker tag deepjavalibrary/djl-serving:0.18.0-deepspeed ${fullname}

docker push $fullname

Login Succeeded
The push refers to repository [874199810560.dkr.ecr.us-east-1.amazonaws.com/djl_deepspeed]
42737ae997a9: Preparing
4e257e38b819: Preparing
b16fc1914e14: Preparing
7e20e12fff52: Preparing
01b6a2323efe: Preparing
55df8d287560: Preparing
61d781c1452a: Preparing
dbc3bf935e02: Preparing
1a5fac543081: Preparing
a8d0c4c62eef: Preparing
7ed9a71261c7: Preparing
a1eeba43cdbe: Preparing
6127942867a5: Preparing
e592fe6d10a9: Preparing
f42691182163: Preparing
68016c5bb65c: Preparing
8034550a3bbe: Preparing
bf8cedc62fb3: Preparing
7ed9a71261c7: Waiting
a1eeba43cdbe: Waiting
6127942867a5: Waiting
e592fe6d10a9: Waiting
f42691182163: Waiting
68016c5bb65c: Waiting
8034550a3bbe: Waiting
bf8cedc62fb3: Waiting
61d781c1452a: Waiting
dbc3bf935e02: Waiting
1a5fac543081: Waiting
a8d0c4c62eef: Waiting
55df8d287560: Waiting
01b6a2323efe: Pushed
7e20e12fff52: Pushed
b16fc1914e14: Pushed
61d781c1452a: Pushed
55df8d287560: Pushed
dbc3bf935e02: Pushed
a8d0c4c62eef: Pushed
a1eeba43cdbe: Pushed
4e257e3

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



#### Write Model.py

In [39]:
!mkdir -p djl

In [None]:
import gc
import math
import os
import logging
import torch
import deepspeed
from djl_python import Input, Output
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.utils import is_offline_mode


### Model loading and instantiating on GPUs
def get_repo_root(model_name_or_path, revision=None):
    # checks if online or not
    if is_offline_mode():

        logging.info("Offline mode: forcing local_files_only=True")
        local_files_only = True
    else:
        local_files_only = False

    # loads files from hub
    cached_repo_dir = snapshot_download(
        model_name_or_path, allow_patterns=["*"], local_files_only=local_files_only, revision=revision
    )

    return cached_repo_dir


def load_model():
    tensor_parallel = int(os.getenv('TENSOR_PARALLEL_DEGREE', '1'))
    model_name = "microsoft/bloom-deepspeed-inference-int8"
    deepspeed.init_distributed("nccl")
    logging.info(f"Loading the model {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    config = AutoConfig.from_pretrained(model_name)
    # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load
    with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
        model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
    model = model.eval()
    torch.cuda.empty_cache()

    ### Deepspeed-Inference Loading
    repo_root = get_repo_root(model_name)
    # tp presharded repos come with their own checkpoints config file
    checkpoints_json = os.path.join(repo_root, "ds_inference_config.json")
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        base_dir=repo_root,
        dtype=torch.int8,
        checkpoint=checkpoints_json,
        replace_with_kernel_inject=True)
    torch.cuda.empty_cache()
    gc.collect()
    deepspeed.runtime.utils.see_memory_usage("post-ds-inference-init", force=True)
    model = model.module
    return model, tokenizer


def batch_generation(batch_size):
    input_sentences = [
        "DeepSpeed is a machine learning framework",
        "He is working on",
        "He has a",
        "He got all",
        "Everyone is happy and I can",
        "The new movie that got Oscar this year",
        "In the far far distance from our galaxy,",
        "Peace is the only way",
    ]
    if batch_size > len(input_sentences):
        # dynamically extend to support larger bs by repetition
        input_sentences *= math.ceil(batch_size / len(input_sentences))
    return input_sentences[: batch_size]


model = None
tokenizer = None


def handle(inputs: Input):
    global model, tokenizer
    if not model:
        model, tokenizer = load_model()

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    data = inputs.get_as_json()
    batch_size = data["batch_size"]
    tokens_to_gen = data["text_length"]
    generate_kwargs = dict(min_length=tokens_to_gen, max_new_tokens=tokens_to_gen, do_sample=False)
    input_tokens = tokenizer.batch_encode_plus(batch_generation(batch_size), return_tensors="pt", padding=True)
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
    outputs = model.generate(**input_tokens, **generate_kwargs)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return Output().add_as_json(outputs)

In [40]:
%%writefile djl/model.py

from djl_python import Input, Output
import os
import deepspeed
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

predictor = None

def _get_model_bloom_176():
    model_name = "EleutherAI/gpt-j-6B"
    tensor_parallel = int(os.getenv("TENSOR_PARALLEL_DEGREE", "2"))
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    print("Bloom:176:LLM:Tokenizer is initialized")
    model = AutoModelForCausalLM.from_pretrained(
        model_dir, 
        device_map="auto", 
        load_in_8bit=True, 
        max_memory={
            0: "0GIB",
            1: "26GIB",
            2: "26GIB",
            3: "26GIB",
            4: "26GIB",
            5: "26GIB",
            6: "26GIB",
            7: "26GIB",
        } 
    )
    #model.requires_grad_(False)
    #model.eval()
    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=model.dtype,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
    )
    return generator
    


def get_model():
    model_name = "EleutherAI/gpt-j-6B"
    tensor_parallel = int(os.getenv("TENSOR_PARALLEL_DEGREE", "2"))
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    model = AutoModelForCausalLM.from_pretrained(
        model_name, revision="float32", torch_dtype=torch.float32
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = deepspeed.init_inference(
        model,
        mp_size=tensor_parallel,
        dtype=model.dtype,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
    )
    return generator


def handle(inputs: Input) -> None:
    global predictor
    if not predictor:
        predictor = get_model()

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None

    data = inputs.get_as_string()
    result = predictor(data, do_sample=True, min_tokens=200, max_new_tokens=256)
    return Output().add(result)

Overwriting djl/model.py


In [41]:
%%writefile djl/serving.properties
engine = Rubikon
gpu.minWorkers=1
gpu.maxWorkers=1

Overwriting djl/serving.properties


In [42]:
!mkdir -p djl/code

In [43]:
%%writefile djl/code/inference.py
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json

import boto3
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

s3_client = boto3.client("s3")

def download_files(files, input_path):
    
    def _download_file(s3_path, input_path):
        
        global s3_client
        
        local_file_path = os.path.join(input_path, s3_path.split("/")[-1])
        
        bucket, *key = s3_path.split("/")
        key = "/".join(key)
        
        try:
            s3_client.download_file(bucket, key, local_file_path)
        except:
            time.sleep(1)
            s3_client.download_file(bucket, key, local_file_path)

        return local_file_path
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(_download_file, file, input_path) for file in files]
        for future in as_completed(futures):
            print (f"Bloom:176:LLM:downloaded: {future.result()}")


def model_fn(model_dir):
    
    bucket = os.environ.get("MODEL_S3_BUCKET")
    key_prefix = os.environ.get("MODEL_S3_PREFIX")
    print(f"Bloom:176:LLM:bucket={bucket}::key={key_prefix}")
    model_dir = "/tmp/model"
    os.makedirs(model_dir, exist_ok=True)
    
    s3_objects = s3_client.list_objects(Bucket=bucket, Prefix=key_prefix)["Contents"]
    s3_paths = [os.path.join(bucket, obj["Key"]) for obj in s3_objects]
    print("Bloom:176:LLM:downloading files")
    download_files(s3_paths, model_dir)
    print("Bloom:176:LLM:downloading finished")
    
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    print("Bloom:176:LLM:Tokenizer is initialized")
    model = AutoModelForCausalLM.from_pretrained(
        model_dir, 
        device_map="auto", 
        load_in_8bit=True, 
        max_memory={
            0: "0GIB",
            1: "26GIB",
            2: "26GIB",
            3: "26GIB",
            4: "26GIB",
            5: "26GIB",
            6: "26GIB",
            7: "26GIB",
        } 
    )
    model.requires_grad_(False)
    model.eval()
    print("Bloom:176:LLM:Loaded the model")
    return model, tokenizer


def predict_fn(data, model_and_tokenizer):
    print("Bloom:176:LLM:predict_fn request received")
    model, tokenizer = model_and_tokenizer
    text = data.pop("inputs", data)
    print("Bloom:176:LLM:Input text is "+ text)
    encoded_input = tokenizer(text, return_tensors='pt')
    result_length = 50 
    #output_sequences = model.generate(input_ids=encoded_input['input_ids'], **data)
    output_sequences = model.generate( input_ids=encoded_input["input_ids"], max_length=result_length)
    return tokenizer.decode(output_sequences[0], skip_special_tokens=True)


#Override out of the box input function
def input_fn(input_data, content_type):
    print("Bloom:176:LLM:Received the input " + input_data)
    print("Bloom:176:LLM:Content type " + content_type)
    if content_type == "application/json":
        return json.loads(input_data)
    return input_data


#Override out of the box output function
def output_fn(prediction, accept):
    print("Bloom:176:LLM:Returning the output " + prediction)
    print("Bloom:176:LLM:accept type " + accept)
    output = {"outputs": prediction}
    return output

Overwriting djl/code/inference.py


In [44]:
import sagemaker, boto3

session = sagemaker.Session()
account = session.account_id()
region = session.boto_region_name
img = "djl_deepspeed"
fullname = account + ".dkr.ecr." + region + ".amazonaws.com/" + img + ":latest"
bucket = session.default_bucket()
path = "s3://" + bucket + "/DEMO-djl-big-model"

#### Tar structure

In [46]:
%%sh

rm gpt-j.tar.gz
#always start fresh

#mkdir -p gpt-j
#mv model.py gpt-j
#mv serving.properties gpt-j
tar -czvf gpt-j.tar.gz djl/
#aws s3 cp gpt-j.tar.gz {path}


djl/
djl/model.py
djl/code/
djl/code/inference.py
djl/serving.properties


In [47]:
model_s3_url = sagemaker.s3.S3Uploader.upload(
    "gpt-j.tar.gz", path, kms_key=None, sagemaker_session=session
)
model_s3_url

's3://sagemaker-us-east-1-874199810560/DEMO-djl-big-model/gpt-j.tar.gz'

In [49]:
from datetime import datetime

sm_client = boto3.client("sagemaker")

time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
model_name = "gpt-j-" + time_stamp

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=session.get_caller_identity_arn(),
    PrimaryContainer={
        "Image": fullname,
        "ModelDataUrl": model_s3_url,
        "Environment": {"TENSOR_PARALLEL_DEGREE": "2"},
    },
)
create_model_response

{'ModelArn': 'arn:aws:sagemaker:us-east-1:874199810560:model/gpt-j-2022-10-03-15-38-57',
 'ResponseMetadata': {'RequestId': 'f38121b3-8a30-4129-9432-51f337f18f2d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f38121b3-8a30-4129-9432-51f337f18f2d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '87',
   'date': 'Mon, 03 Oct 2022 15:38:58 GMT'},
  'RetryAttempts': 0}}

Now we create an endpoint configuration that SageMaker hosting services uses to deploy models. Note that we configured ModelDataDownloadTimeoutInSeconds and ContainerStartupHealthCheckTimeoutInSeconds to accommodate the large size of our model.

In [50]:
initial_instance_count = 1
instance_type = "ml.p3.2xlarge" # "ml.g5.48xlarge"
variant_name = "AllTraffic"
endpoint_config_name = "t-j-config-" + time_stamp

production_variants = [
    {
        "VariantName": variant_name,
        "ModelName": model_name,
        "InitialInstanceCount": initial_instance_count,
        "InstanceType": instance_type,
        "ModelDataDownloadTimeoutInSeconds": 1800,
        "ContainerStartupHealthCheckTimeoutInSeconds": 3600,
    }
]

endpoint_config = {
    "EndpointConfigName": endpoint_config_name,
    "ProductionVariants": production_variants,
}

ep_conf_res = sm_client.create_endpoint_config(**endpoint_config)
ep_conf_res

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:874199810560:endpoint-config/t-j-config-2022-10-03-15-38-57',
 'ResponseMetadata': {'RequestId': 'ca76b43d-b686-4d21-aa11-fc1146290fd0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'ca76b43d-b686-4d21-aa11-fc1146290fd0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '111',
   'date': 'Mon, 03 Oct 2022 15:39:06 GMT'},
  'RetryAttempts': 0}}

In [51]:
endpoint_name = "gpt-j" + time_stamp
ep_res = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
ep_res

{'EndpointArn': 'arn:aws:sagemaker:us-east-1:874199810560:endpoint/gpt-j2022-10-03-15-38-57',
 'ResponseMetadata': {'RequestId': 'b3349fc1-f544-432d-b637-493d115aeabc',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b3349fc1-f544-432d-b637-493d115aeabc',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Mon, 03 Oct 2022 15:39:08 GMT'},
  'RetryAttempts': 0}}

In [None]:
print("Waiting for {} endpoint to be in service...".format(endpoint_name))
waiter = sm_client.get_waiter("endpoint_in_service")
waiter.wait(EndpointName=endpoint_name)

print("Created {} endpoint is in Service and read to invoke ...".format(endpoint_name))

Waiting for gpt-j2022-10-03-15-38-57 endpoint to be in service...


In [None]:
import json

client = boto3.client("sagemaker-runtime")

content_type = "text/plain"  # The MIME type of the input data in the request body.
payload = "Amazon.com is the best"  # Payload for inference.
response = client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType=content_type, Body=payload
)
print(response["Body"].read())

#### Clean up

In [None]:
sm_client.delete_endpoint(endpoint_name)