# 在SageMaker上使用LMI,HuggingFaceAccelerateModel部署BaiChuan2模型

syshen@amazon.com

---------------
repo_id="baichuan-inc/Baichuan2-7B-Base"

repo_id="baichuan-inc/Baichuan2-13B-Base"

repo_id="baichuan-inc/Baichuan2-7B-Chat"

repo_id="baichuan-inc/Baichuan2-7B-Chat-4bits"

repo_id="baichuan-inc/Baichuan2-13B-Chat"

repo_id="baichuan-inc/Baichuan2-13B-Chat-4bits"

时间:20~40分钟

1.环境设置,下载、上传模型

2.部署模型

3.推理测试

---------------

In [None]:
#choose repo_id
repo_id="baichuan-inc/Baichuan2-7B-Base"  #change this to yours
model_name=repo_id.split("/")[-1]
local_dir=repo_id

## 1.环境设置,下载、上传模型 ~4分钟

In [None]:
#upgrade library
!pip install -qU sagemaker
!pip install -qU huggingface_hub

In [None]:
# sagemaker environment setting
import sagemaker
import boto3
import os
import shutil
import sagemaker.huggingface

from sagemaker.djl_inference.model import DJLModel,DeepSpeedModel,HuggingFaceAccelerateModel,DJLPredictor
from sagemaker import Model, image_uris, serializers, deserializers
from huggingface_hub import snapshot_download

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
account_id = sagemaker_session.account_id

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

In [None]:
#download s5cmd
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz
!chmod 777 s5cmd

In [None]:
#download BaiChuan2 model file from Hugging Face
model_download_path = snapshot_download(repo_id=repo_id,local_dir=local_dir,ignore_patterns=["*.msgpack","*.h5"])
print(model_download_path)
!ls $local_dir

In [None]:
#upload model files to s3 bucket
!./s5cmd sync $local_dir/ s3://$bucket/$repo_id/
!aws s3 ls s3://$bucket/$repo_id/

## 2.部署模型 ~10分钟

In [None]:
#prepare inference files
source_dir = 'source_dir'
entry_point = f'{model_name}.py'

if os.path.exists(source_dir):
    shutil.rmtree(source_dir)
!mkdir $source_dir

In [None]:
#copy
!cp $local_dir/*.py $source_dir
!cp $local_dir/*.json $source_dir

In [None]:
%%writefile $source_dir/serving.properties
engine=python
option.enable_streaming = huggingface
option.dtype = fp16
option.batch_size = 1

In [None]:
%%writefile $source_dir/requirements.txt
numpy
transformers==4.33.1 
tokenizers
sentencepiece 
bitsandbytes 
deepspeed>=0.8.3
xformers 
accelerate
vllm
pandas
scipy
nvgpu
pynvml

In [None]:
%%writefile $source_dir/Baichuan2-7B-Base.py
#training code
import torch
from djl_python import Input, Output
import transformers
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tokenization_baichuan import BaichuanTokenizer

import logging
import os

model = None
tokenizer = None
import os

print(f"torch.__version__:{torch.__version__}")
print(f"transformers.__version__:{transformers.__version__}")
cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

def load_model(properties):
    global model,tokenizer
    
    model_name = properties["model_id"]
    
    tensor_parallel_degree = int(properties["tensor_parallel_degree"])
    pipeline_parallel_degree = 1
    dtype = properties["dtype"]
    
    logging.info(f"Loading model: {model_name}")
    print(f'model_name:{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16,trust_remote_code=True)
    
    return model,tokenizer

def handle(inputs: Input):
    global model,tokenizer
    max_new_tokens = 64
    print('----inputs.get_properties():----')
    print(inputs.get_properties())
    
    if not model:
        print("----no model----starting load_model-----")
        model,tokenizer = load_model(inputs.get_properties())
    
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        print("----return None-----")
        return None
    else:
        print(f"----return {inputs}-----")
    device = torch.device('cuda')
    
    data = inputs.get_as_json()
    input_text = data["inputs"]
    params = data["parameters"]
 
    #messages = []
    #messages.append({"role": "user", "content": input_text})
    #response = model.chat(tokenizer, messages)
    
    inputs = tokenizer(input_text, return_tensors='pt')
    inputs = inputs.to('cuda:0')
    pred = model.generate(**inputs, max_new_tokens=max_new_tokens, repetition_penalty=1.1)
    response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
    print(response)

    result = {"BaiChuan2":response}
    
    return Output().add(result)

In [None]:
%%writefile $source_dir/Baichuan2-13B-Base.py
#training code
import torch
from djl_python import Input, Output
import transformers
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tokenization_baichuan import BaichuanTokenizer

import logging
import os

model = None
tokenizer = None
import os

print(f"torch.__version__:{torch.__version__}")
print(f"transformers.__version__:{transformers.__version__}")
cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

def load_model(properties):
    global model,tokenizer
    
    model_name = properties["model_id"]
    
    tensor_parallel_degree = int(properties["tensor_parallel_degree"])
    pipeline_parallel_degree = 1
    dtype = properties["dtype"]
    
    logging.info(f"Loading model: {model_name}")
    print(f'model_name:{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16,trust_remote_code=True)
    
    return model,tokenizer

def handle(inputs: Input):
    global model,tokenizer
    max_new_tokens = 64
    print('----inputs.get_properties():----')
    print(inputs.get_properties())
    
    if not model:
        print("----no model----starting load_model-----")
        model,tokenizer = load_model(inputs.get_properties())
    
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        print("----return None-----")
        return None
    else:
        print(f"----return {inputs}-----")
    device = torch.device('cuda')
    
    data = inputs.get_as_json()
    input_text = data["inputs"]
    params = data["parameters"]
 
    #messages = []
    #messages.append({"role": "user", "content": input_text})
    #response = model.chat(tokenizer, messages)
    
    inputs = tokenizer(input_text, return_tensors='pt')
    inputs = inputs.to('cuda:0')
    pred = model.generate(**inputs, max_new_tokens=max_new_tokens, repetition_penalty=1.1)
    response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
    print(response)

    result = {"BaiChuan2":response}
    
    return Output().add(result)

In [None]:
%%writefile $source_dir/Baichuan2-7B-Chat.py
#training code
import torch
from djl_python import Input, Output
import transformers
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tokenization_baichuan import BaichuanTokenizer

import logging
import os

model = None
tokenizer = None
import os

print(f"torch.__version__:{torch.__version__}")
print(f"transformers.__version__:{transformers.__version__}")
cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

def load_model(properties):
    model_name = properties["model_id"]
    
    tensor_parallel_degree = int(properties["tensor_parallel_degree"])
    pipeline_parallel_degree = 1
    dtype = properties["dtype"]
    
    logging.info(f"Loading model: {model_name}")
    print(f'model_name:{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,trust_remote_code=True) #trust_remote_code=True
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16,trust_remote_code=True) #torch.bfloat16 trust_remote_code=True
    model.generation_config = GenerationConfig.from_pretrained(model_name)

    return model,tokenizer

def handle(inputs: Input):
    global model,tokenizer
    
    print('----inputs.get_properties():----')
    print(inputs.get_properties())
    
    if not model:
        print("----no model----starting load_model-----")
        model,tokenizer = load_model(inputs.get_properties())
    
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        print("----return None-----")
        return None
    else:
        print(f"----return {inputs}-----")
    device = torch.device('cuda')
    
    data = inputs.get_as_json()
    input_text = data["inputs"]
    params = data["parameters"]
 
    messages = []
    messages.append({"role": "user", "content": input_text})
    response = model.chat(tokenizer, messages)

    result = {"BaiChuan2":response}
    
    return Output().add(result)

In [None]:
%%writefile $source_dir/Baichuan2-7B-Chat-4bits.py
#training code
import torch
from djl_python import Input, Output
import transformers
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tokenization_baichuan import BaichuanTokenizer

import logging
import os

model = None
tokenizer = None
import os

print(f"torch.__version__:{torch.__version__}")
print(f"transformers.__version__:{transformers.__version__}")
cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

def load_model(properties):
    model_name = properties["model_id"]
    
    tensor_parallel_degree = int(properties["tensor_parallel_degree"])
    pipeline_parallel_degree = 1
    dtype = properties["dtype"]
    
    logging.info(f"Loading model: {model_name}")
    print(f'model_name:{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,trust_remote_code=True) #trust_remote_code=True
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16,trust_remote_code=True) #torch.bfloat16 trust_remote_code=True
    model.generation_config = GenerationConfig.from_pretrained(model_name)

    return model,tokenizer

def handle(inputs: Input):
    global model,tokenizer
    
    print('----inputs.get_properties():----')
    print(inputs.get_properties())
    
    if not model:
        print("----no model----starting load_model-----")
        model,tokenizer = load_model(inputs.get_properties())
    
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        print("----return None-----")
        return None
    else:
        print(f"----return {inputs}-----")
    device = torch.device('cuda')
    
    data = inputs.get_as_json()
    input_text = data["inputs"]
    params = data["parameters"]
 
    messages = []
    messages.append({"role": "user", "content": input_text})
    response = model.chat(tokenizer, messages)

    result = {"BaiChuan2":response}
    
    return Output().add(result)

In [None]:
%%writefile $source_dir/Baichuan2-13B-Chat.py
#training code
import torch
from djl_python import Input, Output
import transformers
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tokenization_baichuan import BaichuanTokenizer

import logging
import os

model = None
tokenizer = None
import os

print(f"torch.__version__:{torch.__version__}")
print(f"transformers.__version__:{transformers.__version__}")
cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

def load_model(properties):
    model_name = properties["model_id"]
    
    tensor_parallel_degree = int(properties["tensor_parallel_degree"])
    pipeline_parallel_degree = 1
    dtype = properties["dtype"]
    
    logging.info(f"Loading model: {model_name}")
    print(f'model_name:{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,trust_remote_code=True) #trust_remote_code=True
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16,trust_remote_code=True) #torch.bfloat16 trust_remote_code=True
    model.generation_config = GenerationConfig.from_pretrained(model_name)

    return model,tokenizer

def handle(inputs: Input):
    global model,tokenizer
    
    print('----inputs.get_properties():----')
    print(inputs.get_properties())
    
    if not model:
        print("----no model----starting load_model-----")
        model,tokenizer = load_model(inputs.get_properties())
    
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        print("----return None-----")
        return None
    else:
        print(f"----return {inputs}-----")
    device = torch.device('cuda')
    
    data = inputs.get_as_json()
    input_text = data["inputs"]
    params = data["parameters"]
 
    messages = []
    messages.append({"role": "user", "content": input_text})
    response = model.chat(tokenizer, messages)

    result = {"BaiChuan2":response}
    
    return Output().add(result)

In [None]:
%%writefile $source_dir/Baichuan2-13B-Chat-4bits.py
#training code
import torch
from djl_python import Input, Output
import transformers
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation.utils import GenerationConfig
from tokenization_baichuan import BaichuanTokenizer

import logging
import os

model = None
tokenizer = None
import os

print(f"torch.__version__:{torch.__version__}")
print(f"transformers.__version__:{transformers.__version__}")
cwd = os.getcwd()
print(f"cwd:{cwd}")
dir_list = os.listdir(cwd)
print("Files and directories in '", cwd, "' :")
# prints all files
print(dir_list)

def load_model(properties):
    model_name = properties["model_id"]
    
    tensor_parallel_degree = int(properties["tensor_parallel_degree"])
    pipeline_parallel_degree = 1
    dtype = properties["dtype"]
    
    logging.info(f"Loading model: {model_name}")
    print(f'model_name:{model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False,trust_remote_code=True) #trust_remote_code=True
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16,trust_remote_code=True) #torch.bfloat16 trust_remote_code=True
    model.generation_config = GenerationConfig.from_pretrained(model_name)

    return model,tokenizer

def handle(inputs: Input):
    global model,tokenizer
    
    print('----inputs.get_properties():----')
    print(inputs.get_properties())
    
    if not model:
        print("----no model----starting load_model-----")
        model,tokenizer = load_model(inputs.get_properties())
    
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        print("----return None-----")
        return None
    else:
        print(f"----return {inputs}-----")
    device = torch.device('cuda')
    
    data = inputs.get_as_json()
    input_text = data["inputs"]
    params = data["parameters"]
 
    messages = []
    messages.append({"role": "user", "content": input_text})
    response = model.chat(tokenizer, messages)

    result = {"BaiChuan2":response}
    
    return Output().add(result)

In [None]:
# deploying model
import time

endpoint_name = f"{repo_id.replace('/', '-')}-HFModel-" + time.strftime("%Y%m%d-%H%M%S", time.gmtime())
print(endpoint_name)
model_id = f"s3://{bucket}/{repo_id}/"

if(repo_id.endswith("-7B-Base") or repo_id.endswith("-7B-Chat") or repo_id.endswith("-Chat-4bits")):
    instance_type = "ml.g5.2xlarge"
    number_of_partitions = 1
elif(repo_id.endswith("-13B-Base") or repo_id.endswith("-13B-Chat")):
    instance_type = "ml.g5.12xlarge"
    number_of_partitions = 4
else:
    instance_type = "ml.g5.2xlarge"
    number_of_partitions = 1
print(f"instance_type : {instance_type}")

model = HuggingFaceAccelerateModel(
    model_id, # This can also be a HuggingFace Hub model id
    role,
    dtype="bf16",
    task="text-generation",
    number_of_partitions=number_of_partitions, # number of gpus to partition the model across
    entry_point = entry_point,
    source_dir = source_dir,
    #load_in_8bit=True
)
   
predictor = model.deploy(instance_type=instance_type,
                         initial_instance_count=1,
                         endpoint_name=endpoint_name,
                         model_data_download_timeout=5*60,
                         container_startup_health_check_timeout=8*60)


## 3.推理测试~ 1分钟

In [None]:
#predict
print(f'endpoint_name : {predictor.endpoint_name}')
print(predictor.predict(
    { 
        "inputs" : "写一篇关于气候变化对海洋生态的影响的文章,", 
        "parameters": { "max_length": 500 },
    }
))

## only for re-invoke already-created endpoint

In [None]:
#only for re-invoke already-created endpoint
endpoint_name = "baichuan-inc--Baichuan2-7B-Base-2023-10-01-05-27-31"

import sagemaker
from sagemaker.djl_inference.model import DJLModel,DeepSpeedModel,HuggingFaceAccelerateModel,DJLPredictor
from sagemaker import Model, image_uris, serializers, deserializers

sagemaker_session = sagemaker.Session()
#endpoint_name = "djl-inference-2023-09-27-13-21-39-526"
predictor = DJLPredictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)
#predict
print(predictor.predict(
    { 
        "inputs" : "写一篇中秋放假游玩的散文,500字", 
        "parameters": { "max_length": 50 },
    }
))


## clean up

In [None]:
import sagemaker
sagemaker_session = sagemaker.Session()

endpoint_name = "djl-inference-2023-09-27-13-21-39-526"
model_name = ""
sagemaker_session.delete_endpoint(endpoint_name)
#sagemaker_session.delete_endpoint_config(endpoint_name)
#sagemaker_session.delete_model(endpoint_name)

# backup

In [None]:
inference_image_uri = image_uris.retrieve(
framework="djl-deepspeed",
region='us-east-1',
version="0.23.0"
)
inference_image_uri

In [None]:
repo_id="baichuan-inc/Baichuan2-13B-Base"
if(repo_id.endswith("-7B-Base") or repo_id.endswith("-7B-Chat")):
    print("True")
else:
    print("False")

In [None]:
class StreamScanner:
    """
    A helper class for parsing the InvokeEndpointWithResponseStream event stream.

    The output of the model will be in the following format:
    ```
    b'{"outputs": [" a"]}\n'
    b'{"outputs": [" challenging"]}\n'
    b'{"outputs": [" problem"]}\n'
    ...
    ```

    While usually each PayloadPart event from the event stream will contain a byte array
    with a full json, this is not guaranteed and some of the json objects may be split across
    PayloadPart events. For example:
    ```
    {'PayloadPart': {'Bytes': b'{"outputs": '}}
    {'PayloadPart': {'Bytes': b'[" problem"]}\n'}}
    ```

    This class accounts for this by concatenating bytes written via the 'write' function
    and then exposing a method which will return lines (ending with a '\n' character) within
    the buffer via the 'readlines' function. It maintains the position of the last read
    position to ensure that previous bytes are not exposed again.
    """

    def __init__(self) -> None:
        self.buff = io.BytesIO()
        self.read_pos = 0

    def write(self, content: bytes) -> None:
        self.buff.seek(0, io.SEEK_END)
        self.buff.write(content)

    def readlines(self) -> bytes:
        self.buff.seek(self.read_pos)
        for line in self.buff.readlines():
            if line[-1] != b"\n":
                self.read_pos += len(line)
                yield line[:-1]

    def reset(self) -> None:
        self.read_pos = 0


In [None]:
%%time
import json
import io
endpoint_name = "baichuan-inc-Baichuan2-7B-Base-HFModel-20231010-071324"
prompts = "What is Amazon? Be concise."
request_content_type = "application/json"
response_content_type = "application/json"
SAGEMAKER_RUNTIME_CLIENT = boto3.client("sagemaker-runtime", region_name=region)

request_body = {
    "inputs": prompts,
    "parameters": {
        "max_new_tokens": 128,
        "do_sample": True,
        "temperature": 1.1,
        "top_p": 0.85,
    },
}

response = SAGEMAKER_RUNTIME_CLIENT.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    Body=json.dumps(request_body),
    ContentType=request_content_type,
    Accept=response_content_type,
)

event_stream = response["Body"]
scanner = StreamScanner()
for event in event_stream:
    scanner.write(event["PayloadPart"]["Bytes"])
    for line in scanner.readlines():
        deserialized_line = json.loads(line)
        print(deserialized_line.get("outputs")[0], end="")
print("\n")