# bloom inference

For Chinese regions

https://huggingface.co/bigscience/bloom-7b1  7b , ~14GB size

https://huggingface.co/bigscience/bloom 176B , bigscience/bloom ~360GB size

In [None]:
#upgrade sdk library
!pip install -qU sagemaker -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install -qU boto3 -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install -qU botocore -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
# sagemaker environment setting
import sagemaker
import boto3
import os
import shutil
import sagemaker.huggingface
from sagemaker.djl_inference.model import DJLModel,DeepSpeedModel,HuggingFaceAccelerateModel,DJLPredictor

sagemaker_session = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sagemaker_session = sagemaker.Session(default_bucket=sagemaker_session_bucket)
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

## Download model files from huggingface hub, then upload to s3

In [None]:
!pip install -qU huggingface_hub -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

# - This will download the model into the ./model directory where ever the jupyter file is running
local_model_path = Path("tmp")
local_model_path.mkdir(exist_ok=True)
#model_name = "bigscience/bloom" 360GB 
model_name = "bigscience/bloom-7b1"
#model_name = "microsoft/bloom-deepspeed-inference-int8"
#commit_hash = "aa00a6626f6484a2eef68e06d1e089e4e32aa571"

# - Leverage the snapshot library to donload the model since the model is stored in repository using LFS
snapshot_download(repo_id=model_name, cache_dir=local_model_path, allow_patterns=["*.json","*.bin","*.md","*.pt"], ignore_patterns=["*.safetensors","*.msgpack","*.h5"])

# - Upload to S3 using AWS CLI
s3_model_prefix = model_name  # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]

print(f'model_snapshot_path:{model_snapshot_path}')

In [None]:
#model_snapshot_path="tmp/models--microsoft--bloom-deepspeed-inference-int8/snapshots/aa00a6626f6484a2eef68e06d1e089e4e32aa571"
#s3_model_prefix = "microsoft/bloom-deepspeed-inference-int8"


In [None]:
!chmod 777 s5cmd

!./s5cmd sync $model_snapshot_path/ s3://$bucket/$s3_model_prefix/
!aws s3 ls s3://$bucket/$s3_model_prefix/

In [None]:
model_id = f"s3://{bucket}/{s3_model_prefix}/"
print(f"model_id:{model_id}")

In [None]:
source_dir='source_dir'
entry_point = 'entry_point.py'

if os.path.exists(source_dir):
    shutil.rmtree(source_dir)
!mkdir $source_dir

## LMI + Create a model using the DeepSpeed backend

In [None]:
%%writefile $source_dir/requirements.txt
# Start writing content here (remove this file if not neeed)
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.22.2
sentencepiece
protobuf

In [None]:
# configure tensor_parallel_degree according gpu
instance_type = "ml.g4dn.2xlarge"
if instance_type in ["ml.p4d.24xlarge","ml.p4de.24xlarge","ml.p3.16xlarge","ml.p3dn.24xlarge","ml.g5.48xlarge"]: # 8 GPU
    tensor_parallel_degree = 8
elif instance_type in ["ml.p3.8xlarge","ml.g5.24xlarge","ml.g5.12xlarge","ml.g4dn.12xlarge"]: # 4 GPU
    tensor_parallel_degree = 4
elif instance_type in []: # 2 GPU
    tensor_parallel_degree = 2
elif instance_type in ["ml.p3.2xlarge","ml.g5.xlarge","ml.g5.2xlarge","ml.g5.4xlarge","ml.g5.8xlarge","ml.g5.16xlarge","ml.g4dn.xlarge","ml.g4dn.2xlarge","ml.g4dn.4xlarge","ml.g4dn.8xlarge","ml.g4dn.16xlarge"]: # 1 GPU
    tensor_parallel_degree = 1
else:
    tensor_parallel_degree = 0
    
print(f"instance_type:{instance_type} ; tensor_parallel_degree : {tensor_parallel_degree}")

In [None]:
# LMI + Create a model using the DeepSpeed backend    
deepspeed_model = DeepSpeedModel(
    model_id, # This can also be a HuggingFace Hub model id
    role,
    dtype="fp16",
    task="text-generation",
    tensor_parallel_degree=tensor_parallel_degree, # number of gpus to partition the model across using tensor parallelism
    #entry_point = entry_point,
    source_dir = source_dir
)
# Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
deepspeed_predictor = deepspeed_model.deploy(instance_type=instance_type,
                                         initial_instance_count=1,
                                         #model_data_download_timeout=10*60,
                                         #container_startup_health_check_timeout=15*60
                                            )

In [None]:
#predict
print(deepspeed_predictor.predict(
    { 
        "inputs" : "Large model inference is", 
        "parameters": { "max_length": 50 },
    }
))

## LMI + Create a model using the HuggingFace Accelerate backend

In [None]:
%%writefile $source_dir/requirements.txt
# Start writing content here (remove this file if not neeed)
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.22.2
sentencepiece
protobuf

In [None]:
%%time
# LMI + Create a model using the HuggingFace Accelerate backend

hf_accelerate_model = HuggingFaceAccelerateModel(
    model_id, # This can also be a HuggingFace Hub model id
    role,
    dtype="fp16", #dtype
    task="text-generation",
    number_of_partitions=tensor_parallel_degree, # number of gpus to partition the model across
    #entry_point = entry_point,
    source_dir = source_dir
)


hf_accelerate_predictor = hf_accelerate_model.deploy(instance_type=instance_type,
                                                     initial_instance_count=1,
                                                     # model_data_download_timeout=5*60,
                                                     #container_startup_health_check_timeout=10*60
                                                    )


In [None]:
%%time
#predict
print(hf_accelerate_predictor.predict(
    { 
        "inputs" : "Large model inference is", 
        "parameters": { "max_length": 50 },
    }
))

## only for re-invoke already-created endpoint

In [None]:
#only for re-invoke already-created endpoint

from sagemaker import Model, image_uris, serializers, deserializers
endpoint_name = "djl-inference-2023-05-08-07-30-32-434"
predictor = DJLPredictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)
#predict
print(predictor.predict(
    { 
        "inputs" : "Large model inference is", 
        "parameters": { "max_length": 50 },
    }
))


## clean up

In [None]:
endpoint_name = ""
model_name = ""
#sagemaker_session.delete_endpoint(endpoint_name)
#sagemaker_session.delete_endpoint_config(endpoint_name)
#sagemaker_session.delete_model(endpoint_name)