# Deploy RWKV Hugging Face models to Amazon SageMaker by using LMI, DeepSpeed 

reference:

https://huggingface.co/RWKV

https://sagemaker.readthedocs.io/en/stable/frameworks/djl/using_djl.html

https://docs.aws.amazon.com/sagemaker/latest/dg/large-model-inference-dlc.html



## Settings

In [None]:
#upgrade sdk library
!pip install -qU sagemaker
!pip install -qU boto3
!pip install -qU botocore

In [None]:
# sagemaker environment setting
import sagemaker
import boto3
import os
import shutil
import sagemaker.huggingface
from sagemaker.djl_inference.model import DJLModel,DeepSpeedModel,HuggingFaceAccelerateModel,DJLPredictor

sagemaker_session = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sagemaker_session is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sagemaker_session.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sagemaker_session = sagemaker.Session(default_bucket=sagemaker_session_bucket)
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name


print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

In [None]:
source_dir = 'source_dir'
if not os.path.exists(source_dir):
    os.mkdir(source_dir)
#entry_point = 'entry_point.py'

### NOTE: From v4.29.0, RWKV was supported by Transformers, the built-in Transformers(4.26.0) needs to be upgraded.

In [None]:
%%writefile $source_dir/requirements.txt
transformers==4.30.2
boto3
sagemaker
sentencepiece
nvgpu==0.9.0
pynvml==11.4.1

## Download model files from Hugging Face Hub, then upload them to S3 bucket

In [None]:
!curl -L https://github.com/peak/s5cmd/releases/download/v2.0.0/s5cmd_2.0.0_Linux-64bit.tar.gz | tar -xz
#!mv s5cmd ./$source_dir/s5cmd

In [None]:
model_id = "RWKV/rwkv-raven-3b"#"RWKV/rwkv-raven-14b"#"RWKV/rwkv-4-169m-pile"#"RWKV/rwkv-raven-7b"

In [None]:
!pip install -qU huggingface_hub

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./model")
local_model_path.mkdir(exist_ok=True)
# Only download pytorch checkpoint files
allow_patterns = ["*.json", "*.pt", "*.bin", "*.txt", "*.model"]
# - Leverage the snapshot library to download the model since the model is stored in repository using LFS
model_download_path = snapshot_download(
    repo_id=model_id,
    cache_dir=local_model_path,
    allow_patterns=allow_patterns,
)

# define a variable to contain the s3url of the location that has the model
#stabilityai--stable-diffusion-2-1
pretrained_model_location = f"s3://internal-modelzoo-us-east-1/RWKV/{model_id.split('/')[1]}/"

#model_artifact = sess.upload_data(path=model_download_path, key_prefix=s3_model_prefix)
!chmod +x ./s5cmd
!./s5cmd sync {model_download_path}/ {pretrained_model_location}

In [None]:
print(model_download_path)
print(pretrained_model_location)
#!./s5cmd sync {model_download_path}/  {pretrained_model_location}
!rm -fr {local_model_path}

## LMI + Create a model using the DeepSpeed backend, then do inferencing

In [None]:
# LMI + Create a model using the DeepSpeed backend
model_id = "s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/"
print(f"model_id:{model_id}")

deepspeed_model = DeepSpeedModel(
    model_id, # This can also be a HuggingFace Hub model id
    role,
    dtype="fp16",
    task="text-generation",
    tensor_parallel_degree=1, # number of gpus to partition the model across using tensor parallelism
    #entry_point = entry_point,
    source_dir = source_dir,
)

# Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
print(f"Deploying..., please wait for 3-10 minutes!")
deepspeed_predictor = deepspeed_model.deploy(
    "ml.g5.2xlarge",
    initial_instance_count=1,
    model_data_download_timeout=10*60,
    container_startup_health_check_timeout=15*60
)
endpoint_name = deepspeed_predictor.endpoint_name
print("")
print(f"endpoint_name:{endpoint_name}")

In [None]:
#predict
print(deepspeed_predictor.predict(
    { 
        "inputs" : "American election is", 
        "parameters": { "max_length": 50 },
    }
))

In [None]:

# # LMI + Create a model using the HuggingFace Accelerate backend
# model_id = "s3://internal-modelzoo-us-east-1/RWKV/rwkv-raven-3b/"
# print(f"model_id:{model_id}")

# hf_accelerate_model = HuggingFaceAccelerateModel(
#     model_id, # This can also be a HuggingFace Hub model id
#     role,
#     dtype="fp16",
#     task="text-generation",
#     number_of_partitions=1, # number of gpus to partition the model across
#     #entry_point = entry_point,
#     source_dir = source_dir
# )
# # Deploy the model to an Amazon SageMaker Endpoint and get a Predictor
# print(f"Deploying..., please wait for 3-10 minutes!")

# hf_accelerate_predictor = hf_accelerate_model.deploy("ml.g5.2xlarge",
#                                                      initial_instance_count=1,
#                                                      model_data_download_timeout=10*60,
#                                                      container_startup_health_check_timeout=15*60)
# #predict
# print(hf_accelerate_predictor.predict(
#     { 
#         "inputs" : "Large model inference is", 
#         "parameters": { "max_length": 50 },
#     }
# ))

## ONLY for re-invoke already-created endpoint

In [None]:
#only for re-invoke already-created endpoint
endpoint_name="djl-inference-2023-06-18-14-37-50-264"
endpoint_name="djl-inference-2023-08-10-06-10-14-302"
from sagemaker.djl_inference.model import DJLPredictor
from sagemaker import Model, image_uris, serializers, deserializers

predictor = DJLPredictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)
#predict
print(predictor.predict(
    { 
        "inputs" : "Today is sunny,", 
        "parameters": { "max_length": 50 },
    }
))


## clean up

In [None]:
#endpoint_name = ""
#model_name = ""
#sagemaker_session.delete_endpoint(endpoint_name)
#sagemaker_session.delete_endpoint_config(endpoint_name)
#sagemaker_session.delete_model(endpoint_name)