In [2]:
!pip install sagemaker



In [5]:
import sagemaker
import boto3
sess = sagemaker.Session()

In [6]:
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None

In [None]:
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()


In [8]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [9]:
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.0.3"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04


In [4]:
import json
import getpass
from sagemaker.huggingface import HuggingFaceModel

In [18]:
# sagemaker config
instance_type = "ml.m5.2xlarge"
number_of_gpu = 4


# Define Model and Endpoint configuration parameter
config = {
    'HF_MODEL_ID': "meta-llama/Llama-2-7b-hf", # model_id from hf.co/models
    'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
    'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
      # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    image_uri=llm_image,
     env=config
    )

In [16]:
llm_model

<sagemaker.huggingface.model.HuggingFaceModel at 0x7f0f7bd070a0>

In [17]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

ConnectTimeoutError: Connect timeout on endpoint URL: "https://sts.us-east-1.amazonaws.com/"

In [13]:
chat = llm_model.predict({
    "inputs": """<|prompter|>What are some cool ideas to do in the summeError: Connect timeout on endpoir?<|endoftext|><|assistant|>"""
})

AttributeError: 'HuggingFaceModel' object has no attribute 'predict'

In [None]:
print(chat[0]["generated_text"])
#     <|prompter|>What are some cool ideas to do in the summer?<|endoftext|><|assistant|>There are many fun and exciting things you can do in the summer. Here are some ideas: