# Deploys llama model using huggingface and sagemaker
## Runs in aws sagemaker. 

In [23]:
import sagemaker
import boto3
import numpy
import scipy

In [24]:
sagemaker_execution_role = None

In [25]:
def setup_sagemaker_session(default_bucket=None):
    global sagemaker_execution_role
    session = sagemaker.Session(default_bucket=default_bucket)
    try:
        sagemaker_execution_role = sagemaker.get_execution_role()
    except ValueError:
        iam = boto3.client("iam")
        role = "sagemaker_execution_role"
        sagemaker_execution_role = iam.get_role(RoleName=role)["Role"]["Arn"]
    return session, sagemaker_execution_role

In [26]:
def mask_account_id(account_id):
    return "*" * len(account_id)

In [27]:
def main():
    sagemaker_session_bucket = None
    session, sagemaker_execution_role = setup_sagemaker_session(default_bucket=sagemaker_session_bucket)
    account_id = sagemaker_execution_role.split(":")[4]
    masked_account_id = mask_account_id(account_id)
    masked_role = sagemaker_execution_role.replace(account_id, masked_account_id)
    print(f"sagemaker role ARN : {masked_role}")
    print(f"sagemaker session region : {session.boto_region_name}")

In [28]:
main()

sagemaker role ARN : arn:aws:iam::************:role/service-role/AmazonSageMaker-ExecutionRole-20240712T185168
sagemaker session region : us-east-1


In [29]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

In [30]:
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.0.3"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.0.3-gpu-py39-cu118-ubuntu20.04


In [33]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.12xlarge"
number_of_gpu = 4
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "meta-llama/Llama-2-7b-hf", # model_id from hf.co/models. we are using llama2 7b
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048),  # Max length of the generation (including input text)
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=sagemaker_execution_role,
  image_uri=llm_image,
  env=config
)


In [34]:
llm_model

<sagemaker.huggingface.model.HuggingFaceModel at 0x7f7225259fc0>

In [38]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  # volume_size=400, # If using an instance with local SSD storage, volume_size must be None, e.g. p4 but not p3
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the CreateEndpoint operation: The account-level service limit 'ml.g5.12xlarge for endpoint usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

In [None]:
chat = llm.predict({
    "inputs": """<|prompter|>What are some cool ideas to do in the summer?<|endoftext|><|assistant|>"""
})

print(chat[0]["generated_text"])
#     <|prompter|>What are some cool ideas to do in the summer?<|endoftext|><|assistant|>There are many fun and exciting things you can do in the summer. Here are some ideas:

## # Now we will show how to use generation parameters in the parameters attribute of the payload. In addition to setting custom temperature, top_p, etc, we also stop generation after the turn of the bot.

In [None]:
# define payload
prompt="""<|prompter|>How can i stay more active during winter? Give me 3 tips.<|endoftext|><|assistant|>"""

# hyperparameters for llm
payload = {
  "inputs": prompt,
  "parameters": {
    "do_sample": True,
    "top_p": 0.7,
    "temperature": 0.7,
    "top_k": 50,
    "max_new_tokens": 256,
    "repetition_penalty": 1.03,
    "stop": ["<|endoftext|>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

# print(response[0]["generated_text"][:-len("<human>:")])
print(response[0]["generated_text"])