In [None]:
!pip install "sagemaker>=2.175.0" --upgrade --quiet


In [None]:
import sagemaker
import boto3
sess=sagemaker.Session()
# sagemaker session bucket -> used dor uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket=sess.default_bucket()


try:
    role=sagemaker.get_execution_role()
except ValueError:
    iam=boto3.client('iam')
    role=iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess=sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.8.2"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.2xlarge
number_of_gpu = 1
health_check_timeout = 200

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "Model_Repository_id_in_Huggingface", # model id from huggingface.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEM': json.dumps("Hugging_face_token"), # Hugging Face Token
  # 'HF_MODEL_QUANTIZE': "bitsandbytes", # Quantization Config
}

# check if token is set
assert config['HUGGING_FACE_HUB_TOKEN'] != "Hugging_face_token", f"Please set the HUGGING_FACE_HUB_TOKEN in the config and run the cell again"

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [None]:
llm=llm_model.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

In [None]:
# Input Format :: <s>[INST] {human_text} [/INST] {assistant_text} {code_text}</s>

In [None]:
def build_llama2_prompt(messages):
  start_prompt = '<s>[INST] '
  end_prompt = ' [/INST]'
  conversation=[]
  for index,message in enumerate(messages):
    if message["role"]=="system" and index==0:
      conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n")
    elif message["role"]=="user":
      conversation.append(f"{message['content']}")
    else:
      conversation.append(f"{message['content']}</s>")
  prompt=start_prompt+"".join(conversation)+end_prompt
  return prompt


In [None]:
messages=[{"role":"system","content":"Write a Pyhton code for Binary Search Algorithm"}]
instruction="Consider the array [2,6,8,1,3,9] and find the number8"
messages.append({"role":"user","content":instruction})
prompt=build_llama2_prompt(messages)
print(prompt)


In [None]:
answer=llm.predict({"inputs":prompt})
print(answer)