In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


**Importing the dependencies**

In [2]:
import os
import json

import boto3
import sagemaker
from sagemaker.huggingface import get_huggingface_llm_image_uri, HuggingFaceModel

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sid23/.config/sagemaker/config.yaml


**AWS configuration**

In [3]:
config_data = json.load(open("config.json"))

In [4]:
config_data.keys()

dict_keys(['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION', 'ROLE_NAME', 'HF_TOKEN'])

In [5]:
AWS_ACCESS_KEY_ID = config_data["AWS_ACCESS_KEY_ID"]
AWS_SECRET_ACCESS_KEY = config_data["AWS_SECRET_ACCESS_KEY"]
AWS_DEFAULT_REGION = config_data["AWS_DEFAULT_REGION"]
ROLE_NAME = config_data["ROLE_NAME"]
HF_TOKEN = config_data["HF_TOKEN"]

In [6]:
# Set up AWS credentials with environment variables
os.environ['AWS_ACCESS_KEY_ID'] = AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_ACCESS_KEY
os.environ['AWS_DEFAULT_REGION'] = AWS_DEFAULT_REGION

In [7]:
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName=ROLE_NAME)['Role']['Arn']

# ROLE_NAME example: sid-yt-sagemaker-role (Sagemaker execution Role)

Couldn't call 'get_role' to get Role ARN from role name arn:aws:iam::730335406979:root to get Role path.


**Instance Configuration**

In [8]:
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.9.3"
)

In [9]:
# sagemaker config
instance_type = "ml.g5.xlarge"
number_of_gpu = 1
health_check_timeout = 600
endpoint_name = "llama-2-endpoint"

In [10]:
# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "meta-llama/Llama-2-7b-chat-hf", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEN': HF_TOKEN,
  'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}

In [11]:
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  name='llama-2-model',
  role=role,
  image_uri=llm_image,
  env=config
)

In [12]:
# Deploy model to an endpoint
llm = llm_model.deploy(
  endpoint_name=endpoint_name,
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

print('\nLLAMA 2 model deployed to Sagemaker')

-----------!
LLAMA 2 model deployed to Sagemaker
