In [None]:
import sagemaker
import boto3

sess = sagemaker.Session()

# sagemaker session bucket used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket = None

if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket =sess.default_bucket()

In [None]:
# Role management
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

In [None]:
session = sagemaker.Session(default_bucket=sagemaker_session_bucket)
print(f'sagemaker role:{role}')
print(f'{sess.boto_region_name}')

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

hub = {
    'HF_MODEL_ID':'distilbert-base-uncased-distilled-squad', # model id from hf.co/models
    'HF_TASK': 'question-answering'  # NLP task you want to use for predictions
}

# Create huggingface model class
huggingface_model = HuggingFaceModel(
    env=hub,
    role=role,
    transformers_version='4.26',
    pytorch_version='1.13',
    py_version='py310'
)

# Deploy model to Sagemaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    isinstance_type='m1.m5.xlarge'
)

# example request: you always need to define 'inputs'
data = {
    'inputs': {
        'question': 'What I used to teach?',
        'context': 'My Name is kK and I live in Bangalore. I used to teach data science.'
    }
}
#request
predictor.predict(data)

answer: {'data sciecne'}

Compared to deploying regular Hugging Face Models, We first need to retireve the container uri and provide it to our Hugging Face Model class
with a image_uri pointing to the image. To retrieve the new Hugging Face LLM Deep Learning Container in Amazon Sagemaker, We can use the get_huggingface_llm_image_uri method provided by the SageMaker SDK. This method allows us to retrieve the URI for the desired Hugging Face LLM DLC based on the specified backend, session, region and version.

In [None]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# Retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
    'huggingface',
    version='0.8.2'
)

print(f'llm image uri:{llm_image}')

In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel

# saegmaker config
isinstance_type = 'ml.g5.12xlarge'
number_of_gpu = 4

# TGI config
config = {
    'HF_MODEL_ID': 'tiiuae/falcon-40b-instruct', # model id from hf.co/models
    'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
    'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
    'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
    #'HF_MODEL_QUANTIZE': 'bitsandbytes' # comment in to qunatize
}

# create HuggingFaceModel
llm_model = HuggingFaceModel(
    role=role,
    image_uri=llm_image,
    env = config
)



In [None]:
#define payload
prompt = '''
You are helpful assistant, called Falcon. Knowing everything about AWS.
User: Can you tell me something about Amazon SageMaker?
Falcon:
'''
# hyperparameters for llm
payload = {
    'inputs': prompt,
    'parameters': {
        'do_sample': True,
        'top_p': 0.9,
        'temperature': 0.8,
        'max_new_tokens': 1024,
        'repetition_penalty': 1.03,
        'stop': ['\nUser:','<|endoftext|>','</s>']
    }
}
# send request to endpoint
response = llm_model.prompt(payload)

for seq in response:
    print(f'Result: {seq['generated_text']}')