In [1]:
# installing sagemaker

!pip install "sagemaker>=2.175.0" --upgrade --quiet

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()

# sagemaker session bucket -> used for uploading data, models, and logs
# samaker will automatically create this bucket if it not exists
# creating sagemaker session bucket
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket, if a bucketname is not given
    sagemaker_session_bucket = sess.default_bucket()
    
# set role
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] # IAM role for sagemake execution

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) # sagemaker session with bucket

# printing role and region
print(f'sagemaker role arn: {role}')
print(f'sagemaker session region: {sess.boto_region_name}')

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::851725506990:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole
sagemaker session region: us-east-1


In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri from huggingface llm image
llm_image = get_huggingface_llm_image_uri(
    "huggingface",
    version='0.9.3'
)

# print ecr image uri
print(f'llm image uri: {llm_image}')

llm image uri: 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04


In [4]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "meta-llama/Llama-2-7b-chat-hf", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEN': json.dumps("hf_GmuoRVpKcKxqxYPGsYXTTQicwVeOBpIgGpu")
}

# check if token is set
assert config['HUGGING_FACE_HUB_TOKEN'] != "hf_GmuoRVpKcKxqxYPGsYXTTQicwVeOBpIgGpu", "Please set your Hugging Face Hub Token"

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [5]:
# Deploy mode to endpoint

llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

------------!

In [7]:
# building prompt for llama2 based on conversation history

def build_llama2_prompt(messages):
    startPrompt = "<s>[INST] "
    endPrompt = " [/INST]"
    conversation = []
    for index, message in enumerate(messages):
        if message["role"] == "system" and index == 0:
            conversation.append(f"<<SYS>>\n{message['content']}\n<</SYS>>\n\n")
        elif message["role"] == "user":
            conversation.append(message["content"].strip())
        else:
            conversation.append(f" [/INST] {message['content'].strip()}</s><s>[INST]")

    return startPrompt + "".join(conversation) + endPrompt

messages = [
{"role": "system", "content": "You are an advanced Text-to-SQL converter specializing in SQLite3 queries. Your task is to translate natural language English input into accurate and efficient SQL queries."}
]

In [10]:
# sending prompt to model for prediction

instruction = "What is quantum computing?"
messages.append({"role": "user", "content": instruction})
prompt = build_llama2_prompt(messages)

chat = llm.predict({"inputs": prompt})

print(chat[0]['generated_text'][len(prompt):])

 Hello there! I'm Nyara, and I'm here to help you understand the


In [11]:
# detailed payload to control the generation process

payload = {
    "inputs": prompt,
    "parameters": {
        "do_sample":True,
        "top_p":0.6,
        "temperatrure":0.8,
        'top_k':50,
        "max_new_tokens":512,
        "repetition_penalty":1.03,
        "stop":["</s>"]
    }
}

In [12]:
response = llm.predict(payload)

In [13]:
print(response[0]['generated_text'][len(prompt):])

 Hello there! I'm Nyara, and I'm here to help you understand the fascinating world of quantum computing! 🚀

Quantum computing is a revolutionary field that leverages the principles of quantum mechanics to perform complex calculations and simulations that are beyond the capabilities of classical computers. In classical computing, information is processed using bits that can have a value of either 0 or 1. In contrast, quantum computing uses quantum bits, or qubits, which can exist in multiple states simultaneously, known as superposition. 💡

This property allows quantum computers to solve certain problems much faster than classical computers, which are limited by the laws of classical physics. Quantum computers can also entangle qubits, which enables them to perform calculations on multiple variables simultaneously, known as parallel processing. 🔍

Quantum computing has the potential to solve some of the most challenging problems in various fields, including:

1. Cryptography: Quantum co