## Foundation Model - AlexTM (In-context Learning)

#### I. Imports 

In [2]:
from sagemaker.utils import name_from_base
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role
from sagemaker import hyperparameters
from sagemaker.model import Model
from sagemaker import script_uris
from sagemaker import image_uris 
from sagemaker import model_uris
from datetime import datetime
import sagemaker
import logging
import boto3
import json

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'[Using sageMaker version: {sagemaker.__version__}]')

[Using sageMaker version: 2.120.0]


#### II. Setup essentials 

In [5]:
MODEL_ID = 'pytorch-textgeneration1-alexa20b'  # this is hard-coded
MODEL_VERSION = '*'
INSTANCE_TYPE = 'ml.p3.8xlarge'
INSTANCE_COUNT = 1
IMAGE_SCOPE = 'inference'
MODEL_DATA_DOWNLOAD_TIMEOUT = 3600  # in seconds
CONTAINER_STARTUP_HEALTH_CHECK_TIMEOUT = 3600
EBS_VOLUME_SIZE = 256  # in GB

# set up roles and clients 
client = boto3.client('sagemaker-runtime')
ROLE = get_execution_role()
logger.info(f'Role => {ROLE}')

Role => arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628


In [6]:
endpoint_name = name_from_base(f'js-{MODEL_ID}-ep')
logger.info(f'Endpoint name: {endpoint_name}')

Endpoint name: js-pytorch-textgeneration1-alexa20b-ep-2023-02-15-03-27-03-817


#### III. Retrieve artifacts 

In [7]:
deploy_image_uri = image_uris.retrieve(region=None, 
                                       framework=None, 
                                       image_scope=IMAGE_SCOPE, 
                                       model_id=MODEL_ID, 
                                       model_version=MODEL_VERSION, 
                                       instance_type=INSTANCE_TYPE)
logger.info(f'Deploy image URI => {deploy_image_uri}')

Deploy image URI => 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38


In [8]:
model_uri = model_uris.retrieve(model_id=MODEL_ID, 
                                model_version=MODEL_VERSION, 
                                model_scope=IMAGE_SCOPE)
logger.info(f'Model URI => {model_uri}')

Model URI => s3://jumpstart-cache-prod-us-east-1/pytorch-infer/infer-pytorch-textgeneration1-alexa20b.tar.gz


In [9]:
env = {
    'SAGEMAKER_MODEL_SERVER_TIMEOUT': str(3600),
    'MODEL_CACHE_ROOT': '/opt/ml/model', 
    'SAGEMAKER_ENV': '1',
    'SAGEMAKER_SUBMIT_DIRECTORY': '/opt/ml/model/code/',
    'SAGEMAKER_PROGRAM': 'inference.py',
    'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 
    'TS_DEFAULT_WORKERS_PER_MODEL': '1', 
}

In [10]:
model = Model(image_uri=deploy_image_uri, 
              model_data=model_uri, 
              role=ROLE, 
              predictor_cls=Predictor, 
              name=endpoint_name, 
              env=env)

#### IV. Deploy the model for real-time inference 

In [11]:
%%time

model_predictor = model.deploy(initial_instance_count=INSTANCE_COUNT, 
                               instance_type=INSTANCE_TYPE, 
                               endpoint_name=endpoint_name, 
                               volume_size=EBS_VOLUME_SIZE, 
                               model_data_download_timeout=MODEL_DATA_DOWNLOAD_TIMEOUT, 
                               container_startup_health_check_timeout=CONTAINER_STARTUP_HEALTH_CHECK_TIMEOUT)

Creating model with name: js-pytorch-textgeneration1-alexa20b-ep-2023-02-15-03-27-03-817
CreateModel request: {
    "ModelName": "js-pytorch-textgeneration1-alexa20b-ep-2023-02-15-03-27-03-817",
    "ExecutionRoleArn": "arn:aws:iam::119174016168:role/service-role/AmazonSageMaker-ExecutionRole-20211014T093628",
    "PrimaryContainer": {
        "Image": "763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference:1.12.0-gpu-py38",
        "Environment": {
            "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600",
            "MODEL_CACHE_ROOT": "/opt/ml/model",
            "SAGEMAKER_ENV": "1",
            "SAGEMAKER_SUBMIT_DIRECTORY": "/opt/ml/model/code/",
            "SAGEMAKER_PROGRAM": "inference.py",
            "SAGEMAKER_MODEL_SERVER_WORKERS": "1",
            "TS_DEFAULT_WORKERS_PER_MODEL": "1"
        },
        "ModelDataUrl": "s3://jumpstart-cache-prod-us-east-1/pytorch-infer/infer-pytorch-textgeneration1-alexa20b.tar.gz"
    },
    "Tags": [
        {
            "Key": "aws-

-------------------!CPU times: user 292 ms, sys: 35.6 ms, total: 328 ms
Wall time: 9min 33s


#### V. Invoke endpoint for real-time inference 

* max_length: Model generates text until the output length (which includes the input context length) reaches max_length. If specified, it must be a positive integer.
* num_return_sequences: Number of output sequences returned. If specified, it must be a positive integer.
* num_beams: Number of beams used for greedy search. If specified, it must be an integer greater than or equal to num_return_sequences.
* no_repeat_ngram_size: Model ensures that a sequence of words of no_repeat_ngram_size is not repeated in the output sequence. If specified, it must be a positive integer greater than 1.
* temperature: Controls the randomness in the output. Higher temperature results in output sequence with low-probability words and lower temperature results in output sequence with high-probability words. If temperature -> 0, it results in greedy decoding. If specified, it must be a positive float.
* early_stopping: If True, text generation is finished when all beam hypotheses reach the end of sentence token. If specified, it must be boolean.
* do_sample: If True, model samples the next word as per the likelyhood. If specified, it must be boolean.
* top_k: In each step of text generation, sample from only the top_k most likely words. If specified, it must be a positive integer.
* top_p: In each step of text generation, sample from the smallest possible set of words with cumulative probability top_p. If specified, it must be a float between 0 and 1.
* seed: Fix the randomized state for reproducibility. If specified, it must be an integer.

In [None]:
# endpoint_name = '<ENTER YOUR ENDPOINT NAMER HERE>'  # IF PREVIOUSLY DEPLOYED

In [12]:
def invoke(prompt, gen_config):
    payload = {'text_inputs': prompt}
    payload = json.dumps(payload).encode('utf-8')
    response = client.invoke_endpoint(EndpointName=endpoint_name, 
                                      Body=payload, 
                                      ContentType='application/json')
    return response

In [13]:
def parse_response(response):
    body = json.loads(response['Body'].read().decode())
    generated_texts = body['generated_texts'][0]
    return generated_texts

In [14]:
prompt = 'COVID-19 is a deadly'
logger.info(f'Prompt: {prompt}')

Prompt: COVID-19 is a deadly


In [15]:
%%time 

gen_config = {'num_beams': 5, 
              'seed': 123, 
              'no_repeat_ngram_size': 2} 
response = invoke(prompt, gen_config)
response = parse_response(response)
logger.info(f'Response: {response}')

Response: COVID-19 is a deadly virus


CPU times: user 47.5 ms, sys: 11.1 ms, total: 58.7 ms
Wall time: 3min 15s


## N-shot Learning via In-context Learning

### A. Zero-shot Learning 

#### 1. Extract Q&A

In [16]:
context = """Once, a cunning fox saw a crow with a piece of cheese in its beak sitting on a branch. The fox devised a plan and flattered the crow, causing the crow to caw with delight, dropping the cheese which the fox quickly snatched up and ran away. The crow learned a valuable lesson and never trusted the fox again."""

In [17]:
question = 'who got cheated?'
answer = 'crow'

In [18]:
prompt = f'[CLM]Context:{context}<br>Question:{question}<br>Answer:'
logger.info(f'Prompt: {prompt}')

Prompt: [CLM]Context:Once, a cunning fox saw a crow with a piece of cheese in its beak sitting on a branch. The fox devised a plan and flattered the crow, causing the crow to caw with delight, dropping the cheese which the fox quickly snatched up and ran away. The crow learned a valuable lesson and never trusted the fox again.<br>Question:who got cheated?<br>Answer:


In [19]:
%%time 

gen_config = {'do_sample': True, 'max_length': 50, 'top_k': 50}
response = invoke(prompt, gen_config)
response = parse_response(response)
response = response.split('<br>')[0]
logger.info(f'Response: {response}')

Response: The crow.


CPU times: user 5.55 ms, sys: 310 µs, total: 5.86 ms
Wall time: 1.59 s


### 2. Natural Language Inference (NLI)

In [20]:
review = 'I hated the movie. Thoroughly disappointing for a sequel.'
sentiment = 'Sentiment(Good, Bad)'

In [22]:
prompt = f'[CLM]Review:{review}\n{sentiment}:'
logger.info(f'Prompt: {prompt}')

Prompt: [CLM]Review:I hated the movie. Thoroughly disappointing for a sequel.
Sentiment(Good, Bad):


In [23]:
%%time 

gen_config = {'do_sample': True, 'max_length': 50, 'top_k': 50}
response = invoke(prompt, gen_config)
response = parse_response(response)
logger.info(f'Response: {response}')

Response: I hated the movie. Thoroughly disappointing for a sequel. Rating: 1


CPU times: user 5.5 ms, sys: 21 µs, total: 5.52 ms
Wall time: 1.52 s


### B. One-shot Learning 

#### 1. Text Summarization

In [24]:
train_article = 'I love apples especially the large juicy ones. Apples are a great source of vitamins and fiber. An apple a day keeps the doctor away!'
train_summary = 'I love apples. They are healthy!'

In [25]:
test_article = 'I hate oranges especially the bitter ones. They are high in citric acid and they give me heart burns.'
test_summary = 'I hate oranges. They are bad for my heart burn.'

In [26]:
prompt = f'[CLM]article: {train_article}\nsummary:{train_summary}\narticle: {test_article}\nsummary:'
logger.info(f'Prompt: {prompt}')

Prompt: [CLM]article: I love apples especially the large juicy ones. Apples are a great source of vitamins and fiber. An apple a day keeps the doctor away!
summary:I love apples. They are healthy!
article: I hate oranges especially the bitter ones. They are high in citric acid and they give me heart burns.
summary:


In [27]:
%%time 

gen_config = {'do_sample': True, 'max_length': 50, 'top_k': 50}
response = invoke(prompt, gen_config)
response = parse_response(response)
logger.info(f'Response: {response}')

Response: I hate oranges. They are high in citric acid. article: I love


CPU times: user 4.64 ms, sys: 1.13 ms, total: 5.77 ms
Wall time: 1.58 s


#### II. Natural Language Generation (NLG)

In [28]:
train_inp = 'name[The Punter], eat_type[Indian], price_range[cheap]'
train_out = 'The Punter provides Indian food in the cheap price range.'

In [29]:
test_inp = 'name[Blue Spice], eatType[coffee shop], price_range[expensive]'
test_out = 'Blue Spice is a coffee shop that is a bit expensive.'

In [30]:
prompt = (
    f"[CLM] {train_inp} ==> "
    f"sentence describing the place: {train_out} ; "
    f"{test_inp} ==> sentence describing the place:"
)

In [31]:
logger.info(f'Prompt: {prompt}')

Prompt: [CLM] name[The Punter], eat_type[Indian], price_range[cheap] ==> sentence describing the place: The Punter provides Indian food in the cheap price range. ; name[Blue Spice], eatType[coffee shop], price_range[expensive] ==> sentence describing the place:


In [32]:
%%time 

gen_config = {'do_sample': True, 'max_length': 100, 'top_k': 50}
response = invoke(prompt, gen_config)
response = parse_response(response)
response = response.split(';')[0].strip()
logger.info(f'Response: {response}')

Response: Blue Spice provides coffee shop food in the expensive price range.


CPU times: user 6.05 ms, sys: 1.12 ms, total: 7.17 ms
Wall time: 1.58 s


#### Let's flip the input and output formats

In [33]:
train_inp = 'The Punter provides Indian food in the cheap price range.'
train_out = 'name[The Punter], eat_type[Indian], price_range[cheap]'

In [34]:
test_inp = 'Blue Spice is a coffee shop that is a bit pricy.'
test_out = 'name[Blue Spice], eat_type[coffee shop], price_range[pricy]'

In [35]:
prompt = (
    f"[CLM] {train_inp} ==> {train_out}\n"
    f"{test_inp} ==>"
)

In [36]:
logger.info(f'Prompt: {prompt}')

Prompt: [CLM] The Punter provides Indian food in the cheap price range. ==> name[The Punter], eat_type[Indian], price_range[cheap]
Blue Spice is a coffee shop that is a bit pricy. ==>


In [37]:
%%time 

gen_config = {'do_sample': True, 'max_length': 50, 'top_k': 50}
response = invoke(prompt, gen_config)
response = parse_response(response)
logger.info(f'Response: {response}')

Response: name[Blue Spice], eat_type[Coffee], price_range[


CPU times: user 5.14 ms, sys: 626 µs, total: 5.76 ms
Wall time: 1.51 s


III. Machine Translation 

In [38]:
train_inp = 'Das Parlament erhebt sich zu einer Schweigeminute.'
train_out = "The House rose and observed a minute' s silence"

In [39]:
test_inp = 'Kleingärtner bewirtschaften den einstigen Grund von Bauern.'
test_out = 'Allotment holders cultivate the soil of former farmers.'

In [40]:
prompt = (
    f"[CLM] Sentence: {train_inp}; "
    f"Translation in English: {train_out}; "
    f"Sentence: {test_inp}; "
    "Translation in English:"
)

In [41]:
logger.info(f'Prompt: {prompt}')

Prompt: [CLM] Sentence: Das Parlament erhebt sich zu einer Schweigeminute.; Translation in English: The House rose and observed a minute' s silence; Sentence: Kleingärtner bewirtschaften den einstigen Grund von Bauern.; Translation in English:


In [42]:
%%time 

gen_config = {'do_sample': True, 'max_length': 50, 'top_k': 50}
response = invoke(prompt, gen_config)
response = parse_response(response)
response = response.split(';')[0]
logger.info(f'Response: {response}')

Response: Small farmers cultivate the land once owned by farmers


CPU times: user 5.61 ms, sys: 337 µs, total: 5.95 ms
Wall time: 1.51 s
