# Deploy Llama2 7b Chat HF to Inferentia2 and SageMaker

**Kernel**: Python 3 (PyTorch 1.13 Python 3.9 CPU Optimized)  
**Instance**: ml.t3.medium

### Run the following steps to get permission to download LLama2 pre-trained weights from Meta

#### Step 1 - HF Account
Go to (https://huggingface.co/join) and create a HF account if you don't have one. Log into HF hub after that.

#### Step 2 - Create and Access token
Follow the instrutions from (https://huggingface.co/docs/hub/security-tokens) and create a new Access token. Copy the token.

#### Step 3 - Meta approval to download weights
Follow the instructions from (https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) to get an approval from Meta for you to download and use the weights. It can take some time. After approved you'll see a message like: **Gated model You have been granted access to this model** at the top of the same page. Now you're ready to download and compile your model to Inferentia2

## 1) Update SageMaker SDK

In [None]:
%pip install -U sagemaker

## 2) Initialize session

In [None]:
import os
import boto3
import sagemaker

print(sagemaker.__version__)
if not sagemaker.__version__ >= "2.146.0": print("You need to upgrade or restart the kernel if you already upgraded")

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
region = sess.boto_region_name

## ATTENTION: Copy your HF Access token to the following variable
HF_TOKEN=None

assert not HF_TOKEN is None, "Go to your HF account and get an access token. Set HF_TOKEN to your token"
os.makedirs("src", exist_ok=True)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {bucket}")
print(f"sagemaker session region: {region}")

## 3) Install additional packages before compiling the model

In [None]:
%%writefile src/requirements.txt
transformers-neuronx==0.6.106

## 4) Create now Python scripts for compiling and deploying the model

### 4.1) This script will download model weights from HF, split into multiple files and compile the model for a given number of cores

In [None]:
%%writefile src/split_llama2.py
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
import os
import sys
import time
import torch
import shutil
import argparse
import traceback

os.environ["NEURON_CC_FLAGS"] = "--logfile=/dev/null --model-type=transformer-inference"
from huggingface_hub import login
from transformers import LlamaForCausalLM, AutoTokenizer
from transformers_neuronx.module import save_pretrained_split
from transformers_neuronx.llama.model import LlamaForSampling

if __name__=='__main__':
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.    
    parser.add_argument("--model_id", type=str, default="meta-llama/Llama-2-7b-chat-hf")    
    parser.add_argument("--hf_access_token", type=str, required=True)
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    
    parser.add_argument("--tp_degree", type=int, default=2)
    parser.add_argument("--n_positions", type=int, default=1024)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--dtype", type=str, default='bf16')

    args, _ = parser.parse_known_args()
    os.environ['NEURONX_DUMP_TO']=os.path.join(args.model_dir, "neuron_cache")

    login(args.hf_access_token)
    print("Loading model...")
    t=time.time()
    model = LlamaForCausalLM.from_pretrained(args.model_id)
    print(f"Elapsed: {time.time()-t}s, Spliting and saving...")
    t=time.time()
    save_pretrained_split(model, os.path.join(args.model_dir, "llama2-split"))
    print(f"Elapsed: {time.time()-t}s, Done")
    print("Saving tokenizer...")
    t=time.time()
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
    tokenizer.save_pretrained(args.model_dir)
    print(f"Elapsed: {time.time()-t}s, Done")
    print("Copying inference.py")
    code_path = os.path.join(args.model_dir, "code")
    os.makedirs(code_path, exist_ok=True)
    shutil.copy("inference.py", os.path.join(code_path, "inference.py"))

    kwargs = {
        "batch_size": args.batch_size,
        "amp": args.dtype,
        "tp_degree": args.tp_degree,
        "n_positions": args.n_positions,
        "unroll": None
    }
    print("Compiling model...")
    t=time.time()
    model = LlamaForSampling.from_pretrained(os.path.join(args.model_dir, "llama2-split"), **kwargs)
    model.to_neuron()
    model._save_compiled_artifacts(os.path.join(args.model_dir, "artifacts"))
    print(f"Compilation time: {time.time()-t}")

### 4.2) Script used by SageMaker to load the model and invoke it as an API

In [None]:
%%writefile src/inference.py
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
import os
import time
import json
import torch
from filelock import Timeout, FileLock

lock_path='/tmp/new_packages.lock'
lock = FileLock(lock_path)

os.environ['NEURON_RT_NUM_CORES']=os.environ.get('TP_DEGREE', 8)
# if you're using NeuronSDK version 2.14+ use: --model-type=transformer
os.environ["NEURON_CC_FLAGS"] = "--logfile=/dev/null --model-type=transformer-inference"

from transformers import AutoTokenizer
from transformers_neuronx.llama.model import LlamaForSampling

def model_fn(model_dir, context=None):
    os.environ['NEURONX_DUMP_TO'] = os.path.join(model_dir, "neuron_cache")
    batch_size=int(os.environ.get('BATCH_SIZE', 1))
    tp_degree=int(os.environ.get('TP_DEGREE', 8))
    dtype=os.environ.get('DTYPE', 'bf16')

    print("Waiting for the lock acquire...")    
    lock.acquire()
    t=time.time()
    print("Loading model...")
    model = LlamaForSampling.from_pretrained(os.path.join(model_dir, "llama2-split"), batch_size=batch_size, tp_degree=tp_degree, amp=dtype)
    neuron_program_path = os.path.join(model_dir, "artifacts", "neuron-program.pkl")
    if os.path.isfile(neuron_program_path):
        print("Neuron program found. Loading")
        model._load_compiled_artifacts(os.path.join(model_dir, "artifacts"))
    model.to_neuron()
    print(f"Model loaded. Elapsed: {time.time()-t}s")
    lock.release()
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    return model,tokenizer

def input_fn(input_data, content_type, context=None):
    if content_type == 'application/json':
        req = json.loads(input_data)
        prompt = req.get('prompt')
        seq_len = req.get('sequence_length', 2048)
        top_k = req.get('top_k', 50)
        top_p = req.get('top_p', 1.0)
        temperature = req.get('temperature', 1.0)
        if prompt is None or len(prompt) < 3:
            raise("Invalid prompt. Provide an input like: {'prompt': 'text text text'}")
        return prompt,seq_len,top_k,top_p,temperature
    else:
        raise Exception(f"Unsupported mime type: {content_type}. Supported: application/json. Expected keys: prompt,optional[sequence_length,top_k,top_p,temperature]")

def predict_fn(input_object, model_tokenizer, context=None):
    model,tokenizer = model_tokenizer
    prompt,seq_len,top_k,top_p,temperature = input_object
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    # run inference with top-k sampling
    t=time.time()
    with torch.inference_mode():
        generated_sequences = model.sample(input_ids, sequence_length=seq_len, top_k=top_k, top_p=top_p, temperature=temperature)
        out = [tokenizer.decode(seq) for seq in generated_sequences]
        print(f"Pred. elapsed: {time.time()-t}s")
    return out

## 5) SageMaker (training) Job that will download, split and compile the model

In [None]:
tp_degree=2
dtype='bf16'
batch_size=1
sentence_len=2048
assert tp_degree==2 or tp_degree==8, "2 = cheapest option with higher latency; 8 = more efficient with lower latency;"

In [None]:
import json
import logging
from sagemaker.pytorch import PyTorch

instance_type='ml.trn1.32xlarge' if tp_degree > 2 else 'ml.trn1.2xlarge'
print(f"Instance type: {instance_type}")
estimator = PyTorch(
    entry_point="split_llama2.py", # Specify your train script
    source_dir="src",
    role=role,
    sagemaker_session=sess,    
    instance_count=1,
    instance_type=instance_type,
    output_path=f"s3://{bucket}/output",
    disable_profiler=True,
    disable_output_compression=True,
    
    image_uri=f"763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-training-neuronx:1.13.1-neuronx-py310-sdk2.13.2-ubuntu20.04",
    
    volume_size = 128,
    hyperparameters={
        "hf_access_token": HF_TOKEN,
        "model_id": "meta-llama/Llama-2-7b-chat-hf",
        "tp_degree": tp_degree,
        "n_positions": sentence_len
    }
)
estimator.framework_version = '1.13.1' # workround when using image_uri

In [None]:
# this takes ~21mins on a trn1.32xlarge and ~40mins on a trn1.2xlarge
estimator.fit()

## 6) Deploy the compiled model to a SageMaker endpoint on inf2
Depending on the size of the deployed instance and the number of cores used by the model (**tp_degree**), SageMaker can launch multiple workers. A worker is a standalone Python process that manages one copy of the model. SageMaker puts a load balancer on top of all these processes and distributes the load automatically for your clients. It means that you can increase throughput by launching multiple workers which serve different clients in parallel.

For instance. If you set **tp_degree** to 8 and deploy your model to a **ml.inf2.48xlarge**, SageMaker can launch 3 workers with 3 copies of the model. This instance has 24 cores and each model utilizes in this scenario 8 cores. Then, you can have 3 simultaneous clients invoking the endpoint and being served at the same time.

In [None]:
import logging
from sagemaker.utils import name_from_base
from sagemaker.pytorch.model import PyTorchModel

# depending on the inf2 instance you deploy the model you'll have more or less accelerators
# we'll ask SageMaker to launch 1 worker per core

instance_type_idx=0
## Attention: ml.inf2.xlarge doesnt have enough memory to work with llama7b
instance_types=['ml.inf2.8xlarge', 'ml.inf2.24xlarge','ml.inf2.48xlarge']
num_cores=[2,12,24]
num_workers=num_cores[instance_type_idx]//tp_degree
assert num_workers > 0, f"Instance {instance_types[instance_type_idx]} doesn't support tp_degree={tp_degree}"

print(f"Instance type: {instance_types[instance_type_idx]}. Num SM workers: {num_workers}")
pytorch_model = PyTorchModel(
    image_uri=f"763104351884.dkr.ecr.{region}.amazonaws.com/pytorch-inference-neuronx:1.13.1-neuronx-py310-sdk2.13.2-ubuntu20.04",
    model_data=estimator.model_data,
    role=role,    
    name=name_from_base('llama2-7b-chat'),
    sagemaker_session=sess,
    container_log_level=logging.DEBUG,
    model_server_workers=num_workers,
    framework_version="1.13.1",
    env = {
        'SAGEMAKER_MODEL_SERVER_TIMEOUT' : '3600',
        'TP_DEGREE': str(tp_degree),
        'BATCH_SIZE': str(batch_size),
        'DTYPE': dtype
    },
    # for production it is important to define vpc_config and use a vpc_endpoint
    #vpc_config={
    #    'Subnets': ['<SUBNET1>', '<SUBNET2>'],
    #    'SecurityGroupIds': ['<SECURITYGROUP1>', '<DEFAULTSECURITYGROUP>']
    #}
)
pytorch_model._is_compiled_model = True

In [None]:
predictor = pytorch_model.deploy(
    initial_instance_count=1,
    instance_type=instance_types[instance_type_idx],
    model_data_download_timeout=600, # it takes some time to download all the artifacts and load the model
    container_startup_health_check_timeout=600
)

## 7) Run a simple test to check the endpoint

In [289]:
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
predictor.serializer = JSONSerializer()
predictor.deserializer = JSONDeserializer()

In [303]:
import re
import time
text="[INST]Hi, my name is Adam. How are you?[/INST]"
t=time.time()
pred = predictor.predict({"prompt": text })[0]
elapsed = time.time()-t
answer = re.match(r'^.*\[\/INST\] +(.*)</s>', pred)[1]
num_words = len(answer.split(' '))
print(f"Num Words: {num_words}, Words/sec: {num_words/elapsed:.04f}, Elapsed time: {elapsed:.04f}s\nAnswer: {answer}")

Num Words: 43, Words/sec: 23.8324, Elapsed time: 1.8043s
Answer: Hello Adam! I'm just an AI, so I don't have feelings or emotions like humans do, but I'm here to help you in any way I can. How can I assist you today? Is there anything you'd like to chat about or ask?


In [305]:
import time
from multiprocessing.pool import ThreadPool
with ThreadPool(num_workers) as p:
    t=time.time()
    print(p.map(predictor.predict, [{"prompt": text}] * num_workers))
    print(f"Elapsed time: {time.time()-t}")

[["<s> [INST]Hi, my name is Adam. How are you?[/INST]  Hello Adam! I'm just an AI, I don't have feelings or emotions, but I'm here to help you with any questions or tasks you may have. How can I assist you today?</s>"], ['<s> [INST]Hi, my name is Adam. How are you?[/INST]  Hello Adam! I\'m just an AI, I don\'t have feelings or emotions like a human, so I don\'t have a personal experience of being "good" or "bad." However, I\'m here to help you with any questions or tasks you may have, so please feel free to ask me anything!</s>'], ["<s> [INST]Hi, my name is Adam. How are you?[/INST]  Hello Adam! I'm just an AI, I don't have feelings or emotions like humans do, so I don't feel anything in response to your greeting. However, I'm here to help you with any questions or tasks you may have, so feel free to ask me anything!</s>"], ["<s> [INST]Hi, my name is Adam. How are you?[/INST]  Hello Adam! I'm just an AI, I don't have feelings or emotions like humans do, so I can't feel or respond to em