In [None]:
%pip install sagemaker boto3 botocore --quiet --upgrade

# Inferencing with Huggingface‚Äôs Text Generation Inference (TGI)

In [None]:
import sagemaker, json
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.image_uris import retrieve as sagemaker_retrieve_image_uri

def select_container_image(container_selection, region_nm):
    """
    Picks the container image URI (TGI or LMI) based on the choice.
    """
    if container_selection == "tgi":
        return "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.4.0-tgi3.0.1-gpu-py311-cu124-ubuntu22.04"
    elif container_selection == "lmi":
        return "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.31.0-lmi13.0.0-cu124"
    else:
        raise ValueError("Only 'tgi' or 'lmi' are allowed.")

def determine_gpu_count(hosting_type):
    """
    Returns how many GPUs to allocate, based on the instance type.
    """
    four_gpu_types = [
        "ml.g5.12xlarge", "ml.g6e.24xlarge", "ml.g6e.12xlarge",
        "ml.g6.12xlarge", "ml.g6.24xlarge", "ml.g4dn.12xlarge", "ml.g4ad.16xlarge"
    ]
    eight_gpu_types = [
        "ml.p5.48xlarge", "ml.p5e.48xlarge", "ml.p5en.48xlarge",
        "ml.p4d.24xlarge", "ml.p4de.24xlarge", "ml.g6e.48xlarge", "ml.g6.48xlarge"
    ]
    if hosting_type in four_gpu_types:
        return 4
    elif hosting_type in eight_gpu_types:
        return 8
    elif hosting_type.startswith("inf"):
        raise ValueError("Inference instance type not supported by this code.")
    else:
        return 1

def create_sagemaker_model(img_uri, model_identifier, gpu_qty):
    """
    Creates a SageMaker Model object with environment variables for the container.
    """
    env_settings = {
        "HF_MODEL_ID": model_identifier,
        "OPTION_MAX_MODEL_LEN": "10000",
        "OPTION_GPU_MEMORY_UTILIZATION": "0.95",
        "OPTION_ENABLE_STREAMING": "false",
        "OPTION_ROLLING_BATCH": "auto",
        "OPTION_MODEL_LOADING_TIMEOUT": "3600",
        "OPTION_PAGED_ATTENTION": "false",
        "OPTION_DTYPE": "fp16",
        "MAX_CONCURRENT_REQUESTS": "10",
        "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
        "SM_NUM_GPUS": json.dumps(gpu_qty),
    }
    return sagemaker.Model(
        image_uri=img_uri,
        env=env_settings,
        role=sagemaker.get_execution_role(),
        name=model_identifier.split("/")[-1].lower(),
        sagemaker_session=sagemaker.Session()
    )

def deploy_sagemaker_model(sm_model, endpoint_id, inst_type):
    """
    Deploys the model to a SageMaker endpoint using the given instance type.
    """
    return sm_model.deploy(
        endpoint_name=endpoint_id,
        initial_instance_count=1,
        instance_type=inst_type,
        container_startup_health_check_timeout=600
    )

def test_sagemaker_endpoint(predictor_obj, user_input):
    """
    Sends a quick query to check the endpoint's response.
    """
    result = predictor_obj.predict({"inputs": user_input})
    print(result)


* select_container_image() picks either TGI or LMI for inference.
* determine_gpu_count() decides how many GPUs to use.
* create_sagemaker_model() sets environment variables for maximum tokens, rolling batch, and more.
* deploy_sagemaker_model() creates and deploys the endpoint.
* test_sagemaker_endpoint() sends a prompt and prints the model‚Äôs answer.

### üõ†Ô∏è Finally Configure, Deploy, and Test

In [None]:
region_name = sagemaker.Session().boto_session.region_name
container_selection = "tgi"  # or "lmi"
container_uri = select_container_image(container_selection, region_name)

hosting_type = "ml.g5.2xlarge"
gpu_num = determine_gpu_count(hosting_type)

my_model_id = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
model_object = create_sagemaker_model(container_uri, my_model_id, gpu_num)

new_endpoint_name = (my_model_id.split("/")[-1] + "-endpoint").lower()
endpoint_predictor = deploy_sagemaker_model(model_object, new_endpoint_name, hosting_type)

test_sagemaker_endpoint(endpoint_predictor, "What is the meaning of life?")


* container_selection decides TGI or LMI.
* hosting_type picks the instance size.
* gpu_num is the GPU count from determine_gpu_count().
* create_sagemaker_model() prepares the model artifact with the container image.
* deploy_sagemaker_model() deploys that artifact to the endpoint.
* test_sagemaker_endpoint() sends a prompt to confirm correctness.

# Inferentia 2 Deployment Code

For Inferentia 2, similar functions can be used, but change the container to djl-neuronx and the instance type to ml.inf2.xxx.

In [None]:
import boto3
import sagemaker, json
from sagemaker.image_uris import retrieve as sagemaker_retrieve_image_uri
from sagemaker.model import Model

# Helper: Get the inference container URI for Inferentia 2
def get_inferentia_image(region_value):
    return sagemaker_retrieve_image_uri(framework="djl-neuronx", version="latest", region=region_value)

# Helper: Create a SageMaker model object with custom environment settings
def build_inferentia_model(image_uri, hf_model_str, config_dict):
    short_name = hf_model_str.split("/")[-1].lower()
    return Model(
        image_uri=image_uri,
        env=config_dict,
        role=sagemaker.get_execution_role(),
        name=short_name
    )

# Helper: Deploy the model to a SageMaker endpoint
def launch_endpoint(model_obj, inst_type, suffix):
    endpoint_id = f"{model_obj.name}-{suffix}"
    model_obj.deploy(
        initial_instance_count=1,
        instance_type=inst_type,
        container_startup_health_check_timeout=1600,
        endpoint_name=endpoint_id
    )
    print(f"Deployed endpoint: {endpoint_id}")
    return endpoint_id

# Main execution
session_obj = sagemaker.Session()
current_region = session_obj.boto_session.region_name
infer_img_uri = get_inferentia_image(current_region)
print(f"Using inference image: {infer_img_uri}")

hf_model_identifier = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
vllm_settings = {
    "HF_MODEL_ID": hf_model_identifier,
    "OPTION_TENSOR_PARALLEL_DEGREE": "max",
    "HF_TOKEN": "",
    "OPTION_ROLLING_BATCH": "vllm",
    "OPTION_OUTPUT_FORMATTER": "json",
    "OPTION_MAX_ROLLING_BATCH_SIZE": "16",
    "OPTION_MODEL_LOADING_TIMEOUT": "1600",
}

infer_model = build_inferentia_model(infer_img_uri, hf_model_identifier, vllm_settings)
endpoint_name = launch_endpoint(infer_model, "ml.inf2.24xlarge", "ep")


##  Test the Deployed Endpoint

In [None]:
import sagemaker

# Define a sample text from the ECTSum dataset (or your own sample)
sample_text = """
The ECTSum dataset provides detailed articles for text summarization tasks.
It is used to benchmark large language models for generating concise summaries.
"""

# Create a prompt that instructs the model to summarize the sample text
prompt_template = f"""
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
You are a summarization assistant.
<|eot_id|>

<|start_header_id|>user<|end_header_id|>
Summarize the following text:
{sample_text}
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
"""

# Create a predictor for the deployed endpoint
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=session_obj,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

# Invoke the endpoint and print the output
response = predictor.predict({
    "inputs": prompt_template,
    "parameters": {
        "do_sample": True,
        "max_new_tokens": 256,
        "top_p": 0.9,
        "temperature": 0.6,
    }
})
print(response.get("generated_text", "No output returned."))
