### Deploy model in Sagemaker

In [1]:
import os
import boto3

# set the environment variables
os.environ['AWS_PROFILE'] = "default"

# Check boto session
boto_sess = boto3.Session()
credentials = boto_sess.get_credentials()
#print("Access Key:", credentials.access_key)
#print("Secret Key:", credentials.secret_key)

# Create the clients 
sm_client = boto3.client("sagemaker")
iam_client = boto3.client("iam")

# Replace with your actual role name
role_name = "SageMaker-smaker_cli"

# Get role details
response = iam_client.get_role(RoleName=role_name)

# Extract the role ARN
role = response["Role"]["Arn"]
print("Role ARN:", role)


# List SageMaker endpoints to verify connection
response = sm_client.list_endpoints()
print(response)

Role ARN: arn:aws:iam::673671551738:role/service-role/SageMaker-smaker_cli
{'Endpoints': [], 'ResponseMetadata': {'RequestId': 'cf94fe61-70ae-45b3-ad71-813554d77741', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'cf94fe61-70ae-45b3-ad71-813554d77741', 'content-type': 'application/x-amz-json-1.1', 'content-length': '16', 'date': 'Mon, 16 Jun 2025 16:29:10 GMT'}, 'RetryAttempts': 0}}


In [2]:
# Use Sagemaker SDK to create a session
import sagemaker

sess = sagemaker.Session()
region = sess._region_name

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/nachiketa/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::673671551738:role/service-role/SageMaker-smaker_cli
sagemaker bucket: sagemaker-us-east-1-673671551738
sagemaker session region: us-east-1


In [3]:
import tarfile
model_dir = "/home/nachiketa/Documents/Workspaces/checkpoints/deepseekmath/base"  # Path to your model directory
tarball_name = "deepseekmath.tar.gz"
model_bucket = "deepseek-math-7b"
s3_key = "models/deepseekmath.tar.gz"

# --- Step 1: Compress the model directory ---
with tarfile.open(tarball_name, "w:gz") as tar:
    tar.add(model_dir, arcname=os.path.basename(model_dir))

print(f"✅ Compressed {model_dir} into {tarball_name}")

✅ Compressed /home/nachiketa/Documents/Workspaces/checkpoints/deepseekmath/base into deepseekmath.tar.gz


In [4]:
s3 = boto3.client("s3")
s3.upload_file(tarball_name, model_bucket, s3_key)

print(f"🚀 Uploaded to s3://{model_bucket}/{s3_key}")

🚀 Uploaded to s3://deepseek-math-7b/models/deepseekmath.tar.gz


In [5]:
# Get the sagemaker default s3 bucket we are going to use.
bucket = sess.default_bucket() 
print(bucket)
s3_location = f"s3://{bucket}/djl-serving/"
print(f"Sagemaker default bucket : {s3_location}")

# Instead of default bucket, we use our own custom bucket
bucket = "deepseek-math-repo"
s3_location = f"s3://{bucket}/djl-serving/"
print(f"Sagemaker custom bucket : {s3_location}")

sagemaker-us-east-1-673671551738
Sagemaker default bucket : s3://sagemaker-us-east-1-673671551738/djl-serving/
Sagemaker custom bucket : s3://deepseek-math-repo/djl-serving/


In [6]:
# Get the uri of the DJL-Deepspeed image
from sagemaker import image_uris

img_uri = image_uris.retrieve(framework="djl-deepspeed", region=region, version="0.21.0")
print(img_uri)

763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117


In [7]:
from sagemaker.huggingface import HuggingFaceModel
import torch as tf 
print(tf.__version__)
import transformers as trn 
print(trn.__version__)

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data="s3://deepseek-math-7b/models/deepseekmath.tar.gz",  # path to your trained sagemaker model
   role=role, # iam role with permissions to create an Endpoint
   transformers_version="4.26", # transformers version used
   pytorch_version="1.13", # pytorch version used
   py_version="py39", # python version of the DLC
)

2.5.1+cu121
4.51.3


  from .autonotebook import tqdm as notebook_tqdm


### Create the script to create the model from s3 bucket

In [8]:
%%writefile model.py

from djl_python import Input, Output
import os
import deepspeed
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

predictor = None


def get_model(properties):
    model_name = "s3://deepseek-math-7b/"
    tensor_parallel = properties["tensor_parallel_degree"]
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    model = AutoModelForCausalLM.from_pretrained(
        model_name, revision="float32", torch_dtype=torch.float32
    )

    hf_model = HuggingFaceModel(
        model_data="s3://deepseek-math-7b/models/deepseekmath.tar.gz",  # path to your trained sagemaker model
        role=role, # iam role with permissions to create an Endpoint
        transformers_version="4.26", # transformers version used
        pytorch_version="1.13", # pytorch version used
        py_version="py39", # python version of the DLC
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = deepspeed.init_inference(
        hf_model,
        mp_size=tensor_parallel,
        dtype=model.dtype,
        replace_method="auto",
        replace_with_kernel_inject=True,
    )
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
    )
    return generator


def handle(inputs: Input) -> None:
    global predictor
    if not predictor:
        predictor = get_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None

    data = inputs.get_as_string()
    result = predictor(data, do_sample=True, max_new_tokens=256)
    return Output().add(result)

Writing model.py


### Serving properties

In [9]:
%%writefile serving.properties
engine = DeepSpeed
option.tensor_parallel_degree = 2

Writing serving.properties


### compress the model and serving properties.

In [10]:
%%sh
if [ -d dsk-r ]; then
  rm -d -r dsk-r
fi #always start fresh

mkdir -p dsk-r
mv model.py dsk-r
mv serving.properties dsk-r
tar -czvf dsk-r.tar.gz dsk-r/
#aws s3 cp dsk-r.tar.gz {path}

dsk-r/
dsk-r/serving.properties
dsk-r/model.py


In [11]:
# Upload the compressed folder to the s3 location
model_tar_url = sagemaker.s3.S3Uploader.upload("dsk-r.tar.gz", s3_location)

### Create sagemaker endpoint

In [12]:
from datetime import datetime

sm_client = boto3.client("sagemaker")

time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
model_name = "dsk-r-" + time_stamp
model_tar_url = "https://deepseek-math-repo.s3.us-east-1.amazonaws.com/djl-serving//dsk-r.tar.gz"

create_model_response = sm_client.create_model(
    ModelName=model_name,
    ExecutionRoleArn=role,
    PrimaryContainer={"Image": img_uri, "ModelDataUrl": model_tar_url},
)

### create an endpoint configuration that SageMaker hosting services uses to deploy models.

In [13]:
# onfigured ModelDataDownloadTimeoutInSeconds and ContainerStartupHealthCheckTimeoutInSeconds to accommodate the large size of our model.
instance_type = "ml.g5.12xlarge"
initial_instance_count = 1
variant_name = "AllTraffic"
endpoint_config_name = "t-j-config-" + time_stamp

production_variants = [
    {
        "VariantName": variant_name,
        "ModelName": model_name,
        "InitialInstanceCount": initial_instance_count,
        "InstanceType": instance_type,
        "ModelDataDownloadTimeoutInSeconds": 900,
        "ContainerStartupHealthCheckTimeoutInSeconds": 1800,
    }
]

endpoint_config = {
    "EndpointConfigName": endpoint_config_name,
    "ProductionVariants": production_variants,
}

ep_conf_res = sm_client.create_endpoint_config(**endpoint_config)

### create an endpoint using the model and the endpoint configuration created from above steps.

In [14]:
# The creation of the SageMaker endpoint might take a while.
endpoint_name = "dsk-r" + time_stamp
ep_res = sm_client.create_endpoint(
    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)

# test the endpoint

In [11]:
import json

client = boto3.client("sagemaker-runtime")

content_type = "text/plain"  # The MIME type of the input data in the request body.
payload = "Amazon.com is the best"  # Payload for inference.
response = client.invoke_endpoint(
    EndpointName=endpoint_name, ContentType=content_type, Body=payload
)
print(response["Body"].read())