## A. Deploy DeepSeek-R1 Distill Model on Amazon SageMaker via HuggingFaceModel API
> 1. Instance type : **ml.g5.12xlarge**

In [None]:
# %pip install "sagemaker>=2.163.0"

In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri
import time

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()

print(region)
print(role)

In [None]:
# Retrieve the LLM Image URI
image_uri = get_huggingface_llm_image_uri(
  backend="huggingface",
  region=region
)

image_uri

In [None]:
# Create the Hugging Face Model
model_name = "deepseek-14b-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

hub = {
    'HF_MODEL_ID':'deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
    'HF_TASK':'question-answering',
    'SM_NUM_GPUS':'4'
}

model = HuggingFaceModel(
    name=model_name,
    env=hub,
    role=role,
    image_uri=image_uri
)

model_name

In [None]:
%%time
# Creating a SageMaker Endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.12xlarge",
    endpoint_name=model_name
)

predictor

In [None]:
%%time
# Sample Inference Usage
# Advanced generation parameters
generation_params = {
    "do_sample": True,
    "top_p": 0.7,
    "temperature": 0.6,
    "max_new_tokens": 1024
}

# Sample request
predictor.predict({
    "inputs": "What is DeepSeek R1?",
    "parameters": generation_params
})

## B. Invoke endpoint for prediction after deployment at SageMaker Endpoint

In [None]:
import boto3
import json

endpoint_name = <sagemaker-endpoint-name>
sm_runtime = boto3.client("runtime.sagemaker")
sm_runtime

In [None]:
predict_body = json.dumps({"inputs": "How is DeepSeek R1 performance compared to Claude 3.5 Sonnet?",
                           "parameters": {"do_sample": True, "top_p": 0.9, "temperature": 0.6, "max_new_tokens": 1024}})
predict_body

In [None]:
%%time

# After you deploy a model using Amazon SageMaker hosting services
# Performed inferences from the model hosted at the specified endpoint.
response = sm_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="application/json",
    Body=predict_body
)
response_str = response["Body"].read().decode()
print(response_str)

In [None]:
# Cleaning up : release resources
predictor.delete_model()
predictor.delete_endpoint()