### Mixtral Deplymeny on Amazon SageMaker

Mixtral is an open LLM from Mistral AI. 

### Prerequisites

## Setup the development environment

In [4]:

## Make sure you have sagemaker>=2.199.0. If not use: !pip install "sagemaker>=2.199.0" --upgrade --quiet
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::762797705265:role/service-role/AmazonSageMaker-ExecutionRole-20230323T105883
sagemaker session region: us-west-2


## Retrieve the HF LLM DLC

In [5]:
# COMMENT IN WHEN PR (https://github.com/aws/sagemaker-python-sdk/pull/4314) IS MERGED
# from sagemaker.huggingface import get_huggingface_llm_image_uri

# # retrieve the llm image uri
# llm_image = get_huggingface_llm_image_uri(
#   "huggingface",
#   version="1.3.1"
# )

region_mapping = {
    "af-south-1": "626614931356",
    "il-central-1": "780543022126",
    "ap-east-1": "871362719292",
    "ap-northeast-1": "763104351884",
    "ap-northeast-2": "763104351884",
    "ap-northeast-3": "364406365360",
    "ap-south-1": "763104351884",
    "ap-south-2": "772153158452",
    "ap-southeast-1": "763104351884",
    "ap-southeast-2": "763104351884",
    "ap-southeast-3": "907027046896",
    "ap-southeast-4": "457447274322",
    "ca-central-1": "763104351884",
    "cn-north-1": "727897471807",
    "cn-northwest-1": "727897471807",
    "eu-central-1": "763104351884",
    "eu-central-2": "380420809688",
    "eu-north-1": "763104351884",
    "eu-west-1": "763104351884",
    "eu-west-2": "763104351884",
    "eu-west-3": "763104351884",
    "eu-south-1": "692866216735",
    "eu-south-2": "503227376785",
    "me-south-1": "217643126080",
    "me-central-1": "914824155844",
    "sa-east-1": "763104351884",
    "us-east-1": "763104351884",
    "us-east-2": "763104351884",
    "us-gov-east-1": "446045086412",
    "us-gov-west-1": "442386744353",
    "us-iso-east-1": "886529160074",
    "us-isob-east-1": "094389454867",
    "us-west-1": "763104351884",
    "us-west-2": "763104351884",
}

llm_image = f"{region_mapping[sess.boto_region_name]}.dkr.ecr.{sess.boto_region_name}.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04-v1.0"

# print ecr image uri
print(f"llm image uri: {llm_image}")


llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.3.1-gpu-py310-cu121-ubuntu20.04-v1.0


## Deploy Mixtral 8x7B to Amazon SageMaker

In [9]:
import json
from sagemaker.huggingface import HuggingFaceModel

# sagemaker config
instance_type = "ml.g5.48xlarge"
number_of_gpu = 8
health_check_timeout = 600

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "mistralai/Mixtral-8x7B-Instruct-v0.1", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(24000),  # Max length of input text
  'MAX_BATCH_PREFILL_TOKENS': json.dumps(32000),  # Number of tokens for the prefill operation.
  'MAX_TOTAL_TOKENS': json.dumps(32000),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(512000),  # Limits the number of tokens that can be processed in parallel during the generation
  # ,'HF_MODEL_QUANTIZE': "awq", # comment in to quantize not supported yet
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)


In [10]:
# Deploy model to an endpoint
# https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)


----------------!

## Run an inference and have a conversation with the model

The mistralai/Mixtral-8x7B-Instruct-v0.1 is a conversational chat model meaning we can chat with it using the following prompt:

``<s> [INST] User Instruction 1 [/INST] Model answer 1</s> [INST] User instruction 2 [/INST]``

In [24]:
# Prompt to generate
prompt=f"<s> [INST] What are some of the best activites at Big Bend National Park? List 10. [/INST] "

# Generation arguments
payload = {
    "do_sample": True,
    "top_p": 0.6,
    "temperature": 0.9,
    "top_k": 50,
    "max_new_tokens": 1024,
    "repetition_penalty": 1.03,
    "return_full_text": False,
    "stop": ["</s>"]
}


In [25]:
chat = llm.predict({"inputs":prompt, "parameters":payload})

print(chat[0]["generated_text"])


1. Hiking: Big Bend National Park has over 150 miles of hiking trails, ranging from easy walks to strenuous backcountry treks. Some popular hikes include the Window Trail, the South Rim Trail, and the Lost Mine Trail.

2. Backpacking: For those looking for a more remote and rugged experience, Big Bend offers numerous backpacking opportunities. Permits are required for overnight trips, and rangers recommend that all backpackers be well-prepared and experienced.

3. Scenic Drives: The park has several scenic drives, including the Ross Maxwell Scenic Drive, which offers views of the Chisos Mountains, the Santa Elena Canyon, and the Rio Grande.

4. River Trips: The Rio Grande runs through the park, offering opportunities for rafting, canoeing, and kayaking. Several companies offer guided trips, or visitors can bring their own equipment and launch from one of the park's designated river access points.

5. Stargazing: Big Bend is known for its dark skies, making it an excellent spot for star

## Clean up

In [None]:
# llm.delete_model()
# llm.delete_endpoint()

---------------------------------------------------------------------------
UnexpectedStatusException                 Traceback (most recent call last)
Cell In[7], line 3
      1 # Deploy model to an endpoint
      2 # https://sagemaker.readthedocs.io/en/stable/api/inference/model.html#sagemaker.model.Model.deploy
----> 3 llm = llm_model.deploy(
      4   initial_instance_count=1,
      5   instance_type=instance_type,
      6   container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
      7 )

File /opt/conda/lib/python3.10/site-packages/sagemaker/huggingface/model.py:315, in HuggingFaceModel.deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, **kwargs)
    308     inference_tool = "neuron" if instance_type.startswith("ml.inf1") else "neuronx"
    309     self.image_uri = self.serving_image_uri(
    310         region_name=self.sagemaker_session.boto_session.region_name,
    311         instance_type=instance_type,
    312         inference_tool=inference_tool,
    313     )
--> 315 return super(HuggingFaceModel, self).deploy(
    316     initial_instance_count,
    317     instance_type,
    318     serializer,
    319     deserializer,
    320     accelerator_type,
    321     endpoint_name,
    322     tags,
    323     kms_key,
    324     wait,
    325     data_capture_config,
    326     async_inference_config,
    327     serverless_inference_config,
    328     volume_size=volume_size,
    329     model_data_download_timeout=model_data_download_timeout,
    330     container_startup_health_check_timeout=container_startup_health_check_timeout,
    331     inference_recommendation_id=inference_recommendation_id,
    332     explainer_config=explainer_config,
    333     endpoint_logging=kwargs.get("endpoint_logging", False),
    334     endpoint_type=kwargs.get("endpoint_type", None),
    335     resources=kwargs.get("resources", None),
    336     managed_instance_scaling=kwargs.get("managed_instance_scaling", None),
    337 )

File /opt/conda/lib/python3.10/site-packages/sagemaker/model.py:1653, in Model.deploy(self, initial_instance_count, instance_type, serializer, deserializer, accelerator_type, endpoint_name, tags, kms_key, wait, data_capture_config, async_inference_config, serverless_inference_config, volume_size, model_data_download_timeout, container_startup_health_check_timeout, inference_recommendation_id, explainer_config, accept_eula, endpoint_logging, resources, endpoint_type, managed_instance_scaling, **kwargs)
   1650 if is_explainer_enabled:
   1651     explainer_config_dict = explainer_config._to_request_dict()
-> 1653 self.sagemaker_session.endpoint_from_production_variants(
   1654     name=self.endpoint_name,
   1655     production_variants=[production_variant],
   1656     tags=tags,
   1657     kms_key=kms_key,
   1658     wait=wait,
   1659     data_capture_config_dict=data_capture_config_dict,
   1660     explainer_config_dict=explainer_config_dict,
   1661     async_inference_config_dict=async_inference_config_dict,
   1662     live_logging=endpoint_logging,
   1663 )
   1665 if self.predictor_cls:
   1666     predictor = self.predictor_cls(self.endpoint_name, self.sagemaker_session)

File /opt/conda/lib/python3.10/site-packages/sagemaker/session.py:5331, in Session.endpoint_from_production_variants(self, name, production_variants, tags, kms_key, wait, data_capture_config_dict, async_inference_config_dict, explainer_config_dict, live_logging, vpc_config, enable_network_isolation, role)
   5328 LOGGER.info("Creating endpoint-config with name %s", name)
   5329 self.sagemaker_client.create_endpoint_config(**config_options)
-> 5331 return self.create_endpoint(
   5332     endpoint_name=name,
   5333     config_name=name,
   5334     tags=endpoint_tags,
   5335     wait=wait,
   5336     live_logging=live_logging,
   5337 )

File /opt/conda/lib/python3.10/site-packages/sagemaker/session.py:4242, in Session.create_endpoint(self, endpoint_name, config_name, tags, wait, live_logging)
   4238 self.sagemaker_client.create_endpoint(
   4239     EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
   4240 )
   4241 if wait:
-> 4242     self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
   4243 return endpoint_name

File /opt/conda/lib/python3.10/site-packages/sagemaker/session.py:4974, in Session.wait_for_endpoint(self, endpoint, poll, live_logging)
   4968     if "CapacityError" in str(reason):
   4969         raise exceptions.CapacityError(
   4970             message=message,
   4971             allowed_statuses=["InService"],
   4972             actual_status=status,
   4973         )
-> 4974     raise exceptions.UnexpectedStatusException(
   4975         message=message,
   4976         allowed_statuses=["InService"],
   4977         actual_status=status,
   4978     )
   4979 return desc

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-12-14-21-18-04-022: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.. Try changing the instance type or reference the troubleshooting page https://docs.aws.amazon.com/sagemaker/latest/dg/async-inference-troubleshooting.html