# Deploy Models

In this notebook we are deploying the [`GPT-J-6B`](https://huggingface.co/EleutherAI/gpt-j-6b) as the LLM for generating embeddings.

In [None]:
!pip install --upgrade sagemaker --quiet

In [None]:
import sys
import time
import logging
import sagemaker, boto3, json
from sagemaker.model import Model
from sagemaker.session import Session
from sagemaker.predictor import Predictor
from sagemaker.utils import name_from_base
from sagemaker import image_uris, model_uris, script_uris, hyperparameters

In [None]:
# global constants
APP_NAME = "qa-w-rag"
EMBEDDING_MODEL = "huggingface-textembedding-gpt-j-6b"

In [None]:
logger = logging.getLogger()
logging.basicConfig(format='%(asctime)s,%(module)s,%(processName)s,%(levelname)s,%(message)s', level=logging.INFO, stream=sys.stderr)

In [None]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
model_version = "*"
logger.info(f"aws_role={aws_role}, aws_region={aws_region}")

In [None]:
MODEL_CONFIG_LIST = [
    {
        "model_id": "huggingface-textembedding-gpt-j-6b",
        "model_version": "*",
        "instance_type": "ml.g5.24xlarge",
        "instance_count": 1,
        "env": {"TS_DEFAULT_WORKERS_PER_MODEL": "2"},
    }
]

In [None]:
newline, bold, unbold = "\n", "\033[1m", "\033[0m"

for model in MODEL_CONFIG_LIST: 
    start = time.time()
    endpoint_name = name_from_base(f"{APP_NAME}-{model['model_id']}")
    logger.info(f"going to deploy model={model}, endpoint_name={endpoint_name}")    
    # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.
    deploy_image_uri = image_uris.retrieve(
        region=None,
        framework=None,  # automatically inferred from model_id
        image_scope="inference",
        model_id=model['model_id'],
        model_version=model['model_version'],
        instance_type=model['instance_type'],
    )
    # Retrieve the model uri.
    model_uri = model_uris.retrieve(
        model_id=model['model_id'], model_version=model['model_version'], model_scope="inference"
    )
    logger.info(f"deploy_image_uri={deploy_image_uri}, model_uri={model_uri}")
    model_inference = Model(
        image_uri=deploy_image_uri,
        model_data=model_uri,
        role=aws_role,
        predictor_cls=model.get("predictor_cls"),
        name=endpoint_name,
        env=model['env'],
    )
    model_predictor_inference = model_inference.deploy(
        initial_instance_count=model['instance_count'],
        instance_type=model['instance_type'],
        predictor_cls=model.get("predictor_cls"),
        endpoint_name=endpoint_name,
    )
    time_taken = time.time() - start
    logger.info(f"{bold}model={model['model_id']} has been deployed successfully at endpoint={endpoint_name}, took {time_taken}seconds{unbold}{newline}")
    model["endpoint_name"] = endpoint_name

In [None]:
embedding_model_endpoint_name = None
for model in MODEL_CONFIG_LIST:
    if model['model_id'] == EMBEDDING_MODEL:
        embedding_model_endpoint_name = model['endpoint_name']
        logger.info(f"EMBEDDING_MODEL={EMBEDDING_MODEL},   embedding_model_endpoint_name={embedding_model_endpoint_name}")


In [None]:
%store embedding_model_endpoint_name