# Real Time Inference Endpoint

In [None]:
!pip install -U pandas pandas-profiling scikit-learn sagemaker

## Create the inference script
- Since the model has been trained with good accuracy we can deploy it.
- First we need to write the 4 functions for model inference in a .py script
- Sagemaker API documentation: https://sagemaker.readthedocs.io/en/stable/api/index.html

In [None]:
%%writefile serve.py

import os
import joblib
import pandas as pd

def model_fn(model_dir):
    """Load and return the model"""
    model_file_name = "pipeline_model.joblib"
    pipeline_model = joblib.load(os.path.join(model_dir, model_file_name))
    
    return pipeline_model

def input_fn(request_body, request_content_type):
    """Process the input json data and return the processed data.
    You can also add any input data pre-processing in this function
    """
    if request_content_type == "application/json":
        input_object = pd.read_json(request_body, lines=True)
        
        return input_object
    else:
        raise ValueError("Only application/json content type supported!")

def predict_fn(input_object, pipeline_model):
    """Make predictions on processed input data"""
    predictions = pipeline_model.predict(input_object)
    pred_probs = pipeline_model.predict_proba(input_object)
    
    prediction_object = pd.DataFrame(
        {
            "prediction": predictions.tolist(),
            "pred_prob_class0": pred_probs[:, 0].tolist(),
            "pred_prob_class1": pred_probs[:, 1].tolist()
        }
    )
    
    return prediction_object

def output_fn(prediction_object, request_content_type):
    """Post process the predictions and return as json"""
    return_object = prediction_object.to_json(orient="records", lines=True)
    
    return return_object

In [None]:
%%writefile requirements.txt
pandas
numpy

## Real Time Endpoint Deployment
- Supported machines and cost: https://aws.amazon.com/sagemaker/pricing/

In [None]:
# Create the deployment
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import get_execution_role, Session

session = Session()
bucket = session.default_bucket()

training_job_name = "..." # TODO: Update with best TrainingJobName from hyperparameter tuning
model_artifact = f"s3://{bucket}/{training_job_name}/output/model.tar.gz"
endpoint_name = "heart-disease-rfc-pipeline-real-time"

model = SKLearnModel(
    name=endpoint_name,
    framework_version="1.0-1",
    entry_point="serve.py",
    dependencies=["requirements.txt"],
    model_data=model_artifact,
    role=get_execution_role(),
)

In [None]:
# Deploy!
predictor = model.deploy(instance_type="ml.t2.medium", initial_instance_count=1)

In [None]:
endpoint_name = predictor.endpoint_name
print("Endpoint name:")
print(f"{endpoint_name}")

## Invoke the model
- boto3 documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

In [None]:
# Load some data that we want to make predictions on
import pandas as pd
test_df = pd.read_csv("...") # TODO: Paste the S3 path to your test.csv

X_test = test_df.drop("target", axis=1)
y_test = test_df["target"]

# Get two rows to make predictions on
X_pred = X_test.head(2).to_json(orient="records", lines=True)
X_pred

In [None]:
# Submit to the endpoint
import boto3

sm_runtime = boto3.client("sagemaker-runtime")

response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                      Body=X_pred, 
                                      ContentType="application/json", 
                                      Accept="application/json")

In [None]:
# Decode the response from the endpoint
response_body = response['Body']
response_str = response_body.read().decode('utf-8')
response_df = pd.read_json(response_str, lines=True)

response_df

## Cleanup
- Delete the endpoint
- Delete the endpoint config
- Delete the model

In [None]:
import boto3

def cleanup(endpoint_name):
    sm_client = boto3.client("sagemaker")

    # Get the model name from endpoint_name
    response = sm_client.describe_endpoint_config(EndpointConfigName=endpoint_name)

    # Delete the endpoint
    sm_client.delete_endpoint(EndpointName=endpoint_name)

    # Delete the endpoint config
    endpoint_config_name = response['EndpointConfigName']
    sm_client.delete_endpoint_config(EndpointConfigName=endpoint_config_name)                        

    # Delete the model
    model_name = response['ProductionVariants'][0]['ModelName']
    sm_client.delete_model(ModelName=model_name)

In [None]:
# Run the cleanup
cleanup(endpoint_name)

# Multi-Model Endpoint
- You can deploy more than one model to same physical machine to save on costs!
- Supported machines and cost: https://aws.amazon.com/sagemaker/pricing/

In [None]:
# Create the multi-model deployment
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.multidatamodel import MultiDataModel
from sagemaker import Session, get_execution_role

session = Session()
bucket = session.default_bucket()

mme_name = "heart-disease-models"
mme_model_data_prefix = f"s3://{bucket}/{mme_name}"

# Define the base model
base_model = SKLearnModel(
    framework_version="1.0-1",
    entry_point="serve.py",
    dependencies=["requirements.txt"],
    model_data=None,
    role=get_execution_role(),
    sagemaker_session = session
)

# Define the multi-model
mme = MultiDataModel(
    name = mme_name,
    model_data_prefix = mme_model_data_prefix,
    model = base_model,
    sagemaker_session = session
)

In [None]:
# Deploy the empty multi-model!
predictor = mme.deploy(instance_type="ml.t2.large", initial_instance_count=1)

In [None]:
# Add the first model to the multi-model
training_job_name = "..." # TODO: Update with best TrainingJobName from hyperparameter tuning
model_artifact = f"s3://{bucket}/{training_job_name}/output/model.tar.gz"
model_name = f"{model_artifact.split('/')[-3]}_1.tar.gz"

mme.add_model(model_data_source=model_artifact, model_data_path=model_name)

# Add the second model to the multi-model
training_job_name = "..." # TODO: Update with best TrainingJobName from hyperparameter tuning
model_artifact = f"s3://{bucket}/{training_job_name}/output/model.tar.gz"
model_name = f"{model_artifact.split('/')[-3]}_2.tar.gz"

mme.add_model(model_data_source=model_artifact, model_data_path=model_name)

In [None]:
# Check if all the models have been deployed
list(mme.list_models())

## Invoke the models
- boto3 documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

In [None]:
# Load some data that we want to make predictions on
import pandas as pd

test_df = pd.read_csv("...") # TODO: Paste the S3 path to your test.csv

X_test = test_df.drop("target", axis=1)
y_test = test_df["target"]

# Get two rows to make predictions on
X_pred = X_test.head(2).to_json(orient="records", lines=True)
X_pred

In [None]:
# Submit to the endpoint
import boto3

sm_runtime = boto3.client("sagemaker-runtime")
endpoint_name = mme_name

# Predictions from each model
for model_name in mme.list_models():
    response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                          TargetModel=model_name,
                                          Body=X_pred, 
                                          ContentType="application/json", 
                                          Accept="application/json")

    # Decode the response from the endpoint
    response_body = response['Body']
    response_str = response_body.read().decode('utf-8')
    response_df = pd.read_json(response_str, lines=True)
    
    print(model_name)
    print(response_df)

## Cleanup
- Delete the endpoint
- Delete the endpoint config
- Delete the model

In [None]:
# Run the cleanup
cleanup(endpoint_name)

# Serverless
- Most cost effective option for real time inference
- Only runs when there is traffic so small delay in latency of first prediction

In [None]:
# Create the deployment
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import Session, get_execution_role

session = Session()
bucket = session.default_bucket()

training_job_name = "..." # TODO: Update with best TrainingJobName from hyperparameter tuning
model_artifact = f"s3://{bucket}/{training_job_name}/output/model.tar.gz"
endpoint_name = "heart-disease-rfc-pipeline-real-time"

model = SKLearnModel(
    name=endpoint_name,
    framework_version="1.0-1",
    entry_point="serve.py",
    dependencies=["requirements.txt"],
    model_data=model_artifact,
    role=get_execution_role(),
)

In [None]:
# NEW! Create a config for serverless inference
from sagemaker.serverless import ServerlessInferenceConfig
serverless_config = ServerlessInferenceConfig(memory_size_in_mb=1024, max_concurrency=4)

In [None]:
# NEW! Deploy!
predictor = model.deploy(serverless_inference_config=serverless_config)

In [None]:
endpoint_name = predictor.endpoint_name
print("Endpoint name:")
print(f"{endpoint_name}")

## Invoke the model
- boto3 documentation: https://boto3.amazonaws.com/v1/documentation/api/latest/index.html

In [None]:
# Load some data that we want to make predictions on
import pandas as pd

test_df = pd.read_csv("...") # TODO: Paste the S3 path to your test.csv

X_test = test_df.drop("target", axis=1)
y_test = test_df["target"]

# Get two rows to make predictions on
X_pred = X_test.head(2).to_json(orient="records", lines=True)
X_pred

In [None]:
# Submit to the endpoint
import boto3

sm_runtime = boto3.client("sagemaker-runtime")

response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, 
                                      Body=X_pred, 
                                      ContentType="application/json", 
                                      Accept="application/json")

In [None]:
# Decode the response from the endpoint
response_body = response['Body']
response_str = response_body.read().decode('utf-8')
response_df = pd.read_json(response_str, lines=True)

response_df

## Cleanup
- Delete the endpoint
- Delete the endpoint config
- Delete the model

In [None]:
# Run the cleanup
cleanup(endpoint_name)