In [None]:
!pip install "sagemaker>=2.123.0"

In [None]:
from datetime import datetime, timedelta
import pandas as pd
import time
import csv
import json
import boto3
import sagemaker

region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()

sagemaker_client = sagemaker_session.sagemaker_client
sagemaker_runtime_client = sagemaker_session.sagemaker_runtime_client

from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer

from sagemaker.clarify import (
    BiasConfig,
    DataConfig,
    ModelConfig,
    ModelPredictedLabelConfig,
    SHAPConfig,
)

from sagemaker.model_monitor import (
    BiasAnalysisConfig,
    CronExpressionGenerator,
    DataCaptureConfig,
    EndpointInput,
    ExplainabilityAnalysisConfig,
    ModelBiasMonitor,
    ModelExplainabilityMonitor,
    DefaultModelMonitor,
    ModelQualityMonitor,
)

from sagemaker.model_monitor.dataset_format import DatasetFormat

from sagemaker.s3 import S3Downloader, S3Uploader

## Configure data capture and generate synthetic traffic

Data quality monitoring automatically monitors machine learning (ML) models in production and notifies you when data quality issues arise. ML models in production have to make predictions on real-life data that is not carefully curated like most training datasets. If the statistical nature of the data that your model receives while in production drifts away from the nature of the baseline data it was trained on, the model begins to lose accuracy in its predictions. Amazon SageMaker Model Monitor uses rules to detect data drift and alerts you when it happens.

### Initialize SageMaker Predictor for real-time requests to previously deployed model endpoint

In [None]:
# Create a Predictor Python object for real-time endpoint requests. https://sagemaker.readthedocs.io/en/stable/api/inference/predictors.html
predictor = Predictor(endpoint_name=endpoint_name, serializer=CSVSerializer())

In [None]:
# SageMaker automatically created a DataCaptureConfig when your model was deployed to an endpoint 
# in a prior lab that already had data capture enabled. Below is illustrating how create a custom 
# DataCaptureConfig with data capture enabled and update an existing endpoint.
data_capture_config = DataCaptureConfig(
    enable_capture=True,
    sampling_percentage=100,
    destination_s3_uri=s3_capture_upload_path,
)

In [None]:
# Now update endpoint with data capture enabled and provide an s3_capture_upload_path.
predictor.update_data_capture_config(data_capture_config)

In [None]:
# Read in training set for schema and to compute feature attribution baselines.
train_df = pd.read_csv("train-headers.csv")

In [None]:
print("Sending test traffic to the endpoint {}. \nPlease wait...".format(endpoint_name))

test_sample_df = pd.read_csv('test-samples-no-header.csv', header = None, index_col = False)

response = predictor.predict(data=test_sample_df.to_numpy())

print("Done!")

In [None]:
print("Waiting 60 seconds for captures to show up", end="")

for _ in range(60):
    capture_files = sorted(S3Downloader.list(f"{s3_capture_upload_path}"))
    if capture_files:
        break
    print(".", end="", flush=True)
    time.sleep(1)

print("\nFound Capture Files:")
print("\n ".join(capture_files[-10:]))

In [None]:
capture_file = S3Downloader.read_file(capture_files[-1]).split("\n")[-10:-1]
print(capture_file[-1])

View a single line is present below in a formatted JSON file.

In [None]:
print(json.dumps(json.loads(capture_file[-1]), indent=2))

In [None]:
import threading

class WorkerThread(threading.Thread):
    def __init__(self, do_run, *args, **kwargs):
        super(WorkerThread, self).__init__(*args, **kwargs)
        self.__do_run = do_run
        self.__terminate_event = threading.Event()

    def terminate(self):
        self.__terminate_event.set()

    def run(self):
        while not self.__terminate_event.is_set():
            self.__do_run(self.__terminate_event)

In [None]:
def invoke_endpoint(terminate_event):
    with open("test-samples-no-header.csv", "r") as f:
        i = 0
        for row in f:
            payload = row.rstrip("\n")
            response = sagemaker_runtime_client.invoke_endpoint(
                EndpointName=endpoint_name,
                ContentType="text/csv",
                Body=payload,
                InferenceId=str(i),  # unique ID per row
            )
            i += 1
            response["Body"].read()
            time.sleep(1)
            if terminate_event.is_set():
                break


# Keep invoking the endpoint with test data
invoke_endpoint_thread = WorkerThread(do_run=invoke_endpoint)
invoke_endpoint_thread.start()

In [None]:
data_quality_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)

In [None]:
data_quality_baseline_job_name = f"DataQualityBaselineJob-{datetime.utcnow():%Y-%m-%d-%H%M}"

data_quality_baseline_job = data_quality_monitor.suggest_baseline(
    job_name=data_quality_baseline_job_name,
    baseline_dataset="train-headers.csv",
    dataset_format=DatasetFormat.csv(header=True),
)

data_quality_baseline_job.wait(logs=False)

In [None]:
latest_data_quality_baseline_job = data_quality_monitor.latest_baselining_job
schema_df = pd.json_normalize(latest_data_quality_baseline_job.baseline_statistics().body_dict["features"])
schema_df.head(10)

In [None]:
constraints_df = pd.json_normalize(latest_data_quality_baseline_job.suggested_constraints().body_dict["features"])
constraints_df.head(10)

In [None]:
## Create a data quality monitoring schedule name.
data_quality_monitor_schedule_name = (
    f"xgboost-dm-data-monitoring-schedule-{datetime.utcnow():%Y-%m-%d-%H%M}"
)

In [None]:
# Create an EndpointInput
endpointInput = EndpointInput(
    endpoint_name=predictor.endpoint_name,
    destination="/opt/ml/processing/input_data",
)

In [None]:
# Specify where to write the data quality monitoring results report to.
data_quality_baseline_job_result_uri = f"{s3_baseline_results_path}/data_quality"

response = data_quality_monitor.create_monitoring_schedule(
    monitor_schedule_name=data_quality_monitor_schedule_name,
    endpoint_input=endpointInput,
    output_s3_uri=data_quality_baseline_job_result_uri,
    constraints=latest_data_quality_baseline_job.suggested_constraints(),
    # Create the monitoring schedule to execute every hour.    
    schedule_cron_expression=CronExpressionGenerator.hourly(),
    enable_cloudwatch_metrics=True,
)

In [None]:
# You will see the monitoring schedule in the 'Scheduled' status
data_quality_monitor.describe_schedule()

In [None]:
# Check default model monitor created.
predictor.list_monitors()

In [None]:
# Initially there will be no executions since the first execution happens at the top of the hour
# Note that it is common for the execution to launch upto 20 min after the hour.
executions = data_quality_monitor.list_executions()
executions[:5]

In [None]:
invoke_endpoint_thread.terminate()

In [None]:
model_monitors = predictor.list_monitors()

for monitor in model_monitors:
    monitor.stop_monitoring_schedule()
    monitor.delete_monitoring_schedule()