### This example shows how to use Model Monitor to manually kick-off monitoring jobs
This is necessary when you don't have a SageMaker Endpoint, like when you use SageMaker Edge Manager + CaptureData.

Model Monitor by default only supports tabular data: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor.html

However, you can bring your own container to Model Monitor: https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-byoc-containers.html if you want to create a custom CV monitoring

In [None]:
!mkdir -p data
!curl https://spock.cloud/datasets/wind_turbine/dataset.csv.gz |gunzip > data/dataset.csv

In [None]:
import pandas as pd
import numpy as np
import sagemaker
from sagemaker.model_monitor import DefaultModelMonitor, ModelQualityMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

role = sagemaker.get_execution_role()

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix='windturbine'

endpoint_name = 'windturbine_no_endpoint'
monitoring_schedule_name = 'monitoring-schedule-windturbine'
variant_name = 'AllTraffic'

In [None]:
cols = ['_', 'ts', 'freemem', 'rps', 'voltage',
        'qw', 'qx', 'qy', 'qz', 'gx', 'gy', 'gz', 'ax', 'ay', 'az', 
        'gearboxtemp', 'ambtemp', 'humidity', 'pressure', 'gas' ]

df = pd.read_csv('data/dataset.csv', sep=',', names=cols, low_memory=False)
df = df.drop(['_'], axis=1)
df.ts = pd.to_timedelta(df.ts, unit='ms')
df.drop(['ts', 'freemem'], axis=1).to_csv('data/full_dataset.csv', index=None)
df.head()

### Computing the Data Quality Baseline

In [None]:
endpoint_monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=3600,
)
endpoint_monitor.suggest_baseline(
    baseline_dataset='data/full_dataset.csv',
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri='s3://{}/{}/monitoring/data_quality/baseline'.format(bucket, prefix),
    wait=True,
    logs=False
)

### Computing the Model Quality Baseline

In [None]:
model_quality_monitor = ModelQualityMonitor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
    sagemaker_session=sagemaker_session
)
model_quality_monitor.suggest_baseline(
    baseline_dataset='data/full_dataset.csv',
    dataset_format=DatasetFormat.csv(header=True),
    output_s3_uri = ='s3://{}/{}/monitoring/model_quality/baseline'.format(bucket, prefix),
    problem_type='Regression',
    inference_attribute= "prediction", # The column in the dataset that contains predictions.
    #probability_attribute= "probability", # The column in the dataset that contains probabilities.
    ground_truth_attribute= "label", # The column in the dataset that contains ground truth labels.
    wait=True,
    logs=True    
)

### Generating synthetic (json lines) logs (simulating Data Capture in a real endpoint)

In [None]:
import datetime
import random
import json
import boto3
import io
from uuid import uuid4

s3_client = boto3.client('s3')
def generate_log_file(df, num_rows):
    now = datetime.datetime.today()
    suffix = now.strftime("%Y/%m/%d/%H/%M-%S-%f")[:-3]
    key = '%s/monitoring/%s/%s/%s-%s.jsonl' % (
        prefix, endpoint_name, variant_name, suffix, uuid4() )
    
    data = io.BytesIO()
    for i in range(num_rows):
        idx = random.randint(0, len(df))
        line = ",".join(df.iloc[idx].values.astype(str))
        row = {
            "captureData": {
                "endpointInput": {
                  "observedContentType": "text/csv",
                  "mode": "INPUT",
                  "data": line,
                  "encoding": "CSV"
                },
                "endpointOutput": {
                  "observedContentType": "text/csv; charset=utf-8",
                  "mode": "OUTPUT",
                  "data": "1,2,3,4,5",
                  "encoding": "CSV"
                }
            },
            "eventMetadata": {
                "eventId": str(uuid4()),
                "inferenceTime": now.strftime("%Y-%m-%dT%H:%M:%SZ")
            },
            "eventVersion": "0"
        }
        data.write(("%s\n" % json.dumps(row)).encode('utf-8'))
    data.seek(0)
    
    s3_client.upload_fileobj(data, bucket, key)
generate_log_file(df, 100)

In [None]:
import time
import datetime
import boto3

def process_monitoring_logs(endpoint_monitor):
    sm = boto3.client('sagemaker')
    now = datetime.datetime.today()
    suffix = now.strftime("%Y/%m/%d/%H")
    start_time = datetime.datetime(now.year, now.month, now.day, now.hour)
    end_time = start_time + datetime.timedelta(hours=1)

    # get the monitoring metadata
    base_desc = endpoint_monitor.describe_latest_baselining_job()    
    baseline_path = base_desc['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
    logs_path = "%s/%s/%s" % (endpoint_name,variant_name,suffix)
    
    s3_output = {
        "S3Uri": 's3://{}/{}/monitoring/{}'.format(bucket, prefix, logs_path),
        "LocalPath": "/opt/ml/processing/output",
        "S3UploadMode": "Continuous"
    }
    # values for the processing job input
    values = [
        [ 'input_1', 's3://{}/{}/monitoring/{}'.format(bucket, prefix, logs_path),
            '/opt/ml/processing/input/endpoint/{}'.format(logs_path) ], 
        [ 'baseline', '%s/statistics.json' % baseline_path,
            '/opt/ml/processing/baseline/stats'],
        [ 'constraints', '%s/constraints.json' % baseline_path,
            '/opt/ml/processing/baseline/constraints']
    ]
    job_params = {
        'ProcessingJobName': 'model-monitoring-%s' % time.strftime("%Y%m%d%H%M%S"),
        'ProcessingInputs': [{
            'InputName': o[0],
            'S3Input': { 
                'S3Uri': o[1], 'LocalPath': o[2], 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 
                'S3CompressionType': 'None', 'S3DataDistributionType': 'FullyReplicated'
            }} for o in values],
        'ProcessingOutputConfig': { 'Outputs': [ {'OutputName': 'result','S3Output': s3_output } ] },
        'ProcessingResources': base_desc['ProcessingResources'],
        'AppSpecification': base_desc['AppSpecification'],
        'RoleArn': base_desc['RoleArn'],
        'Environment': {
            'baseline_constraints': '/opt/ml/processing/baseline/constraints/constraints.json',
            'baseline_statistics': '/opt/ml/processing/baseline/stats/statistics.json',
            'dataset_format': '{"sagemakerCaptureJson":{"captureIndexNames":["endpointInput","endpointOutput"]}}',
            'dataset_source': '/opt/ml/processing/input/endpoint',      
            'output_path': '/opt/ml/processing/output',
            'publish_cloudwatch_metrics': 'Enabled',
            'sagemaker_monitoring_schedule_name': monitoring_schedule_name,
            'sagemaker_endpoint_name': endpoint_name,
            'start_time': start_time.strftime("%Y-%m-%dT%H:%M:%SZ"),
            'end_time': end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
        }
    }
    print(job_params)
    sm.create_processing_job(**job_params)
    waiter = sm.get_waiter('processing_job_completed_or_stopped')
    waiter.wait( ProcessingJobName=job_params['ProcessingJobName'], WaiterConfig={'Delay': 30,'MaxAttempts': 20} )
    return job_params['ProcessingJobName'], s3_output['S3Uri']

In [None]:
## The processing job takes something like 5mins to run
job_name, s3_output = process_monitoring_logs(endpoint_monitor)
tokens = s3_output.split('/', 3)
df = pd.read_json(sagemaker_session.read_s3_file(tokens[2], '%s/constraint_violations.json' % tokens[3]))
df = pd.json_normalize(df.violations)
df.head()