# Environment Setup

Imports & AWS Configuration

In [14]:
# imports
import os
import io
import json

import boto3
import pandas as pd

import sagemaker
from sagemaker.model_monitor import DefaultModelMonitor, ModelQualityMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Initialize AWS clients
sm_sess = sagemaker.Session()
BUCKET = sm_sess.default_bucket()
ROLE = sagemaker.get_execution_role()
REGION = sm_sess.boto_region_name
sm = boto3.client("sagemaker")
cw = boto3.client("cloudwatch")
s3 = boto3.client("s3")

print(f"Region: {REGION}")
print(f"Bucket: {BUCKET}")

Region: us-east-1
Bucket: sagemaker-us-east-1-767397858887


Define S3 Paths & Monitoring Configuration

In [2]:
# Monitoring Setup & Configuration (bucket, job, and local dirs)
DATA_PREFIX = "student-anxiety-ml"
JOB_NAME = os.environ.get("BATCH_JOB_NAME", "student-anxiety-ml-xgb-batch-1760782623")
LOCAL_DIR = "local_artifacts"
os.makedirs(LOCAL_DIR, exist_ok=True)

# S3 URIs
TRAIN_XGB   = f"s3://{BUCKET}/{DATA_PREFIX}/xgb/train/train.csv"          # label-first, no header
BATCH_IN    = f"s3://{BUCKET}/{DATA_PREFIX}/batch-inference/input/"       # features-only, no header
BATCH_OUT   = f"s3://{BUCKET}/{DATA_PREFIX}/batch-predictions/xgb/"       # .out softprob files
PROD_XGB    = f"s3://{BUCKET}/{DATA_PREFIX}/xgb/prod/prod.csv"            # label-first, for GT

PREFIX      = "sm-batch-monitoring"
BASE_S3     = f"s3://{BUCKET}/{PREFIX}"
REPORTS     = f"{BASE_S3}/reports"
BASELINES   = f"{BASE_S3}/baseline"

print(f"Job Name: {JOB_NAME}")
print(f"Reports: {REPORTS}")

Job Name: student-anxiety-ml-xgb-batch-1760782623
Reports: s3://sagemaker-us-east-1-767397858887/sm-batch-monitoring/reports


# Function Definitions

Define S3 Helper Functions for CSV Processing

In [3]:
def s3_split(uri):
    """Split S3 URI into bucket & key"""
    assert uri.startswith("s3://")
    rest = uri[5:]
    bucket, key = rest.split("/", 1)
    return bucket, key

def download_process_upload_csv(source_uri, dest_key, process_fn):
    """
    Downloads CSV from S3, transforms it, uploads result.
    Needed for both baseline prep & GT upload
    """
    # Download from S3 to memory
    src_bucket, src_key = s3_split(source_uri)
    body = s3.get_object(Bucket=src_bucket, Key=src_key)["Body"].read()

    # Process DF
    df = pd.read_csv(io.BytesIO(body), header=None)
    result = process_fn(df)

    # Upload back to S3
    csv_bytes = result.to_csv(header=False, index=False).encode()
    s3.put_object(Bucket=BUCKET, Key=dest_key, Body=csv_bytes)

    return f"s3://{BUCKET}/{dest_key}"

Define Data Prep Functions (Baseline, Predictions, Ground Truth)

In [37]:
def ensure_features_only_baseline():
    """
    Creates baseline for data quality monitoring.
    Needed bc SageMaker's DefaultModelMonitor expects baseline to match batch input format (features only, no labels).
    Our training data has labels so we strip col 0.

    Returns S3 path to the features-only baseline.
    """
    dst_key = f"{DATA_PREFIX}/xgb/train_features_only/train_features_only.csv"

    # Download from S3, remove label column (first col), upload result
    def drop_label_column(df):
        return df.iloc[:, 1:]

    download_process_upload_csv(TRAIN_XGB, dst_key, drop_label_column)
    return f"s3://{BUCKET}/{DATA_PREFIX}/xgb/train_features_only/"

def write_pred_labels_from_softprob():
    """
    TL;DR Converts batch transform output to predicted class labels.

    Batch transform outputs softmax probs in .out files (one per shard).
    This reads them all, takes argmax to get predicted class,
    concatenates into single CSV for model quality monitor.

    NOTE: Make sure batch job finished before running this to avoid
    RuntimeError about no .out files.
    """
    batch_bucket, batch_prefix = s3_split(BATCH_OUT)
    keys = []
    for page in s3.get_paginator("list_objects_v2").paginate(Bucket=batch_bucket, Prefix=batch_prefix):
        keys += [o["Key"] for o in page.get("Contents",[]) if o["Key"].endswith(".out")]

    if not keys:
        raise RuntimeError("No .out files found under BATCH_OUT.")
    keys.sort()

    parts = []
    for k in keys:
        body = s3.get_object(Bucket=batch_bucket, Key=k)["Body"].read().decode('utf-8')
        
        # Parse each line as JSON and convert to numpy array
        probs_list = []
        for line in body.strip().split('\n'):
            if line.strip():
                probs_list.append(json.loads(line))
        
        # Convert to numpy array & get argmax
        probs_array = pd.DataFrame(probs_list).values
        pred_classes = probs_array.argmax(axis=1)
        parts.append(pd.DataFrame(pred_classes))

    preds = pd.concat(parts, ignore_index=True)

    out_key = f"{DATA_PREFIX}/batch-predictions/xgb_post/preds_labels.csv"
    s3.put_object(Bucket=BUCKET, Key=out_key, Body=preds.to_csv(header=False, index=False).encode())

    return f"s3://{BUCKET}/{out_key}"

def upload_ground_truth_from_prod():
    """
    tl;dr Pulls ground truth labels (col 0) from prod.csv for model quality eval.
    Our prod.csv has the same format as training data (label in column 0).
    We pull just that column and upload it for model quality evaluation.
    """
    gt_key = f"{DATA_PREFIX}/ground-truth/ground_truth.csv"

    # Download prod data, extract ground truth label (col 0), upload result
    def extract_label_column(df):
        return df[[0]]

    return download_process_upload_csv(PROD_XGB, gt_key, extract_label_column)

Define CloudWatch Metrics & Dashboard Creation Functions

In [5]:
def monitor_infra(job_name):
    """Pushes job duration & success metrics to CloudWatch."""

    # Get job details
    response = sm.describe_transform_job(TransformJobName=job_name)
    status = response["TransformJobStatus"]

    # If job hasn't completed, record failure
    if "TransformEndTime" not in response:
        metric_data = [{
            "MetricName": "JobSuccess",
            "Dimensions": [{"Name": "TransformJobName", "Value": job_name}],
            "Value": 0,
            "Unit": "Count"
        }]
        cw.put_metric_data(
            Namespace  = "SageMaker/Batch/Infrastructure",
            MetricData = metric_data
        )
        return

    # Calculate job duration
    start_time = response["TransformStartTime"]
    end_time = response["TransformEndTime"]
    duration_seconds = (end_time - start_time).total_seconds()

    # Determine success status
    success_value = 1 if status == "Completed" else 0

    # Publish both metrics
    metric_data = [
        {
            "MetricName": "JobDuration",
            "Dimensions": [{"Name": "TransformJobName", "Value": job_name}],
            "Value": duration_seconds,
            "Unit": "Seconds"
        },
        {
            "MetricName": "JobSuccess",
            "Dimensions": [{"Name": "TransformJobName", "Value": job_name}],
            "Value": success_value,
            "Unit": "Count"
        }
    ]

    cw.put_metric_data(
        Namespace  = "SageMaker/Batch/Infrastructure",
        MetricData = metric_data
    )


def create_dashboard(name="SageMaker-Batch-Monitoring-Dashboard"):
    """
    Creates CloudWatch dashboard for monitoring batch inference pipeline.

    Shows job duration, success rate, and links to S3 monitoring reports.
    We can view it in the CloudWatch console (after running this).
    """
    # Dashboard configuration
    dashboard_body = {
        "widgets": [
            # Job Duration Widget
            {
                "type": "metric",
                "width": 12,
                "height": 6,
                "properties": {
                    "metrics": [[
                        "SageMaker/Batch/Infrastructure",
                        "JobDuration",
                        "TransformJobName",
                        JOB_NAME
                    ]],
                    "period": 300,
                    "stat": "Average",
                    "region": REGION,
                    "title": "Batch Job Duration (s)"
                }
            },
            # Job Success Widget
            {
                "type": "metric",
                "width": 12,
                "height": 6,
                "properties": {
                    "metrics": [[
                        "SageMaker/Batch/Infrastructure",
                        "JobSuccess",
                        "TransformJobName",
                        JOB_NAME
                    ]],
                    "period": 300,
                    "stat": "Sum",
                    "region": REGION,
                    "title": "Batch Job Success"
                }
            },
            # Reports Summary Widget
            {
                "type": "text",
                "width": 24,
                "height": 4,
                "properties": {
                    "markdown": (
                        f"### Reports\n"
                        f"- Data Quality: `{REPORTS}/data_quality_analysis`\n"
                        f"- Model Quality: `{REPORTS}/quality_analysis`"
                    )
                }
            }
        ]
    }

    cw.put_dashboard(
        DashboardName = name,
        DashboardBody = json.dumps(dashboard_body)
    )

# Monitoring Pipeline Execution

Infrastructure Monitoring - Publish Job Metrics to CloudWatch

In [6]:
# Infrastructure Monitoring - Track job performance
monitor_infra(JOB_NAME)

 Data Quality Monitoring - Create Baseline & Detect Data Drift

In [38]:
# Data Quality Monitoring - Detect drift vs training data (baseline + run)

data_monitor = DefaultModelMonitor(
    role=ROLE,
    instance_count=1,
    instance_type="ml.m5.large",
    sagemaker_session=sm_sess
)

# Generate baseline statistics
data_monitor.suggest_baseline(
    baseline_dataset=ensure_features_only_baseline(),
    dataset_format=DatasetFormat.csv(header=False),
    output_s3_uri=f"{BASELINES}/data",
    wait=True,
    logs=False
)

# Generate statistics for batch inference data
data_monitor.suggest_baseline(
    baseline_dataset=BATCH_IN,
    dataset_format=DatasetFormat.csv(header=False),
    output_s3_uri=f"{REPORTS}/data_quality_analysis",
    wait=True,
    logs=False
)

# Download & compare statistics
baseline_stats_key = f"{PREFIX}/baseline/data/statistics.json"
batch_stats_key = f"{PREFIX}/reports/data_quality_analysis/statistics.json"

baseline_stats = json.loads(s3.get_object(Bucket=BUCKET, Key=baseline_stats_key)["Body"].read())
batch_stats = json.loads(s3.get_object(Bucket=BUCKET, Key=batch_stats_key)["Body"].read())


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2025-10-21-07-56-15-318


...........................................................!

INFO:sagemaker:Creating processing-job with name baseline-suggestion-job-2025-10-21-08-01-18-149


...........................................................................!

In [51]:
# Simple drift detection
print("\nData Quality Summary:")
for baseline_feature in baseline_stats['features']:
    feature_name = baseline_feature['name']
    
    # Find matching feature in batch stats
    batch_feature = next((f for f in batch_stats['features'] if f['name'] == feature_name), None)
    
    if batch_feature:
        # Get mean from numerical_statistics
        baseline_mean = baseline_feature.get('numerical_statistics', {}).get('mean')
        batch_mean = batch_feature.get('numerical_statistics', {}).get('mean')
        
        if baseline_mean is not None and batch_mean is not None:
            drift_pct = abs(batch_mean - baseline_mean) / abs(baseline_mean) * 100 if baseline_mean != 0 else 0
            status = "DRIFT" if drift_pct > 10 else "OK"
            print(f"  {feature_name}: {status} (baseline: {baseline_mean:.2f}, batch: {batch_mean:.2f}, drift: {drift_pct:.1f}%)")
        else:
            print(f"  {feature_name}: Non-numerical feature")

print(f"\nReports available at:\n{REPORTS}/data_quality_analysis/")


Data Quality Summary:
  _c0: OK (baseline: 15.78, batch: 15.78, drift: 0.0%)
  _c1: DRIFT (baseline: -0.01, batch: -0.01, drift: 46.4%)
  _c2: OK (baseline: 0.48, batch: 0.49, drift: 1.3%)
  _c3: DRIFT (baseline: 0.17, batch: 0.20, drift: 15.6%)
  _c4: DRIFT (baseline: 0.25, batch: 0.27, drift: 11.5%)
  _c5: DRIFT (baseline: 0.25, batch: 0.29, drift: 13.9%)
  _c6: OK (baseline: 0.45, batch: 0.49, drift: 7.7%)
  _c7: OK (baseline: -0.05, batch: -0.05, drift: 0.2%)
  _c8: OK (baseline: 10.14, batch: 10.10, drift: 0.3%)
  _c9: DRIFT (baseline: 0.11, batch: 0.08, drift: 23.3%)
  _c10: OK (baseline: 0.10, batch: 0.09, drift: 5.9%)
  _c11: OK (baseline: 0.20, batch: 0.19, drift: 4.4%)
  _c12: OK (baseline: 13.91, batch: 13.96, drift: 0.4%)
  _c13: OK (baseline: 0.11, batch: 0.11, drift: 7.3%)
  _c14: OK (baseline: 0.06, batch: 0.07, drift: 4.2%)

Reports available at:
s3://sagemaker-us-east-1-767397858887/sm-batch-monitoring/reports/data_quality_analysis/


Model Quality Monitoring - Compare Predictions vs Ground Truth

Note:
Initialy tried using `ModelQualityMonitor` directly but it only works for real-time endpoints, not batch. So used`sklearn` instead (and formatted to match SageMaker Model Monitor output structure).
Reports stored in S3 following SageMaker conventions.

In [49]:
# Model Quality Monitoring: compare predicted labels (from softmax argmax) to ground truth.

# Process predictions
preds_uri = write_pred_labels_from_softprob()

# Upload ground truth
gt_uri    = upload_ground_truth_from_prod()

# Download predictions & ground truth for analysis
preds_bucket, preds_key = s3_split(preds_uri)
gt_bucket, gt_key = s3_split(gt_uri)

preds_body = s3.get_object(Bucket=preds_bucket, Key=preds_key)["Body"].read()
gt_body = s3.get_object(Bucket=gt_bucket, Key=gt_key)["Body"].read()

y_pred = pd.read_csv(io.BytesIO(preds_body), header=None)[0].values
y_true = pd.read_csv(io.BytesIO(gt_body), header=None)[0].values

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
conf_matrix = confusion_matrix(y_true, y_pred)

# Create model quality report
model_quality_report = {
    "multiclass_classification_metrics": {
        "accuracy": {"value": float(accuracy)},
        "weighted_precision": {"value": float(precision)},
        "weighted_recall": {"value": float(recall)},
        "weighted_f1": {"value": float(f1)}
    },
    "confusion_matrix": conf_matrix.tolist(),
    "classification_report": classification_report(y_true, y_pred, output_dict=True)
}

# Save report to S3
report_key = f"{PREFIX}/reports/quality_analysis/model_quality_metrics.json"
s3.put_object(
    Bucket=BUCKET,
    Key=report_key,
    Body=json.dumps(model_quality_report, indent=2).encode()
)

# Display results
print("\n Model Quality Metrics:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1 Score:  {f1:.4f}")
print(f"\n Confusion Matrix:")
print(conf_matrix)
print(f"\nModel quality report:\ns3://{BUCKET}/{report_key}")


 Model Quality Metrics:
  Accuracy:  0.4708
  Precision: 0.4700
  Recall:    0.4708
  F1 Score:  0.4704

Confusion Matrix:
[[1656  661  966]
 [ 627 1752 1012]
 [1003 1047 1321]]

Model quality report:
s3://sagemaker-us-east-1-767397858887/sm-batch-monitoring/reports/quality_analysis/model_quality_metrics.json


Create CloudWatch Dashboard for Visualization & Monitoring

In [50]:
# Final Dashboard Creation
create_dashboard()