# SageMaker Clarify - Get bias metrics and explainations

In [1]:
import sagemaker

sm_version = sagemaker.__version__
if sm_version[0] =="1":
    !pip install sagemaker==2.5.5
    import sagemaker
    
import os
import boto3

session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "2021-demos"
prefix = "sagemaker/german-data-xgb"

## Get model and endpoint

In [2]:
xgb_endpoint_name = 'sm-clarify-german-xgb'
xgb_model_name = 'xgboost-german-model'
aws_region = session.boto_region_name
# smclarify_image_uri = f'678264136642.dkr.ecr.{aws_region}.amazonaws.com/sagemaker-xai-analyzer:latest'
smclarify_image_uri = f'306415355426.dkr.ecr.us-west-2.amazonaws.com/sagemaker-clarify-processing:1.0'

analyzer_instance_count = 1
analyzer_instance_type = 'ml.c5.4xlarge'

In [3]:
xgb_model_desc = session.sagemaker_client.describe_model(ModelName=xgb_model_name)

## Prepare the dataset

In [4]:
from io import StringIO
import numpy as np
import pandas as pd

train_data_s3_path = 's3://{}/{}/train/train_data.csv'.format(bucket, prefix)
train_data = sagemaker.s3.S3Downloader.read_file(train_data_s3_path)

X = sagemaker.s3.S3Downloader.read_file('s3://{}/{}/preprocessed_data/X.csv'.format(bucket, prefix))
y = sagemaker.s3.S3Downloader.read_file('s3://{}/{}/preprocessed_data/y.csv'.format(bucket, prefix))

train_data = pd.read_csv(StringIO(train_data), header=None, sep=',')
X = pd.read_csv(StringIO(X), header=None, sep=',')
y = pd.read_csv(StringIO(y), header=None, sep=',', squeeze=True)

## Clarify analysis

In [5]:
from sagemaker.processing import Processor, ProcessingInput, ProcessingJob, ProcessingOutput

xai_analyzer = Processor(base_job_name = 'sm-clarify-analysis',
                         image_uri=smclarify_image_uri,
                         role=role,
                         instance_count=analyzer_instance_count,
                         instance_type=analyzer_instance_type,
                         max_runtime_in_seconds=1200,
                         volume_size_in_gb=100                         
                        )

## Specify config and data inputs

In [6]:
analysis_config_path = session.upload_data(path='analysis_config.json', key_prefix=prefix + "/config")
print('Config uploaded to: ' + analysis_config_path )
config_input = ProcessingInput(
                input_name="analysis_config",
                source=analysis_config_path,
                destination="/opt/ml/processing/input/config",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_compression_type="None")
data_input = ProcessingInput(
                input_name="dataset",
                source=train_data_s3_path,
                destination="/opt/ml/processing/input/data",
                s3_data_type="S3Prefix",
                s3_input_mode="File",
                s3_compression_type="None")

Config uploaded to: s3://sagemaker-us-west-2-921212210452/sagemaker/german-data-xgb/config/analysis_config.json


## Specify output

In [7]:
analysis_result_path = 's3://{}/{}/{}'.format(bucket, prefix, 'german_analysis_result')
result_output = ProcessingOutput(
                    source='/opt/ml/processing/output',
                    destination=analysis_result_path,
                    output_name="analysis_result",
                    s3_upload_mode="EndOfJob"
                )

In [8]:
xai_analyzer.run( inputs=[data_input,config_input], outputs=[result_output], wait=True)


Job Name:  sm-clarify-analysis-2021-02-06-02-12-13-039
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://2021-demos/sagemaker/german-data-xgb/train/train_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-921212210452/sagemaker/german-data-xgb/config/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://2021-demos/sagemaker/german-data-xgb/german_analysis_result', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
..........................[34mINFO:sagemaker-clarify-proce

## Inspect results

In [9]:
import json

analysis_result_json = sagemaker.s3.S3Downloader.read_file(analysis_result_path + '/analysis.json')
analysis_result = json.loads(analysis_result_json)
print(json.dumps(analysis_result, indent=1))

{
 "version": "1.0",
 "post_training_bias_metrics": {
  "label": "Class1Good2Bad",
  "facets": {
   "ForeignWorker": [
    {
     "value_or_threshold": "0",
     "metrics": [
      {
       "name": "AD",
       "description": "Accuracy Difference (AD)",
       "value": -0.04681404421326396
      },
      {
       "name": "CDDPL",
       "description": "Conditional Demographic Disparity in Predicted Labels (CDDPL)",
       "value": -0.036106299295195335
      },
      {
       "name": "DAR",
       "description": "Difference in Acceptance Rates (DAR)",
       "value": -0.050909090909090904
      },
      {
       "name": "DCA",
       "description": "Difference in Conditional Acceptance (DCA)",
       "value": -0.036363636363636376
      },
      {
       "name": "DCR",
       "description": "Difference in Conditional Rejection (DCR)",
       "value": -0.091324200913242
      },
      {
       "name": "DI",
       "description": "Disparate Impact (DI)",
       "value": 1.262873900293255