# Clarify Demo

In [3]:
import sagemaker

sm_version = sagemaker.__version__
if sm_version[0] =="1":
    !pip install sagemaker==2.5.5
    import sagemaker
    
import os
import boto3
from io import StringIO
import pandas as pd

session = sagemaker.Session()
role = 'arn:aws:iam::921212210452:role/service-role/AmazonSageMaker-ExecutionRole-20210202T101127'
# role = sagemaker.get_execution_role()
bucket = "2021-demos"
prefix = "sagemaker/german-data-xgb"

Basic training/inference parameters

In [4]:
xgb_endpoint_name = 'sm-clarify-german-xgb'
xgb_model_name = 'xgboost-german-model'
train_instance_count = 1
train_instance_type = 'ml.c5.4xlarge'
predictor_instance_count = 1
predictor_instance_type = 'ml.c5.4xlarge'
batch_transform_instance_count = 1
batch_transform_instance_type = 'ml.c5.4xlarge'

In [5]:
if not os.path.exists('Data'):
    os.makedirs('Data')
    
local_data_path = './Data/german.data'
url="https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
os.system('wget -O %s %s' %(local_data_path, url))

data = pd.read_csv(local_data_path, header=None, sep=" ")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 10)

# Details here: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,4,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,2,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,3,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,4,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,4,A124,53,A143,A153,2,A173,2,A191,A201,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,A14,12,A32,A42,1736,A61,A74,3,A92,A101,4,A121,31,A143,A152,1,A172,1,A191,A201,1
996,A11,30,A32,A41,3857,A61,A73,4,A91,A101,4,A122,40,A143,A152,1,A174,1,A192,A201,1
997,A14,12,A32,A43,804,A61,A75,4,A93,A101,4,A123,38,A143,A152,1,A173,1,A191,A201,1
998,A11,45,A32,A43,1845,A61,A73,4,A93,A101,4,A124,23,A143,A153,1,A173,1,A192,A201,2


In [6]:
def _preprocess_data(german):
    german.columns=["CheckingAC_Status", "MaturityMonths", "CreditHistory", "Purpose", 
                    "LoanAmount", "SavingsAC", "Employment", "InstallmentPctOfIncome", 
                    "SexAndStatus", "OtherDebts", "PresentResidenceYears", "Property", 
                    "Age", "OtherInstallmentPlans", "Housing", "NumExistingLoans", "Job", 
                    "Dependents", "Telephone", "ForeignWorker", "Class1Good2Bad"]
    
    df = pd.DataFrame(2-german.Class1Good2Bad) # Conver to good=1, bad=0
    res = pd.get_dummies(german.CheckingAC_Status) # A11 A12 A13 A14
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.MaturityMonths], axis=1, sort=False)
    res = pd.get_dummies(german.CreditHistory) # A30 A31 A32 A33 A34
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Purpose)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.LoanAmount], axis=1, sort=False)
    res = pd.get_dummies(german.SavingsAC)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Employment)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.InstallmentPctOfIncome], axis=1, sort=False)
    res = pd.get_dummies(german.SexAndStatus)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.OtherDebts)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.PresentResidenceYears], axis=1, sort=False)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Property)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.Age], axis=1, sort=False)
    res = pd.get_dummies(german.OtherInstallmentPlans)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Housing)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.NumExistingLoans], axis=1, sort=False)
    res = pd.get_dummies(german.Job)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.Dependents], axis=1, sort=False)
    res = pd.DataFrame({'Telephone': german.Telephone.str.slice(3,4).astype(int)-1})
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.DataFrame({'ForeignWorker': abs(german.ForeignWorker.str.slice(3,4).astype(int)-2)})
    df = pd.concat([df,res], axis=1, sort=False)
    
    print("DF shape {}".format(df.shape))
    print("DF columns: \n{}".format(df.columns))
    
    # Separate X and y in the dataset
    X = df.drop(['Class1Good2Bad'], axis=1)
    y = df.Class1Good2Bad
    print("X shape: {}, y shape: {}".format(X.shape, y.shape))
    
    return X, y, df

X, y, df = _preprocess_data(data)

DF shape (1000, 63)
DF columns: 
Index(['Class1Good2Bad', 'A11', 'A12', 'A13', 'A14', 'MaturityMonths', 'A30',
       'A31', 'A32', 'A33', 'A34', 'A40', 'A41', 'A410', 'A42', 'A43', 'A44',
       'A45', 'A46', 'A48', 'A49', 'LoanAmount', 'A61', 'A62', 'A63', 'A64',
       'A65', 'A71', 'A72', 'A73', 'A74', 'A75', 'InstallmentPctOfIncome',
       'A91', 'A92', 'A93', 'A94', 'A101', 'A102', 'A103',
       'PresentResidenceYears', 'A101', 'A102', 'A103', 'A121', 'A122', 'A123',
       'A124', 'Age', 'A141', 'A142', 'A143', 'A151', 'A152', 'A153',
       'NumExistingLoans', 'A171', 'A172', 'A173', 'A174', 'Dependents',
       'Telephone', 'ForeignWorker'],
      dtype='object')
X shape: (1000, 62), y shape: (1000,)


In [7]:
df

Unnamed: 0,Class1Good2Bad,A11,A12,A13,A14,MaturityMonths,A30,A31,A32,A33,A34,A40,A41,A410,A42,A43,A44,A45,A46,A48,A49,LoanAmount,A61,A62,A63,A64,A65,A71,A72,A73,A74,A75,InstallmentPctOfIncome,A91,A92,A93,A94,A101,A102,A103,PresentResidenceYears,A101.1,A102.1,A103.1,A121,A122,A123,A124,Age,A141,A142,A143,A151,A152,A153,NumExistingLoans,A171,A172,A173,A174,Dependents,Telephone,ForeignWorker
0,1,1,0,0,0,6,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1169,0,0,0,0,1,0,0,0,0,1,4,0,0,1,0,1,0,0,4,1,0,0,1,0,0,0,67,0,0,1,0,1,0,2,0,0,1,0,1,1,1
1,0,0,1,0,0,48,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,5951,1,0,0,0,0,0,0,1,0,0,2,0,1,0,0,1,0,0,2,1,0,0,1,0,0,0,22,0,0,1,0,1,0,1,0,0,1,0,1,0,1
2,1,0,0,0,1,12,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2096,1,0,0,0,0,0,0,0,1,0,2,0,0,1,0,1,0,0,3,1,0,0,1,0,0,0,49,0,0,1,0,1,0,1,0,1,0,0,2,0,1
3,1,1,0,0,0,42,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,7882,1,0,0,0,0,0,0,0,1,0,2,0,0,1,0,0,0,1,4,0,0,1,0,1,0,0,45,0,0,1,0,0,1,1,0,0,1,0,2,0,1
4,0,1,0,0,0,24,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,4870,1,0,0,0,0,0,0,1,0,0,3,0,0,1,0,1,0,0,4,1,0,0,0,0,0,1,53,0,0,1,0,0,1,2,0,0,1,0,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,0,0,1,12,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1736,1,0,0,0,0,0,0,0,1,0,3,0,1,0,0,1,0,0,4,1,0,0,1,0,0,0,31,0,0,1,0,1,0,1,0,1,0,0,1,0,1
996,1,1,0,0,0,30,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,3857,1,0,0,0,0,0,0,1,0,0,4,1,0,0,0,1,0,0,4,1,0,0,0,1,0,0,40,0,0,1,0,1,0,1,0,0,0,1,1,1,1
997,1,0,0,0,1,12,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,804,1,0,0,0,0,0,0,0,0,1,4,0,0,1,0,1,0,0,4,1,0,0,0,0,1,0,38,0,0,1,0,1,0,1,0,0,1,0,1,0,1
998,0,1,0,0,0,45,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1845,1,0,0,0,0,0,0,1,0,0,4,0,0,1,0,1,0,0,4,1,0,0,0,0,0,1,23,0,0,1,0,0,1,1,0,0,1,0,1,1,1


## Split the dataset

In [5]:
train_data = df.sample(frac=0.8, random_state=200)
other_data = df.drop(train_data.index)
validation_data = other_data.sample(frac=0.5, random_state=200)
test_data = other_data.drop(validation_data.index)
del other_data, test_data

In [6]:
train_file = '/tmp/train_data.csv'
train_data.to_csv(train_file, index=False, header=False)
train_data_s3_path = session.upload_data( bucket=bucket, path=train_file, key_prefix=prefix + "/train")
print('Training data uploaded to: ' + train_data_s3_path)

validation_file = '/tmp/validation_data.csv'
validation_data.to_csv(validation_file, index=False, header=False)
validation_data_s3_path = session.upload_data( bucket=bucket, path=validation_file, key_prefix=prefix + "/validation")
print('Validation data uploaded to: ' + validation_data_s3_path)

preprocessed_data_file = "/tmp/X.csv"
preprocessed_label_file = "/tmp/y.csv"
X.to_csv(preprocessed_data_file, index=None, header=False)
y.to_csv(preprocessed_label_file, index=None, header=False)
session.upload_data( bucket=bucket,path=preprocessed_data_file, key_prefix=prefix + "/preprocessed_data")
session.upload_data( bucket=bucket,path=preprocessed_label_file, key_prefix=prefix + "/preprocessed_data")

Training data uploaded to: s3://2021-demos/sagemaker/german-data-xgb/train/train_data.csv
Validation data uploaded to: s3://2021-demos/sagemaker/german-data-xgb/validation/validation_data.csv


's3://2021-demos/sagemaker/german-data-xgb/preprocessed_data/y.csv'

 ## Train the model

In [7]:
from sagemaker.image_uris import retrieve
xgb_image_uri = retrieve('xgboost', session.boto_region_name, version="latest")
xgb = sagemaker.estimator.Estimator( xgb_image_uri, role, instance_count=train_instance_count, 
                                    instance_type=train_instance_type, 
                                    output_path='s3://{}/{}/{}'.format(bucket, prefix, 'xgb_model'),
                                    sagemaker_session=session)
xgb.set_hyperparameters(max_depth=7, eta=0.3, objective='binary:logistic', num_round=10)

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train/train_data.csv'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/validation_data.csv'.format(bucket, prefix), content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-02-05 07:31:42 Starting - Starting the training job...
2021-02-05 07:32:06 Starting - Launching requested ML instancesProfilerReport-1612510302: InProgress
......
2021-02-05 07:33:07 Starting - Preparing the instances for training......
2021-02-05 07:34:13 Downloading - Downloading input data
2021-02-05 07:34:13 Training - Training image download completed. Training in progress.
2021-02-05 07:34:13 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2021-02-05:07:34:07:INFO] Running standalone xgboost training.[0m
[34m[2021-02-05:07:34:07:INFO] File size need to be processed in the node: 0.11mb. Available memory size in the node: 23234.3mb[0m
[34m[2021-02-05:07:34:07:INFO] Determined delimiter of CSV input is ','[0m
[34m[07:34:08] S3DistributionType set as FullyReplicated[0m
[34m[07:34:08] 800x62 matrix with 49600 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-02-05:07:34:08:INFO] Determined delim

## Deploy the model

In [8]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

xgb_predictor = xgb.deploy(initial_instance_count=predictor_instance_count,
                           instance_type=predictor_instance_type,
                           serializer=CSVSerializer(),
                           deserializer=CSVDeserializer(),
                           model_name=xgb_model_name,
                           endpoint_name=xgb_endpoint_name )
print("\nModel is successfully deployed at the endpoint {xgb_endpoint_name}.")

-----------!
Model is successfully deployed at the endpoint {xgb_endpoint_name}.


## Explore the model

In [9]:
import numpy as np

train_dataset = sagemaker.s3.S3Downloader.read_file('s3://{}/{}/{}/X.csv'.format(bucket, prefix, 'preprocessed_data'))
probs_list = xgb_predictor.predict(train_dataset)
probs = np.array(probs_list[0], dtype=float)

pred_labels = (probs >= 0.5).astype(dtype=np.int32)

labels_str = sagemaker.s3.S3Downloader.read_file('s3://{}/{}/{}/y.csv'.format(bucket, prefix, 'preprocessed_data'))
labels = np.fromstring(labels_str, sep='\n')

In [10]:
def plot_roc_curve(labels, pred_labels):
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt
    fpr, tpr, thrs = roc_curve(labels, pred_labels)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw=2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc )
    plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy score for online predictions", accuracy_score(pred_labels, labels))
plot_roc_curve(labels, pred_labels)

Accuracy score for online predictions 0.924


In [None]:
xgb_predictor.delete_endpoint()