# Clarify Demo

In [None]:
import sagemaker

sm_version = sagemaker.__version__
if sm_version[0] =="1":
    !pip install sagemaker==2.5.5
    import sagemaker
    
import os
import boto3
from io import StringIO
import pandas as pd

session = sagemaker.Session()

# If running locally cut and paste Execution role ARN, otherwise use get_execution_role() method
# role = ''
# role = sagemaker.get_execution_role()
bucket = "2021-demos"
prefix = "sagemaker/german-data-xgb"

Basic training/inference parameters

In [None]:
xgb_endpoint_name = 'sm-clarify-german-xgb'
xgb_model_name = 'xgboost-german-model'
train_instance_count = 1
train_instance_type = 'ml.c5.4xlarge'
predictor_instance_count = 1
predictor_instance_type = 'ml.c5.4xlarge'
batch_transform_instance_count = 1
batch_transform_instance_type = 'ml.c5.4xlarge'

In [None]:
if not os.path.exists('Data'):
    os.makedirs('Data')
    
local_data_path = './Data/german.data'
url="https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
os.system('wget -O %s %s' %(local_data_path, url))

data = pd.read_csv(local_data_path, header=None, sep=" ")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 10)

# Details here: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
data

In [None]:
def _preprocess_data(german):
    german.columns=["CheckingAC_Status", "MaturityMonths", "CreditHistory", "Purpose", 
                    "LoanAmount", "SavingsAC", "Employment", "InstallmentPctOfIncome", 
                    "SexAndStatus", "OtherDebts", "PresentResidenceYears", "Property", 
                    "Age", "OtherInstallmentPlans", "Housing", "NumExistingLoans", "Job", 
                    "Dependents", "Telephone", "ForeignWorker", "Class1Good2Bad"]
    
    df = pd.DataFrame(2-german.Class1Good2Bad) # Conver to good=1, bad=0
    res = pd.get_dummies(german.CheckingAC_Status) # A11 A12 A13 A14
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.MaturityMonths], axis=1, sort=False)
    res = pd.get_dummies(german.CreditHistory) # A30 A31 A32 A33 A34
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Purpose)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.LoanAmount], axis=1, sort=False)
    res = pd.get_dummies(german.SavingsAC)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Employment)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.InstallmentPctOfIncome], axis=1, sort=False)
    res = pd.get_dummies(german.SexAndStatus)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.OtherDebts)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.PresentResidenceYears], axis=1, sort=False)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Property)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.Age], axis=1, sort=False)
    res = pd.get_dummies(german.OtherInstallmentPlans)
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.get_dummies(german.Housing)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.NumExistingLoans], axis=1, sort=False)
    res = pd.get_dummies(german.Job)
    df = pd.concat([df,res], axis=1, sort=False)
    df = pd.concat([df,german.Dependents], axis=1, sort=False)
    res = pd.DataFrame({'Telephone': german.Telephone.str.slice(3,4).astype(int)-1})
    df = pd.concat([df,res], axis=1, sort=False)
    res = pd.DataFrame({'ForeignWorker': abs(german.ForeignWorker.str.slice(3,4).astype(int)-2)})
    df = pd.concat([df,res], axis=1, sort=False)
    
    print("DF shape {}".format(df.shape))
    print("DF columns: \n{}".format(df.columns))
    
    # Separate X and y in the dataset
    X = df.drop(['Class1Good2Bad'], axis=1)
    y = df.Class1Good2Bad
    print("X shape: {}, y shape: {}".format(X.shape, y.shape))
    
    return X, y, df

X, y, df = _preprocess_data(data)

## Transformed dataset

In [None]:
df

## Split the dataset

In [None]:
train_data = df.sample(frac=0.8, random_state=200)
other_data = df.drop(train_data.index)
validation_data = other_data.sample(frac=0.5, random_state=200)
test_data = other_data.drop(validation_data.index)
del other_data, test_data

In [None]:
train_file = '/tmp/train_data.csv'
train_data.to_csv(train_file, index=False, header=False)
train_data_s3_path = session.upload_data( bucket=bucket, path=train_file, key_prefix=prefix + "/train")
print('Training data uploaded to: ' + train_data_s3_path)

validation_file = '/tmp/validation_data.csv'
validation_data.to_csv(validation_file, index=False, header=False)
validation_data_s3_path = session.upload_data( bucket=bucket, path=validation_file, key_prefix=prefix + "/validation")
print('Validation data uploaded to: ' + validation_data_s3_path)

preprocessed_data_file = "/tmp/X.csv"
preprocessed_label_file = "/tmp/y.csv"
X.to_csv(preprocessed_data_file, index=None, header=False)
y.to_csv(preprocessed_label_file, index=None, header=False)
session.upload_data( bucket=bucket,path=preprocessed_data_file, key_prefix=prefix + "/preprocessed_data")
session.upload_data( bucket=bucket,path=preprocessed_label_file, key_prefix=prefix + "/preprocessed_data")

 ## Train the model

In [None]:
from sagemaker.image_uris import retrieve
xgb_image_uri = retrieve('xgboost', session.boto_region_name, version="latest")
xgb = sagemaker.estimator.Estimator( xgb_image_uri, role, instance_count=train_instance_count, 
                                    instance_type=train_instance_type, 
                                    output_path='s3://{}/{}/{}'.format(bucket, prefix, 'xgb_model'),
                                    sagemaker_session=session)
xgb.set_hyperparameters(max_depth=7, eta=0.3, objective='binary:logistic', num_round=10)

s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train/train_data.csv'.format(bucket, prefix), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/validation/validation_data.csv'.format(bucket, prefix), content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

## Deploy the model

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

xgb_predictor = xgb.deploy(initial_instance_count=predictor_instance_count,
                           instance_type=predictor_instance_type,
                           serializer=CSVSerializer(),
                           deserializer=CSVDeserializer(),
                           model_name=xgb_model_name,
                           endpoint_name=xgb_endpoint_name )
print("\nModel is successfully deployed at the endpoint {xgb_endpoint_name}.")

## Explore the model

In [None]:
import numpy as np

train_dataset = sagemaker.s3.S3Downloader.read_file('s3://{}/{}/{}/X.csv'.format(bucket, prefix, 'preprocessed_data'))
probs_list = xgb_predictor.predict(train_dataset)
probs = np.array(probs_list[0], dtype=float)

pred_labels = (probs >= 0.5).astype(dtype=np.int32)

labels_str = sagemaker.s3.S3Downloader.read_file('s3://{}/{}/{}/y.csv'.format(bucket, prefix, 'preprocessed_data'))
labels = np.fromstring(labels_str, sep='\n')

In [None]:
def plot_roc_curve(labels, pred_labels):
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt
    fpr, tpr, thrs = roc_curve(labels, pred_labels)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw=2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc )
    plt.plot([0,1], [0,1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy score for online predictions", accuracy_score(pred_labels, labels))
plot_roc_curve(labels, pred_labels)

In [None]:
xgb_predictor.delete_endpoint()