## XGBoost based heart disease detector deployment using Amazon Sazemaker

In [84]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

In [86]:
bucket_name = 'xgbsp-s3' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [100]:
model_data = pd.read_csv('./heart.csv')
useful_feature_target = ['cp', 'thalach', 'slope', 'restecg', 'trestbps', 'age', 'sex', 'thal', 'ca', 'oldpeak', 'exang', 'chol', 'target']
model_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [101]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=20), [int(0.8 * len(model_data))])
print(train_data.shape, test_data.shape)

(242, 14) (61, 14)


In [102]:
pd.concat([train_data['target'], train_data.drop(['target'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [104]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, instance_count=1, instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=4,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

In [105]:
xgb.fit({'train': s3_input_train})

2022-10-18 09:49:09 Starting - Starting the training job...
2022-10-18 09:49:35 Starting - Preparing the instances for trainingProfilerReport-1666086549: InProgress
.........
2022-10-18 09:51:00 Downloading - Downloading input data...
2022-10-18 09:51:36 Training - Downloading the training image........[34mArguments: train[0m
[34m[2022-10-18:09:52:55:INFO] Running standalone xgboost training.[0m
[34m[2022-10-18:09:52:55:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-10-18:09:52:55:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 8812.0mb[0m
[34m[2022-10-18:09:52:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[09:52:55] S3DistributionType set as FullyReplicated[0m
[34m[09:52:55] 242x13 matrix with 3146 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[09:52:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 2 pruned nodes, max_dept

In [106]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

------!

In [109]:
from sagemaker.serializers import CSVSerializer

test_data_array = test_data.drop(['target'], axis=1).values #load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(61,)


In [117]:
test_data['target'].values

array([1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0])

In [136]:
def predict_cls(predictions_array):
    y_pred=[]
    for i in predictions_array:
        if i<=0.5:
            y_pred.append(0)
        else:
            y_pred.append(1)
    return y_pred

def output_fn(prediction, content_type):
    return content_type[int(prediction[0])]

y_pred = predict_cls(predictions_array)

In [137]:
y_test = test_data['target'].values

In [138]:
print(classification_report(y_test, y_pred)) #y_pred should be np.array()

              precision    recall  f1-score   support

           0       0.89      0.76      0.82        33
           1       0.76      0.89      0.82        28

    accuracy                           0.82        61
   macro avg       0.83      0.83      0.82        61
weighted avg       0.83      0.82      0.82        61



In [139]:
x_test = test_data.drop(['target'], axis=1)

In [142]:
pred_prob = float(xgb_predictor.predict(x_test.iloc[0].values).decode('utf-8'))
print('The predicted probability is', pred_prob)
pred_val = predict_cls([pred_prob])

content_type = ['No','Yes']
print('Heart disease:',output_fn(pred_val, content_type))

The predicted probability is 0.8522401452064514
Heart disease: Yes


### Delete endpoint and bucket, also delete all instances

In [143]:
# Delete your endpoint
xgb_predictor.delete_endpoint(delete_endpoint_config=True)
# Delete your training artifacts and S3 bucket
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'H2VKX8J5NM25VQYG',
   'HostId': 'TFKD9tiz1Sd5ns5//Ctmgyba5aLRpkG6XQJdNTjjxUDS/sUjk62isNuujKq77JOkE8SSXakX40o=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'TFKD9tiz1Sd5ns5//Ctmgyba5aLRpkG6XQJdNTjjxUDS/sUjk62isNuujKq77JOkE8SSXakX40o=',
    'x-amz-request-id': 'H2VKX8J5NM25VQYG',
    'date': 'Tue, 18 Oct 2022 11:06:45 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-10-18-09-40-32-437/output/model.tar.gz'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-10-18-09-49-09-303/rule-output/ProfilerReport-1666086549/profiler-output/profiler-reports/OverallSystemUsage.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-10-18-09-49-09-303/rule-output/ProfilerReport-1666086549/profiler-output/profiler-reports/Dataloader.json'},
   {'Key': 'sagem