# Prepare the data

Import the required libraries and define the environment variables needed to prepare the data, train the model and deploy the model.

In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.serializers import CSVSerializer

# define IAM role
role = get_execution_role()
prefix = "sagemaker/DEMO-xgboost-dm"
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + xgboost_container + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the ap-south-1 region. You will use the 991648021394.dkr.ecr.ap-south-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


Create the S3 bucket to store the data.

In [2]:
bucket_name = "predicting-cardiovascular-disease-bucket"
s3 = boto3.resource("s3")

try:
    if  my_region == "us-east-1":
      s3.create_bucket(Bucket = bucket_name)
    else: 
      s3.create_bucket(Bucket = bucket_name, 
                       CreateBucketConfiguration = { "LocationConstraint": my_region })
    print("S3 bucket created successfully")
except Exception as e:
    print("S3 error: ",e)

S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


Download the data to the Sagemaker instance and load the data into a data frame.

In [3]:
names = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", 
         "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num"]

try:
  model_data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data",
                           names = names,
                           na_values = "?")
  print("Success: Data loaded into data frame.")
except Exception as e:
    print("Data load error: ", e)

Success: Data loaded into data frame.


In its current state, the target variable has 5 unique values. Pool together the values 1, 2, 3 and 4 into the single value 1, to denote the presence of heart disease. Leave the 0 values the way they are, to denote the absence of heart disease.

In [4]:
model_data["num"].value_counts()

0    164
1     55
2     36
3     35
4     13
Name: num, dtype: int64

In [5]:
model_data["num"] = model_data["num"].map({0: 0,
                                           1: 1,
                                           2: 1,
                                           3: 1,
                                           4: 1})

In [6]:
model_data["num"].value_counts()

0    164
1    139
Name: num, dtype: int64

One-hot encode the target variable.

In [7]:
model_data.loc[1:10, "num"]

1     1
2     1
3     0
4     0
5     0
6     1
7     0
8     1
9     1
10    0
Name: num, dtype: int64

In [8]:
model_data["num_0"] = model_data["num"].apply(lambda x: int(x == 0))
model_data["num_1"] = model_data["num"].apply(lambda x: int(x == 1))
model_data = model_data.drop(labels = "num",
                             axis = "columns")

In [9]:
model_data.loc[1:10, ["num_0", "num_1"]]

Unnamed: 0,num_0,num_1
1,0,1
2,0,1
3,1,0
4,1,0
5,1,0
6,0,1
7,1,0
8,0,1
9,0,1
10,1,0


Shuffle and split the data into training data and test data.

In [10]:
train_data, test_data = np.split(model_data.sample(frac = 1, random_state = 555), 
                                 [int(0.8 * len(model_data))])
print(train_data.shape, test_data.shape)

(242, 15) (61, 15)


# Train the model

Reformat the header and the first column of the training data. Load the data from the S3 bucket.

In [11]:
pd.concat([train_data["num_1"], 
          train_data.drop(labels = ["num_0", "num_1"], axis = "columns")], 
          axis = 1).to_csv("train.csv", index = False, header = False)
boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(prefix, "train/train.csv")).upload_file("train.csv")
s3_input_train = sagemaker.inputs.TrainingInput(s3_data = "s3://{}/{}/train".format(bucket_name, prefix), 
                                                content_type = "csv")

Set up the Amazon SageMaker session, create an instance of the XGBoost model and define the model's hyperparameters.

In [12]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(xgboost_container,role, 
                                    instance_count = 1, 
                                    instance_type = "ml.m4.xlarge", 
                                    output_path = "s3://{}/{}/output".format(bucket_name, prefix),
                                    sagemaker_session = sess)
xgb.set_hyperparameters(max_depth = 5,
                        eta = 0.2, 
                        gamma = 4, 
                        min_child_weight = 6, 
                        subsample = 0.8, 
                        silent = 0, 
                        objective = "binary:logistic", 
                        num_round = 100)

Start the training job.

In [13]:
xgb.fit({"train": s3_input_train})

2021-12-11 16:48:48 Starting - Starting the training job...
2021-12-11 16:48:50 Starting - Launching requested ML instancesProfilerReport-1639241328: InProgress
......
2021-12-11 16:50:06 Starting - Preparing the instances for training.........
2021-12-11 16:51:46 Downloading - Downloading input data
2021-12-11 16:51:46 Training - Downloading the training image...
2021-12-11 16:52:12 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2021-12-11:16:52:07:INFO] Running standalone xgboost training.[0m
[34m[2021-12-11:16:52:07:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2021-12-11:16:52:07:INFO] File size need to be processed in the node: 0.01mb. Available memory size in the node: 8351.62mb[0m
[34m[2021-12-11:16:52:07:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:52:07] S3DistributionType set as FullyReplicated[0m
[34m[16:52:07] 242x13 matrix with 3145 entries loaded from /opt/ml/input/data/train?format=csv&label_column=

# Deploy the model

Deploy the model on a server and create a SageMaker endpoint that can be accessed.

In [14]:
xgb_predictor = xgb.deploy(initial_instance_count = 1,
                           instance_type = "ml.m4.xlarge")

-----!

Predict whether the given patients have cardiovascular disease or not.

In [15]:
test_data_array = test_data.drop(labels = ["num_0", "num_1"], axis = "columns").values # load the data into an array
xgb_predictor.serializer = CSVSerializer() # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode("utf-8") # predict!
predictions_array = np.fromstring(predictions[1:], sep=",") # and turn the prediction into an array
print(predictions_array.shape)

(61,)


# Evaluate model performance

Compare the actual vs predicted values in a confusion matrix.

In [16]:
cm = pd.crosstab(index = test_data["num_1"], 
                 columns = np.round(predictions_array), 
                 rownames = ["Observed"], 
                 colnames = ["Predicted"])
tn = cm.iloc[0, 0]
fn = cm.iloc[1, 0]
tp = cm.iloc[1, 1]
fp = cm.iloc[0, 1]
p = (tp + tn) / (tp + tn + fp + fn) * 100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", 
                                     p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", 
                                    "No Disease", 
                                    "Disease"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Disease", 
                                                           tn / (tn + fn) *100,
                                                           tn, 
                                                           fp / (tp + fp) * 100, 
                                                           fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Disease", 
                                                              fn / (tn + fn) * 100,
                                                              fn, 
                                                              tp / (tp + fp) *100, 
                                                              tp))


Overall Classification Rate: 82.0%

Predicted      No Disease      Disease
Observed
No Disease     86% (32)    25% (6)
Disease         14% (5)     75% (18) 



# Clean up

Delete the endpoint.

In [17]:
xgb_predictor.delete_endpoint(delete_endpoint_config = True)

Delete the training artifacts and the S3 bucket.

In [18]:
bucket_to_delete = boto3.resource("s3").Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '9HRFRY97A1YAVBEA',
   'HostId': 'Cva3kxitQxjo4DNyoP9Yq+6r1ZU/R0o79nYI3EFFzcHSZGxn2N8MDuFzsDS966sgKQn0hXAx3YE=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'Cva3kxitQxjo4DNyoP9Yq+6r1ZU/R0o79nYI3EFFzcHSZGxn2N8MDuFzsDS966sgKQn0hXAx3YE=',
    'x-amz-request-id': '9HRFRY97A1YAVBEA',
    'date': 'Sat, 11 Dec 2021 16:58:27 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2021-12-11-16-48-48-028/rule-output/ProfilerReport-1639241328/profiler-output/profiler-reports/LoadBalancing.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2021-12-11-16-48-48-028/profiler-output/system/training_job_end.ts'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2021-12-11-16-14-05-537/rule-output/ProfilerReport-1639239245/profiler-output/profiler-reports/StepOutlier.json'}