In [1]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

In [2]:
# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
my_region = boto3.session.Session().region_name # set the region of the instance

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

print(
    "Success - the MySageMakerInstance is in the " + my_region \
    + " region. You will use the " + xgboost_container \
    + " container for your SageMaker endpoint."
)

Success - the MySageMakerInstance is in the us-east-2 region. You will use the 825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [5]:
# create s3 bucket to store data 
bucket_name = 'sagemaker-demo-xgboost-rksnrc'
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    else: 
        s3.create_bucket(
            Bucket=bucket_name, 
            CreateBucketConfiguration={ 'LocationConstraint': my_region }
        )
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [9]:
data_url = "https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/"\
                + "bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv"

try:
    urllib.request.urlretrieve (
        data_url, 
        "bank_clean.csv"
    )
    print('Success: downloaded bank_clean.csv.')
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [10]:
train_data, test_data = np.split(
    model_data.sample(frac=1, random_state=1729), 
    [int(0.7 * len(model_data))]
)
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


In [13]:
pd.concat(
    [train_data['y_yes'], 
    train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1
).to_csv('train.csv', index=False, header=False)

my_bucket = boto3.Session().resource('s3').Bucket(bucket_name)
my_bucket.Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data='s3://{}/{}/train'.format(bucket_name, prefix), 
    content_type='csv'
)

In [14]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    xgboost_container, 
    role, 
    instance_count=1, 
    instance_type='ml.m4.xlarge',
    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
    sagemaker_session=sess
)

xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    silent=0,
    objective='binary:logistic',
    num_round=100
)

In [15]:
xgb.fit({'train': s3_input_train})

2022-11-15 19:35:39 Starting - Starting the training job...
2022-11-15 19:36:04 Starting - Preparing the instances for trainingProfilerReport-1668540939: InProgress
............
2022-11-15 19:38:02 Downloading - Downloading input data...
2022-11-15 19:38:23 Training - Downloading the training image...
2022-11-15 19:39:09 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2022-11-15:19:38:55:INFO] Running standalone xgboost training.[0m
[34m[2022-11-15:19:38:55:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2022-11-15:19:38:55:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8629.3mb[0m
[34m[2022-11-15:19:38:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:38:55] S3DistributionType set as FullyReplicated[0m
[34m[19:38:55] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[19:38:56] src/tree/updater_prune.cc:

In [17]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge'
)

------!

In [18]:
from sagemaker.serializers import CSVSerializer

#load the data into an array
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values 

# set the serializer type
xgb_predictor.serializer = CSVSerializer() 

# predict!
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') 

# and turn the prediction into an array
predictions_array = np.fromstring(predictions[1:], sep=',') 

print(predictions_array.shape)

(12357,)


In [24]:
cm = pd.crosstab(
    index=test_data['y_yes'], 
    columns=np.round(predictions_array), 
    rownames=['Observed'], 
    colnames=['Predicted']
)

tn = cm.iloc[0,0] 
fn = cm.iloc[1,0]
tp = cm.iloc[1,1]
fp = cm.iloc[0,1]
p = (tp+tn)/(tp+tn+fp+fn)*100

print(f"Overall Classification Rate: {p}")

cm

Overall Classification Rate: 89.47964716355102


Predicted,0.0,1.0
Observed,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10769,167
1,1133,288


In [25]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'SSQV3KBZG2PAVMTS',
   'HostId': 'WNS/5EOgfhRJObrqNs5upLDvq9YzZhFrpDjDgaLCXjW/SXJFtvz2EPkYfyO+Y5A4QjtS2G4VdVk=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'WNS/5EOgfhRJObrqNs5upLDvq9YzZhFrpDjDgaLCXjW/SXJFtvz2EPkYfyO+Y5A4QjtS2G4VdVk=',
    'x-amz-request-id': 'SSQV3KBZG2PAVMTS',
    'date': 'Tue, 15 Nov 2022 19:58:27 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-11-15-19-35-39-480/rule-output/ProfilerReport-1668540939/profiler-output/profiler-reports/CPUBottleneck.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-11-15-19-35-39-480/rule-output/ProfilerReport-1668540939/profiler-output/profiler-reports/BatchSize.json'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2022-11-15-19-35-39-480/rule-output/ProfilerReport-1668540939/profiler