# Initialize stuff for project

In [36]:
# import all required libraries
import sagemaker # Amazon Sagemaker Python SDK (needed for using prebuilt Sagemaker models/algos)
import boto3     # AWS Python SDK(helps us read data from S3 buckets, just like pandas helps us read from CSVs, etc.)
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session

In [6]:
# initialize S3 bucket name and detect region for creation
bucket_name = "rzapplication1" # simply the bucket name for our application, can be IAM controlled/restricted down the road
my_region = boto3.session.Session().region_name
print(my_region)              # we check our region name, for use with S3 bucket that is created

us-west-2


In [7]:
# create S3 bucket programmatically
s3 = boto3.resource("s3")
try: 
    if my_region == "us-west-2":
        s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': 'us-west-2'})
        print("S3 bucket '",  bucket_name,"' created successfully!")
except Exception as e:
    print("S3 error: ", e)

S3 bucket ' rzapplication1 ' created successfully!


In [8]:
# mapping for storage in S3 bucket
model_name_prefix = "sagemaker_xgboost_model"
model_write_path = "s3://{}/{}/output".format(bucket_name, model_name_prefix)
print(model_write_path)

s3://rzapplication1/sagemaker_xgboost_model/output


# Pull and ready data for training/testing our model

In [37]:
# load additional libraries
import pandas as pd
import urllib 

In [10]:
# download the dataset via specified URL
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", 
                                "bank_clean.csv")
    print("Success: downloaded bank_clean.csv.")
except Exception as e:
    print("Data download error: ", e) 

Success: downloaded bank_clean.csv.


In [11]:
# read that data into a pandas dataframe now
try:
    model_data = pd.read_csv("./bank_clean.csv")
    print("Success: downloaded file successfully loaded as a pandas dataframe!")
except Exception as e:
    print("Pandas load error: ", e)

Success: downloaded file successfully loaded as a pandas dataframe!


In [14]:
# do the train-test split of our downloaded dataset, into a 70-30 ratio
import numpy as np
train_data, test_data = np.split(model_data.sample(frac = 1, random_state = 101), [int(0.7 * len(model_data))])
print("Shape of train data: ", train_data.shape)
print("Shape of test data: ", test_data.shape)

Shape of train data:  (28831, 62)
Shape of test data:  (12357, 62)


In [18]:
# save the train data into S3 bucket
import os

# convert into the data format required by Sagemaker, with response first and then the predictors
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', 
                                                                                            index=False, 
                                                                                            header=False)

# upload to S3 bucket we created above
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(model_name_prefix, 'train/train.csv')).upload_file('train.csv')

# load the object with reference to this newly uploaded train dataset
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, model_name_prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [19]:
# save the test data into S3 bucket
import os

# convert into the data format required by Sagemaker, with response first and then the predictors
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', 
                                                                                            index=False, 
                                                                                            header=False)

# upload to S3 bucket we created above
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(model_name_prefix, 'test/test.csv')).upload_file('test.csv')

# load the object with reference to this newly uploaded test dataset
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/{}/test'.format(bucket_name, model_name_prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


# Build XGBoost model using Sagemaker algo and data from created S3 bucket

In [24]:
# the Sagemaker pre-built algos are retrieved as containers, and we need to pull them into our current AS instance as follows
container = get_image_uri(boto3.Session().region_name, 'xgboost', repo_version="1.0-1") # last argument is for fetching latest

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [31]:
# initialize a model object using our container
xgb_model = sagemaker.estimator.Estimator(image_name=container, # The container image to use for training
                                         role=sagemaker.get_execution_role(), # An AWS IAM role (either name or full ARN) 
                                         train_instance_count=1, # Number of Amazon EC2 instances to use for training
                                         train_instance_type='ml.m5.2xlarge', # Type of EC2 instance to use for training
                                         train_volume_size = 5, # Size (GB) of EBS volume to use for storing I/P data during training
                                         output_path=model_write_path, # S3 location for saving the training result (model artifacts and output files)
                                         sagemaker_session=sagemaker.Session(), # Session object which manages interactions with Amazon SageMaker APIs and any other AWS services needed
                                         train_use_spot_instances=True, # Specifies whether to use SageMaker Managed Spot instances for training
                                         train_max_run=300, # Timeout in seconds for training (After this amount of time Amazon SageMaker terminates the job regardless of its current status)
                                         train_max_wait=600) #  Timeout in seconds waiting for spot training instances

# up above, the last 3 parameteres were set in order to reduce the amount AWS bills to my credit card :)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [32]:
# specify our custom set of hyperparameters for our model
xgb_model.set_hyperparameters(max_depth = 5,
                              eta = .2,
                              gamma = 4,
                              min_child_weight = 6,
                              subsample = 0.7, # Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees (this will prevent overfitting). Subsampling will occur once in every boosting iteration.
                              silent = 0,
                              objective = "binary:logistic",
                              num_round = 50)

In [33]:
# fit the model to our data, provide the paths (URLs) to training and validation datasets
xgb_model.fit({'train': s3_input_train,'validation': s3_input_test})

2020-10-11 21:43:59 Starting - Starting the training job...
2020-10-11 21:44:01 Starting - Launching requested ML instances......
2020-10-11 21:45:03 Starting - Preparing the instances for training......
2020-10-11 21:46:29 Downloading - Downloading input data
2020-10-11 21:46:29 Training - Training image download completed. Training in progress..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m[21:46:31] 28831x60 matrix with 1729860 entries loaded 

# Model deployment

In [39]:
# Deploy this trained model using endpoints

xgb_predictor = xgb_model.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!

In [40]:
# since the endpoint will be accepting data in a typical tabular format, we set up things for that
from sagemaker.predictor import csv_serializer

In [41]:
# drop the labels from input dataset
input_data = test_data.drop(['y_no','y_yes'], axis=1).values

In [42]:
# set the expected type of input for our endpoint model
xgb_predictor.content_type = 'text/csv' 

In [43]:
# similarly, set the serializer type for our endpoint
xgb_predictor.serializer = csv_serializer

In [45]:
# grab predictions using trained model endpoint for passed input data
predictions = xgb_predictor.predict(input_data).decode('utf-8')

In [46]:
# bring the output in proper format (array)
predictions_array = np.fromstring(predictions[1:], sep=',') 

In [49]:
predictions_array

array([0.09114921, 0.05947307, 0.07003157, ..., 0.03261118, 0.01985421,
       0.05698879])

In [50]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.9%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (10726)    34% (198)
Purchase        9% (1054)     66% (379) 



In [None]:
# Delete the end point etc. to prevent additional billing - need to convert to a markdown

In [51]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '54271468171E6E19',
   'HostId': '0MGQHrfuA+pRnuUusKcuCyt/fIHXh9X/veBOVPit15wQastx72ZDMHlHAhSEFbtXvcgukDkReIo=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': '0MGQHrfuA+pRnuUusKcuCyt/fIHXh9X/veBOVPit15wQastx72ZDMHlHAhSEFbtXvcgukDkReIo=',
    'x-amz-request-id': '54271468171E6E19',
    'date': 'Mon, 12 Oct 2020 06:05:57 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker_xgboost_model/output/sagemaker-xgboost-2020-10-11-21-43-59-350/output/model.tar.gz'},
   {'Key': 'sagemaker_xgboost_model/train/train.csv'},
   {'Key': 'sagemaker_xgboost_model/test/test.csv'},
   {'Key': 'sagemaker_xgboost_model/output/sagemaker-xgboost-2020-10-05-05-31-11-801/output/model.tar.gz'}]}]