In [52]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
from IPython.display import display
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer

### Preparing the environment

In [53]:
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'

In [54]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

In [55]:
my_region = boto3.session.Session().region_name

print('Your Sagmaker instance is in the ' + my_region + ' region. You will use the container: ' + containers[my_region])

Your Sagmaker instance is in the eu-west-1 region. You will use the container: 685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest


In [56]:
bucket = 'bank-marketing-202010'
s3 = boto3.resource('s3')

In [57]:
try:
    s3.create_bucket(Bucket=bucket, CreateBucketConfiguration={ 'LocationConstraint' : my_region })
    
    print('S3 bucket created successfully')
    
except Exception as e:
    print('S3 error: ', e)

S3 error:  An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


### Loading and formatting the data

In [58]:
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
    
except Exception as e:
    print('Data load error: ',e)
    
try:
    data = pd.read_csv('bank_clean.csv', index_col=0)
    print('Success: Data loaded into dataframe.')

except Exception as e:
    print('Data load error: ', e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [59]:
data.head(3)

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [60]:
train, test = np.split(data.sample(frac=1, random_state=1729), [int(0.7 * len(data))])
print(train.shape, test.shape)

(28831, 61) (12357, 61)


In [61]:
# To use XGBoost, the training data should have first column as the feature column, and no header row
pd.concat([train['y_yes'], train.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)

# Upload the new formatted training data to S3
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket, prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


### Training

In [62]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess
                                   )

xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100
                       )

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [63]:
xgb.fit({'train': s3_input_train})

2020-10-10 10:13:52 Starting - Starting the training job...
2020-10-10 10:13:54 Starting - Launching requested ML instances......
2020-10-10 10:14:54 Starting - Preparing the instances for training...
2020-10-10 10:15:44 Downloading - Downloading input data...
2020-10-10 10:16:05 Training - Downloading the training image.[34mArguments: train[0m
[34m[2020-10-10:10:16:25:INFO] Running standalone xgboost training.[0m
[34m[2020-10-10:10:16:25:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-10-10:10:16:25:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8499.14mb[0m
[34m[2020-10-10:10:16:25:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:16:25] S3DistributionType set as FullyReplicated[0m
[34m[10:16:25] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[10:16:25] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes

### Deploy the model

In [64]:
# deploy() will deploy the model on a server and return an endpoint
xgb_predictor = xgb.deploy(initial_instance_count=1,
                          instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------!

### Test the model

In [66]:
test_data_array = test.drop(['y_no', 'y_yes'], axis=1).values

# Set the datatype for an inference input
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
predictions_array = np.fromstring(predictions[1:], sep=',')

In [67]:
print(predictions_array.shape)

(12357,)


In [68]:
predictions_array

array([0.06022352, 0.08928269, 0.05913398, ..., 0.0573156 , 0.02460817,
       0.03605176])

### Evaluate model performance

In [69]:
matrix = pd.crosstab(index=test['y_yes'], columns=np.round(predictions_array), rownames=['Actual'], colnames=['Predicted'])
matrix

Predicted,0.0,1.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10785,151
1,1143,278


In [71]:

tn = matrix.iloc[0,0]
fn = matrix.iloc[1,0]
tp = matrix.iloc[1,1]
fp = matrix.iloc[0,1]
p = (tp+tn) / (tp+tn+fp+fn) * 100

print('\n{0:<20}{1:<4.1f}%\n'.format('Overall Classification Rate', p))
print('{0:<15}{1:<15}{2:>8}'.format('Predicted', 'No Purchase', 'Purchase'))
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate89.5%

Predicted      No Purchase    Purchase
No Purchase    90% (10785)    35% (151)
Purchase        10% (1143)     65% (278) 

