In [68]:
import numpy as np
import pandas as pd

import boto3
import re

import sagemaker
from sagemaker import get_execution_role

In [69]:
# Establish a session with AWS
sess = sagemaker.Session()

In [70]:
role = get_execution_role()

In [19]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::750253866451:role/sage_role


In [71]:
bucket_name = 'saurav-ml-sagemaker'

training_folder = r'churnmodel/training/'
validation_folder = r'churnmodel/validation/'
test_folder = r'churnmodel/test/'

s3_model_output_location = r's3://{0}/churnmodel/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name,validation_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)

In [72]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://saurav-ml-sagemaker/churnmodel/model
s3://saurav-ml-sagemaker/churnmodel/training/
s3://saurav-ml-sagemaker/churnmodel/validation/
s3://saurav-ml-sagemaker/churnmodel/test/


In [6]:
'''In the past, we had to maintain the algorithm containers mapping:


containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

Reference:
https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
'''

"In the past, we had to maintain the algorithm containers mapping:\n\n\ncontainers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',\n              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',\n              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',\n              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}\n\nReference:\nhttps://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html\n"

In [73]:
# Sagemaker API now maintains the algorithm container mapping for us
# Specify the region, algorithm and version
container = sagemaker.amazon.amazon_estimator.get_image_uri(
    sess.boto_region_name,
    "xgboost", 
    "latest")

print('Using SageMaker XGBoost container:\n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using SageMaker XGBoost container:
811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


In [74]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

estimator = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.m4.xlarge',
    output_path=s3_model_output_location,
    sagemaker_session=sess,
    base_job_name ='xgboost-churn-v1')

In [75]:
# Specify hyper parameters that appropriate for the training algorithm
# XGBoost Training Parameter Reference: 
#   https://github.com/dmlc/xgboost/blob/master/doc/parameter.md

# max_depth=5,eta=0.1,subsample=0.7,num_round=150
estimator.set_hyperparameters(max_depth=5,
                              objective="binary:logistic",
                              eta=0.1,
                              num_round=150)

In [76]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'binary:logistic', 'eta': 0.1, 'num_round': 150}

In [77]:
# content type can be libsvm or csv for XGBoost
training_input_config = sagemaker.session.s3_input(
    s3_data=s3_training_file_location,
    content_type='csv',
    s3_data_type='S3Prefix')

validation_input_config = sagemaker.session.s3_input(
    s3_data=s3_validation_file_location,
    content_type='csv',
    s3_data_type='S3Prefix'
)

data_channels = {'train': training_input_config, 'validation': validation_input_config}

In [78]:
print(training_input_config.config)
print(validation_input_config.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://saurav-ml-sagemaker/churnmodel/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://saurav-ml-sagemaker/churnmodel/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [79]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit(data_channels)

2020-03-27 10:05:09 Starting - Starting the training job...
2020-03-27 10:05:10 Starting - Launching requested ML instances.........
2020-03-27 10:07:10 Starting - Preparing the instances for training......
2020-03-27 10:08:05 Downloading - Downloading input data...
2020-03-27 10:08:23 Training - Downloading the training image.[34mArguments: train[0m
[34m[2020-03-27:10:08:44:INFO] Running standalone xgboost training.[0m
[34m[2020-03-27:10:08:44:INFO] File size need to be processed in the node: 0.55mb. Available memory size in the node: 8520.96mb[0m
[34m[2020-03-27:10:08:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:08:44] S3DistributionType set as FullyReplicated[0m
[34m[10:08:44] 7000x13 matrix with 91000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-03-27:10:08:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[10:08:44] S3DistributionType set as FullyReplicated[0m
[34m[10:08:44] 1500x13 matrix w


2020-03-27 10:08:56 Uploading - Uploading generated training model
2020-03-27 10:08:56 Completed - Training job completed
Training seconds: 51
Billable seconds: 51


In [29]:
xgb_predictor = estimator.deploy(initial_instance_count=1,
                                content_type='text/csv',
                                instance_type='ml.t2.medium',
                                endpoint_name = 'xgboost-churn-v1'
                                )

-----------------!

In [33]:
endpoint_name = 'xgboost-churn-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [34]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [35]:
df_all = pd.read_csv('test.csv')

In [36]:
df_all


Unnamed: 0,0.0,0.0.1,1.0,1.0.1,0.0.2,608.0,41.0,1.0.2,83807.86,1.0.3,0.0.3,1.0.4,112542.58,0.0.4
0,0.0,1.0,0.0,0.0,1.0,653.0,58.0,1.0,132602.88,1.0,1.0,0.0,5097.67,1.0
1,1.0,0.0,0.0,1.0,0.0,726.0,24.0,6.0,0.00,2.0,1.0,1.0,54724.03,0.0
2,0.0,0.0,1.0,1.0,0.0,591.0,39.0,3.0,0.00,3.0,1.0,0.0,140469.38,1.0
3,0.0,1.0,0.0,0.0,1.0,582.0,41.0,6.0,70349.48,2.0,0.0,1.0,178074.04,0.0
4,0.0,0.0,1.0,1.0,0.0,660.0,61.0,5.0,155931.11,1.0,1.0,1.0,158338.39,0.0
5,0.0,1.0,0.0,0.0,1.0,550.0,38.0,2.0,103391.38,1.0,0.0,1.0,90878.13,0.0
6,1.0,0.0,0.0,1.0,0.0,788.0,33.0,5.0,0.00,2.0,0.0,0.0,116978.19,0.0
7,0.0,1.0,0.0,1.0,0.0,604.0,25.0,5.0,157780.84,2.0,1.0,1.0,58426.81,0.0
8,1.0,0.0,0.0,1.0,0.0,735.0,21.0,1.0,178718.19,2.0,1.0,0.0,22388.00,0.0
9,0.0,1.0,0.0,1.0,0.0,416.0,41.0,10.0,122189.66,2.0,1.0,0.0,98301.61,0.0


In [37]:
arr_test = df_all[df_all.columns[:-1]].values

In [38]:
arr_test.shape

(1499, 13)

In [39]:
type(arr_test)

numpy.ndarray

In [40]:
arr_test.shape

(1499, 13)

In [41]:
result = predictor.predict(arr_test[:2])

In [42]:
result

b'0.000610451214015,0.000663901038934'

In [49]:
predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    predictions += [float(r) for r in result]

(150, 13)
(150, 13)
(150, 13)
(150, 13)
(150, 13)
(150, 13)
(150, 13)
(150, 13)
(150, 13)
(149, 13)


In [55]:
final_results = []
for i in range(0, len(predictions)):
    if predictions[i] >0.5:
        
        final_results.append(1)
    else:
        
        final_results.append(0)

In [56]:
final_results

[0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,


In [59]:
len(final_results)

1499

In [63]:
y_true = df_all.iloc[:,-1]

In [60]:
from sklearn.metrics import confusion_matrix


In [64]:
cm = confusion_matrix(y_true, final_results)

In [65]:
cm

array([[885, 299],
       [249,  66]])