In [36]:
import sklearn
import pandas as pd
import numpy as np
import boto3
import pprint
import os
import time


import re

import sagemaker
from sagemaker import get_execution_role

In [37]:
sess = sagemaker.Session()

In [38]:
role = get_execution_role()
print(role)

arn:aws:iam::750253866451:role/sage_role


In [39]:
bucket_name = 'saurav-ml-sagemaker'

training_folder = r'ChurnModel/training/'
test_folder = r'ChurnModel/test/'
validation_folder = r'ChurnModel/validation/'

In [40]:
s3_model_output_location = r's3://{}/ChurnModel/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_folder)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_folder)
s3_validation_file_location = r's3://{0}/{1}'.format(bucket_name, validation_folder)

In [41]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_validation_file_location)
print(s3_test_file_location)

s3://saurav-ml-sagemaker/ChurnModel/model
s3://saurav-ml-sagemaker/ChurnModel/training/
s3://saurav-ml-sagemaker/ChurnModel/validation/
s3://saurav-ml-sagemaker/ChurnModel/test/


In [42]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(sess.boto_region_name,
                                                            "xgboost",
                                                           "latest")

print('Using SageMaker XGBoost container:\n{} ({})'.format(container, sess.boto_region_name))

	get_image_uri(region, 'xgboost', '0.90-1').


Using SageMaker XGBoost container:
811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest (us-east-1)


In [43]:
estimator = sagemaker.estimator.Estimator(  container,
                                            role, 
                                            train_instance_count=1, 
                                            train_instance_type='ml.m4.xlarge',
                                            output_path=s3_model_output_location,
                                            sagemaker_session=sess,
                                            base_job_name ='xgboost-churn-v1')

In [44]:
estimator.set_hyperparameters(max_depth = 5,
                             objective = "binary:logistic",
                             eta = 0.1,
                             num_round = 150)

In [45]:
estimator.hyperparameters()

{'max_depth': 5, 'objective': 'binary:logistic', 'eta': 0.1, 'num_round': 150}

In [46]:
train_channel = sagemaker.session.s3_input(s3_data = s3_training_file_location,
                                          content_type = 'csv',
                                          s3_data_type = 'S3Prefix')

In [47]:
validation_channel = sagemaker.session.s3_input(s3_data = s3_validation_file_location,
                                          content_type = 'csv',
                                          s3_data_type = 'S3Prefix')

In [48]:
data_channels = {'train': train_channel, 'validation': validation_channel}

In [49]:
print(train_channel.config)
print(validation_channel.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://saurav-ml-sagemaker/ChurnModel/training/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://saurav-ml-sagemaker/ChurnModel/validation/', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'csv'}


In [50]:
estimator.fit(data_channels)

2020-03-29 07:57:28 Starting - Starting the training job...
2020-03-29 07:57:29 Starting - Launching requested ML instances.........
2020-03-29 07:59:11 Starting - Preparing the instances for training......
2020-03-29 08:00:09 Downloading - Downloading input data...
2020-03-29 08:00:44 Training - Downloading the training image..
2020-03-29 08:01:16 Uploading - Uploading generated training model
2020-03-29 08:01:16 Completed - Training job completed
[34mArguments: train[0m
[34m[2020-03-29:08:01:04:INFO] Running standalone xgboost training.[0m
[34m[2020-03-29:08:01:04:INFO] File size need to be processed in the node: 2.06mb. Available memory size in the node: 8517.89mb[0m
[34m[2020-03-29:08:01:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[08:01:04] S3DistributionType set as FullyReplicated[0m
[34m[08:01:04] 7000x13 matrix with 91000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-03-29:08:01:04:INFO] Determined deli

Training seconds: 67
Billable seconds: 67


In [None]:
xgb_predictor = estimator.deploy(initial_instance_count=1,
                                content_type='text/csv',
                                instance_type='ml.t2.medium',
                                endpoint_name = 'xgboost-churn-v1'
                                )

In [None]:
endpoint_name = 'xgboost-churn-v1'
predictor = sagemaker.predictor.RealTimePredictor(endpoint=endpoint_name)

In [None]:
from sagemaker.predictor import csv_serializer, json_deserializer

predictor.content_type = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None

In [23]:
df_all = pd.read_csv('test.csv')

In [24]:
df_all.head()

Unnamed: 0,-1.0028039309990795,-0.5787359118285802,1.7427397119690895,-0.9124191495230279,0.912419149523028,-0.057205253550936634,0.4842246042993514,1.0329077647974714,0.5973285944410451,0.8077365626180174,0.6460916681531842,-1.030670113400118,0.8636502870218121,1.0
0,0.997204,-0.578736,-0.573809,-0.912419,0.912419,-1.267778,-0.755372,0.341352,0.409185,0.807737,-1.547768,-1.03067,-0.3462,0.0
1,0.997204,-0.578736,-0.573809,1.095988,-1.095988,-1.050496,-1.327494,-0.004426,-1.225848,0.807737,-1.547768,-1.03067,1.578357,0.0
2,-1.002804,-0.578736,1.74274,1.095988,-1.095988,-0.160673,-0.373958,0.68713,-1.225848,0.807737,0.646092,0.970243,-0.593636,0.0
3,-1.002804,1.727904,-0.573809,-0.912419,0.912419,0.025569,1.819175,-1.387538,0.899393,-0.911583,0.646092,-1.03067,-1.651826,1.0
4,0.997204,-0.578736,-0.573809,-0.912419,0.912419,0.842964,0.198164,1.032908,-1.225848,0.807737,0.646092,0.970243,1.23107,0.0


In [None]:
arr_test = df_all[df_all.columns[:-1]].values

In [None]:
arr_test.shape

In [None]:
type(arr_test)

In [None]:
arr_test.shape

In [None]:
result = predictor.predict(arr_test[:2])

In [None]:
result

In [None]:
predictions = []
for arr in np.array_split(arr_test,10):
    result = predictor.predict(arr)
    result = result.decode("utf-8")
    result = result.split(',')
    print (arr.shape)
    predictions += [float(r) for r in result]

In [None]:
final_results = []
for i in range(0, len(predictions)):
    if predictions[i] >0.5:
        
        final_results.append(1)
    else:
        
        final_results.append(0)

In [None]:
final_results

In [None]:
len(final_results)

In [None]:
y_true = df_all.iloc[:,-1]

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, final_results)
cm