Importing SageMaker libs

In [1]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

region = boto3.Session().region_name

session = sagemaker.Session()

bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-churn'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

Importing other libs

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime, sleep


Download dataset

In [3]:
#!wget

Take a look at the dataset

In [4]:
churn = pd.read_csv('./data/churn.csv')
pd.set_option('display.max_columns', 500)
churn

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,area_code_415,no,no,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,no
4246,WV,73,area_code_408,no,no,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,no
4247,NC,75,area_code_408,no,no,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,no
4248,HI,50,area_code_408,no,yes,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,no


Upload file to S3

In [5]:
#train_data = #TODO
train_data_s3_path = session.upload_data(path='./data/train.csv', key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_data_s3_path = session.upload_data(path='./data/test.csv', key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-907833590179/sagemaker/autopilot-churn/train/train.csv
Test data uploaded to: s3://sagemaker-us-east-1-907833590179/sagemaker/autopilot-churn/test/test.csv


Initiate the AutoPilot (AutoML) training

In [6]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'churn'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }


Launch the job

In [7]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

autopilot_job_name = 'autopilot-churn-' + timestamp_suffix
print('Job Name: ' + autopilot_job_name)
%store autopilot_job_name

sm.create_auto_ml_job(AutoMLJobName=autopilot_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig={'CompletionCriteria': {'MaxCandidates': 20}
                                      },
                      RoleArn=role)


Job Name: autopilot-churn-18-11-08-18
Stored 'autopilot_job_name' (str)


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:907833590179:automl-job/autopilot-churn-18-11-08-18',
 'ResponseMetadata': {'RequestId': '048d7ef3-ead1-4db4-9e9c-70d6f6a45f1e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '048d7ef3-ead1-4db4-9e9c-70d6f6a45f1e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '98',
   'date': 'Sun, 18 Apr 2021 11:08:19 GMT'},
  'RetryAttempts': 0}}

Tracking job progress

In [8]:
print ('Overall Status - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=autopilot_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=autopilot_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)


Overall Status - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
I

KeyboardInterrupt: 

If you're using SageMaker Studio, try look around at **Experiments and trials** for visualization of AutoPilot job status

Optionally, explore Auto-generated codes

In [9]:
#print(describe_response['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation'])
#print(describe_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation'])

#candidate_nbk = describe_response['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
#data_explore_nbk = describe_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

#def split_s3_path(s3_path):
#    path_parts=s3_path.replace("s3://","").split("/")
#    bucket=path_parts.pop(0)
#    key="/".join(path_parts)
#    return bucket, key

#s3_bucket, candidate_nbk_key = split_s3_path(candidate_nbk)
#_, data_explore_nbk_key = split_s3_path(data_explore_nbk)

#session.download_data(path='./', bucket=s3_bucket, 
                                 key_prefix = candidate_nbk_key)

#session.download_data(path='./', bucket=s3_bucket, 
                                 key_prefix = data_explore_nbk_key)

s3://sagemaker-us-east-1-907833590179/sagemaker/autopilot-churn/output/autopilot-churn-18-11-08-18/sagemaker-automl-candidates/pr-1-5965b5e4bdc94b13876db648287347362eaaf7cc659642a2aa143d3bd5/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
s3://sagemaker-us-east-1-907833590179/sagemaker/autopilot-churn/output/autopilot-churn-18-11-08-18/sagemaker-automl-candidates/pr-1-5965b5e4bdc94b13876db648287347362eaaf7cc659642a2aa143d3bd5/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb
sagemaker-us-east-1-907833590179 sagemaker/autopilot-churn/output/autopilot-churn-18-11-08-18/sagemaker-automl-candidates/pr-1-5965b5e4bdc94b13876db648287347362eaaf7cc659642a2aa143d3bd5/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb sagemaker/autopilot-churn/output/autopilot-churn-18-11-08-18/sagemaker-automl-candidates/pr-1-5965b5e4bdc94b13876db648287347362eaaf7cc659642a2aa143d3bd5/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


Now we're playing the wait game, and get the best candidate result.

In [12]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=autopilot_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

{'CandidateName': 'tuning-job-1-921d9aba2f6d444091-015-77d60aea', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:f1', 'Value': 0.8737199902534485}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:907833590179:processing-job/db-1-4bbf5ba730474c149fb9db115f729bead646aedc94a143d0a4412ffc21', 'CandidateStepName': 'db-1-4bbf5ba730474c149fb9db115f729bead646aedc94a143d0a4412ffc21'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:907833590179:training-job/autopilot--dpp8-1-19658e6927f840d3b0498080a165adcad5a264e4783f4', 'CandidateStepName': 'autopilot--dpp8-1-19658e6927f840d3b0498080a165adcad5a264e4783f4'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:907833590179:transform-job/autopilot--dpp8-rpb-1-0cbd7a634cb14ec88b7331ebf9112be4524fe9f01', 'CandidateStepName': '

Now deploy the best model

In [15]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
model_name = best_candidate_name + timestamp_suffix + "-model"
model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

epc_name = best_candidate_name + timestamp_suffix + "-epc"
ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType': 'ml.m5.2xlarge',
                                                           'InitialInstanceCount': 1,
                                                           'ModelName': model_name,
                                                           'VariantName': 'main'}])

ep_name = best_candidate_name + timestamp_suffix + "-ep"
create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)

print('Endpoint: ' + create_endpoint_response['EndpointArn'])
%store create_endpoint_response['EndpointArn']

Endpoint: {'EndpointArn': 'arn:aws:sagemaker:us-east-1:907833590179:endpoint/tuning-job-1-921d9aba2f6d444091-015-77d60aea18-11-51-39-ep', 'ResponseMetadata': {'RequestId': 'ddc2eaf0-4955-45d5-88a3-790a88f678cd', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ddc2eaf0-4955-45d5-88a3-790a88f678cd', 'content-type': 'application/x-amz-json-1.1', 'content-length': '126', 'date': 'Sun, 18 Apr 2021 11:51:40 GMT'}, 'RetryAttempts': 0}}


And optinally, evaluate it

In [16]:
test_data = pd.read_csv('./data/test.csv')
test_data_inference = test_data.drop('churn', axis=1)

if (sagemaker.__version__ < '2'):
    from sagemaker.predictor import RealTimePredictor
    from sagemaker.content_types import CONTENT_TYPE_CSV
    predictor = RealTimePredictor(
        endpoint=ep_name,
        sagemaker_session=session,
        content_type=CONTENT_TYPE_CSV,
        accept=CONTENT_TYPE_CSV)

    prediction = predictor.predict(test_data_inference.to_csv(sep=',', header=False, index=False)).decode('utf-8')

else:
    from sagemaker.predictor import Predictor
    from sagemaker.serializers import CSVSerializer
    from sagemaker.deserializers import CSVDeserializer
    predictor = Predictor(
        endpoint_name=ep_name,
        sagemaker_session=session,
        serializer=CSVSerializer(),
        deserializer=CSVDeserializer())
    
    prediction = predictor.predict(test_data_inference.to_csv(sep=',', header=False, index=False))


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(test_data.reset_index()['churn'], prediction_df[0])
precision = precision_score(test_data.reset_index()['churn'], prediction_df[0], pos_label='True.')
recall = recall_score(test_data.reset_index()['churn'], prediction_df[0], pos_label='True.', average='binary')
f1 = f1_score(test_data.reset_index()['churn'], prediction_df[0], pos_label='True.')

print('Accuracy: {}'.format(accuracy))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1: {}'.format(f1))

KeyError: "['churn'] not found in axis"

Also, when you're done, remember to delete the endpoints

In [None]:
#sm.delete_endpoint(EndpointName=ep_name)
#sm.delete_endpoint_config(EndpointConfigName=epc_name)
#sm.delete_model(ModelName=model_name)
