Importing SageMaker libs

In [None]:
import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer

region = boto3.Session().region_name

session = sagemaker.Session()

bucket = session.default_bucket()
prefix = 'sagemaker/autopilot-churn'

role = get_execution_role()

sm = boto3.Session().client(service_name='sagemaker',region_name=region)

Importing other libs

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime, sleep


Download dataset

In [None]:
#https://www.kaggle.com/c/customer-churn-prediction-2020/data?select=train.csv

Take a look at the dataset

In [None]:
churn = pd.read_csv('./data/churn.csv')
pd.set_option('display.max_columns', 500)
churn

Upload file to S3

In [None]:
#train_data = #TODO
train_data_s3_path = session.upload_data(path='./data/train.csv', key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_data_s3_path = session.upload_data(path='./data/test.csv', key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Initiate the AutoPilot (AutoML) training

In [None]:
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'churn'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }


Launch the job

In [None]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

autopilot_job_name = 'autopilot-churn-' + timestamp_suffix
print('Job Name: ' + autopilot_job_name)
%store autopilot_job_name

sm.create_auto_ml_job(AutoMLJobName=autopilot_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig={'CompletionCriteria': {'MaxCandidates': 20}
                                      },
                      RoleArn=role)


Tracking job progress

In [None]:
print ('Overall Status - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=autopilot_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=autopilot_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)


If you're using SageMaker Studio, try look around at **Experiments and trials** for visualization of AutoPilot job status

Optionally, explore Auto-generated codes

In [None]:
#print(describe_response['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation'])
#print(describe_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation'])

#candidate_nbk = describe_response['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
#data_explore_nbk = describe_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

#def split_s3_path(s3_path):
#    path_parts=s3_path.replace("s3://","").split("/")
#    bucket=path_parts.pop(0)
#    key="/".join(path_parts)
#    return bucket, key

#s3_bucket, candidate_nbk_key = split_s3_path(candidate_nbk)
#_, data_explore_nbk_key = split_s3_path(data_explore_nbk)

#session.download_data(path='./', bucket=s3_bucket, 
                                 key_prefix = candidate_nbk_key)

#session.download_data(path='./', bucket=s3_bucket, 
                                 key_prefix = data_explore_nbk_key)

Now we're playing the wait game, and get the best candidate result.

In [None]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=autopilot_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

Now deploy the best model

In [None]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
model_name = best_candidate_name + timestamp_suffix + "-model"
model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

epc_name = best_candidate_name + timestamp_suffix + "-epc"
ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType': 'ml.m5.2xlarge',
                                                           'InitialInstanceCount': 1,
                                                           'ModelName': model_name,
                                                           'VariantName': 'main'}])

ep_name = best_candidate_name + timestamp_suffix + "-ep"
create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)

print('Endpoint: ' + create_endpoint_response['EndpointArn'])
%store create_endpoint_response['EndpointArn']

And optinally, evaluate it

In [None]:
test_data = pd.read_csv('./data/test.csv')
test_data_inference = test_data.drop('churn', axis=1)

if (sagemaker.__version__ < '2'):
    from sagemaker.predictor import RealTimePredictor
    from sagemaker.content_types import CONTENT_TYPE_CSV
    predictor = RealTimePredictor(
        endpoint=ep_name,
        sagemaker_session=session,
        content_type=CONTENT_TYPE_CSV,
        accept=CONTENT_TYPE_CSV)

    prediction = predictor.predict(test_data_inference.to_csv(sep=',', header=False, index=False)).decode('utf-8')

else:
    from sagemaker.predictor import Predictor
    from sagemaker.serializers import CSVSerializer
    from sagemaker.deserializers import CSVDeserializer
    predictor = Predictor(
        endpoint_name=ep_name,
        sagemaker_session=session,
        serializer=CSVSerializer(),
        deserializer=CSVDeserializer())
    
    prediction = predictor.predict(test_data_inference.to_csv(sep=',', header=False, index=False))


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(test_data.reset_index()['churn'], prediction_df[0])
precision = precision_score(test_data.reset_index()['churn'], prediction_df[0], pos_label='True.')
recall = recall_score(test_data.reset_index()['churn'], prediction_df[0], pos_label='True.', average='binary')
f1 = f1_score(test_data.reset_index()['churn'], prediction_df[0], pos_label='True.')

print('Accuracy: {}'.format(accuracy))
print('Precision: {}'.format(precision))
print('Recall: {}'.format(recall))
print('F1: {}'.format(f1))

Also, when you're done, remember to delete the endpoints

In [None]:
#sm.delete_endpoint(EndpointName=ep_name)
#sm.delete_endpoint_config(EndpointConfigName=epc_name)
#sm.delete_model(ModelName=model_name)
