# Direct Marketing with XGBoost and Amazon SageMaker
Featuring SageMaker AutoPilot, SageMaker Processing, SageMaker Experiments, and SageMaker Model Monitoring

In [None]:
!pip install sagemaker smdebug --upgrade

In [None]:
!pip install botocore --upgrade
!pip install boto3 --upgrade

In [15]:
import botocore
botocore.__version__
region

'us-west-2'

In [2]:
import boto3
import sagemaker
import os, sys

print (sagemaker.__version__)

sess   = sagemaker.Session()
bucket = sess.default_bucket()                     
prefix = 'sagemaker/DEMO-automl-dm'
region = boto3.Session().region_name

# Role when working on a notebook instance
role = sagemaker.get_execution_role()

1.45.1


In [3]:
sm = boto3.Session().client(service_name='sagemaker',region_name=region)
sm_rt = boto3.Session().client('runtime.sagemaker', region_name=region)

In [4]:
import numpy as np 
import pandas as pd

In [5]:
!wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
!unzip -o bank-additional.zip

--2019-12-09 04:32:55--  https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 444572 (434K) [application/x-httpd-php]
Saving to: ‘bank-additional.zip’


2019-12-09 04:32:56 (2.43 MB/s) - ‘bank-additional.zip’ saved [444572/444572]

Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/bank-additional/
  inflating: __MACOSX/bank-additional/._.DS_Store  
  inflating: bank-additional/.Rhistory  
  inflating: bank-additional/bank-additional-full.csv  
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: __MACOSX/._bank-additional  


Let's read the CSV file into a Pandas data frame and take a look at the first few lines.

In [6]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
data = pd.read_csv('./bank-additional/bank-additional-full.csv', sep=';')
pd.set_option('display.max_columns', 500)     # Make sure we can see all of the columns
pd.set_option('display.max_rows', 50)         # Keep the output on one page
data[:10] # Show the first 10 lines

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,198,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,139,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,217,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,380,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,50,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [7]:
data.shape # (number of lines, number of columns)

(41188, 21)

## Splitting the dataset

We split the dataset into training (95%) and test (5%) datasets. We will use the training dataset for AutoML, where it will be automatically split again for training and validation.
 
Once the model has been deployed, we'll use the test dataset to evaluate its performance.

In [8]:
# Set the seed to 123 for reproductibility
# https://pandas.pydata.org/pandas-docs/version/0.25/generated/pandas.DataFrame.sample.html
# https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.split.html
train_data, test_data, _ = np.split(data.sample(frac=1, random_state=123), 
                                                  [int(0.95 * len(data)), int(len(data))])  

# Save to CSV files
train_data.to_csv('automl-train.csv', index=False, header=True, sep=',') # Need to keep column names
test_data.to_csv('automl-test.csv', index=False, header=True, sep=',')

In [9]:
!ls -l automl*.csv

-rw-rw-r-- 1 ec2-user ec2-user  257339 Dec  9 04:33 automl-test.csv
-rw-rw-r-- 1 ec2-user ec2-user 4889516 Dec  9 04:33 automl-train.csv


**No preprocessing needed!** AutoML will take care of this, so let's just copy the training set to S3.

In [10]:
sess.upload_data(path="automl-train.csv", key_prefix=prefix + "/input")

's3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/input/automl-train.csv'

## Setting up the Amazon SageMaker AutoPilot job

After uploading the dataset to S3, we can invoke SageMaker AutoPilot to find the best ML pipeline to train a model on this dataset. 

The required inputs for invoking a SageMaker AutoML job are the dataset location in S3, the name of the column of the dataset you want to predict (`y` in this case) and an IAM role.

In [11]:
job_config = {
    'CompletionCriteria': {
      'MaxRuntimePerTrainingJobInSeconds': 600,
      'MaxCandidates': 10,
      'MaxAutoMLJobRuntimeInSeconds': 3600
    },
}

input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/input'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'y'  # the column we want to predict
    }
]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
}

## Launching the Amazon SageMaker AutoPilot job

We can now launch the job by calling the `create_auto_ml_job` API.

In [12]:
from time import gmtime, strftime, sleep
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'automl-dm-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig=job_config,
                      RoleArn=role)

AutoMLJobName: automl-dm-09-04-34-29


{'AutoMLJobArn': 'arn:aws:sagemaker:us-west-2:453691756499:automl-job/automl-dm-09-04-34-29',
 'ResponseMetadata': {'RequestId': '1adaa2fe-156e-4468-b108-a03de17b2ba2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1adaa2fe-156e-4468-b108-a03de17b2ba2',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '92',
   'date': 'Mon, 09 Dec 2019 04:34:29 GMT'},
  'RetryAttempts': 0}}

### Tracking the progress of the Amazon SageMaker AutoPilot job
SageMaker AutoPilot job consists of four high-level steps : 
* Data Preprocessing, where the dataset is split into train and validation sets.
* Recommending Pipelines, where the dataset is analyzed and SageMaker AutoPilot comes up with a list of ML pipelines that should be tried out on the dataset.
* Automatic Feature Engineering, where SageMaker AutoPilot performs feature transformation on individual features of the dataset as well as at an aggregate level.
* ML pipeline selection and hyperparameter tuning, where the top performing pipeline is selected along with the optimal hyperparameters for the training algorithm (the last stage of the pipeline). 

As you can guess, several of these steps are powered by **Amazon SageMaker Processing**.

In [13]:
%%time
job_run_status = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['AutoMLJobStatus']

print(job_run_status)

while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(60)

InProgress
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
InProgress - ModelTuning
Completed - MaxCandidatesReached
CPU times: user 413 ms, sys: 10.8 ms, total: 424 ms
Wall time: 29min 4s


### Inspecting the SageMaker Autopilot job with Amazon SageMaker Experiments

In [14]:
from sagemaker.analytics import ExperimentAnalytics

analytics = ExperimentAnalytics(
    sagemaker_session=sess, 
    experiment_name=auto_ml_job_name+'-aws-auto-ml-job'
)

df = analytics.dataframe()
df

Unnamed: 0,TrialComponentName,DisplayName,SourceArn,SageMaker.ImageUri,SageMaker.InstanceCount,SageMaker.InstanceType,SageMaker.VolumeSizeInGB,_tuning_objective_metric,alpha,colsample_bytree,eta,gamma,lambda,max_depth,min_child_weight,num_round,objective,subsample,ObjectiveMetric - Min,ObjectiveMetric - Max,ObjectiveMetric - Avg,ObjectiveMetric - StdDev,ObjectiveMetric - Last,ObjectiveMetric - Count,validation:error - Min,validation:error - Max,validation:error - Avg,validation:error - StdDev,validation:error - Last,validation:error - Count,validation:accuracy - Min,validation:accuracy - Max,validation:accuracy - Avg,validation:accuracy - StdDev,validation:accuracy - Last,validation:accuracy - Count,train:error - Min,train:error - Max,train:error - Avg,train:error - StdDev,train:error - Last,train:error - Count,train:accuracy - Min,train:accuracy - Max,train:accuracy - Avg,train:accuracy - StdDev,train:accuracy - Last,train:accuracy - Count,binary_classifier_model_selection_criteria,l1,learning_rate,loss,mini_batch_size,num_models,positive_example_weight_mult,predictor_type,wd,processor_module,sagemaker_program,sagemaker_submit_directory,input_channel_mode,job_name,label_col
0,automl-dm-09-tuning-job-1-3a268-009-6c889f29-a...,automl-dm-09-tuning-job-1-3a268-009-6c889f29-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,3e-06,0.676956,0.001582,0.006241,7.8e-05,3.0,0.012706,270.0,binary:hinge,0.83269,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,automl-dm-09-tuning-job-1-3a268-007-c1ae9a71-a...,automl-dm-09-tuning-job-1-3a268-007-c1ae9a71-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,3e-06,0.676956,0.001582,0.006241,7.8e-05,3.0,0.012706,270.0,binary:hinge,0.83269,0.112588,0.112588,0.112588,0.0,0.112588,43.0,0.887412,0.887412,0.887412,0.0,0.887412,43.0,0.112588,0.112588,0.112588,0.0,0.112588,43.0,0.887519,0.887519,0.887519,0.0,0.887519,43.0,0.112481,0.112481,0.112481,0.0,0.112481,43.0,,,,,,,,,,,,,,,
2,automl-dm-09-tuning-job-1-3a268-006-057361d1-a...,automl-dm-09-tuning-job-1-3a268-006-057361d1-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,3e-06,0.676956,0.001582,0.006241,7.8e-05,3.0,0.012706,270.0,binary:hinge,0.83269,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,automl-dm-09-tuning-job-1-3a268-005-1ab5f8a3-a...,automl-dm-09-tuning-job-1-3a268-005-1ab5f8a3-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,3e-06,0.676956,0.001582,0.006241,7.8e-05,3.0,0.012706,270.0,binary:hinge,0.83269,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,automl-dm-09-tuning-job-1-3a268-008-e26ef964-a...,automl-dm-09-tuning-job-1-3a268-008-e26ef964-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,3e-06,0.676956,0.001582,0.006241,7.8e-05,3.0,0.012706,270.0,binary:hinge,0.83269,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,automl-dm-09-tuning-job-1-3a268-003-ed833680-a...,automl-dm-09-tuning-job-1-3a268-003-ed833680-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,1.6e-05,0.393186,0.025153,0.000119,2.7e-05,30.0,0.035558,187.0,binary:hinge,0.660072,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,automl-dm-09-tuning-job-1-3a268-010-94204782-a...,automl-dm-09-tuning-job-1-3a268-010-94204782-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,174872318107.dkr.ecr.us-west-2.amazonaws.com/l...,1.0,ml.m5.4xlarge,50.0,validation:binary_classification_accuracy,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,loss_function,2.801905e-07,0.004927,logistic,1000.0,1.0,0.01,binary_classifier,0.000253,,,,,,
7,automl-dm-09-tuning-job-1-3a268-001-f7db5792-a...,automl-dm-09-tuning-job-1-3a268-001-f7db5792-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,1.6e-05,0.393186,0.025153,0.000119,2.7e-05,30.0,0.035558,187.0,binary:hinge,0.660072,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,automl-dm-09-tuning-job-1-3a268-002-8aa752d3-a...,automl-dm-09-tuning-job-1-3a268-002-8aa752d3-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,1.6e-05,0.393186,0.025153,0.000119,2.7e-05,30.0,0.035558,187.0,binary:hinge,0.660072,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,automl-dm-09-tuning-job-1-3a268-004-5a614c2a-a...,automl-dm-09-tuning-job-1-3a268-004-5a614c2a-a...,arn:aws:sagemaker:us-west-2:453691756499:train...,246618743249.dkr.ecr.us-west-2.amazonaws.com/s...,1.0,ml.m5.4xlarge,50.0,validation:accuracy,3e-06,0.676956,0.001582,0.006241,7.8e-05,3.0,0.012706,270.0,binary:hinge,0.83269,0.112588,0.112588,0.112588,0.0,0.112588,44.0,0.887412,0.887412,0.887412,0.0,0.887412,44.0,0.112588,0.112588,0.112588,0.0,0.112588,44.0,0.887519,0.887519,0.887519,0.0,0.887519,44.0,0.112481,0.112481,0.112481,0.0,0.112481,44.0,,,,,,,,,,,,,,,


### Listing all candidates explored by Amazon SageMaker AutoPilot
You can view all the candidates (pipeline evaluations with different hyperparameter combinations) that were explored by AutoML and sort them by their final performance metric.

In [16]:
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name, 
                                                SortBy='FinalObjectiveMetricValue')['Candidates']
index = 1
for candidate in candidates:
  print (str(index) + "  " 
         + candidate['CandidateName'] + "  " 
         + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))
  index += 1

1  automl-dm-09-tuning-job-1-3a268-003-ed833680  0.9100319743156433
2  automl-dm-09-tuning-job-1-3a268-001-f7db5792  0.9095209836959839
3  automl-dm-09-tuning-job-1-3a268-002-8aa752d3  0.9088820219039917
4  automl-dm-09-tuning-job-1-3a268-010-94204782  0.8874121308326721
5  automl-dm-09-tuning-job-1-3a268-004-5a614c2a  0.11258800327777863
6  automl-dm-09-tuning-job-1-3a268-008-e26ef964  0.11258800327777863
7  automl-dm-09-tuning-job-1-3a268-006-057361d1  0.11258800327777863
8  automl-dm-09-tuning-job-1-3a268-007-c1ae9a71  0.11258800327777863
9  automl-dm-09-tuning-job-1-3a268-005-1ab5f8a3  0.11258800327777863
10  automl-dm-09-tuning-job-1-3a268-009-6c889f29  0.11258800327777863


In [17]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']

print("Candidate name: " + best_candidate_name)

Candidate name: automl-dm-09-tuning-job-1-3a268-003-ed833680


We can also see the containers and models composing the Inference Pipeline.

In [18]:
for container in best_candidate['InferenceContainers']:
    print(container['Image'])
    print(container['ModelDataUrl'])
    print('-')

246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sklearn-automl:0.1.0-cpu-py3
s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/data-processor-models/automl-dm-09-04-34-29-automl-dm--dpp0-1-2a614a10de4c4cc0b8ce7f1/output/model.tar.gz
-
246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-xgboost:0.90-1-cpu-py3
s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/tuning/automl-dm--dpp0-xgb/automl-dm-09-tuning-job-1-3a268-003-ed833680/output/model.tar.gz
-
246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-sklearn-automl:0.1.0-cpu-py3
s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/data-processor-models/automl-dm-09-04-34-29-automl-dm--dpp0-1-2a614a10de4c4cc0b8ce7f1/output/model.tar.gz
-


### Fetching the auto-generated notebooks

SageMaker AutoPilot also generates two notebooks: 
* Data exploration,
* Candidate definition.

In [19]:
job = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
job_candidate_notebook = job['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
job_data_notebook = job['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

print(job_candidate_notebook)
print(job_data_notebook)

s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/sagemaker-automl-candidates/automl-dm-09-04-34-29-pr-1-17ddde359c11472192b705dff544e14f8626/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/sagemaker-automl-candidates/automl-dm-09-04-34-29-pr-1-17ddde359c11472192b705dff544e14f8626/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


Let's copy these two notebooks.

In [20]:
%%sh -s $job_candidate_notebook $job_data_notebook
aws s3 cp $1 .
aws s3 cp $2 .

download: s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/sagemaker-automl-candidates/automl-dm-09-04-34-29-pr-1-17ddde359c11472192b705dff544e14f8626/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb to ./SageMakerAutopilotCandidateDefinitionNotebook.ipynb
download: s3://sagemaker-us-west-2-453691756499/sagemaker/DEMO-automl-dm/output/automl-dm-09-04-34-29/sagemaker-automl-candidates/automl-dm-09-04-34-29-pr-1-17ddde359c11472192b705dff544e14f8626/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb to ./SageMakerAutopilotDataExplorationNotebook.ipynb


## Deploying the best candidate, with Amazon SageMaker Model Monitor
Now that we have successfully completed the AutoML job on our dataset and visualized the trials, we can create a model from any of the trials with a single API call and then deploy that model for online or batch prediction using [Inference Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/inference-pipelines.html).

Let's create a SageMaker model for this Inference Pipeline.

In [21]:
model_name = 'automl-dm-model-' + timestamp_suffix

model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

print('Model ARN: ', model_arn['ModelArn'])

Model ARN:  arn:aws:sagemaker:us-west-2:453691756499:model/automl-dm-model-09-04-34-29


Let's configure data capture.

In [22]:
s3_capture_path = 's3://reinvent-recap-demos/' + model_name + '/'

print(s3_capture_path)

s3://reinvent-recap-demos/automl-dm-model-09-04-34-29/


In [23]:
data_capture_configuration = {
    "EnableCapture": True, # flag turns data capture on and off
    "DestinationS3Uri": s3_capture_path, # s3 location where captured data is saved
    "InitialSamplingPercentage": 100, # sampling rate to capture data. max is 100%
    "CaptureOptions": [
       {
            "CaptureMode": "Output" # The type of capture this option enables. Values can be: [Output/Input]
        },
        {
            "CaptureMode": "Input" # The type of capture this option enables. Values can be: [Output/Input]
        }
    ],
    "CaptureContentTypeHeader": {
       "CsvContentTypes": ["text/csv"], # headers which should signal to decode the payload into CSV format 
       "JsonContentTypes": ["application/json"] # headers which should signal to decode the payload into JSON format 
    }
}

As usual, we first create the endpoint configuration, and then the endpoint.

In [24]:
# Endpoint configuration name
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())
epc_name = 'automl-dm-epc-' + timestamp_suffix
print('Endpoint configuration name:', epc_name)

ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType':'ml.m4.xlarge',
                                                           'InitialInstanceCount':1,
                                                           'ModelName':model_name,
                                                           'VariantName': 'AllTraffic'}],
                                      DataCaptureConfig = data_capture_configuration)

Endpoint configuration name: automl-dm-epc-09-05-26-23


In [25]:
# Endpoint name
ep_name = 'automl-dm-ep-' + timestamp_suffix
variant_name = 'automl-dm-variant-' + timestamp_suffix
print('Endpoint name:', ep_name)

create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)

Endpoint name: automl-dm-ep-09-05-26-23


In [26]:
%%time
sm.get_waiter('endpoint_in_service').wait(EndpointName=ep_name)

resp = sm.describe_endpoint(EndpointName=ep_name)
status = resp['EndpointStatus']

print("Endpoint ARN   : " + resp['EndpointArn'])
print("Endpoint status: " + status)

Endpoint ARN   : arn:aws:sagemaker:us-west-2:453691756499:endpoint/automl-dm-ep-09-05-26-23
Endpoint status: InService
CPU times: user 240 ms, sys: 9.02 ms, total: 249 ms
Wall time: 9min 31s


## Scoring the best candidate

Let's predict and score the test set. We'll compute metrics ourselves just for fun.

In [27]:
tp = tn = fp = fn = count = 0

with open('automl-test.csv') as f:
    lines = f.readlines()
    for l in lines[1:]:   # Skip header
        l = l.split(',')  # Split CSV line into features
        label = l[-1]     # Store 'yes'/'no' label
        l = l[:-1]        # Remove label
        l = ','.join(l)   # Rebuild CSV line without label
                
        response = sm_rt.invoke_endpoint(EndpointName=ep_name, ContentType='text/csv', Accept='text/csv', Body=l)

        response = response['Body'].read().decode("utf-8")
        #print ("label %s response %s" %(label,response))

        if 'yes' in label:
            # Sample is positive
            if 'yes' in response:
                # True positive
                tp=tp+1
            else:
                # False negative
                fn=fn+1
        else:
            # Sample is negative
            if 'no' in response:
                # True negative
                tn=tn+1
            else:
                # False positive
                fp=fp+1
        count = count+1
        if (count % 100 == 0):   
            sys.stdout.write(str(count)+' ')
            
print ("Done")

100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 Done


In [28]:
# Confusion matrix
print ("%d %d" % (tn, fp))
print ("%d %d" % (fn, tp))

accuracy  = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall    = tn/(tn+fn)
f1        = (2*precision*recall)/(precision+recall)

print ("Accuracy: %.4f, Precision: %.4f, Recall: %.4f, F1: %.4f" % (accuracy, precision, recall, f1))

1761 61
149 89
Accuracy: 0.8981, Precision: 0.5933, Recall: 0.9220, F1: 0.7220


Let's check that we captured data.

In [37]:
!aws s3 ls --recursive {s3_capture_path}

2019-12-09 05:37:49     969830 automl-dm-model-09-04-34-29/automl-dm-ep-09-05-26-23/AllTraffic/2019/12/09/05/36-03-862-e7a0ce60-5843-49fb-aae8-c94862db161d.jsonl


In [38]:
!aws s3 cp --recursive {s3_capture_path} .

download: s3://reinvent-recap-demos/automl-dm-model-09-04-34-29/automl-dm-ep-09-05-26-23/AllTraffic/2019/12/09/05/36-03-862-e7a0ce60-5843-49fb-aae8-c94862db161d.jsonl to automl-dm-ep-09-05-26-23/AllTraffic/2019/12/09/05/36-03-862-e7a0ce60-5843-49fb-aae8-c94862db161d.jsonl


In [42]:
!head <file_name.jsonl>

{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"35,technician,single,professional.course,no,yes,no,cellular,aug,wed,156,2,999,0,nonexistent,1.4,93.444,-36.1,4.965,5228.1","encoding":"CSV"},"endpointOutput":{"observedContentType":"text/csv; charset=utf-8","mode":"OUTPUT","data":"no\n","encoding":"CSV"}},"eventMetadata":{"eventId":"1577c6d8-a2d8-4134-8577-e7bbb5661708","inferenceTime":"2019-12-09T05:36:03Z"},"eventVersion":"0"}
{"captureData":{"endpointInput":{"observedContentType":"text/csv","mode":"INPUT","data":"43,blue-collar,single,basic.4y,unknown,yes,no,telephone,may,mon,225,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0","encoding":"CSV"},"endpointOutput":{"observedContentType":"text/csv; charset=utf-8","mode":"OUTPUT","data":"no\n","encoding":"CSV"}},"eventMetadata":{"eventId":"5f127d44-e349-4820-a66d-8d0772ed932c","inferenceTime":"2019-12-09T05:36:04Z"},"eventVersion":"0"}
{"captureData":{"endpointInput":{"observedContentType":"text/c

## Deleting the endpoint
Once that we're done predicting, we can delete the endpoint (and stop paying for it).

In [43]:
# Uncomment to delete
sm.delete_endpoint(EndpointName=ep_name)

{'ResponseMetadata': {'RequestId': '511f7315-d60d-48bb-849c-aa89e58293ba',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '511f7315-d60d-48bb-849c-aa89e58293ba',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Mon, 09 Dec 2019 05:48:57 GMT'},
  'RetryAttempts': 0}}