# 3 Train a model with Amazon SageMaker Autopilot


In [5]:
#Data analysis where the data is summarized and analyzed to determine which feature engineering techniques, hyper-parameters, and models to explore.
#Feature engineering where the data is scrubbed, balanced, combined, and split into train and validation.
#Model training and tuning where the top performing features, hyper-parameters, and models are selected and trained.

!pip install --disable-pip-version-check -q sagemaker==2.35.0
import boto3, sagemaker, pandas as pd, numpy as np, botocore, time, json, matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

config = botocore.config.Config(user_agent_extra='dlai-pds/c1/w3')

# low-level service client of the boto3 session
sm = boto3.client(service_name='sagemaker', config=config); sm_runtime = boto3.client('sagemaker-runtime',config=config); sess = sagemaker.Session(sagemaker_client=sm, sagemaker_runtime_client=sm_runtime)
bucket = sess.default_bucket(); role = sagemaker.get_execution_role(); region = sess.boto_region_name

[0m

## Review transformed dataset


In [12]:
# transform the dataset into a format that Autopilot recognizes. Specifically, a comma-separated file of label,features
#Autopilot requires that the target variable is first and the set of features come next.

#download, extract, 
!aws s3 cp 's3://dlai-practical-data-science/data/balanced/womens_clothing_ecommerce_reviews_balanced.csv' ./c1w3
path = './c1w3/womens_clothing_ecommerce_reviews_balanced.csv'

#extract features and save
df = pd.read_csv(path, delimiter=',')
path_autopilot = './c1w3/womens_clothing_ecommerce_reviews_balanced_for_autopilot.csv'
df[['sentiment', 'review_body']].to_csv(path_autopilot, sep=',', index=False)

#upload data to s3
autopilot_train_s3_uri = sess.upload_data(bucket=bucket, key_prefix='autopilot/data', path=path_autopilot)
!aws s3 ls $autopilot_train_s3_uri

download: s3://dlai-practical-data-science/data/balanced/womens_clothing_ecommerce_reviews_balanced.csv to c1w3/womens_clothing_ecommerce_reviews_balanced.csv
2022-07-25 13:28:08    2253749 womens_clothing_ecommerce_reviews_balanced_for_autopilot.csv


In [16]:
#CONFIGURE AND RUN AUTOPILOT JOB

#Set the S3 output path for the Autopilot outputs.
model_output_s3_uri = 's3://{}/autopilot'.format(bucket)
#Create the Autopilot job name.
import time; timestamp = int(time.time()); auto_ml_job_name = 'automl-dm-{}'.format(timestamp)


automl = sagemaker.automl.automl.AutoML(
    target_attribute_name="sentiment", # the name of the target attribute for predictions
    base_job_name=auto_ml_job_name, # Autopilot job name
    output_path=model_output_s3_uri, # output data path
    max_candidates= 3, # maximum number of model candidates
    sagemaker_session=sess, role=role,max_runtime_per_training_job_in_seconds=1200, total_job_runtime_in_seconds=7200)

#run autopilot job
automl.fit(
    autopilot_train_s3_uri, # input data path
    job_name=auto_ml_job_name, # Autopilot job name
    wait=False, logs=False)

#track the processing job 
from IPython.core.display import display, HTML
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/">processing jobs</a></b>'.format(region)))

## Track the auto pilot jobs statues

In [18]:
#TRACK AUTOPILOT JOBs
job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)#returns the information about the AutoML job in dictionary format. review the response syntax and response elements in https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeAutoMLJob.html
#To track the job progress you can use two response elements: AutoMLJobStatus and AutoMLJobSecondaryStatus, which correspond to the primary (Completed | InProgress | Failed | Stopped | Stopping) and secondary (AnalyzingData | FeatureEngineering | ModelTuning etc.) job states respectively. To see if the AutoML job has started, you can check the existence of the AutoMLJobStatus and AutoMLJobSecondaryStatus elements in the job description response.

# check if the job is started else print the progress
while 'AutoMLJobStatus' not in job_description_response.keys() and 'AutoMLJobSecondaryStatus' not in job_description_response.keys():
    job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job has not yet started. Please wait. ')
    # function `json.dumps` encodes JSON string for printing.
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print('[INFO] Waiting for Autopilot job to start...')
    sleep(15)

print('[OK] AutoML job started.')

[OK] AutoML job started.


In [20]:
# The Autopilot creates required SageMaker processing jobs during the run:
# First processing job (data splitter) checks the data sanity, performs stratified shuffling and splits the data into training and validation.
# Second processing job (candidate generator) first streams through the data to compute statistics for the dataset. Then, uses these statistics to identify the problem type, and possible types of every column-predictor: numeric, categorical, natural language, etc.
#to check the completion of the data analysis step:
%%time

job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']

if job_status not in ('Stopped', 'Failed'):
    while job_status in ('InProgress') and job_sec_status in ('Starting', 'AnalyzingData'):
        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(15)
    print('[OK] Data analysis phase completed.\n')
    
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

In [22]:
#Once data analysis is complete, SageMaker AutoPilot generates two notebooks: Data exploration, Candidate definition
#To check if the Autopilot job artifacts have been generated:

job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name) # Replace None
while "AutoMLJobArtifacts" not in job_description_response.keys():
    job_description_response = None # Replace None
    print('[INFO] Autopilot job has not yet generated the artifacts. Please wait. ')
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print('[INFO] Waiting for AutoMLJobArtifacts...')
    time.sleep(15)

print('[OK] AutoMLJobArtifacts generated.')

[OK] AutoMLJobArtifacts generated.


In [24]:
#Check if the notebooks have been created.
job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name) # Replace None

while "DataExplorationNotebookLocation" not in job_description_response['AutoMLJobArtifacts'].keys():
    # update the information about the running Autopilot job
    job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name) 

    print('[INFO] Autopilot job has not yet generated the notebooks. Please wait. ')
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print('[INFO] Waiting for DataExplorationNotebookLocation...')
    time.sleep(15)

print('[OK] DataExplorationNotebookLocation found.')  

[OK] DataExplorationNotebookLocation found.


In [25]:
#Review the generated resources in S3 directly

from IPython.core.display import display, HTML
generated_resources = job_description_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation']
download_path = generated_resources.rsplit('/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb')[0]
job_id = download_path.rsplit('/', 1)[-1]
if not job_id: 
    print('No AutoMLJobArtifacts found.')
else: 
    display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/autopilot/{}/sagemaker-automl-candidates/{}/">generated notebooks</a> in S3 bucket</b>'.format(bucket, auto_ml_job_name, job_id)))

In [None]:
#Check the completion of the feature engineering step.

#%%time
job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print(job_status)
print(job_sec_status)
if job_status not in ('Stopped', 'Failed'):
    while job_status == 'InProgress' and job_sec_status == "FeatureEngineering": # Replace all None
        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Feature engineering phase completed.\n')
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

In [28]:
#view the HP tuning job here
from IPython.core.display import display, HTML
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/hyper-tuning-jobs/">hyper-parameter tuning jobs</a></b>'.format(region)))


In [None]:
#Check the completion of the model tuning step.
#%%time
job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print(job_status)
print(job_sec_status)
if job_status not in ('Stopped', 'Failed'):
    while job_status == "InProgress" and job_sec_status == "ModelTuning":
        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Model tuning phase completed.\n')
    
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

In [None]:
#check the completion of the Autopilot job looking for the Completed job status.
#%%time

from pprint import pprint

job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
pprint(job_description_response)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print('Job status:  {}'.format(job_status))
print('Secondary job status:  {}'.format(job_sec_status))
if job_status not in ('Stopped', 'Failed'):
    while job_status not in ('Completed'):
        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print('Job status:  {}'.format(job_status))
        print('Secondary job status:  {}'.format(job_sec_status))        
        time.sleep(10)
    print('[OK] Autopilot job completed.\n')
else:
    print('Job status: {}'.format(job_status))
    print('Secondary job status: {}'.format(job_status))

In [36]:
#Once model tuning is complete, you can view all the candidates (pipeline evaluations with different hyperparameter combinations) that were explored by AutoML and sort them by their final performance metric.
# CANDIDATE EXISTANCE CHECK:
candidates = []
while candidates == []:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating the candidates. Please wait.')
    time.sleep(10)

print('[OK] Candidates generated.')

#You can review the response syntax and response elements of the function list_candidates in the https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_AutoMLCandidate.html 

#List candidates generated by Autopilot sorted by accuracy from highest to lowest:
candidates = automl.list_candidates(
    job_name= auto_ml_job_name, # Autopilot job name
    sort_by="FinalObjectiveMetricValue") # accuracy field name


[INFO] Autopilot job is generating the candidates. Please wait.
[OK] Candidates generated.


In [None]:
#CandidateName contains the candidate name and the FinalAutoMLJobObjectiveMetric element contains the metric information which can be used to identify the best candidate later. Let's check that they were generated.

#check candidateName and FinalAutoMLJobObjectiveMetric element in cadidates
while 'CandidateName' not in candidates[0]:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating CandidateName. Please wait. ')
    sleep(10)
print('[OK] CandidateName generated.')

while 'FinalAutoMLJobObjectiveMetric' not in candidates[0]:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating FinalAutoMLJobObjectiveMetric. Please wait. ')
    sleep(10)
print('[OK] FinalAutoMLJobObjectiveMetric generated.')

#print the names of the candidates with their metric values
print(json.dumps(candidates, indent=4, sort_keys=True, default=str))
print("metric " + str(candidates[0]['FinalAutoMLJobObjectiveMetric']['MetricName']))

for index, candidate in enumerate(candidates):
    print(str(index) + "  " 
        + candidate['CandidateName'] + "  " 
        + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))

## Review best candidate

In [None]:
#Now that you have successfully completed the Autopilot job on the dataset and visualized the trials, you can get the information about the best candidate model and review it.
candidates = automl.list_candidates(job_name=auto_ml_job_name)

if candidates != []:
    best_candidate = automl.best_candidate(job_name=auto_ml_job_name)
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
best_candidate_identifier = best_candidate['CandidateName']
print("Candidate name: " + best_candidate_identifier)
print("Metric name: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("Metric value: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

#Review all output in S3 bucket
from IPython.core.display import display, HTML
display(HTML('<b>Review all <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}?region={}&prefix=autopilot/{}/">output in S3</a></b>'.format(   bucket, region, auto_ml_job_name)))

## Deploy and test best candidate model

In [46]:
#While batch transformations are supported, you will deploy our model as a REST Endpoint in this example.

#customize the inference response.
inference_response_keys = ['predicted_label', 'probability']

#create a SageMaker endpoint
autopilot_model = automl.deploy(initial_instance_count=1,instance_type='ml.m5.large',
    candidate=best_candidate, inference_response_keys=inference_response_keys,
    predictor_cls=sagemaker.predictor.Predictor, serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer())
print('\nEndpoint name:  {}'.format(autopilot_model.endpoint_name))

#Review the SageMaker endpoint in the AWS console
from IPython.core.display import display, HTML
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/endpoints/{}">SageMaker REST endpoint</a></b>'.format(region, autopilot_model.endpoint_name)))

---------------!
Endpoint name:  sagemaker-sklearn-automl-2022-07-25-15-42-57-738


In [51]:
#Test the model
#sm_runtime = boto3.client('sagemaker-runtime')

review_list = ['This product is never been good!','OK, but not great.','This is not the right product.']

for review in review_list:
    # remove commas from the review since we're passing the inputs as a CSV
    review = review.replace(",", "")

    response = sm_runtime.invoke_endpoint(
        EndpointName=autopilot_model.endpoint_name, # endpoint name
        ContentType='text/csv', # type of input data
        Accept='text/csv', # type of the inference in the response
        Body=review) # review text

    response_body=response['Body'].read().decode('utf-8').strip().split(',')

    print('Review: ', review, ' Predicated class: {}'.format(response_body[0]))

print("(-1 = Negative, 0=Neutral, 1=Positive)")

Review:  This product is never been good!  Predicated class: -1
Review:  OK but not great.  Predicated class: 0
Review:  This is not the right product.  Predicated class: -1
(-1 = Negative, 0=Neutral, 1=Positive)
