# Train a model with Amazon SageMaker Autopilot

In [28]:
!pip install --disable-pip-version-check -q sagemaker==2.35.0

[0m

In [29]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import time
import json

In [30]:
sm = boto3.client(service_name = 'sagemaker')
sm_runtime = boto3.client('sagemaker-runtime')

sess = sagemaker.Session(sagemaker_client=sm,
                         sagemaker_runtime_client=sm_runtime)
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

In [31]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackeend.figure_format = 'retina'

## 1. Review transformed dataset

In [61]:
!aws s3 cp 's3://sagemaker-us-east-1-740250289965/data-kalyan/bias/balanced/womens_clothing_ecommerce_reviews_balanced_.csv' ./

download: s3://sagemaker-us-east-1-740250289965/data-kalyan/bias/balanced/womens_clothing_ecommerce_reviews_balanced_.csv to ./womens_clothing_ecommerce_reviews_balanced_.csv


In [72]:
path = './womens_clothing_ecommerce_reviews_balanced_.csv'

df = pd.read_csv(path, delimiter=',')
df.head()

Unnamed: 0,sentiment,review_body,product_category
0,-1,This suit did nothing for me. the top has zero...,Swim
1,-1,Like other reviewers i saw this dress on the ...,Dresses
2,-1,I wish i had read the reviews before purchasin...,Knits
3,-1,I ordered these pants in my usual size (xl) an...,Legwear
4,-1,I noticed this top on one of the sales associa...,Knits


In [73]:
path_autopilot = './womens_clothing_ecommerce_reviews_balanced_for_autopilot.csv'

df[['sentiment', 'review_body']].to_csv(path_autopilot, 
                                        sep=',', 
                                        index=False)

## 2. Configure the Autopilot job

In [74]:
autopilot_train_s3_uri = sess.upload_data(bucket=bucket, key_prefix='data-kalyan/autopilot/data', path=path_autopilot)
autopilot_train_s3_uri

's3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/data/womens_clothing_ecommerce_reviews_balanced_for_autopilot.csv'

In [75]:
!aws s3 ls $autopilot_train_s3_uri

2022-09-19 14:54:53    2253749 womens_clothing_ecommerce_reviews_balanced_for_autopilot.csv


### Need tp provide a S3 output for generated assets

In [76]:
model_output_s3_uri = 's3://{}/data-kalyan/autopilot'.format(bucket)

print(model_output_s3_uri)

s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot


### Configure the Autopilot job

In [77]:
import time
timestamp = int(time.time())
auto_ml_job_name = 'automl-dm-{}'.format(timestamp)

In [78]:
max_candidates = 3

automl = sagemaker.automl.automl.AutoML(
    target_attribute_name = 'sentiment',
    base_job_name = auto_ml_job_name,
    output_path = model_output_s3_uri,
    max_candidates = max_candidates,
    sagemaker_session = sess,
    role = role,
    max_runtime_per_training_job_in_seconds = 1200,
    total_job_runtime_in_seconds = 7200
)

#  Launch the Autopilot job

In [79]:
automl.fit(
    autopilot_train_s3_uri,
    job_name=auto_ml_job_name, 
    wait=False, 
    logs=False
)


# 4. Track Autopilot job progress

Once the Autopilot job has been launched, you can track the job progress directly from the notebook using the SDK capabilities.

### 4.1. Autopilot job description

Function `describe_auto_ml_job` of the Amazon SageMaker service returns the information about the AutoML job in dictionary format. You can review the response syntax and response elements in the [**documentation**](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeAutoMLJob.html).

In [80]:
job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)

### 4.2. Autopilot job status

To track the job progress you can use two response elements: `AutoMLJobStatus` and `AutoMLJobSecondaryStatus`, which correspond to the primary (Completed | InProgress | Failed | Stopped | Stopping) and secondary (AnalyzingData | FeatureEngineering | ModelTuning etc.) job states respectively. To see if the AutoML job has started, you can check the existence of the `AutoMLJobStatus` and `AutoMLJobSecondaryStatus` elements in the job description response.


In [81]:
while 'AutoMLJobStatus' not in job_description_response.keys() and 'AutoMLJobSecondaryStatus' not in job_description_response.keys():
    job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job has not yet started. Please wait. ')
    # function `json.dumps` encodes JSON string for printing.
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print('[INFO] Waiting for Autopilot job to start...')
    sleep(15)

print('[OK] AutoML job started.')

[OK] AutoML job started.


### 4.3. Review the SageMaker processing jobs

The Autopilot creates required SageMaker processing jobs during the run:

* First processing job (data splitter) checks the data sanity, performs stratified shuffling and splits the data into training and validation. 
* Second processing job (candidate generator) first streams through the data to compute statistics for the dataset. Then, uses these statistics to identify the problem type, and possible types of every column-predictor: numeric, categorical, natural language, etc.

In [82]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/">processing jobs</a></b>'.format(region)))


### 4.4. Wait for the data analysis step to finish

Here you will use the same scheme as above to check the completion of the data analysis step. This step can be identified with the (primary) job status value `InProgress` and secondary job status values `Starting` and then `AnalyzingData`.
### _This cell will take approximately 10 minutes to run._

In [83]:
%%time

job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']

if job_status not in ('Stopped', 'Failed'):
    while job_status in ('InProgress') and job_sec_status in ('Starting', 'AnalyzingData'):
        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(15)
    print('[OK] Data analysis phase completed.\n')
    
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress AnalyzingData
InProgress FeatureEngineering
[OK] Data analysis phase completed.

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:740250289965:automl-job/automl-dm-1663599311",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb

### 4.5. View generated notebooks
Once data analysis is complete, SageMaker AutoPilot generates two notebooks: 
* Data exploration
* Candidate definition

Notebooks are included in the AutoML job artifacts generated during the run. Before checking the existence of the notebooks, you can check if the artifacts have been generated.

In [84]:
job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name) # Replace None

# keep in the while loop until the Autopilot job artifacts will be generated
while 'AutoMLJobArtifacts' not in job_description_response.keys(): # Replace all None
    # update the information about the running Autopilot job
    job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name) # Replace None
    print('[INFO] Autopilot job has not yet generated the artifacts. Please wait. ')
    print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))
    print('[INFO] Waiting for AutoMLJobArtifacts...')
    time.sleep(15)

print('[OK] AutoMLJobArtifacts generated.')

[OK] AutoMLJobArtifacts generated.


In [85]:
job_description_response['AutoMLJobArtifacts']

{'CandidateDefinitionNotebookLocation': 's3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb',
 'DataExplorationNotebookLocation': 's3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb'}

In [91]:
%%time

job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print(job_status)
print(job_sec_status)
if job_status not in ('Stopped', 'Failed'):
    while job_status in ('InProgress') and job_sec_status in ('FeatureEngineering'): # Replace all None

        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Feature engineering phase completed.\n')
    
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

Completed
Completed
[OK] Feature engineering phase completed.

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:740250289965:automl-job/automl-dm-1663599311",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb",
        "DataExplorationNotebookLocation": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb"
    },
    "AutoMLJobConfig": {
        "CompletionCriteria": {
            "MaxAutoMLJobRuntimeInSeconds": 7200,
            "MaxCandidates": 3,
            "MaxRuntimePerTrainingJobInSeconds": 1200
        },
        "SecurityConfig": {
           

In [92]:
%%time

job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
job_status = job_description_response['AutoMLJobStatus']
job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
print(job_status)
print(job_sec_status)
if job_status not in ('Stopped', 'Failed'):
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    while job_status in ('InProgress') and job_sec_status in ('ModelTuning'): # Replace all None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
        job_description_response = automl.describe_auto_ml_job(job_name=auto_ml_job_name)
        job_status = job_description_response['AutoMLJobStatus']
        job_sec_status = job_description_response['AutoMLJobSecondaryStatus']
        print(job_status, job_sec_status)
        time.sleep(5)
    print('[OK] Model tuning phase completed.\n')
    
print(json.dumps(job_description_response, indent=4, sort_keys=True, default=str))

Completed
Completed
[OK] Model tuning phase completed.

{
    "AutoMLJobArn": "arn:aws:sagemaker:us-east-1:740250289965:automl-job/automl-dm-1663599311",
    "AutoMLJobArtifacts": {
        "CandidateDefinitionNotebookLocation": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb",
        "DataExplorationNotebookLocation": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/sagemaker-automl-candidates/automl-dm-1663599311-pr-1-a85dd33bb4044c0bb2a42211666033cbcf8bb/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb"
    },
    "AutoMLJobConfig": {
        "CompletionCriteria": {
            "MaxAutoMLJobRuntimeInSeconds": 7200,
            "MaxCandidates": 3,
            "MaxRuntimePerTrainingJobInSeconds": 1200
        },
        "SecurityConfig": {
            "Enabl

### 6.2. Compare model candidates
Once model tuning is complete, you can view all the candidates (pipeline evaluations with different hyperparameter combinations) that were explored by AutoML and sort them by their final performance metric.

In [93]:
candidates = automl.list_candidates(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    job_name=auto_ml_job_name # Replace None
    #sort_by='FinalObjectiveMetric' # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
)

In [94]:
while candidates == []:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating the candidates. Please wait.')
    time.sleep(10)

print('[OK] Candidates generated.') 

[OK] Candidates generated.


In [95]:
print(candidates[0].keys())

dict_keys(['CandidateName', 'FinalAutoMLJobObjectiveMetric', 'ObjectiveStatus', 'CandidateSteps', 'CandidateStatus', 'InferenceContainers', 'CreationTime', 'EndTime', 'LastModifiedTime', 'CandidateProperties'])


In [97]:
print(candidates[0].values())

dict_values(['automl-dm-1663599311mUdONubaqhhN-003-a2aaf670', {'MetricName': 'validation:accuracy', 'Value': 0.5974699854850769}, 'Succeeded', [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:740250289965:processing-job/automl-dm-1663599311-db-1-6e0667df70da42c4be65301c87ae089a691ca', 'CandidateStepName': 'automl-dm-1663599311-db-1-6e0667df70da42c4be65301c87ae089a691ca'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:740250289965:training-job/automl-dm-1663599311-dpp0-1-be32e56140754bc29c5e921d6075668d16a', 'CandidateStepName': 'automl-dm-1663599311-dpp0-1-be32e56140754bc29c5e921d6075668d16a'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:740250289965:transform-job/automl-dm-1663599311-dpp0-rpb-1-a4f107b904d74a84b9b91643b08559c', 'CandidateStepName': 'automl-dm-1663599311-dpp0-rpb-1-a4f107b904d74a84b9b91643b08559c'}, {

In [98]:
while 'CandidateName' not in candidates[0]:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating CandidateName. Please wait. ')
    sleep(10)

print('[OK] CandidateName generated.')

[OK] CandidateName generated.


In [99]:
while 'FinalAutoMLJobObjectiveMetric' not in candidates[0]:
    candidates = automl.list_candidates(job_name=auto_ml_job_name)
    print('[INFO] Autopilot job is generating FinalAutoMLJobObjectiveMetric. Please wait. ')
    sleep(10)

print('[OK] FinalAutoMLJobObjectiveMetric generated.')

[OK] FinalAutoMLJobObjectiveMetric generated.


In [100]:
print(json.dumps(candidates, indent=4, sort_keys=True, default=str))


[
    {
        "CandidateName": "automl-dm-1663599311mUdONubaqhhN-003-a2aaf670",
        "CandidateProperties": {
            "CandidateMetrics": [
                {
                    "MetricName": "Accuracy",
                    "Set": "Validation",
                    "StandardMetricName": "Accuracy",
                    "Value": 0.5974699854850769
                },
                {
                    "MetricName": "PrecisionMacro",
                    "Set": "Validation",
                    "StandardMetricName": "PrecisionMacro",
                    "Value": 0.595770001411438
                },
                {
                    "MetricName": "BalancedAccuracy",
                    "Set": "Validation",
                    "StandardMetricName": "BalancedAccuracy",
                    "Value": 0.5974699854850769
                },
                {
                    "MetricName": "F1macro",
                    "Set": "Validation",
                    "StandardMetricName": 

In [101]:
print("metric " + str(candidates[0]['FinalAutoMLJobObjectiveMetric']['MetricName']))

for index, candidate in enumerate(candidates):
    print(str(index) + "  " 
        + candidate['CandidateName'] + "  " 
        + str(candidate['FinalAutoMLJobObjectiveMetric']['Value']))

metric validation:accuracy
0  automl-dm-1663599311mUdONubaqhhN-003-a2aaf670  0.5974699854850769
1  automl-dm-1663599311mUdONubaqhhN-002-1e0d9069  0.5922600030899048
2  automl-dm-1663599311mUdONubaqhhN-001-709bc605  0.6151900291442871


### 6.3. Review best candidate

Now that you have successfully completed the Autopilot job on the dataset and visualized the trials, you can get the information about the best candidate model and review it.

In [102]:
candidates = automl.list_candidates(job_name=auto_ml_job_name)

if candidates != []:
    best_candidate = automl.best_candidate(
        job_name=auto_ml_job_name # Replace None
    )
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))

{
    "CandidateName": "automl-dm-1663599311mUdONubaqhhN-001-709bc605",
    "CandidateProperties": {
        "CandidateArtifactLocations": {
            "Explainability": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/documentation/explainability/output",
            "ModelInsights": "s3://sagemaker-us-east-1-740250289965/data-kalyan/autopilot/automl-dm-1663599311/documentation/model_monitor/output"
        },
        "CandidateMetrics": [
            {
                "MetricName": "Accuracy",
                "Set": "Validation",
                "StandardMetricName": "Accuracy",
                "Value": 0.6151900291442871
            },
            {
                "MetricName": "PrecisionMacro",
                "Set": "Validation",
                "StandardMetricName": "PrecisionMacro",
                "Value": 0.6137400269508362
            },
            {
                "MetricName": "BalancedAccuracy",
                "Set": "Validation",
    

In [103]:
while 'CandidateName' not in best_candidate:
    best_candidate = automl.best_candidate(job_name=auto_ml_job_name)
    print('[INFO] Autopilot Job is generating BestCandidate CandidateName. Please wait. ')
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print('[OK] BestCandidate CandidateName generated.') 

[OK] BestCandidate CandidateName generated.


In [104]:
while 'FinalAutoMLJobObjectiveMetric' not in best_candidate:
    best_candidate = automl.best_candidate(job_name=auto_ml_job_name)
    print('[INFO] Autopilot Job is generating BestCandidate FinalAutoMLJobObjectiveMetric. Please wait. ')
    print(json.dumps(best_candidate, indent=4, sort_keys=True, default=str))
    sleep(10)

print('[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.')  

[OK] BestCandidate FinalAutoMLJobObjectiveMetric generated.


In [105]:
best_candidate_identifier = best_candidate['CandidateName']
print("Candidate name: " + best_candidate_identifier)
print("Metric name: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("Metric value: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

Candidate name: automl-dm-1663599311mUdONubaqhhN-001-709bc605
Metric name: validation:accuracy
Metric value: 0.6151900291442871


# 8. Deploy and test best candidate model

In [107]:
inference_response_keys = ['predicted_label', 'probability']

In [108]:
autopilot_model = automl.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    candidate=best_candidate,
    inference_response_keys=inference_response_keys,
    predictor_cls=sagemaker.predictor.Predictor,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer()
)

print('\nEndpoint name:  {}'.format(autopilot_model.endpoint_name))

-----------------!
Endpoint name:  sagemaker-sklearn-automl-2022-09-19-15-56-53-630


### 8.2. Test the model

In [110]:
review_list = ['This product is great!',
               'OK, but not great.',
               'This is not the right product.']

for review in review_list:
    
    # remove commas from the review since we're passing the inputs as a CSV
    review = review.replace(",", "")

    response = sm_runtime.invoke_endpoint(
        EndpointName=autopilot_model.endpoint_name, # endpoint name
        ContentType='text/csv', # type of input data
        Accept='text/csv', # type of the inference in the response
        Body=review # review text
        )

    response_body=response['Body'].read().decode('utf-8').strip().split(',')

    print('Review: ', review, ' Predicated class: {}'.format(response_body[0]))

print("(-1 = Negative, 0=Neutral, 1=Positive)")

Review:  This product is great!  Predicated class: 1
Review:  OK but not great.  Predicated class: 0
Review:  This is not the right product.  Predicated class: -1
(-1 = Negative, 0=Neutral, 1=Positive)
