# Optimize Models using Automatic Model Tuning

<img src="img/hpt.png" width="90%" align="left">

In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# PRE-REQUISITE: _You need to have succesfully run the notebooks in the `PREPARE`section before you continue with this notebook._ 

# Specify the S3 Location of the Features

In [2]:
%store -r processed_train_data_s3_uri

In [3]:
try:
    processed_train_data_s3_uri
    print('[OK]')
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('+++++++++++++++++++++++++++++++')

[OK]


In [4]:
print(processed_train_data_s3_uri)

s3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-train


In [5]:
%store -r processed_validation_data_s3_uri

In [6]:
try:
    processed_validation_data_s3_uri
    print('[OK]')    
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the previous sections before you continue.')
    print('+++++++++++++++++++++++++++++++')

[OK]


In [7]:
print(processed_validation_data_s3_uri)

s3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-validation


In [8]:
%store -r processed_test_data_s3_uri

In [9]:
try:
    processed_test_data_s3_uri
    print('[OK]')    
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the previous sections before you continue.')
    print('+++++++++++++++++++++++++++++++')

[OK]


In [10]:
print(processed_test_data_s3_uri)

s3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-test


In [11]:
print(processed_train_data_s3_uri)
!aws s3 ls $processed_train_data_s3_uri/

s3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-train
2020-09-26 17:50:36     352881 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-09-26 17:50:36      11912 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-09-26 17:49:11      10766 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [12]:
print(processed_validation_data_s3_uri)
!aws s3 ls $processed_validation_data_s3_uri/

s3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-validation
2020-09-26 17:50:36      19944 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-09-26 17:50:36        699 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-09-26 17:49:11        716 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [13]:
print(processed_test_data_s3_uri)
!aws s3 ls $processed_test_data_s3_uri/

s3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-test
2020-09-26 17:50:36      19970 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-09-26 17:50:36        710 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-09-26 17:49:11        650 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [14]:
!pip list

Package                            Version
---------------------------------- -------------------
absl-py                            0.10.0
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-project                   0.8.3
argh                               0.26.2
asn1crypto                         1.3.0
astor                              0.8.1
astroid                            2.4.2
astropy                            4.0.1.post1
atomicwrites                       1.3.0
attrs                              19.3.0
Automat                            20.2.0
autopep8                           1.4.4
autovizwidget                      0.15.0
awscli                             1.18.137
awswrangler                        1.9.3
Babel                              2.8.0
backcall                           0.1.0
backports.shutil-get-terminal-size 1.0.0
bcrypt                             3.2.0
beautifulsoup4                     4.8.2
bit

typed-ast                          1.4.1
typing-extensions                  3.7.4.1
ujson                              1.35
unicodecsv                         0.14.1
urllib3                            1.25.8
watchdog                           0.10.2
wcwidth                            0.1.9
webencodings                       0.5.1
websocket-client                   0.57.0
Werkzeug                           1.0.1
wheel                              0.34.2
widgetsnbextension                 3.5.1
wrapt                              1.12.1
wurlitzer                          2.0.0
xlrd                               1.2.0
XlsxWriter                         1.2.8
xlwt                               1.3.0
yapf                               0.28.0
zict                               2.0.0
zipp                               2.2.0


In [15]:
from sagemaker.inputs import TrainingInput

s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, 
                                    distribution='ShardedByS3Key') 
s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, 
                                         distribution='ShardedByS3Key')
s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, 
                                   distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-west-2-085964654406/sagemaker-scikit-learn-2020-09-26-17-44-12-987/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}


In [16]:
!cat src/tf_bert_reviews.py

import time
import random
import pandas as pd
from glob import glob
import pprint
import argparse
import json
import subprocess
import sys
import os
import tensorflow as tf
#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow==2.1.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'transformers==2.8.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sagemaker-tensorflow==2.1.0.1.0.0'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'smdebug==0.9.3'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.23.1'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'matplotlib==3.2.1'])
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers.configuration_distilbert import DistilBertConfig
from tensorflow.keras.callbacks import ModelCheckp

# Setup Static Hyper-Parameters for Classification Layer
First, retrieve `max_seq_length` from the prepare phase.

In [17]:
%store -r max_seq_length

In [18]:
try:
    max_seq_length
    print('[OK]')
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('+++++++++++++++++++++++++++++++')

[OK]


In [19]:
print(max_seq_length)

64


In [20]:
epsilon=0.00000001
validation_batch_size=128
test_batch_size=128
validation_steps=100
test_steps=100
train_instance_count=1
#train_instance_type='ml.m5.4xlarge' #bur
train_instance_type='ml.c5.4xlarge' #evt
train_volume_size=1024
use_xla=True
use_amp=True
enable_sagemaker_debugger=False
enable_checkpointing=False
enable_tensorboard=False
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

# Track the Optimizations Within our Experiment

In [21]:
%store -r experiment_name

In [22]:
try:
    experiment_name
    print('[OK]')
except NameError:
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the TRAIN section before you continue.')
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

[OK]


In [23]:
print(experiment_name)

Amazon-Customer-Reviews-BERT-Experiment-1601145857


In [24]:
%store -r trial_name

In [25]:
try:
    trial_name
    print('[OK]')    
except NameError:
    print('+++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the previous TRAIN section before you continue.')
    print('+++++++++++++++++++++++++++++++')

[OK]


In [26]:
print(trial_name)

trial-1601145857


In [27]:
import time
from smexperiments.trial import Trial

timestamp = '{}'.format(int(time.time()))

trial = Trial.load(trial_name=trial_name)
print(trial)

Trial(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fd6287a00f0>,trial_name='trial-1601145857',trial_arn='arn:aws:sagemaker:us-west-2:085964654406:experiment-trial/trial-1601145857',display_name='trial-1601145857',experiment_name='Amazon-Customer-Reviews-BERT-Experiment-1601145857',creation_time=datetime.datetime(2020, 9, 26, 18, 44, 17, 852000, tzinfo=tzlocal()),created_by={},last_modified_time=datetime.datetime(2020, 9, 26, 19, 19, 24, 267000, tzinfo=tzlocal()),last_modified_by={},response_metadata={'RequestId': 'fc5c4d8d-0235-465d-8be8-3261e8341f91', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'fc5c4d8d-0235-465d-8be8-3261e8341f91', 'content-type': 'application/x-amz-json-1.1', 'content-length': '326', 'date': 'Sat, 26 Sep 2020 19:43:47 GMT'}, 'RetryAttempts': 0})


In [28]:
from smexperiments.tracker import Tracker

tracker_optimize = Tracker.create(display_name='optimize-1', 
                                  sagemaker_boto_client=sm)

optimize_trial_component_name = tracker_optimize.trial_component.trial_component_name
print('Optimize trial component name {}'.format(optimize_trial_component_name))

Optimize trial component name TrialComponent-2020-09-26-194347-gwoe


# Attach the `deploy` Trial Component and Tracker as a Component to the Trial

In [29]:
trial.add_trial_component(tracker_optimize.trial_component)

# Setup Dynamic Hyper-Parameter Ranges to Explore


In [30]:
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import CategoricalParameter
from sagemaker.tuner import HyperparameterTuner
                                                
hyperparameter_ranges = {
    'epochs': CategoricalParameter([1, 2]),
    'learning_rate': ContinuousParameter(0.00001, 0.00005, scaling_type='Linear'),
    'train_batch_size': CategoricalParameter([64, 128]),
    'train_steps_per_epoch': CategoricalParameter([100, 200]),
    'freeze_bert_layer': CategoricalParameter([True, False])
}

# Setup Metrics

In [31]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

In [32]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=role,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       volume_size=train_volume_size,
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={
                                        'epsilon': epsilon,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                                                                 
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,
                                        'max_seq_length': max_seq_length,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,                                        
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
#                       max_run=7200 # max 2 hours * 60 minutes seconds per hour * 60 seconds per minute
                      )

# Setup HyperparameterTuner with Estimator and Hyper-Parameter Ranges

In [33]:
objective_metric_name = 'train:accuracy'

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_type='Maximize',
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metrics_definitions,
    max_jobs=2,
    max_parallel_jobs=2,
    strategy='Random',
    early_stopping_type='Off'
)

# Start Tuning Job

In [34]:
tuner.fit(inputs={'train': s3_input_train_data, 
                  'validation': s3_input_validation_data,
                  'test': s3_input_test_data
          }, 
          include_cls_metadata=False,
          wait=False)

INFO:sagemaker:Creating hyperparameter tuning job with name: tensorflow-training-200926-1943


# Check Tuning Job Status
Re-run this cell to track the status.

In [35]:
from pprint import pprint

tuning_job_name = tuner.latest_tuning_job.job_name

job_description = sm.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = job_description['HyperParameterTuningJobStatus']

pprint(job_description)

if status != 'Completed':
    job_count = job_description['TrainingJobStatusCounters']['Completed']
    print('Not yet complete, but {} jobs have completed.'.format(job_count))
    
    if job_description.get('BestTrainingJob', None):
        print("Best candidate:")
        pprint(job_description['BestTrainingJob']['TrainingJobName'])
        pprint(job_description['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric'])
    else:
        print("No training jobs have reported results yet.")    

{'CreationTime': datetime.datetime(2020, 9, 26, 19, 43, 51, 662000, tzinfo=tzlocal()),
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-west-2:085964654406:hyper-parameter-tuning-job/tensorflow-training-200926-1943',
 'HyperParameterTuningJobConfig': {'HyperParameterTuningJobObjective': {'MetricName': 'train:accuracy',
                                                                        'Type': 'Maximize'},
                                   'ParameterRanges': {'CategoricalParameterRanges': [{'Name': 'epochs',
                                                                                       'Values': ['"1"',
                                                                                                  '"2"']},
                                                                                      {'Name': 'train_batch_size',
                                                                                       'Values': ['"64"',
                                            

In [36]:
from IPython.core.display import display, HTML
    
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/hyper-tuning-jobs/{}">Hyper-Parameter Tuning Job</a></b>'.format(region, tuning_job_name)))

# _Please Wait for the ^^ Tuning Job ^^ to Complete Above_

In [37]:
%%time

tuner.wait()

..............................................................................................................................................................................................................................................................................................................................................................!
CPU times: user 1.24 s, sys: 98.5 ms, total: 1.34 s
Wall time: 29min 27s


# [INFO] _Feel free to continue to the next workshop section while this notebook is running._

# Show the Tuning Job
### _Note:  This will fail at first.  Please wait about 15-30 seconds and re-run._

In [38]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

hp_results = HyperparameterTuningJobAnalytics(
    sagemaker_session=sess, 
    hyperparameter_tuning_job_name=tuning_job_name
)

df_results = hp_results.dataframe()
df_results.shape

(2, 11)

In [39]:
df_results.sort_values('FinalObjectiveValue', ascending=0)

Unnamed: 0,epochs,freeze_bert_layer,learning_rate,train_batch_size,train_steps_per_epoch,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,"""1""","""False""",1.4e-05,"""128""","""200""",tensorflow-training-200926-1943-002-75de2dd3,Completed,0.4674,2020-09-26 19:45:42+00:00,2020-09-26 20:11:55+00:00,1573.0
1,"""1""","""True""",1.3e-05,"""128""","""100""",tensorflow-training-200926-1943-001-1a7cfa05,Completed,0.1611,2020-09-26 19:45:54+00:00,2020-09-26 20:03:04+00:00,1030.0


# Show the Best Candidate

In [40]:
df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)

Unnamed: 0,epochs,freeze_bert_layer,learning_rate,train_batch_size,train_steps_per_epoch,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,"""1""","""False""",1.4e-05,"""128""","""200""",tensorflow-training-200926-1943-002-75de2dd3,Completed,0.4674,2020-09-26 19:45:42+00:00,2020-09-26 20:11:55+00:00,1573.0


# Log the Best Hyper-Parameter and Objective Metric in the Experiment

Logging `learning_rate` parameter and `accuracy` metric

In [41]:
best_learning_rate = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['learning_rate']
print(best_learning_rate)

0    0.000014
Name: learning_rate, dtype: float64


In [42]:
best_accuracy = df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)['FinalObjectiveValue']
print(best_accuracy)

0    0.4674
Name: FinalObjectiveValue, dtype: float64


In [43]:
tracker_optimize.log_parameters({
    'learning_rate': float(best_learning_rate)
})

# must save after logging
tracker_optimize.trial_component.save()

TrialComponent(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fd628d8b0f0>,trial_component_name='TrialComponent-2020-09-26-194347-gwoe',display_name='optimize-1',tags=None,trial_component_arn='arn:aws:sagemaker:us-west-2:085964654406:experiment-trial-component/trialcomponent-2020-09-26-194347-gwoe',response_metadata={'RequestId': 'd6290de2-008c-461c-b6f2-7d264c812366', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'd6290de2-008c-461c-b6f2-7d264c812366', 'content-type': 'application/x-amz-json-1.1', 'content-length': '129', 'date': 'Sat, 26 Sep 2020 20:13:21 GMT'}, 'RetryAttempts': 0},parameters={'learning_rate': 1.432303027860436e-05},input_artifacts={},output_artifacts={})

In [44]:
tracker_optimize.log_metric('accuracy', float(best_accuracy))

# must save after logging
tracker_optimize.trial_component.save()

TrialComponent(sagemaker_boto_client=<botocore.client.SageMaker object at 0x7fd628d8b0f0>,trial_component_name='TrialComponent-2020-09-26-194347-gwoe',display_name='optimize-1',tags=None,trial_component_arn='arn:aws:sagemaker:us-west-2:085964654406:experiment-trial-component/trialcomponent-2020-09-26-194347-gwoe',response_metadata={'RequestId': '448d460f-bf2d-488e-ae90-61bd81d78348', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '448d460f-bf2d-488e-ae90-61bd81d78348', 'content-type': 'application/x-amz-json-1.1', 'content-length': '129', 'date': 'Sat, 26 Sep 2020 20:13:21 GMT'}, 'RetryAttempts': 0},parameters={'learning_rate': 1.432303027860436e-05},input_artifacts={},output_artifacts={})

# Show Experiment Analytics

In [45]:
from sagemaker.analytics import ExperimentAnalytics

lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=experiment_name,
    metric_names=['validation:accuracy'],
    sort_by="CreationTime",
    sort_order="Ascending",
)

lineage_df = lineage_table.dataframe()
lineage_df.shape

(3, 64)

In [46]:
lineage_df

Unnamed: 0,TrialComponentName,DisplayName,balance_dataset,max_seq_length,test_split_percentage,train_split_percentage,validation_split_percentage,raw_data_s3_uri - MediaType,raw_data_s3_uri - Value,test_data_s3_uri - MediaType,...,train - MediaType,train - Value,validation - MediaType,validation - Value,SageMaker.Checkpoints - MediaType,SageMaker.Checkpoints - Value,SageMaker.DebugHookOutput - MediaType,SageMaker.DebugHookOutput - Value,SageMaker.ModelArtifact - MediaType,SageMaker.ModelArtifact - Value
0,TrialComponent-2020-09-26-184417-oahc,prepare,True,64.0,0.05,0.9,0.05,s3/uri,s3://sagemaker-us-west-2-085964654406/amazon-r...,s3/uri,...,,,,,,,,,,
1,tensorflow-training-2020-09-26-18-44-25-975-aw...,train,,64.0,,,,,,,...,,s3://sagemaker-us-west-2-085964654406/sagemake...,,s3://sagemaker-us-west-2-085964654406/sagemake...,,s3://sagemaker-us-west-2-085964654406/checkpoi...,,s3://sagemaker-us-west-2-085964654406/,,s3://sagemaker-us-west-2-085964654406/tensorfl...
2,TrialComponent-2020-09-26-194347-gwoe,optimize-1,,,,,,,,,...,,,,,,,,,,


# Pass `tuning_job_name` to the Next Notebook

In [47]:
print(tuning_job_name)

tensorflow-training-200926-1943


In [48]:
%store tuning_job_name

Stored 'tuning_job_name' (str)


In [49]:
%store

Stored variables and their in-db values:
auto_ml_job_name                                      -> 'automl-dm-26-16-00-25'
autopilot_endpoint_name                               -> 'automl-dm-ep-26-16-21-49'
autopilot_train_s3_uri                                -> 's3://sagemaker-us-west-2-085964654406/data/amazon
balance_dataset                                       -> True
experiment_name                                       -> 'Amazon-Customer-Reviews-BERT-Experiment-160114585
ingest_create_athena_db_passed                        -> True
ingest_create_athena_table_parquet_passed             -> True
ingest_create_athena_table_tsv_passed                 -> True
max_seq_length                                        -> 64
prepare_trial_component_name                          -> 'TrialComponent-2020-09-26-184417-oahc'
processed_test_data_s3_uri                            -> 's3://sagemaker-us-west-2-085964654406/sagemaker-s
processed_train_data_s3_uri                           -> 's3://sa

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();