### Scenario Description

    In this notebook we are doing the following
    - Using training & test data present in csv format
    - doing binary classification
    - Using a pre-built amazon container for xgboost
    - hyperparamters tuning
    - distributed training
    - specifying debugging configurations

In [1]:
import sagemaker
import boto3
from sagemaker.session import s3_input

session = sagemaker.Session()
sm = boto3.Session().client('sagemaker')
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

print ('Role-',role)
print ('Region-',region)

Role- arn:aws:iam::951135073253:role/service-role/AmazonSageMaker-ExecutionRole-20200722T234773
Region- eu-west-1


In [2]:
import time

BUCKET_NAME = 'snowflake-getting-started'
BASE_PREFIX = 'bank-marketing'

EXPERIMENTS_OUTPUT_LOC = 's3://'+BUCKET_NAME+'/'+BASE_PREFIX+'/experiments-xboost-distributedtraining'
print ('Experiment metadata would be published at -',EXPERIMENTS_OUTPUT_LOC)

EXP_CHECKPOINT=EXPERIMENTS_OUTPUT_LOC+'/checkpoint'
EXP_DEBUGGING_OUTPUTS=EXPERIMENTS_OUTPUT_LOC+'/debugging'
EXP_TRAINED_MODELS=EXPERIMENTS_OUTPUT_LOC+'/trained_models'
EXP_SOURCE_CODE= EXPERIMENTS_OUTPUT_LOC+'/code'
EXP_ESTIMATOR= BASE_PREFIX+'/experiments-xboost-distributedtraining/estimator'

print ('Experiment debugging data available at -',EXP_DEBUGGING_OUTPUTS)
print ('Experiment trained models available at -',EXP_TRAINED_MODELS)
print ('Experiment checkpoints available at -',EXP_CHECKPOINT)
print ('Experiment code available at -',EXP_SOURCE_CODE)
print ('Experiment estimator available at -',EXP_ESTIMATOR)

Experiment metadata would be published at - s3://snowflake-getting-started/bank-marketing/experiments-xboost-distributedtraining
Experiment debugging data available at - s3://snowflake-getting-started/bank-marketing/experiments-xboost-distributedtraining/debugging
Experiment trained models available at - s3://snowflake-getting-started/bank-marketing/experiments-xboost-distributedtraining/trained_models
Experiment checkpoints available at - s3://snowflake-getting-started/bank-marketing/experiments-xboost-distributedtraining/checkpoint
Experiment code available at - s3://snowflake-getting-started/bank-marketing/experiments-xboost-distributedtraining/code
Experiment estimator available at - bank-marketing/experiments-xboost-distributedtraining/estimator


In [3]:
from sagemaker.estimator import Estimator
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig
from sagemaker.amazon.amazon_estimator import get_image_uri

# we use the Hyperparameter Tuner
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import HyperparameterTuner

def train_xgboost(instances, mode):
    # define the data type and paths to the training and validation datasets
    content_type = "text/csv"
    train_input = s3_input("s3://{}/{}/{}".format(BUCKET_NAME, BASE_PREFIX, 'train/train_data.csv'), content_type=content_type,distribution=mode)
    validation_input = s3_input("s3://{}/{}/{}".format(BUCKET_NAME, BASE_PREFIX, 'test/test_data.csv'), content_type=content_type)

    save_interval ='1'
    container = get_image_uri(region, 'xgboost',repo_version='1.0-1')

    algorithm_mode_default_estimator = Estimator(container,
                                                  train_instance_type='ml.m4.xlarge',
                                                  train_instance_count=instances,
                                                  sagemaker_session = session,
                                                  role = role,
                                                  input_mode='File',
                                                  enable_network_isolation = True, #disallow internet connection,
                                                  checkpoint_s3_uri = EXP_CHECKPOINT,
                                                  enable_sagemaker_metrics=True,
                                                  debugger_hook_config=DebuggerHookConfig(
                                                              s3_output_path=EXP_DEBUGGING_OUTPUTS, 
                                                              hook_parameters={
                                                                'save_interval': '1'
                                                              },
                                                              # Required - See https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#built-in-collections for supported collections
                                                              collection_configs=[ 
                                                                  CollectionConfig( name="metrics"), 
                                                                  CollectionConfig( name="feature_importance"), 
                                                                  CollectionConfig( name="full_shap"), 
                                                                  CollectionConfig( name="average_shap"), 
                                                              ],
                                                            ),
                                                  rules=[ 
                                                      Rule.sagemaker( 
                                                          rule_configs.loss_not_decreasing(), 
                                                          rule_parameters={ "collection_names": "metrics", "num_steps": str(save_interval * 2), }, 
                                                      ), 
                                                  ],
                                                  output_path = EXP_TRAINED_MODELS
                                            )

    # Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
    hyperparameter_ranges = {
        'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
        'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
        'max_depth': IntegerParameter(5,10,scaling_type='Auto'),
        'min_child_weight': ContinuousParameter(0,10,scaling_type='Auto'),
        'num_round': IntegerParameter(1,4000,scaling_type='Auto'),
        'subsample': ContinuousParameter(0.5,1,scaling_type='Logarithmic')}

    objective_metric_name = 'validation:auc'

    algorithm_mode_hyper_tuning_estimator = HyperparameterTuner(
                                                                algorithm_mode_default_estimator,
                                                                objective_metric_name,
                                                                hyperparameter_ranges,
                                                                max_jobs=3,
                                                                max_parallel_jobs=3,
                                                                strategy='Bayesian'
                                                            )

    algorithm_mode_hyper_tuning_estimator.fit(
                                                inputs={'train': train_input, 'validation': validation_input},    
                                                logs=True,
                                                # This is a fire and forget event. By setting wait=False, you just submit the job to run in the background.
                                                # Amazon SageMaker starts one training job and release control to next cells in the notebook.
                                                # Follow this notebook to see status of the training job.
                                                wait=False
                                            )
    return algorithm_mode_hyper_tuning_estimator

### Compare the model training runs for an experiment

Now we will use the analytics capabilities of Python SDK to query and compare the training runs for identifying the best model produced by our experiment. You can retrieve trial components by using a search expression.

In [4]:
import time

def monitor_training_job(estimator, instances, mode,results_df):

    # Retrieve analytics object
    algorithm_mode_hyper_tuning_estimator_analytics = estimator.analytics()

    t0 = time.time()
    while (1):        
        status = estimator.describe()['HyperParameterTuningJobStatus']
        if (status=='Completed' or status=='Failed'):
            print ('Hyper parameter tuning job completed - displaying results')
            t1 = time.time()
            break
        else:
            print ('Training in progress')
        time.sleep(60)
        
    timetaken = t1-t0

    results_df=results_df.append({'Instances':instances,
                                  'Time':timetaken/60,
                                  'Mode':mode
                                 },
                                 ignore_index=True)       

    # Look at summary of associated training jobs
    tuner_dataframe = algorithm_mode_hyper_tuning_estimator_analytics.dataframe()

    tuner_dataframe
    
    return results_df

In [5]:
import pandas as pd

results_df = pd.DataFrame(columns=['Instances','Time','Mode'])

In [6]:
estimator = train_xgboost(3, 'FullyReplicated')
results_df = monitor_training_job(estimator, 2, 'FullyReplicated',results_df)
print (results_df)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


Training in progress
Training in progress
Training in progress
Training in progress
Training in progress
Training in progress
Hyper parameter tuning job completed - displaying results
  Instances      Time             Mode
0         2  6.011087  FullyReplicated


In [7]:
estimator = train_xgboost(3, 'FullyReplicated')
results_df = monitor_training_job(estimator, 3, 'FullyReplicated',results_df)
print (results_df)



Training in progress
Training in progress
Training in progress
Training in progress
Training in progress
Training in progress
Training in progress
Hyper parameter tuning job completed - displaying results
  Instances      Time             Mode
0         2  6.011087  FullyReplicated
1         3  7.012280  FullyReplicated


In [8]:
estimator = train_xgboost(3, 'ShardedByS3Key')
results_df = monitor_training_job(estimator,2,'ShardedByS3Key',results_df)
print (results_df)



Training in progress
Training in progress
Training in progress
Training in progress
Training in progress
Training in progress


KeyboardInterrupt: 

In [None]:
estimator = train_xgboost(3, 'ShardedByS3Key')
results_df = monitor_training_job(estimator,3,'ShardedByS3Key',results_df)
print (results_df)

In [None]:
estimator = train_xgboost(1, 'FullyReplicated')
results_df = monitor_training_job(estimator, 1, 'FullyReplicated',results_df)
print (results_df)