### Scenario Description

    In this notebook we are doing the following
    - Using training & test data present in csv format
    - doing binary classification
    - Using a pre-built amazon container for xgboost
    - basic hyperparamters (no tuning!)
    - specifying debugging configurations

In [1]:
import sagemaker
import boto3
from sagemaker.session import s3_input

session = sagemaker.Session()
sm = boto3.Session().client('sagemaker')
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

print ('Role-',role)
print ('Region-',region)

Role- arn:aws:iam::951135073253:role/service-role/AmazonSageMaker-ExecutionRole-20200722T234773
Region- eu-west-1


In [2]:
import time

BUCKET_NAME = 'snowflake-getting-started'
BASE_PREFIX = 'bank-marketing'

EXPERIMENTS_OUTPUT_LOC = 's3://'+BUCKET_NAME+'/'+BASE_PREFIX+'/experiments-xboost'
print ('Experiment metadata would be published at -',EXPERIMENTS_OUTPUT_LOC)

EXP_DEBUGGING_OUTPUTS=EXPERIMENTS_OUTPUT_LOC+'/debugging'
EXP_TRAINED_MODELS=EXPERIMENTS_OUTPUT_LOC+'/trained_models'

print ('Experiment debugging data available at -',EXP_DEBUGGING_OUTPUTS)
print ('Experiment trained moddels available at -',EXP_TRAINED_MODELS)

Experiment metadata would be published at - s3://snowflake-getting-started/bank-marketing/experiments-xboost
Experiment debugging data available at - s3://snowflake-getting-started/bank-marketing/experiments-xboost/debugging
Experiment trained moddels available at - s3://snowflake-getting-started/bank-marketing/experiments-xboost/trained_models


In [3]:
# define the data type and paths to the training and validation datasets
content_type = "text/csv"
train_input = s3_input("s3://{}/{}/{}".format(BUCKET_NAME, BASE_PREFIX, 'train/train_data.csv'), content_type=content_type)
validation_input = s3_input("s3://{}/{}/{}".format(BUCKET_NAME, BASE_PREFIX, 'test/test_data.csv'), content_type=content_type)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [4]:
from sagemaker.estimator import Estimator
from sagemaker.debugger import rule_configs, Rule, DebuggerHookConfig, CollectionConfig
from sagemaker.amazon.amazon_estimator import get_image_uri

save_interval ='1'
container = get_image_uri(region, 'xgboost',repo_version='1.0-1')

print (container)
algorithm_mode_default_estimator = Estimator(container,
                                              train_instance_type='ml.m4.xlarge',
                                              train_instance_count=1,
                                              sagemaker_session = session,
                                              role = role,
                                              #code_location  = EXP_SOURCE_CODE,
                                              hyperparameters = {
                                                  'num_round':100,
                                                  'max_depth':3,
                                                  'eta':0.2,
                                                  'subsample':0.8,
                                                  "objective":"binary:logistic"
                                              },
                                              input_mode='File',
                                              enable_sagemaker_metrics=True,
                                              debugger_hook_config=DebuggerHookConfig(
                                                          s3_output_path=EXP_DEBUGGING_OUTPUTS, 
                                                          hook_parameters={
                                                            'save_interval': '1'
                                                          },
                                                          # Required - See https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#built-in-collections for supported collections
                                                          collection_configs=[ 
                                                              CollectionConfig( name="metrics"), 
                                                              CollectionConfig( name="feature_importance"), 
                                                              CollectionConfig( name="full_shap"), 
                                                              CollectionConfig( name="average_shap"), 
                                                          ],
                                                        ),
                                              rules=[ 
                                                  Rule.sagemaker( 
                                                      rule_configs.loss_not_decreasing(), 
                                                      rule_parameters={ "collection_names": "metrics", "num_steps": str(save_interval * 2), }, 
                                                  ), 
                                              ],
                                              output_path = EXP_TRAINED_MODELS
                                        )


algorithm_mode_default_estimator.fit(
    inputs={'train': train_input, 'validation': validation_input},    
    logs=True,
    # This is a fire and forget event. By setting wait=False, you just submit the job to run in the background.
    # Amazon SageMaker starts one training job and release control to next cells in the notebook.
    # Follow this notebook to see status of the training job.
    wait=False
)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3


In [5]:
import time

for _ in range(360):
    job_name = algorithm_mode_default_estimator.latest_training_job.name
    client = algorithm_mode_default_estimator.sagemaker_session.sagemaker_client
    description = client.describe_training_job(TrainingJobName=job_name)
    training_job_status = description["TrainingJobStatus"]
    rule_job_summary = algorithm_mode_default_estimator.latest_training_job.rule_job_summary()
    rule_evaluation_status = rule_job_summary[0]["RuleEvaluationStatus"]
    print("Training job status: {}, Rule Evaluation Status: {}".format(training_job_status, rule_evaluation_status))

    if rule_evaluation_status in ["Stopped", "IssuesFound", "NoIssuesFound"]:
        break

    time.sleep(10)

Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation Status: InProgress
Training job status: InProgress, Rule Evaluation

KeyboardInterrupt: 

In [6]:
algorithm_mode_default_estimator.jobs[-1].describe()

{'TrainingJobName': 'sagemaker-xgboost-2020-07-30-19-42-26-952',
 'TrainingJobArn': 'arn:aws:sagemaker:eu-west-1:951135073253:training-job/sagemaker-xgboost-2020-07-30-19-42-26-952',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://snowflake-getting-started/bank-marketing/experiments-xboost/trained_models/sagemaker-xgboost-2020-07-30-19-42-26-952/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'eta': '0.2',
  'max_depth': '3',
  'num_round': '100',
  'objective': 'binary:logistic',
  'subsample': '0.8'},
 'AlgorithmSpecification': {'TrainingImage': '141502667606.dkr.ecr.eu-west-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:mae',
    'Regex': '.*\\[[0-9]+\\].*#011train-mae:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
   {'Name': 'validation:aucpr',
    'Regex': '.*\\[[0-9]+\\].*#011validation-aucpr:([-+]?[0-9]*\\.?[0-9]+(?:[eE][-+]?[0-9]+)?).*'},
  

### Visualizing and analyzing the debugger output

In [None]:
algorithm_mode_default_estimator.latest_training_job.rule_job_summary()

In [None]:
!pip install smdebug

In [None]:
from smdebug.trials import create_trial

s3_output_path = algorithm_mode_default_estimator.latest_job_debugger_artifacts_path()
trial = create_trial(s3_output_path)

In [None]:
trial.tensor_names()

#### Plot the feature_importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import re


def get_data(trial, tname):
    """
    For the given tensor name, walks though all the iterations
    for which you have data and fetches the values.
    Returns the set of steps and the values.
    """
    tensor = trial.tensor(tname)
    steps = tensor.steps()
    vals = [tensor.value(s) for s in steps]
    return steps, vals

def plot_collection(trial, collection_name, regex='.*', figsize=(20, 20)):
    """
    Takes a `trial` and a collection name, and 
    plots all tensors that match the given regex.
    """
    fig, ax = plt.subplots(figsize=figsize)
    sns.despine()

    tensors = trial.collection(collection_name).tensor_names

    for tensor_name in sorted(tensors):
        if re.match(regex, tensor_name):
            steps, data = get_data(trial, tensor_name)
            ax.plot(steps, data, label=tensor_name)

    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax.set_xlabel('Iteration')

In [None]:
plot_collection(trial, "metrics")

In [None]:
def plot_feature_importance(trial, importance_type="weight"):
    SUPPORTED_IMPORTANCE_TYPES = ["weight", "gain", "cover", "total_gain", "total_cover"]
    if importance_type not in SUPPORTED_IMPORTANCE_TYPES:
        raise ValueError(f"{importance_type} is not one of the supported importance types.")
    plot_collection(
        trial,
        "feature_importance",
        regex=f"feature_importance/{importance_type}/.*")

In [None]:
plot_feature_importance(trial)

### SHAP 

In [None]:
plot_collection(trial,"average_shap")