In [1]:
# !pip install -q --upgrade pip
# !pip install -q sagemaker smdebug awscli sagemaker-experiments --upgrade

In [2]:
import os
import numpy as np
import time
import sagemaker
import boto3
from sagemaker.session import s3_input
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs
from sagemaker.tensorflow import TensorFlow

In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name = 'sagemaker-jobs-studio'
job_folder = 'jobs'
dataset_folder = 'datasets'

In [4]:
hyperparams={    'epochs'       : 100,
                 'learning-rate': 0.00001,
                 'batch-size'   : 256,
                 'weight-decay' : 2e-4,
                 'momentum'     : 0.9,
                 'optimizer'    : 'adam'}

In [5]:
rules = [Rule.sagemaker(rule_configs.vanishing_gradient())]

In [6]:
job_name = f'tensorflow-debugger-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/jobs'

train_instance_type = 'ml.p3.2xlarge'

tf_estimator = TensorFlow(entry_point               = 'cifar10-training-sagemaker.py', 
                          source_dir                = 'code',
                          output_path               = output_path + '/',
                          code_location             = output_path,
                          role                      = role,
                          train_instance_count      = 1, 
                          train_instance_type       = train_instance_type,
                          framework_version         = '1.15', 
                          py_version                = 'py3',
                          script_mode               = True,
                          hyperparameters           = hyperparams,
                          rules                     = rules)

In [7]:
train_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/train'
val_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/validation'
eval_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/eval'

tf_estimator.fit({'training': train_path,
                  'validation': val_path,
                  'eval': eval_path},
                  job_name=job_name, wait=False)

In [8]:
job_name = tf_estimator.latest_training_job.name
client = tf_estimator.sagemaker_session.sagemaker_client

description = client.describe_training_job(TrainingJobName=job_name)

In [9]:
tf_estimator.latest_training_job.rule_job_summary()

[{'RuleConfigurationName': 'VanishingGradient',
  'RuleEvaluationStatus': 'InProgress',
  'LastModifiedTime': datetime.datetime(2020, 3, 19, 10, 37, 0, 748000, tzinfo=tzlocal())},
 {'RuleConfigurationName': 'LossNotDecreasing',
  'RuleEvaluationStatus': 'InProgress',
  'LastModifiedTime': datetime.datetime(2020, 3, 19, 10, 37, 0, 748000, tzinfo=tzlocal())},
 {'RuleConfigurationName': 'Overfit',
  'RuleEvaluationStatus': 'InProgress',
  'LastModifiedTime': datetime.datetime(2020, 3, 19, 10, 37, 0, 748000, tzinfo=tzlocal())},
 {'RuleConfigurationName': 'ExplodingTensor',
  'RuleEvaluationStatus': 'InProgress',
  'LastModifiedTime': datetime.datetime(2020, 3, 19, 10, 37, 0, 748000, tzinfo=tzlocal())}]

In [10]:
debug_output = description["DebugHookConfig"]["S3OutputPath"] + job_name + '/' + 'debug-output/'
print(debug_output)

s3://sagemaker-jobs-studio/jobs/tensorflow-debugger-2020-03-19-10-37-00-079/debug-output/


In [12]:
from smdebug.trials import create_trial
trial = create_trial(tf_estimator.latest_job_debugger_artifacts_path())

[2020-03-19 10:51:56.883 a6c6594265ef:639 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-jobs-studio/jobs/tensorflow-debugger-2020-03-19-10-37-00-079/debug-output


In [None]:
trial.collections()

In [None]:
trial.tensor_names()

In [14]:
trial.modes()

dict_keys([<ModeKeys.TRAIN: 1>, <ModeKeys.EVAL: 2>])

In [None]:
t_acc = trial.tensor('acc')

In [None]:
t_acc_val = t_acc.values()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(num=1, figsize=(8, 8), dpi=60,
        facecolor='w', edgecolor='k')

for key, value in t_acc_val.items():
    plt.scatter(key, value[0], c='b', marker='.')
plt.show()