In [None]:
# !pip install -q --upgrade pip
# !pip install -q sagemaker smdebug awscli sagemaker-experiments --upgrade

In [1]:
import os
import numpy as np
import time
import sagemaker
import boto3
from sagemaker.session import s3_input
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs
from sagemaker.tensorflow import TensorFlow

In [9]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name = 'sagemaker-jobs-studio'
job_folder = 'jobs'
dataset_folder = 'datasets'

In [10]:
hyperparams={    'epochs'       : 1,
                 'learning-rate': 0.01,
                 'batch-size'   : 256,
                 'weight-decay' : 2e-4,
                 'momentum'     : 0.9,
                 'optimizer'    : 'adam'}

In [11]:
rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()), 
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overfit())
]

In [12]:
job_name = f'tensorflow-debugger-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/jobs'

metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]
train_instance_type = 'ml.p3.2xlarge'

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'code',
                          output_path          = output_path + '/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = train_instance_type,
                          framework_version    = '1.14', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          hyperparameters      = hyperparams,
                          rules                = rules)

if train_instance_type == 'local_gpu':
    train_path = 'file://../dataset/train'
    val_path = 'file://../dataset/validation'
    eval_path = 'file://../dataset/eval'
else:
    train_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/train'
    val_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/validation'
    eval_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/eval'

In [13]:
tf_estimator.fit({'training': train_path,
                  'validation': val_path,
                  'eval': eval_path},
                  job_name=job_name, wait=False)

In [None]:
job_name = tf_estimator.latest_training_job.name
client = tf_estimator.sagemaker_session.sagemaker_client

description = client.describe_training_job(TrainingJobName=job_name)

In [None]:
tf_estimator.latest_training_job.rule_job_summary()

In [None]:
debug_output = description["DebugHookConfig"]["S3OutputPath"] + job_name + '/' + 'debug-output/'

In [None]:
from smdebug.trials import create_trial
trial = create_trial(tf_estimator.latest_job_debugger_artifacts_path())

In [None]:
trial.collections()

In [None]:
trial.tensor_names()

In [None]:
t_acc = trial.tensor('acc')

In [None]:
t_acc_val = t_acc.values()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(num=1, figsize=(8, 8), dpi=60,
        facecolor='w', edgecolor='k')

for key, value in t_acc_val.items():
    plt.scatter(key, value[0], c='b', marker='.')
plt.show()