In [None]:
# !pip install -q --upgrade pip
# !pip install -q sagemaker smdebug awscli sagemaker-experiments --upgrade

In [1]:
import os
import numpy as np
import time
import sagemaker
import boto3
from sagemaker.session import s3_input
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs
from sagemaker.tensorflow import TensorFlow

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [2]:
sess = boto3.Session()
sm = sess.client('sagemaker')
sagemaker_session = sagemaker.Session(boto_session=sess)
role = sagemaker.get_execution_role()

experiment_name = 'sagemaker-debugger-cifar10-experiment'
bucket_name = 'sagemaker-jobs-studio'
job_folder = 'jobs'
dataset_folder = 'datasets'

In [3]:
exp_exists = bool([exp for exp in sm.list_experiments()['ExperimentSummaries'] if exp['ExperimentName'] == experiment_name])
if exp_exists:
    debugger_experiment = Experiment.load('sagemaker-debugger-cifar10-experiment')
else:
    debugger_experiment = Experiment.create(
    experiment_name="sagemaker-debugger-cifar10-experiment", 
    description="Experiment to track cifar10 debugger trials", 
    sagemaker_boto_client=sm)

In [5]:
trial_name = f'tf-smdebug-trial-{time.strftime("%m-%d-%S-%j", time.gmtime())}'
cifar10_trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=debugger_experiment.experiment_name,
    sagemaker_boto_client=sm,
)

In [6]:
hyperparams={'epochs'       : 100,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam'}

In [7]:
debug_rules = [
    Rule.sagemaker(rule_configs.loss_not_decreasing()),
    Rule.sagemaker(rule_configs.overtraining())
]

In [8]:
job_name = f'tensorflow-debugger-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/jobs'

train_instance_type = 'ml.p3.2xlarge'

tf_estimator = TensorFlow(entry_point               = 'cifar10-training-sagemaker.py', 
                          source_dir                = 'code',
                          output_path               = output_path + '/',
                          code_location             = output_path,
                          role                      = role,
                          train_instance_count      = 1, 
                          train_instance_type       = train_instance_type,
                          framework_version         = '1.15', 
                          py_version                = 'py3',
                          script_mode               = True,
                          sagemaker_session         = sagemaker_session,
                          hyperparameters           = hyperparams,
                          rules                     = debug_rules)

In [9]:
train_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/train'
val_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/validation'
eval_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/eval'

tf_estimator.fit({'training': train_path,
                  'validation': val_path,
                  'eval': eval_path},
                  job_name=job_name, wait=False,
                  experiment_config={
                      "ExperimentName": debugger_experiment.experiment_name, 
                      "TrialName": cifar10_trial.trial_name,
                      "TrialComponentDisplayName": job_name,
        })

INFO:sagemaker:Creating training-job with name: tensorflow-debugger-2020-03-20-05-42-32-080


In [None]:
job_name = tf_estimator.latest_training_job.name
client = tf_estimator.sagemaker_session.sagemaker_client

description = client.describe_training_job(TrainingJobName=job_name)

In [None]:
tf_estimator.latest_training_job.rule_job_summary()

In [None]:
debug_output = description["DebugHookConfig"]["S3OutputPath"] + job_name + '/' + 'debug-output/'
print(debug_output)

In [None]:
from smdebug.trials import create_trial
trial = create_trial(tf_estimator.latest_job_debugger_artifacts_path())

In [None]:
trial.collections()

In [None]:
trial.tensor_names()

In [None]:
trial.modes()

In [None]:
t_acc = trial.tensor('acc')

In [None]:
t_acc_val = t_acc.values()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(num=1, figsize=(8, 8), dpi=60,
        facecolor='w', edgecolor='k')

for key, value in t_acc_val.items():
    plt.scatter(key, value[0], c='b', marker='.')
plt.show()