In [23]:
# !pip install -q --upgrade pip
# !pip install -q sagemaker smdebug awscli sagemaker-experiments --upgrade
# !pip install matplotlib

In [24]:
import os
import numpy as np
import time
import sys
import sagemaker
import boto3
import matplotlib.pyplot as plt
import pandas as pd

from sagemaker.session import s3_input
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs
from sagemaker.tensorflow import TensorFlow

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

In [25]:
experiment_name = 'sagemaker-debugger-cifar10-experiment'
bucket_name     = 'sagemaker-jobs-studio'
job_folder      = 'jobs'
dataset_folder  = 'datasets'

train_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/train'
val_path   = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/validation'
eval_path  = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/eval'

In [26]:
exp_exists = bool([exp for exp in Experiment.list() if exp.experiment_name == experiment_name])

if exp_exists:
    debugger_experiment = Experiment.load(experiment_name)
else:
    debugger_experiment = Experiment.create(
                                experiment_name = "sagemaker-debugger-cifar10-experiment", 
                                description     = "Experiment to track cifar10 debugger trials", 
                                sagemaker_boto_client=sm)

In [27]:
hyperparams={'epochs'       : 200,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam',
             'model-type'   : 'custom'}

In [28]:
debug_rules = [
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.custom(name='MyCustomRule',
                image_uri='840043622174.dkr.ecr.us-east-2.amazonaws.com/sagemaker-debugger-rule-evaluator:latest', 
                instance_type='ml.t3.medium',
                source='rules/my_custom_rule.py',
                rule_to_invoke='CustomGradientRule',
                volume_size_in_gb=30,
                rule_parameters={"threshold": "20.0"})
            ]

In [29]:
debugger_hook_config = DebuggerHookConfig(
                                hook_parameters={"save_interval": '100'},
                                collection_configs=[
                                    CollectionConfig("losses"),
                                    CollectionConfig("weights"),
                                    CollectionConfig("gradients"),
                                    CollectionConfig("biases")
                                    ])

List of built-in collections:<br>
https://github.com/awslabs/sagemaker-debugger/blob/master/docs/api.md#collection

In [30]:
output_path = f's3://{bucket_name}/jobs'
train_instance_type = 'ml.p3.2xlarge'

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'code',
                          output_path          = output_path + '/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = train_instance_type,
                          framework_version    = '1.15', 
                          py_version           = 'py3',
                          script_mode          = True,
                          sagemaker_session    = sagemaker_session,
                          hyperparameters      = hyperparams,
                          debugger_hook_config = debugger_hook_config,
                          rules                = debug_rules)

In [31]:
time_append = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

trial_name = f'2-GTC2020-trial-{time_append}'
cifar10_trial = Trial.create(
    trial_name = trial_name, 
    experiment_name = debugger_experiment.experiment_name,
    sagemaker_boto_client = sm,
)

job_name=f'2-GTC2020-{hyperparams["model-type"]}-{time_append}'
experiment_config = {"ExperimentName": debugger_experiment.experiment_name, 
                       "TrialName": cifar10_trial.trial_name,
                       "TrialComponentDisplayName": job_name,}

In [32]:
tf_estimator.fit({'training': train_path,
                  'validation': val_path,
                  'eval': eval_path},
                  job_name=job_name,
                  wait=False,
                  experiment_config=experiment_config)

INFO:sagemaker:Creating training-job with name: 2-GTC2020-custom-2020-03-22-22-25-33


In [33]:
# Below command will give the status of training job
job_name = tf_estimator.latest_training_job.name
client = tf_estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=job_name)
print('Training job name: ' + job_name)

def print_same_line(s):
    sys.stdout.write(f'\r{time.strftime("%X", time.gmtime())}: {s}')
    sys.stdout.flush()

if description['TrainingJobStatus'] != 'Completed':
    while description['SecondaryStatus'] not in {'Training', 'Completed'}:
        description = client.describe_training_job(TrainingJobName=job_name)
        primary_status = description['TrainingJobStatus']
        secondary_status = description['SecondaryStatus']
        print_same_line(f'Current job status: [PrimaryStatus: {primary_status}, SecondaryStatus: {secondary_status}]')
        time.sleep(15)

Training job name: 2-GTC2020-custom-2020-03-22-22-25-33
22:28:19: Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Training]ng]

In [38]:
tf_estimator.latest_training_job.rule_job_summary()

#### Analyze Debug Data

In [39]:
from smdebug.trials import create_trial
trial = create_trial(tf_estimator.latest_job_debugger_artifacts_path())

In [40]:
trial.tensor_names(collection=None)

In [41]:
trial.tensor('conv2d_1/weights/conv2d_1/kernel:0').value(step_num=0)[0][0]