In [None]:
# !pip install -q --upgrade pip
# !pip install -q sagemaker smdebug awscli sagemaker-experiments --upgrade

In [1]:
import os
import numpy as np
import time
import sys
import sagemaker
import boto3
import matplotlib.pyplot as plt
import pandas as pd

from sagemaker.session import s3_input
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs
from sagemaker.tensorflow import TensorFlow

from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

#### Experiment name and dataset path

In [2]:
experiment_name = 'sagemaker-debugger-cifar10-experiment'
bucket_name     = 'sagemaker-jobs-studio'
job_folder      = 'jobs'
dataset_folder  = 'datasets'

train_path = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/train'
val_path   = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/validation'
eval_path  = f's3://{bucket_name}/{dataset_folder}/cifar10-dataset/eval'

#### Create a new experiment, if one doesn't exist
Experiments -> Trials -> Trial Components

In [3]:
exp_exists = bool([exp for exp in Experiment.list() if exp.experiment_name == experiment_name])

if exp_exists:
    debugger_experiment = Experiment.load(experiment_name)
else:
    debugger_experiment = Experiment.create(
                                experiment_name = "sagemaker-debugger-cifar10-experiment", 
                                description     = "Experiment to track cifar10 debugger trials", 
                                sagemaker_boto_client=sm)

#### Specify hyperparameters

In [4]:
hyperparams={'epochs'       : 200,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'adam',
             'model-type'   : 'resnet'}

#### Specify SageMaker Debugger Rules
https://docs.aws.amazon.com/sagemaker/latest/dg/debugger-built-in-rules.html

In [5]:
debug_rules = [
    Rule.sagemaker(rule_configs.overtraining()),
    Rule.sagemaker(rule_configs.overfit()),
    Rule.sagemaker(rule_configs.loss_not_decreasing())
            ]

#### Create a SageMaker TensorFlow estimator

In [6]:
output_path = f's3://{bucket_name}/jobs'
train_instance_type = 'ml.p3.2xlarge'

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'code',
                          output_path          = output_path + '/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = train_instance_type,
                          framework_version    = '1.15', 
                          py_version           = 'py3',
                          script_mode          = True,
                          sagemaker_session    = sagemaker_session,
                          hyperparameters      = hyperparams,
                          rules                = debug_rules)

#### Create a Trial, and associate it your Experiment
An Experiment can have multiple trials

In [7]:
time_append = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())

trial_name = f'1-GTC2020-trial-{time_append}'
cifar10_trial = Trial.create(
    trial_name = trial_name, 
    experiment_name = debugger_experiment.experiment_name,
    sagemaker_boto_client = sm,
)

job_name=f'1-GTC2020-{hyperparams["model-type"]}-{time_append}'
experiment_config = {"ExperimentName": debugger_experiment.experiment_name, 
                       "TrialName": cifar10_trial.trial_name,
                       "TrialComponentDisplayName": job_name,}

In [8]:
tf_estimator.fit({'training': train_path,
                  'validation': val_path,
                  'eval': eval_path},
                  job_name=job_name,
                  wait=False,
                  experiment_config=experiment_config)

INFO:sagemaker:Creating training-job with name: 1-GTC2020-resnet-2020-03-22-22-24-58


In [9]:
# Below command will give the status of training job
job_name = tf_estimator.latest_training_job.name
client = tf_estimator.sagemaker_session.sagemaker_client
description = client.describe_training_job(TrainingJobName=job_name)
print('Training job name: ' + job_name)

def print_same_line(s):
    sys.stdout.write(f'\r{time.strftime("%X", time.gmtime())}: {s}')
    sys.stdout.flush()

if description['TrainingJobStatus'] != 'Completed':
    while description['SecondaryStatus'] not in {'Training', 'Completed'}:
        description = client.describe_training_job(TrainingJobName=job_name)
        primary_status = description['TrainingJobStatus']
        secondary_status = description['SecondaryStatus']
        print_same_line(f'Current job status: [PrimaryStatus: {primary_status}, SecondaryStatus: {secondary_status}]')
        time.sleep(15)

Training job name: 1-GTC2020-resnet-2020-03-22-22-24-58
22:27:26: Current job status: [PrimaryStatus: InProgress, SecondaryStatus: Training]ng]

In [24]:
tf_estimator.latest_training_job.rule_job_summary()

In [25]:
from smdebug.trials import create_trial
trial = create_trial(tf_estimator.latest_job_debugger_artifacts_path())
# # trial = create_trial("s3://sagemaker-jobs-studio/jobs/tf-smdebug-job-2020-03-22-10-56-39/debug-output")

In [26]:
trial.tensor_names()

In [17]:
def tensor_df(tname):
    tval = trial.tensor(tname).values()
    df   = pd.DataFrame.from_dict(tval,orient='index',columns=[tname])
    df_tval = df.reset_index().rename(columns={'index':'steps'})
    return df_tval

In [27]:
df_val_loss = tensor_df('val_loss')
df_loss = tensor_df('loss')

In [21]:
index_to_start = 5
fig, ax = plt.subplots(1,1, sharey=True)
fig.set_size_inches([10, 5])

df_val_loss[index_to_start:].plot(x='steps',y='val_loss', ax=ax, style='-')
df_loss[index_to_start:].plot(x='steps',y='loss', ax=ax, style='.')

# overfit_rule = 
# ax.axvline(x=overfit_rule, c='#1f77b4', ls='--')
# ax.text(overfit_rule+2e2,1.6,'Overfit Rule')

# overtraining_rule = 
# ax.axvline(x=overtraining_rule, c='r', alpha=0.6, ls='--')
# _ = ax.text(overtraining_rule+2e2,1.6,'Overtraining Rule')

Overfitting rule<br>https://docs.aws.amazon.com/sagemaker/latest/dg/overfit.html

Overtraining rule<br>https://docs.aws.amazon.com/sagemaker/latest/dg/overtraining.html