In [None]:
# !pip install -q sagemaker smdebug sagemaker awscli sagemaker-experiments --upgrade

In [None]:
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd
import itertools
from pprint import pprint

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

# Download cifar10 dataset and upload to Amazon S3

In [None]:
!python generate_cifar10_tfrecords.py --data-dir cifar10;
datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')

In [None]:
bucket_name = sagemaker_session.default_bucket()
# If the dataset already exists, get path directly
# datasets = f's3://{bucket_name}/datasets/cifar10-dataset'

# Create an experiment to track training trials

In [None]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [None]:
training_experiment = Experiment.create(
                                experiment_name = f"cifar10-training-experiment-{int(time.time())}",
                                description     = "Hypothesis: Custom model architecture delivers higher validation accuracy for classification compared to ResNet50 and VGG on the CIFAR10 dataset",
                                sagemaker_boto_client=sm)

# SageMaker training 

In [None]:
static_hyperparams={'batch-size'   : 128,
                    'learning-rate': 0.001,
                    'weight-decay' : 1e-6,
                    'momentum'     : 0.9}

In [None]:
hyperparam_options = {
    'model': ['resnet', 'custom'],
    'optimizer': ['adam', 'sgd', 'rmsprop'],
    'epochs': [30, 60, 120]
}

hypnames, hypvalues = zip(*hyperparam_options.items())
trial_hyperparameter_set = [dict(zip(hypnames, h)) for h in itertools.product(*hypvalues)]
trial_hyperparameter_set

In [None]:
with Tracker.create(display_name="experiment-metadata", 
                    artifact_bucket=bucket_name,
                    artifact_prefix=training_experiment.experiment_name,
                    sagemaker_boto_client=sm) as exp_tracker:
    exp_tracker.log_input(name="cifar10-dataset", media_type="s3/uri", value=datasets)
    exp_tracker.log_parameters(static_hyperparams)
    exp_tracker.log_parameters(hyperparam_options)
    exp_tracker.log_artifact(file_path='generate_cifar10_tfrecords.py')

In [None]:
from sagemaker.tensorflow import TensorFlow

for trial_hyp in trial_hyperparameter_set:
    # Combine static hyperparameters and trial specific hyperparameters
    hyperparams = {**static_hyperparams, **trial_hyp}
    
    # Create unique job name with hyperparameter and time
    time_append = int(time.time())
    hyp_append = "-".join([str(elm) for elm in trial_hyp.values()])
    job_name = f'cifar10-training-{hyp_append}-{time_append}'
    
    # Create a Tracker to track Trial specific hyperparameters
    with Tracker.create(display_name=f"trial-metadata-{time_append}",
                    artifact_bucket=bucket_name,
                    artifact_prefix=f"{training_experiment.experiment_name}/{job_name}",
                    sagemaker_boto_client=sm) as trial_tracker:
        trial_tracker.log_parameters(hyperparams)

    # Create a new Trial and associate Tracker to it        
    tf_trial = Trial.create(
        trial_name = f'trial-{hyp_append}-{time_append}', 
        experiment_name = training_experiment.experiment_name,
        sagemaker_boto_client = sm)
    tf_trial.add_trial_component(exp_tracker.trial_component)
    time.sleep(2) #To prevent ThrottlingException
    tf_trial.add_trial_component(trial_tracker.trial_component)
    
    # Create an experiment config that associates training job to the Trial
    experiment_config = {"ExperimentName"             : training_experiment.experiment_name, 
                           "TrialName"                : tf_trial.trial_name,
                           "TrialComponentDisplayName": job_name}
    
    metric_definitions = [{'Name': 'loss', 'Regex': 'loss: ([0-9\\.]+)'},
                         {'Name': 'acc', 'Regex': 'acc: ([0-9\\.]+)'},
                         {'Name': 'val_loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
                         {'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'},
                         {'Name': 'test_acc', 'Regex': 'test_acc: ([0-9\\.]+)'},
                         {'Name': 'test_loss', 'Regex': 'test_loss: ([0-9\\.]+)'}]
    
    # Create a TensorFlow Estimator with the Trial specific hyperparameters
    tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                              source_dir           = 'code',
                              output_path          = f's3://{bucket_name}/{training_experiment.experiment_name}/',
                              code_location        = f's3://{bucket_name}/{training_experiment.experiment_name}',
                              role                 = role,
                              train_instance_count = 1, 
                              train_instance_type  = 'ml.p3.2xlarge',
                              framework_version    = '1.15', 
                              py_version           = 'py3',
                              script_mode          = True,
                              metric_definitions   = metric_definitions,
                              sagemaker_session    = sagemaker_session,
                              hyperparameters      = hyperparams,
                              enable_sagemaker_metrics = True)
    
    # Launch a training job
    tf_estimator.fit({'training'  : datasets,
                      'validation': datasets,
                      'eval'      : datasets},
                      job_name = job_name,
                      wait     = False,
                      experiment_config = experiment_config)
    
    time.sleep(3) #To prevent ThrottlingException

In [None]:
from sagemaker.analytics import ExperimentAnalytics

experiment_name = training_experiment.experiment_name

trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=sagemaker_session, 
    experiment_name=experiment_name,
)
trial_comp_ds = trial_component_analytics.dataframe()

idx_jobs = ~trial_comp_ds['test_acc - Last'].isna()
trial_comp_ds_jobs = trial_comp_ds_sorted.loc[idx_jobs]
trial_comp_ds_jobs = trial_comp_ds_jobs.sort_values('test_acc - Last', ascending=False)
trial_comp_ds_jobs['col_names'] = trial_comp_ds_jobs['model'] + '-' + trial_comp_ds_sorted['optimizer']
trial_comp_ds_jobs['col_names'] = trial_comp_ds_jobs[['col_names']].applymap(lambda x: x.replace('"', ''))

fig = plt.figure()
fig.set_size_inches([15, 10])
trial_comp_ds_jobs.plot.bar('col_names', 'test_acc - Last',ax=plt.gca())
trial_comp_ds_jobs[['TrialComponentName', 'test_acc - Last', 'model', 'batch-size', 'epochs', 'learning-rate', 'optimizer']]

In [None]:
from smdebug.trials import create_trial

def tensor_df(tname):
    tval = trial.tensor(tname).values()
    df   = pd.DataFrame.from_dict(tval,orient='index',columns=[tname])
    df_tval = df.reset_index().rename(columns={'index':'steps'})
    return df_tval

def trial_perf_curves(job_name, tname, experiment_name):
    debug_data = f's3://{bucket_name}/{experiment_name}/{job_name}/debug-output'
    trial = create_trial(debug_data)
    tval = trial.tensor(tname).values()
    df   = pd.DataFrame.from_dict(tval,orient='index',columns=[tname])
    return df

def get_metric_dataframe(metric, trial_comp_ds, experiment_name):
    df = pd.DataFrame()
    for tc_name in trial_comp_ds['DisplayName']:
        print(f'\nLoading training job: {tc_name}')
        print(f'--------------------------------\n')
        trial_perf = trial_perf_curves(tc_name, metric, experiment_name)
        trial_perf.columns = [tc_name]
        df = pd.concat([df, trial_perf],axis=1)
    return df

val_acc_df = get_metric_dataframe('val_acc', trial_comp_ds_jobs, experiment_name)

fig = plt.figure()
fig.set_size_inches([15, 10])

# Replace the Trial names with the ones you want to plot, or remove indexing to plot all jobs
val_acc_df[['cifar10-training-adam-custom-120-1594536575','cifar10-training-adam-custom-60-1594536571','cifar10-training-rmsprop-custom-30-1594536622']].plot(style='-',ax=plt.gca())