In [1]:
%run ./clear_experiments.ipynb

Current experiments:


In [2]:
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd
import itertools
from pprint import pprint

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

# Download cifar10 dataset and upload to Amazon S3

In [3]:
# !python generate_cifar10_tfrecords.py --data-dir cifar10;
# datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')

In [4]:
bucket_name = sagemaker_session.default_bucket()
datasets = f's3://{bucket_name}/datasets/cifar10-dataset'

# Create an experiment to track training trials

In [5]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [6]:
training_experiment = Experiment.create(
                                experiment_name = "cifar10-training-experiment",
                                description     = "Experiment to track cifar10 training trials", 
                                sagemaker_boto_client=sm)

# SageMaker training 

In [7]:
static_hyperparams={'epochs'       : 30,
                    'learning-rate': 0.01,
                    'weight-decay' : 2e-4,
                    'momentum'     : 0.9}

In [8]:
hyperparam_options = {
    'batch-size': [64, 256],
    'optimizer': ['adam', 'sgd'],
    'model': ['resnet', 'vgg']
}

hypnames, hypvalues = zip(*hyperparam_options.items())
trial_hyperparameter_set = [dict(zip(hypnames, h)) for h in itertools.product(*hypvalues)]
trial_hyperparameter_set

[{'batch-size': 64, 'optimizer': 'adam', 'model': 'resnet'},
 {'batch-size': 64, 'optimizer': 'adam', 'model': 'vgg'},
 {'batch-size': 64, 'optimizer': 'sgd', 'model': 'resnet'},
 {'batch-size': 64, 'optimizer': 'sgd', 'model': 'vgg'},
 {'batch-size': 256, 'optimizer': 'adam', 'model': 'resnet'},
 {'batch-size': 256, 'optimizer': 'adam', 'model': 'vgg'},
 {'batch-size': 256, 'optimizer': 'sgd', 'model': 'resnet'},
 {'batch-size': 256, 'optimizer': 'sgd', 'model': 'vgg'}]

In [9]:
with Tracker.create(display_name="experiment-metadata", 
                    artifact_bucket=bucket_name,
                    artifact_prefix=training_experiment.experiment_name,
                    sagemaker_boto_client=sm) as exp_tracker:
    exp_tracker.log_input(name="cifar10-dataset", media_type="s3/uri", value=datasets)
    exp_tracker.log_parameters(static_hyperparams)
    exp_tracker.log_parameters(hyperparam_options)
    exp_tracker.log_artifact(file_path='clear_experiments.ipynb')

In [10]:
from sagemaker.tensorflow import TensorFlow

for trial_hyp in trial_hyperparameter_set[:2]:
    hyperparams = {**static_hyperparams, **trial_hyp}
    
    time_append = int(time.time())
    hyp_append = "-".join([str(elm) for elm in trial_hyp.values()])
    job_name = f'training-{hyp_append}-{time_append}'
    
    with Tracker.create(display_name=f"trial-metadata-{time_append}",
                    artifact_bucket=bucket_name,
                    artifact_prefix=f"{training_experiment.experiment_name}/{job_name}",
                    sagemaker_boto_client=sm) as trial_tracker:
        trial_tracker.log_parameters(hyperparams)

        
    tf_trial = Trial.create(
        trial_name = f'trial-{hyp_append}-{time_append}', 
        experiment_name = training_experiment.experiment_name,
        sagemaker_boto_client = sm)
    tf_trial.add_trial_component(exp_tracker.trial_component)
    tf_trial.add_trial_component(trial_tracker.trial_component)
    
    experiment_config = {"ExperimentName"             : training_experiment.experiment_name, 
                           "TrialName"                : tf_trial.trial_name,
                           "TrialComponentDisplayName": job_name}
    
    tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                              source_dir           = 'code',
                              output_path          = f's3://{bucket_name}/{training_experiment.experiment_name}/',
                              code_location        = f's3://{bucket_name}/{training_experiment.experiment_name}',
                              role                 = role,
                              train_instance_count = 1, 
                              train_instance_type  = 'ml.p3.2xlarge',
                              framework_version    = '1.15', 
                              py_version           = 'py3',
                              script_mode          = True,
                              metric_definitions   = [{'Name': 'val_acc', 'Regex': 'val_acc:([0-9\\.]+)'}],
                              sagemaker_session    = sagemaker_session,
                              hyperparameters      = hyperparams)
    
    
    tf_estimator.fit({'training'  : datasets,
                      'validation': datasets,
                      'eval'      : datasets},
                      job_name = job_name,
                      wait     = False,
                      experiment_config = experiment_config)
    
    time.sleep(2)

INFO:sagemaker:Creating training-job with name: training-64-adam-resnet-1593565145
INFO:sagemaker:Creating training-job with name: training-64-adam-vgg-1593565150


In [11]:
# [trial_component.trial_component_name for trial_component in single_gpu_trial.list_trial_components()]

In [12]:
# from sagemaker.analytics import ExperimentAnalytics

# trial_component_analytics = ExperimentAnalytics(
#     sagemaker_session=sagemaker_session, 
#     experiment_name=tf_experiment.experiment_name,
#     parameter_names=['model-type', 'batch-size', 'epochs', 'learning-rate', 'optimizer']
# )