# ***Demo: TensorFlow + Amazon SageMaker***
* Train model on a single GPU
* Model hosting
* Large-scale distributed training
* Large-scale hyperparameter optimization

In [1]:
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd

sess = boto3.Session()
sm   = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)

# Download cifar10 dataset and upload to Amazon S3

In [2]:
!python generate_cifar10_tfrecords.py --data-dir cifar10;
datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')




Download from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and extract.
Successfully downloaded cifar-10-python.tar.gz 170498071 bytes.
Generating cifar10/train/train.tfrecords
Generating cifar10/validation/validation.tfrecords
Generating cifar10/eval/eval.tfrecords
Done!


In [3]:
bucket_name = sagemaker_session.default_bucket()
job_folder      = 'jobs'
dataset_folder  = 'datasets'

train_path = f'{datasets}/train'
val_path   = f'{datasets}/validation'
eval_path  = f'{datasets}/eval'

# Create an experiment to track training trials

In [4]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

training_experiment = Experiment.create(
                                experiment_name = "sagemaker-training-experiments", 
                                description     = "Experiment to track cifar10 training trials", 
                                sagemaker_boto_client=sm)

# SageMaker training on a single GPU instance

In [5]:
single_gpu_trial = Trial.create(
    trial_name = 'sagemaker-single-gpu-training', 
    experiment_name = training_experiment.experiment_name,
    sagemaker_boto_client = sm,
)

trial_comp_name = 'single-gpu-training-job'
experiment_config = {"ExperimentName": training_experiment.experiment_name, 
                       "TrialName": single_gpu_trial.trial_name,
                       "TrialComponentDisplayName": trial_comp_name}

In [10]:
from sagemaker.tensorflow import TensorFlow

hyperparams={'epochs'       : 10,
             'learning-rate': 0.01,
             'batch-size'   : 256,
             'weight-decay' : 2e-4,
             'momentum'     : 0.9,
             'optimizer'    : 'sgd'}

output_path = f's3://{bucket_name}/jobs'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'training_scripts',
                          output_path          = f'{output_path}/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = 'ml.p3.2xlarge',
                          framework_version    = '1.15.2', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          sagemaker_session    = sagemaker_session,
                          hyperparameters      = hyperparams)

job_name=f'tensorflow-single-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tf_estimator.fit({'training'  : train_path,
                  'validation': val_path,
                  'eval'      : eval_path},
                 job_name = job_name,
                 experiment_config=experiment_config)

INFO:sagemaker:Creating training-job with name: tensorflow-single-gpu-2020-06-17-19-27-43


2020-06-17 19:27:44 Starting - Starting the training job...
2020-06-17 19:27:47 Starting - Launching requested ML instances.........
2020-06-17 19:29:24 Starting - Preparing the instances for training......
2020-06-17 19:30:45 Downloading - Downloading input data
2020-06-17 19:30:45 Training - Downloading the training image......
[0m
[34m2020-06-17 19:31:33,385 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-06-17 19:31:33,779 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "eval": "/opt/ml/input/data/eval",
        "training": "/opt/ml/input/data/training",
        "validation": "/opt/ml/input/data/validation"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "momentum": 0.9,
        

In [11]:
predictor = tf_estimator.deploy(instance_type='ml.m4.xlarge',
                                accelerator_type = 'ml.eia1.medium',
                                initial_instance_count=1)

INFO:sagemaker:Creating model with name: tensorflow-single-gpu-2020-06-17-19-27-43
INFO:sagemaker:Creating endpoint with name tensorflow-single-gpu-2020-06-17-19-27-43


-------------!

### Blog post: A quick guide to using Spot Instances
https://towardsdatascience.com/a-quick-guide-to-using-spot-instances-with-amazon-sagemaker-b9cfb3a44a68

### Blog post: A quick guide to debugging models with SageMaker Debugger
https://towardsdatascience.com/how-to-debug-machine-learning-models-to-catch-issues-early-and-often-5663f2b4383b

# Distributed training

In [None]:
distributed_training_trial = Trial.create(
    trial_name = 'sagemaker-distributed-training', 
    experiment_name = training_experiment.experiment_name,
    sagemaker_boto_client = sm,
)

trial_comp_name = 'distributed-training-job'
experiment_config = {"ExperimentName": training_experiment.experiment_name, 
                       "TrialName": distributed_training_trial.trial_name,
                       "TrialComponentDisplayName": trial_comp_name}

In [None]:
hvd_instance_type = 'ml.p3.16xlarge'
hvd_instance_count = 2
hvd_processes_per_host = 8
# Total number of GPUs => 8*2 = 16 GPUs

hyperparams = {'epochs': 100, 
               'learning-rate': 0.001,
               'momentum': 0.9,
               'weight-decay': 2e-4,
               'optimizer': 'adam',
               'batch-size' : 256}

distributions = {
                 'mpi': {
                          'enabled'           : True,
                          'processes_per_host': hvd_processes_per_host,
                          'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }

hvd_estimator = TensorFlow(entry_point          = 'cifar10-distributed-training-sagemaker.py', 
                           source_dir           = 'training_scripts',
                           output_path          = f'{output_path}/',
                           code_location        = output_path,
                           role                 = role,
                           train_instance_count = hvd_instance_count, 
                           train_instance_type  = hvd_instance_type,
                           train_volume_size    = 50,
                           framework_version    = '1.15', 
                           py_version           = 'py3',
                           script_mode          = True,
                           metric_definitions   = metric_definitions,
                           sagemaker_session    = sagemaker_session,
                           hyperparameters      = hyperparams,
                           distributions        = distributions)

hvd_estimator.fit({'training'  : train_path,
                  'validation' : val_path,
                  'eval'       : eval_path},
                   experiment_config = experiment_config)

### Blog post: A quick guide to distributed training with TensorFlow and Horovod on Amazon SageMaker
https://towardsdatascience.com/a-quick-guide-to-distributed-training-with-tensorflow-and-horovod-on-amazon-sagemaker-dae18371ef6e

## Hyperparameter optimization

In [None]:
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

hyperparameter_ranges = {
    'epochs'        : IntegerParameter(5, 30),
    'learning-rate' : ContinuousParameter(0.001, 0.1, scaling_type='Logarithmic'), 
    'batch-size'    : CategoricalParameter(['128', '256', '512']),
    'momentum'      : ContinuousParameter(0.9, 0.99),
    'optimizer'     : CategoricalParameter(['sgd', 'adam'])
}

objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

tf_estimator = TensorFlow(entry_point          = 'cifar10-training-sagemaker.py', 
                          source_dir           = 'training_scripts',
                          output_path          = f'{output_path}/',
                          code_location        = output_path,
                          role                 = role,
                          train_instance_count = 1, 
                          train_instance_type  = 'ml.p3.2xlarge',
                          framework_version    = '1.15', 
                          py_version           = 'py3',
                          script_mode          = True,
                          metric_definitions   = metric_definitions,
                          sagemaker_session    = sagemaker_session,
                          hyperparameters      = hyperparams)

tuner = HyperparameterTuner(estimator             = tf_estimator,
                            objective_metric_name = objective_metric_name,
                            hyperparameter_ranges = hyperparameter_ranges,
                            metric_definitions    = metric_definitions,
                            max_jobs              = 8,
                            max_parallel_jobs     = 16,
                            objective_type        = objective_type)

tuner.fit({'training'  : train_path,
           'validation': val_path,
           'eval'      : eval_path})