In [1]:
%matplotlib inline
import os
import time
import numpy as np
import sagemaker
from sagemaker.session import s3_input

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket_name = 'tfworld19-distributed-training'

In [2]:
from sagemaker.tensorflow import TensorFlow

hvd_instance_type = 'ml.p3.2xlarge'
hvd_instance_count = 2
hvd_processes_per_host = 1
output_path = 's3://{}/'.format(bucket_name)

distributions = {'mpi': {
                    'enabled': True,
                    'processes_per_host': hvd_processes_per_host,
                    'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }

hyperparameters = {'epochs': 100, 'batch-size' : 256}

estimator_hvd = TensorFlow(base_job_name='hvd-cifar10-tf',
                       source_dir='code',
                       entry_point='cifar10-multi-gpu-horovod-sagemaker.py', 
                       role=role,
                       framework_version='1.14',
                       py_version='py3',
                       hyperparameters=hyperparameters,
                       train_instance_count=hvd_instance_count, 
                       train_instance_type=hvd_instance_type,
                       output_path=output_path,
                       tags = [{'Key' : 'Project', 'Value' : 'cifar10'},{'Key' : 'TensorBoard', 'Value' : 'dist'}],
                       metric_definitions=[{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}],
                       distributions=distributions)

In [3]:
train_path = 's3://{}/cifar10-dataset/train'.format(bucket_name)
val_path = 's3://{}/cifar10-dataset/validation'.format(bucket_name)
eval_path = 's3://{}/cifar10-dataset/eval/'.format(bucket_name)
job_name = 'sm-dist-training-job' + time.strftime('%Y-%m-%d-%H-%M-%S-%j', time.gmtime())

estimator_hvd.fit({'train': train_path,'validation': val_path,'eval': eval_path}, 
                  job_name=job_name, wait=False)

Launch tensorboard and open the link on a new tab to visualize training progress, and navigate to the following link

In [4]:
!S3_REGION=us-west-2 tensorboard --logdir s3://{bucket_name}/{job_name}/model/

https://tfworld19-dist-training-workshop.notebook.us-west-2.sagemaker.aws/proxy/6006/


In [None]:
https://tf-world-instance.notebook.us-west-2.sagemaker.aws/proxy/6006/

TensorBoard 1.14.0 at http://ip-172-16-11-53:6006/ (Press CTRL+C to quit)


In [None]:
import matplotlib.pyplot as plt
def plot_training_curves(history): 
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharex=True)
    ax = axes[0]
    ax.plot(history['acc'], label='train')
    ax.plot(history['val_acc'], label='validation')
    ax.set(
        title='model accuracy',
        ylabel='accuracy',
        xlabel='epoch')
    ax.legend()

    ax = axes[1]
    ax.plot(history['loss'], label='train')
    ax.plot(history['val_loss'], label='validation')
    ax.set(
        title='model loss',
        ylabel='loss',
        xlabel='epoch')
    ax.legend()
    fig.tight_layout()

In [None]:
import json
!aws s3 cp {estimator_hvd.model_data} ./hvd_model/model.tar.gz
!tar -xzf ./hvd_model/model.tar.gz -C ./hvd_model

with open('./hvd_model/hvd_history.p', "r") as f:
    hvd_history = json.load(f)
    
plot_training_curves(hvd_history)