In [48]:
import os
import time
import numpy as np
import sagemaker
import tqdm

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name = 'cloud-csce678-dml-bucket'
jobs_folder    = 'jobs'

In [49]:
hvd_instance_type = 'ml.m4.xlarge'
hvd_instance_count = 2
hvd_processes_per_host = 2

print(f'Distributed training with a total of {hvd_processes_per_host*hvd_instance_count} workers: {hvd_instance_count} instances of {hvd_instance_type}')
print(f'{hvd_processes_per_host} GPU(s) per instance')

Distributed training with a total of 4 workers: 2 instances of ml.m4.xlarge
2 GPU(s) per instance


In [50]:
job_name   = f'tf-horovod-resnet50-{hvd_instance_count}x{hvd_processes_per_host}-workers-{time.strftime("%Y-%m-%d-%H-%M-%S-%j", time.gmtime())}'
output_path = f's3://{bucket_name}/{jobs_folder}'
tboard_logs = f's3://{bucket_name}/tensorboard_logs/{job_name}'

metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]

hyperparameters = {'epochs': 100, 
                   'learning-rate': 0.001,
                   'momentum': 0.9,
                   'weight-decay': 2e-4,
                   'optimizer': 'adam',
                   'batch-size' : 256}

sm_config       = {'tensorboard_logs': tboard_logs}

hyperparameters.update(sm_config)

In [51]:
print(hyperparameters)

{'epochs': 100, 'learning-rate': 0.001, 'momentum': 0.9, 'weight-decay': 0.0002, 'optimizer': 'adam', 'batch-size': 256, 'tensorboard_logs': 's3://cloud-csce678-dml-bucket/tensorboard_logs/tf-horovod-resnet50-2x2-workers-2020-04-25-03-47-07-116'}


In [52]:

distributions = {
                 'mpi': {
                          'enabled'           : True,
                          'processes_per_host': hvd_processes_per_host,
                          'custom_mpi_options': '-verbose --NCCL_DEBUG=INFO -x OMPI_MCA_btl_vader_single_copy_mechanism=none'
                        }
                }


In [53]:
from sagemaker.tensorflow import TensorFlow
hvd_estimator = TensorFlow(entry_point          = 'csce678_project.py', 
                           source_dir           = 'code',
                           output_path          = output_path + '/',
                           code_location        = output_path,
                           role                 = role,
                           train_instance_count = hvd_instance_count, 
                           train_instance_type  = hvd_instance_type,
                           train_volume_size    = 50,
                           framework_version    = '1.15', 
                           py_version           = 'py3',
                           script_mode          = True,
                           metric_definitions   = metric_definitions,
                           hyperparameters      = hyperparameters,
                           distributions        = distributions)

In [54]:
train_path = f's3://{bucket_name}/fruit_dataset/train'
val_path   = f's3://{bucket_name}/fruit_dataset/validation'
eval_path  = f's3://{bucket_name}/fruit_dataset/eval'

hvd_estimator.fit({'train': train_path,'validation': val_path,'eval': eval_path}, 
                  job_name=job_name, wait=False)

In [11]:
hvd_estimator.uploaded_code

UserCode(s3_prefix='s3://cloud-csce678-dml-bucket/jobs/tf-horovod-resnet50-1x2-workers-2020-04-23-03-15-56-114/source/sourcedir.tar.gz', script_name='csce678_project.py')

In [55]:
!S3_REGION=us-east-2 tensorboard --logdir s3://{bucket_name}/tensorboard_logs/


TensorBoard 1.15.0 at http://ip-172-16-59-129:6006/ (Press CTRL+C to quit)
W0425 04:08:08.489317 139914187458304 plugin_event_multiplexer.py:250] Deleting accumulator 'tf-horovod-resnet50-1x2-workers-2020-04-23-03-07-35-114'
W0425 04:08:08.490341 139914187458304 plugin_event_multiplexer.py:250] Deleting accumulator 'tf-horovod-resnet50-2x2-workers-2020-04-25-03-24-32-116'
W0425 04:08:08.490427 139914187458304 plugin_event_multiplexer.py:250] Deleting accumulator 'tf-horovod-resnet50-2x2-workers-2020-04-25-02-51-35-116'
W0425 04:08:08.490502 139914187458304 plugin_event_multiplexer.py:250] Deleting accumulator 'tf-horovod-resnet50-1x2-workers-2020-04-23-03-15-56-114'
W0425 04:08:08.490571 139914187458304 plugin_event_multiplexer.py:250] Deleting accumulator 'tf-horovod-resnet50-2x2-workers-2020-04-23-03-27-05-114'
W0425 04:08:08.490638 139914187458304 plugin_event_multiplexer.py:250] Deleting accumulator 'tf-horovod-resnet50-2x2-workers-2020-04-25-03-05-42-116'
^C


In [None]:
import os 

In [None]:
apple_dir_names = ['Apple/A', 'Apple/B', 'Apple/C']
guava_dir_names = []
kiwi_dir_names = []

def get_filenames_from_directory_list(directory_list):
    paths = []
    for directory in directory_list:
        paths.extend(os.listdir(directory))
    return paths

apple_paths = get_filenames_from_directory_list(apple_dir_names)
guava_paths = get_filenames_from_directory_list(guava_dir_names)
kiwi_paths = get_filenames_from_directory_list(kiwi_dir_names)