From 372343b5efe87bc9ea59095ead43ee9b4d551d0f Mon Sep 17 00:00:00 2001 From: rafael Date: Thu, 20 Dec 2018 12:54:14 +0100 Subject: [PATCH 1/2] add tf+horovod test --- .../apps/tensorflow/tf_horovod_check.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 cscs-checks/apps/tensorflow/tf_horovod_check.py diff --git a/cscs-checks/apps/tensorflow/tf_horovod_check.py b/cscs-checks/apps/tensorflow/tf_horovod_check.py new file mode 100644 index 0000000000..9387ad11a8 --- /dev/null +++ b/cscs-checks/apps/tensorflow/tf_horovod_check.py @@ -0,0 +1,54 @@ +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.simple_test +class TensorFlowHorovodTest(rfm.RunOnlyRegressionTest): + def __init__(self): + super().__init__() + self.descr = 'Distributed training with TensorFlow and Horovod' + self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_prog_environs = ['PrgEnv-gnu'] + tfshortver = '1.11' + self.sourcesdir = 'https://github.com/tensorflow/benchmarks' + self.modules = ['Horovod/0.15.0-CrayGNU-18.08-tf-%s.0' % tfshortver] + self.reference = { + 'dom:gpu': { + 'img_sec': (1133.6, None, 0.05), + }, + 'daint:gpu': { + 'img_sec': (4403.0, None, 0.05) + }, + } + self.perf_patterns = { + 'img_sec': sn.avg(sn.extractall( + r'total images/sec:\s+(?P\S+)', + self.stdout, 'img_sec', float)) + } + self.sanity_patterns = sn.assert_found( + r'[\S+\s+] INFO NET\/IB : Using interface ipogif0' + r' for sideband communication', self.stdout) + self.num_tasks_per_node = 1 + if self.current_system.name == 'dom': + self.num_tasks = 8 + + if self.current_system.name == 'daint': + self.num_tasks = 32 + + self.pre_run = ['git checkout cnn_tf_v%s_compatible' % tfshortver] + self.variables = { + 'NCCL_DEBUG': 'INFO', + 'NCCL_IB_HCA': 'ipogif0', + 'NCCL_IB_CUDA_SUPPORT': '1', + 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', + } + self.executable = ('python scripts/tf_cnn_benchmarks/' + 'tf_cnn_benchmarks.py') + self.executable_opts = [ + '--model inception3', + '--batch_size 64', + '--variable_update horovod', + '--log_dir ./logs', + '--train_dir ./checkpoints'] + self.tags = {'production'} + self.maintainers = ['MS', 'RS'] From 0d1c5459530f7b5b5df8464761b5c5721d272e26 Mon Sep 17 00:00:00 2001 From: rafael Date: Fri, 11 Jan 2019 11:58:31 +0100 Subject: [PATCH 2/2] fix comments --- .../apps/tensorflow/tf_horovod_check.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cscs-checks/apps/tensorflow/tf_horovod_check.py b/cscs-checks/apps/tensorflow/tf_horovod_check.py index 9387ad11a8..27b8b77e38 100644 --- a/cscs-checks/apps/tensorflow/tf_horovod_check.py +++ b/cscs-checks/apps/tensorflow/tf_horovod_check.py @@ -2,6 +2,7 @@ import reframe.utility.sanity as sn +@rfm.required_version('>=2.16-dev0') @rfm.simple_test class TensorFlowHorovodTest(rfm.RunOnlyRegressionTest): def __init__(self): @@ -14,16 +15,16 @@ def __init__(self): self.modules = ['Horovod/0.15.0-CrayGNU-18.08-tf-%s.0' % tfshortver] self.reference = { 'dom:gpu': { - 'img_sec': (1133.6, None, 0.05), + 'throughput': (1133.6, None, 0.05, 'images/s'), }, 'daint:gpu': { - 'img_sec': (4403.0, None, 0.05) + 'throughput': (4403.0, None, 0.05, 'images/s') }, } self.perf_patterns = { - 'img_sec': sn.avg(sn.extractall( - r'total images/sec:\s+(?P\S+)', - self.stdout, 'img_sec', float)) + 'throughput': sn.avg(sn.extractall( + r'total images/sec:\s+(?P\S+)', + self.stdout, 'throughput', float)) } self.sanity_patterns = sn.assert_found( r'[\S+\s+] INFO NET\/IB : Using interface ipogif0' @@ -31,8 +32,7 @@ def __init__(self): self.num_tasks_per_node = 1 if self.current_system.name == 'dom': self.num_tasks = 8 - - if self.current_system.name == 'daint': + elif self.current_system.name == 'daint': self.num_tasks = 32 self.pre_run = ['git checkout cnn_tf_v%s_compatible' % tfshortver] @@ -42,9 +42,9 @@ def __init__(self): 'NCCL_IB_CUDA_SUPPORT': '1', 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', } - self.executable = ('python scripts/tf_cnn_benchmarks/' - 'tf_cnn_benchmarks.py') + self.executable = ('python') self.executable_opts = [ + 'scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py', '--model inception3', '--batch_size 64', '--variable_update horovod',