From 79deac5f82129a279f8095e61f6f7ab10e8eb654 Mon Sep 17 00:00:00 2001 From: rafael Date: Thu, 4 Jun 2020 09:59:04 +0200 Subject: [PATCH 1/4] add ipcmagic check --- cscs-checks/apps/jupyter/check_ipcmagic.py | 62 +++++++++++++++++++ .../jupyter/src/tf-hvd-sgd-ipc-tf-1.14.py | 42 +++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 cscs-checks/apps/jupyter/check_ipcmagic.py create mode 100644 cscs-checks/apps/jupyter/src/tf-hvd-sgd-ipc-tf-1.14.py diff --git a/cscs-checks/apps/jupyter/check_ipcmagic.py b/cscs-checks/apps/jupyter/check_ipcmagic.py new file mode 100644 index 0000000000..118ec92c0e --- /dev/null +++ b/cscs-checks/apps/jupyter/check_ipcmagic.py @@ -0,0 +1,62 @@ +import reframe as rfm +import reframe.utility.sanity as sn +from reframe.core.backends import getlauncher + + +@rfm.simple_test +class IPCMagicCheck(rfm.RunOnlyRegressionTest): + def __init__(self): + self.descr = 'Distributed training with TensorFlow using ipyparallel' + self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.modules = ['ipcmagic/0.1-CrayGNU-19.10'] + self.pre_run = [ + 'module unload dask', + 'module load Horovod/0.16.4-CrayGNU-19.10-tf-1.14.0'] + self.num_tasks = 2 + self.num_tasks_per_node = 1 + self.executable = 'ipython' + self.executable_opts = ['tf-hvd-sgd-ipc-tf-1.14.py'] + nids = sn.extractall(r'nid(?P\d+)', + self.stdout, 'nid', str) + self.sanity_patterns = sn.all([ + sn.assert_ne(nids, []), + sn.assert_ne(nids[0], nids[1]) + ]) + self.reference = { + 'daint:gpu': { + 'slope': (2.0, -0.1, 0.1, ''), + 'offset': (0.0, -0.1, 0.1, ''), + 'retries': (0, None, None, ''), + 'time': (10, None, None, 'seconds'), + }, + 'dom:gpu': { + 'slope': (2.0, -0.1, 0.1, ''), + 'offset': (0.0, -0.1, 0.1, ''), + 'retries': (0, None, None, ''), + 'time': (10, None, None, 'seconds'), + } + } + self.perf_patterns = { + 'slope': sn.extractsingle(r'slope=(?P\S+)', + self.stdout, 'slope', float), + 'offset': sn.extractsingle(r'offset=(?P\S+)', + self.stdout, 'offset', float), + 'retries': self.retries(), + 'time': sn.extractsingle(r'IPCluster is ready\!\s+' + r'\((?P