From 5d15c59d67e461aea6112c5e0af4f4ea6b9a80d7 Mon Sep 17 00:00:00 2001 From: kraushm Date: Thu, 26 Aug 2021 08:00:08 +0200 Subject: [PATCH 1/5] [bugfix] correct typo in module version --- cscs-checks/apps/jupyter/check_ipcmagic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/apps/jupyter/check_ipcmagic.py b/cscs-checks/apps/jupyter/check_ipcmagic.py index 1596c54ad6..1ecf5d1351 100644 --- a/cscs-checks/apps/jupyter/check_ipcmagic.py +++ b/cscs-checks/apps/jupyter/check_ipcmagic.py @@ -17,7 +17,7 @@ def __init__(self): self.valid_prog_environs = ['PrgEnv-gnu'] self.modules = [ # FIXME: Use the default ipcmagic version when fixed - f'ipcmagic/0.1-CrayGNU-{osext.cray_cdt_version()}', + f'ipcmagic/1.0.1-CrayGNU-{osext.cray_cdt_version()}', f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' ] self.num_tasks = 2 From fbfab887d7c8947119762c386dbda3566e1fce3a Mon Sep 17 00:00:00 2001 From: kraushm Date: Fri, 27 Aug 2021 10:58:30 +0200 Subject: [PATCH 2/5] Change to using the default ipcmagic --- cscs-checks/apps/jupyter/check_ipcmagic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cscs-checks/apps/jupyter/check_ipcmagic.py b/cscs-checks/apps/jupyter/check_ipcmagic.py index 1ecf5d1351..4d5b423028 100644 --- a/cscs-checks/apps/jupyter/check_ipcmagic.py +++ b/cscs-checks/apps/jupyter/check_ipcmagic.py @@ -16,8 +16,7 @@ def __init__(self): self.valid_systems = ['daint:gpu', 'dom:gpu'] self.valid_prog_environs = ['PrgEnv-gnu'] self.modules = [ - # FIXME: Use the default ipcmagic version when fixed - f'ipcmagic/1.0.1-CrayGNU-{osext.cray_cdt_version()}', + f'ipcmagic', f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' ] self.num_tasks = 2 From 1022be5a207b65e167937816375f686d2a860363 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Fri, 27 Aug 2021 16:09:59 +0200 Subject: [PATCH 3/5] Modernize Jupyter IPCMagic test --- cscs-checks/apps/jupyter/check_ipcmagic.py | 99 ++++++++++++---------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/cscs-checks/apps/jupyter/check_ipcmagic.py b/cscs-checks/apps/jupyter/check_ipcmagic.py index 4d5b423028..42b4c9ac84 100644 --- a/cscs-checks/apps/jupyter/check_ipcmagic.py +++ b/cscs-checks/apps/jupyter/check_ipcmagic.py @@ -11,54 +11,63 @@ @rfm.simple_test class IPCMagicCheck(rfm.RunOnlyRegressionTest): - def __init__(self): - self.descr = 'Distributed training with TensorFlow using ipyparallel' - self.valid_systems = ['daint:gpu', 'dom:gpu'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.modules = [ - f'ipcmagic', - f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' - ] - self.num_tasks = 2 - self.num_tasks_per_node = 1 - self.executable = 'ipython' - self.executable_opts = ['tf-hvd-sgd-ipc-tf2.py'] - nids = sn.extractall(r'nid(?P\d+)', - self.stdout, 'nid', str) - self.sanity_patterns = sn.all([ - sn.assert_ne(nids, []), - sn.assert_ne(nids[0], nids[1]) - ]) - self.reference = { - 'daint:gpu': { - 'slope': (2.0, -0.1, 0.1, None), - 'offset': (0.0, -0.1, 0.1, None), - 'retries': (0, None, None, None), - 'time': (10, None, None, 's'), - }, - 'dom:gpu': { - 'slope': (2.0, -0.1, 0.1, None), - 'offset': (0.0, -0.1, 0.1, None), - 'retries': (0, None, None, None), - 'time': (10, None, None, 's'), - } + descr = 'Distributed training with TensorFlow using ipyparallel' + valid_systems = ['daint:gpu', 'dom:gpu'] + valid_prog_environs = ['PrgEnv-gnu'] + modules = [ + f'ipcmagic', f'jupyterlab', + f'Horovod/0.21.0-CrayGNU-{osext.cray_cdt_version()}-tf-2.4.0' + ] + num_tasks = 2 + num_tasks_per_node = 1 + executable = 'ipython' + executable_opts = ['tf-hvd-sgd-ipc-tf2.py'] + reference = { + 'daint:gpu': { + 'slope': (2.0, -0.1, 0.1, None), + 'offset': (0.0, -0.1, 0.1, None), + 'retries': (0, None, None, None), + 'time': (10, None, None, 's'), + }, + 'dom:gpu': { + 'slope': (2.0, -0.1, 0.1, None), + 'offset': (0.0, -0.1, 0.1, None), + 'retries': (0, None, None, None), + 'time': (10, None, None, 's'), } - self.perf_patterns = { - 'slope': sn.extractsingle(r'slope=(?P\S+)', - self.stdout, 'slope', float), - 'offset': sn.extractsingle(r'offset=(?P\S+)', - self.stdout, 'offset', float), - 'retries': 4 - sn.count(sn.findall(r'IPCluster is already running', - self.stdout)), - 'time': sn.extractsingle(r'IPCluster is ready\!\s+' - r'\((?P