From 0e10a0d48472ae46dbd083ecd19f6aa4e973cda1 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 3 Apr 2019 14:53:57 +0200 Subject: [PATCH 1/2] Enable small scale application tests also on Daint --- cscs-checks/apps/amber/amber_check.py | 61 +++++----- cscs-checks/apps/cp2k/cp2k_check.py | 114 +++++++++--------- cscs-checks/apps/cpmd/cpmd_check.py | 50 ++++---- cscs-checks/apps/espresso/espresso_check.py | 36 +++--- cscs-checks/apps/gromacs/gromacs_check.py | 114 +++++++++--------- cscs-checks/apps/lammps/lammps_check.py | 97 ++++++++------- cscs-checks/apps/namd/namd_check.py | 81 ++++++------- .../apps/tensorflow/tf_horovod_check.py | 43 ++++--- cscs-checks/microbenchmarks/osu/osu_tests.py | 46 ++++--- 9 files changed, 347 insertions(+), 295 deletions(-) diff --git a/cscs-checks/apps/amber/amber_check.py b/cscs-checks/apps/amber/amber_check.py index 081497355a..4018e923ce 100644 --- a/cscs-checks/apps/amber/amber_check.py +++ b/cscs-checks/apps/amber/amber_check.py @@ -41,49 +41,54 @@ def __init__(self, input_file, output_file): self.tags = {'scs'} -@rfm.parameterized_test(*([variant, arch] - for variant in ['prod', 'maint'] - for arch in ['CPU', 'GPU'])) +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*( + [variant, arch, scale] + for variant in ['prod', 'maint'] + for arch in ['CPU', 'GPU'] + for scale in ['small', 'large'] + if (not (scale, arch) == ('large', 'GPU') and + not (variant, arch) == ('maint', 'CPU')) +)) class AmberCheck(AmberBaseCheck): - def __init__(self, variant, arch): + def __init__(self, variant, arch, scale): super().__init__('mdin.%s' % arch, 'amber.out') + self.descr = 'Amber parallel %s %s check (%s)' % (scale, arch, variant) + self.tags |= {'maintenance' if variant == 'maint' else 'production'} if arch == 'GPU': self.valid_systems = ['daint:gpu', 'dom:gpu'] self.executable = 'pmemd.cuda.MPI' self.reference = { 'dom:gpu': { - 'perf': (30.0, -0.05, None) + 'perf': (30.0, -0.05, None, 'ns/day') }, 'daint:gpu': { - 'perf': (30.0, -0.05, None) + 'perf': (30.0, -0.05, None, 'ns/day') }, } - if variant == 'prod': - self.descr = 'Amber parallel GPU production check' - self.tags |= {'production'} - elif variant == 'maint': - self.descr = 'Amber parallel GPU maintenance check' - self.tags |= {'maintenance'} elif arch == 'CPU': - self.valid_systems = ['daint:mc', 'dom:mc'] - if variant == 'prod': - self.descr = 'Amber parallel CPU production check' - self.tags |= {'production'} - self.executable = 'pmemd.MPI' - self.strict_check = False - if self.current_system.name == 'dom': - self.num_tasks = 216 - self.num_tasks_per_node = 36 - else: - self.num_tasks = 576 - self.num_tasks_per_node = 36 + self.valid_systems = ['daint:mc'] + if scale == 'small': + self.valid_systems += ['dom:mc'] + self.executable = 'pmemd.MPI' + self.strict_check = False + if scale == 'small': + self.num_tasks = 216 + self.num_tasks_per_node = 36 self.reference = { 'dom:mc': { - 'perf': (8.0, -0.05, None) + 'perf': (8.0, -0.05, None, 'ns/day') }, 'daint:mc': { - 'perf': (10.7, -0.25, None) - }, + 'perf': (7.6, -0.05, None, 'ns/day') + } + } + else: + self.num_tasks = 576 + self.num_tasks_per_node = 36 + self.reference = { + 'daint:mc': { + 'perf': (10.7, -0.25, None, 'ns/day') + } } - diff --git a/cscs-checks/apps/cp2k/cp2k_check.py b/cscs-checks/apps/cp2k/cp2k_check.py index 47610f9f4b..feda362e7a 100644 --- a/cscs-checks/apps/cp2k/cp2k_check.py +++ b/cscs-checks/apps/cp2k/cp2k_check.py @@ -4,11 +4,9 @@ class Cp2kCheck(rfm.RunOnlyRegressionTest): - def __init__(self, check_name, check_descr): - super().__init__(check_name, os.path.dirname(__file__)) - self.descr = check_descr + def __init__(self): + super().__init__() self.valid_prog_environs = ['PrgEnv-gnu'] - self.executable = 'cp2k.psmp' self.executable_opts = ['H2O-256.inp'] @@ -26,7 +24,7 @@ def __init__(self, check_name, check_descr): ]) self.perf_patterns = { - 'perf': sn.extractsingle(r'^ CP2K(\s+[\d\.]+){4}\s+(?P\S+)', + 'time': sn.extractsingle(r'^ CP2K(\s+[\d\.]+){4}\s+(?P\S+)', self.stdout, 'perf', float) } @@ -41,75 +39,83 @@ def __init__(self, check_name, check_descr): } -@rfm.parameterized_test(['prod'], ['maint']) +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['maint', 'prod'])) class Cp2kCpuCheck(Cp2kCheck): - def __init__(self, variant): - super().__init__('cp2k_cpu_%s_check' % variant, - 'CP2K check CPU') - self.valid_systems = ['daint:mc', 'dom:mc'] - self.num_gpus_per_node = 0 - if self.current_system.name == 'dom': + def __init__(self, size, variant): + super().__init__() + self.descr = 'CP2K CPU check (version: %s, %s)' % (size, variant) + self.valid_systems = ['daint:mc'] + if size == 'small': + self.valid_systems += ['dom:mc'] self.num_tasks = 216 else: self.num_tasks = 576 self.num_tasks_per_node = 36 - - if variant == 'maint': - self.tags |= {'maintenance'} - self.reference = { - 'dom:mc': { - 'perf': (182.6, None, 0.05) + references = { + 'maint': { + 'small': { + 'dom:mc': {'time': (182.6, None, 0.05, 's')}, + 'daint:mc': {'time': (214.5, None, 0.15, 's')} }, - 'daint:mc': { - 'perf': (141.0, None, 0.05) - }, - } - else: - self.tags |= {'production'} - self.reference = { - 'dom:mc': { - 'perf': (174.5, None, 0.05) - }, - 'daint:mc': { - 'perf': (113.0, None, 0.25) + 'large': { + 'daint:mc': {'time': (141.0, None, 0.05, 's')} + } + }, + 'prod': { + 'small': { + 'dom:mc': {'time': (174.5, None, 0.05, 's')}, + 'daint:mc': {'time': (214.5, None, 0.15, 's')} }, + 'large': { + 'daint:mc': {'time': (113.0, None, 0.05, 's')} + } } + } + self.reference = references[variant][size] + self.tags |= {'maintenance' if variant == 'maint' else 'production'} -@rfm.parameterized_test(['prod'], ['maint']) + +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['maint', 'prod'])) class Cp2kGpuCheck(Cp2kCheck): - def __init__(self, variant): - super().__init__('cp2k_gpu_%s_check' % variant, - 'CP2K check GPU') - self.valid_systems = ['daint:gpu', 'dom:gpu'] + def __init__(self, size, variant): + super().__init__() + self.descr = 'CP2K GPU check (version: %s, %s)' % (size, variant) + self.valid_systems = ['daint:gpu'] self.variables = {'CRAY_CUDA_MPS': '1'} self.modules = ['CP2K'] self.num_gpus_per_node = 1 - if self.current_system.name == 'dom': + if size == 'small': + self.valid_systems += ['dom:gpu'] self.num_tasks = 72 else: self.num_tasks = 192 self.num_tasks_per_node = 12 - - if variant == 'maint': - self.tags |= {'maintenance'} - self.reference = { - 'dom:gpu': { - 'perf': (251.8, None, 0.15) + references = { + 'maint': { + 'small': { + 'dom:gpu': {'time': (251.8, None, 0.15, 's')}, + 'daint:gpu': {'time': (262.6, None, 0.10, 's')} }, - 'daint:gpu': { - 'perf': (222.6, None, 0.05) - }, - } - else: - self.tags |= {'production'} - self.reference = { - 'dom:gpu': { - 'perf': (240.0, None, 0.05) - }, - 'daint:gpu': { - 'perf': (222.6, None, 0.05) + 'large': { + 'daint:gpu': {'time': (222.6, None, 0.05, 's')} + } + }, + 'prod': { + 'small': { + 'dom:gpu': {'time': (240.0, None, 0.05, 's')}, + 'daint:gpu': {'time': (262.6, None, 0.10, 's')} }, + 'large': { + 'daint:gpu': {'time': (222.6, None, 0.05, 's')} + } } + } + self.reference = references[variant][size] + self.tags |= {'maintenance' if variant == 'maint' else 'production'} diff --git a/cscs-checks/apps/cpmd/cpmd_check.py b/cscs-checks/apps/cpmd/cpmd_check.py index bff3b86887..e2a0dcdac0 100644 --- a/cscs-checks/apps/cpmd/cpmd_check.py +++ b/cscs-checks/apps/cpmd/cpmd_check.py @@ -1,28 +1,30 @@ -import os - import reframe as rfm import reframe.utility.sanity as sn -@rfm.simple_test +@rfm.required_version('>=2.16') +@rfm.parameterized_test(['small'], ['large']) class CPMDCheck(rfm.RunOnlyRegressionTest): - def __init__(self): + def __init__(self, scale): super().__init__() self.descr = 'CPMD check (C4H6 metadynamics)' self.maintainers = ['AJ', 'LM'] self.tags = {'production'} - self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_systems = ['daint:gpu'] + if scale == 'small': + self.num_tasks = 9 + self.valid_systems += ['dom:gpu'] + else: + self.num_tasks = 16 + self.time_limit = (0, 20, 0) + + self.num_tasks_per_node = 1 self.valid_prog_environs = ['PrgEnv-intel'] self.modules = ['CPMD'] self.executable = 'cpmd.x' self.executable_opts = ['ana_c4h6.in > stdout.txt'] self.readonly_files = ['ana_c4h6.in', 'C_MT_BLYP', 'H_MT_BLYP'] - if self.current_system.name == 'dom': - self.num_tasks = 9 - else: - self.num_tasks = 16 - self.num_tasks_per_node = 1 self.use_multithreading = True self.strict_check = False self.extra_resources = { @@ -30,6 +32,7 @@ def __init__(self): 'num_switches': 1 } } + # OpenMP version of CPMD segfaults # self.variables = { 'OMP_NUM_THREADS' : '8' } energy = sn.extractsingle( @@ -39,16 +42,21 @@ def __init__(self): energy_diff = sn.abs(energy - energy_reference) self.sanity_patterns = sn.assert_lt(energy_diff, 0.26) self.perf_patterns = { - 'perf': sn.extractsingle(r'^ cpmd(\s+[\d\.]+){3}\s+(?P\S+)', + 'time': sn.extractsingle(r'^ cpmd(\s+[\d\.]+){3}\s+(?P\S+)', 'stdout.txt', 'perf', float) } - - self.reference = { - 'daint:gpu': { - 'perf': (245.0, None, 0.59) # (225.0, None, 0.15) - }, - 'dom:gpu': { - 'perf': (332.0, None, 0.15) - }, - } - + if scale == 'small': + self.reference = { + 'daint:gpu': { + 'time': (285.5, None, 0.20, 's') + }, + 'dom:gpu': { + 'time': (332.0, None, 0.15, 's') + } + } + else: + self.reference = { + 'daint:gpu': { + 'time': (245.0, None, 0.59, 's') + } + } diff --git a/cscs-checks/apps/espresso/espresso_check.py b/cscs-checks/apps/espresso/espresso_check.py index 2b4b25670e..b339e3b2ef 100644 --- a/cscs-checks/apps/espresso/espresso_check.py +++ b/cscs-checks/apps/espresso/espresso_check.py @@ -4,9 +4,10 @@ import reframe.utility.sanity as sn -@rfm.simple_test +@rfm.required_version('>=2.16') +@rfm.parameterized_test(['small'], ['large']) class QECheck(rfm.RunOnlyRegressionTest): - def __init__(self): + def __init__(self, size): super().__init__() self.descr = 'Quantum Espresso CPU check' self.maintainers = ['AK', 'LM'] @@ -14,17 +15,31 @@ def __init__(self): self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'Espresso') - self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_systems = ['daint:mc'] self.valid_prog_environs = ['PrgEnv-intel'] self.modules = ['QuantumESPRESSO'] self.executable = 'pw.x' self.executable_opts = ['-in', 'ausurf.in'] - if self.current_system.name == 'dom': + if size == 'small': + self.valid_systems += ['dom:mc'] self.num_tasks = 216 self.num_tasks_per_node = 36 + self.reference = { + 'dom:mc': { + 'time': (159.0, None, 0.05, 's'), + }, + 'daint:mc': { + 'time': (151.6, None, 0.05, 's') + }, + } else: self.num_tasks = 576 self.num_tasks_per_node = 36 + self.reference = { + 'daint:mc': { + 'time': (157.0, None, 0.40, 's') + }, + } self.use_multithreading = True self.extra_resources = { @@ -41,15 +56,6 @@ def __init__(self): sn.assert_reference(energy, -11427.09017162, -1e-10, 1e-10) ]) self.perf_patterns = { - 'sec': sn.extractsingle(r'electrons :\s+(?P\S+)s CPU ', - self.stdout, 'sec', float) + 'time': sn.extractsingle(r'electrons :\s+(?P\S+)s CPU ', + self.stdout, 'sec', float) } - self.reference = { - 'dom:mc': { - 'sec': (159.0, None, 0.05), - }, - 'daint:mc': { - 'sec': (157.0, None, 0.40) - }, - } - diff --git a/cscs-checks/apps/gromacs/gromacs_check.py b/cscs-checks/apps/gromacs/gromacs_check.py index 03f9ea2d8f..157b51226d 100644 --- a/cscs-checks/apps/gromacs/gromacs_check.py +++ b/cscs-checks/apps/gromacs/gromacs_check.py @@ -8,7 +8,6 @@ class GromacsBaseCheck(rfm.RunOnlyRegressionTest): def __init__(self, output_file): super().__init__() - self.valid_prog_environs = ['PrgEnv-gnu'] self.executable = 'gmx_mpi' @@ -43,90 +42,97 @@ def __init__(self, output_file): 'num_switches': 1 } } + self.tags = {'scs'} +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['prod', 'maint'])) class GromacsGPUCheck(GromacsBaseCheck): - def __init__(self, variant): + def __init__(self, size, variant): super().__init__('md.log') - - self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_systems = ['daint:gpu'] self.descr = 'GROMACS GPU check' - self.name = 'gromacs_gpu_%s_check' % variant - self.executable_opts = ('mdrun -dlb yes -ntomp 1 -npme 0 ' - '-s herflat.tpr ').split() + self.executable_opts = ['mdrun', '-dlb yes', '-ntomp 1', '-npme 0', + '-s herflat.tpr'] self.variables = {'CRAY_CUDA_MPS': '1'} - self.tags = {'scs'} self.num_gpus_per_node = 1 - - if self.current_system.name == 'dom': + if size == 'small': + self.valid_systems += ['dom:gpu'] self.num_tasks = 72 self.num_tasks_per_node = 12 else: self.num_tasks = 192 self.num_tasks_per_node = 12 - -@rfm.simple_test -class GromacsGPUMaintCheck(GromacsGPUCheck): - def __init__(self): - super().__init__('maint') - self.tags |= {'maintenance'} - self.reference = { - 'dom:gpu': { - 'perf': (29.3, -0.05, None) + references = { + 'maint': { + 'small': { + 'dom:gpu': {'perf': (29.3, -0.05, None, 'ns/day')}, + 'daint:gpu': {'perf': (30.3, -0.10, None, 'ns/day')} + }, + 'large': { + 'daint:gpu': {'perf': (42.0, -0.10, None, 'ns/day')} + } }, - 'daint:gpu': { - 'perf': (42.0, -0.10, None) - }, - } - - -@rfm.simple_test -class GromacsGPUProdCheck(GromacsGPUCheck): - def __init__(self): - super().__init__('prod') - self.tags |= {'production'} - self.reference = { - 'dom:gpu': { - 'perf': (29.3, -0.05, None) - }, - 'daint:gpu': { - 'perf': (42.0, -0.20, None) + 'prod': { + 'small': { + 'dom:gpu': {'perf': (29.3, -0.05, None, 'ns/day')}, + 'daint:gpu': {'perf': (30.3, -0.10, None, 'ns/day')} + }, + 'large': { + 'daint:gpu': {'perf': (42.0, -0.20, None, 'ns/day')} + } }, } + self.reference = references[variant][size] + self.tags |= {'maintenance' if variant == 'maint' else 'production'} +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['prod'])) class GromacsCPUCheck(GromacsBaseCheck): - def __init__(self, variant): + def __init__(self, size, variant): super().__init__('md.log') - - self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_systems = ['daint:mc'] self.descr = 'GROMACS CPU check' - self.name = 'gromacs_cpu_%s_check' % variant - self.executable_opts = ('mdrun -dlb yes -ntomp 1 -npme -1 ' - '-nb cpu -s herflat.tpr ').split() + self.executable_opts = ['mdrun', '-dlb yes', '-ntomp 1', '-npme -1', + '-nb cpu', '-s herflat.tpr'] - if self.current_system.name == 'dom': + if size == 'small': + self.valid_systems += ['dom:mc'] self.num_tasks = 216 self.num_tasks_per_node = 36 else: self.num_tasks = 576 self.num_tasks_per_node = 36 - -@rfm.simple_test -class GromacsCPUProdCheck(GromacsCPUCheck): - def __init__(self): - super().__init__('prod') - self.tags |= {'production'} - self.reference = { - 'dom:mc': { - 'perf': (42.7, -0.05, None) + references = { + 'maint': { + 'small': { + 'dom:mc': {'perf': (0.0, None, None, 'ns/day')}, + # FIXME: numbers may need update + 'daint:mc': {'perf': (38.8, -0.10, None, 'ns/day')} + }, + 'large': { + 'daint:mc': {'perf': (0.0, None, None, 'ns/day')} + } }, - 'daint:mc': { - 'perf': (70.4, -0.20, None) + 'prod': { + 'small': { + 'dom:mc': {'perf': (42.7, -0.05, None, 'ns/day')}, + 'daint:mc': {'perf': (38.8, -0.10, None, 'ns/day')} + }, + 'large': { + 'daint:mc': {'perf': (70.4, -0.20, None, 'ns/day')} + } }, } + self.reference = references[variant][size] + self.tags |= {'maintenance' if variant == 'maint' else 'production'} # FIXME: This test is obsolete; it is kept only for reference. diff --git a/cscs-checks/apps/lammps/lammps_check.py b/cscs-checks/apps/lammps/lammps_check.py index f68e067098..08e0e08fc3 100644 --- a/cscs-checks/apps/lammps/lammps_check.py +++ b/cscs-checks/apps/lammps/lammps_check.py @@ -37,79 +37,78 @@ def __init__(self): self.maintainers = ['TR', 'VH'] +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['prod', 'maint'])) class LAMMPSGPUCheck(LAMMPSBaseCheck): - def __init__(self): + def __init__(self, size, variant): super().__init__() - self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_systems = ['daint:gpu'] self.executable = 'lmp_mpi' - self.executable_opts = '-sf gpu -pk gpu 1 -in in.lj.gpu'.split() + self.executable_opts = ['-sf gpu', '-pk gpu 1', '-in in.lj.gpu'] self.variables = {'CRAY_CUDA_MPS': '1'} self.num_gpus_per_node = 1 - if self.current_system.name == 'dom': + if size == 'small': + self.valid_systems += ['dom:gpu'] self.num_tasks = 12 self.num_tasks_per_node = 2 else: self.num_tasks = 32 self.num_tasks_per_node = 2 - -@rfm.simple_test -class LAMMPSGPUMaintCheck(LAMMPSGPUCheck): - def __init__(self): - super().__init__() - self.reference = { - 'dom:gpu': { - 'perf': (3457.0, -0.10, None) + references = { + 'maint': { + 'small': { + 'dom:gpu': {'perf': (3457, -0.10, None, 'timesteps/s')}, + 'daint:gpu': {'perf': (2524, -0.10, None, 'timesteps/s')} + }, + 'large': { + 'daint:gpu': {'perf': (3832, -0.05, None, 'timesteps/s')} + } }, - 'daint:gpu': { - 'perf': (3832.0, -0.05, None) + 'prod': { + 'small': { + 'dom:gpu': {'perf': (3132, -0.05, None, 'timesteps/s')}, + 'daint:gpu': {'perf': (2524, -0.10, None, 'timesteps/s')} + }, + 'large': { + 'daint:gpu': {'perf': (2382, -0.50, None, 'timesteps/s')} + } }, } - - self.tags |= {'maintenance'} - - -@rfm.simple_test -class LAMMPSGPUProdCheck(LAMMPSGPUCheck): - def __init__(self): - super().__init__() - self.reference = { - 'dom:gpu': { - 'perf': (3132.0, -0.05, None) - }, - 'daint:gpu': { - 'perf': (2382.0, -0.50, None) - }, - } - - self.tags |= {'production'} + self.reference = references[variant][size] + self.tags |= {'maintenance' if variant == 'maint' else 'production'} +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['prod'])) class LAMMPSCPUCheck(LAMMPSBaseCheck): - def __init__(self): + def __init__(self, size, variant): super().__init__() - self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_systems = ['daint:mc'] self.executable = 'lmp_omp' - self.executable_opts = '-sf omp -pk omp 1 -in in.lj.cpu'.split() - if self.current_system.name == 'dom': + self.executable_opts = ['-sf omp', '-pk omp 1', '-in in.lj.cpu'] + if size == 'small': + self.valid_systems += ['dom:mc'] self.num_tasks = 216 self.num_tasks_per_node = 36 else: self.num_tasks_per_node = 36 self.num_tasks = 576 - -@rfm.simple_test -class LAMMPSCPUProdCheck(LAMMPSCPUCheck): - def __init__(self): - super().__init__() - self.reference = { - 'dom:mc': { - 'perf': (4394.0, -0.05, None) - }, - 'daint:mc': { - 'perf': (5310.0, -0.65, None) + references = { + 'prod': { + 'small': { + 'dom:mc': {'perf': (4394, -0.05, None, 'timesteps/s')}, + 'daint:mc': {'perf': (3824, -0.10, None, 'timesteps/s')} + }, + 'large': { + 'daint:mc': {'perf': (5310, -0.65, None, 'timesteps/s')} + } }, } - - self.tags |= {'production'} + self.reference = references[variant][size] + self.tags |= {'maintenance' if variant == 'maint' else 'production'} diff --git a/cscs-checks/apps/namd/namd_check.py b/cscs-checks/apps/namd/namd_check.py index 278af610a4..fa4904475f 100644 --- a/cscs-checks/apps/namd/namd_check.py +++ b/cscs-checks/apps/namd/namd_check.py @@ -5,24 +5,20 @@ class NamdBaseCheck(rfm.RunOnlyRegressionTest): - def __init__(self, version, variant): + def __init__(self, arch, size, variant): super().__init__() - self.name = 'namd_%s_%s_check' % (version, variant) - self.descr = 'NAMD check (%s, %s)' % (version, variant) - + self.descr = 'NAMD check (%s, %s)' % (arch, variant) self.valid_prog_environs = ['PrgEnv-intel'] - self.modules = ['NAMD'] # Reset sources dir relative to the SCS apps prefix self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'NAMD', 'prod') self.executable = 'namd2' - self.use_multithreading = True self.num_tasks_per_core = 2 - if self.current_system.name == 'dom': + if size == 'small': self.num_tasks = 6 self.num_tasks_per_node = 1 else: @@ -30,7 +26,7 @@ def __init__(self, version, variant): self.num_tasks_per_node = 1 energy = sn.avg(sn.extractall(r'ENERGY:(\s+\S+){10}\s+(?P\S+)', - self.stdout, 'energy', float)) + self.stdout, 'energy', float)) energy_reference = -2451359.5 energy_diff = sn.abs(energy - energy_reference) self.sanity_patterns = sn.all([ @@ -57,46 +53,49 @@ def __init__(self, version, variant): } -@rfm.parameterized_test(['maint'], ['prod']) +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['maint', 'prod'])) class NamdGPUCheck(NamdBaseCheck): - def __init__(self, variant): - super().__init__('gpu', variant) - self.valid_systems = ['daint:gpu', 'dom:gpu'] - self.executable_opts = '+idlepoll +ppn 23 stmv.namd'.split() + def __init__(self, size, variant): + super().__init__('gpu', size, variant) + self.valid_systems = ['daint:gpu'] + self.executable_opts = ['+idlepoll', '+ppn 23', 'stmv.namd'] self.num_cpus_per_task = 24 self.num_gpus_per_node = 1 - if variant == 'prod': - self.tags |= {'production'} + self.tags |= {'maintenance' if variant == 'maint' else 'production'} + if size == 'small': + self.valid_systems += ['dom:gpu'] + self.reference = { + 'dom:gpu': {'days_ns': (0.18, None, 0.05, 'days/ns')}, + 'daint:gpu': {'days_ns': (0.18, None, 0.05, 'days/ns')} + } else: - self.tags |= {'maintenance'} - - self.reference = { - 'dom:gpu': { - 'days_ns': (0.18, None, 0.05), - }, - 'daint:gpu': { - 'days_ns': (0.11, None, 0.05), - }, - } + self.reference = { + 'daint:gpu': {'days_ns': (0.11, None, 0.05, 'days/ns')} + } -@rfm.parameterized_test(['maint'], ['prod']) +@rfm.required_version('>=2.16') +@rfm.parameterized_test(*([s, v] + for s in ['small', 'large'] + for v in ['maint', 'prod'])) class NamdCPUCheck(NamdBaseCheck): - def __init__(self, variant): - super().__init__('cpu', variant) - self.valid_systems = ['daint:mc', 'dom:mc'] - self.executable_opts = '+idlepoll +ppn 71 stmv.namd'.split() + def __init__(self, size, variant): + super().__init__('cpu', size, variant) + self.valid_systems = ['daint:mc'] + self.executable_opts = ['+idlepoll', '+ppn 71', 'stmv.namd'] self.num_cpus_per_task = 72 - if variant == 'prod': - self.tags |= {'production'} + if size == 'small': + self.valid_systems += ['dom:mc'] + self.reference = { + 'dom:mc': {'days_ns': (0.57, None, 0.05, 'days/ns')}, + 'daint:mc': {'days_ns': (0.56, None, 0.05, 'days/ns')} + } else: - self.tags |= {'maintenance'} + self.reference = { + 'daint:mc': {'days_ns': (0.38, None, 0.05, 'days/ns')} + } - self.reference = { - 'dom:mc': { - 'days_ns': (0.57, None, 0.05), - }, - 'daint:mc': { - 'days_ns': (0.38, None, 0.05), - }, - } + self.tags |= {'maintenance' if variant == 'maint' else 'production'} diff --git a/cscs-checks/apps/tensorflow/tf_horovod_check.py b/cscs-checks/apps/tensorflow/tf_horovod_check.py index 27b8b77e38..1b51310afb 100644 --- a/cscs-checks/apps/tensorflow/tf_horovod_check.py +++ b/cscs-checks/apps/tensorflow/tf_horovod_check.py @@ -2,25 +2,37 @@ import reframe.utility.sanity as sn -@rfm.required_version('>=2.16-dev0') -@rfm.simple_test +@rfm.required_version('>=2.16') +@rfm.parameterized_test(['small'], ['large']) class TensorFlowHorovodTest(rfm.RunOnlyRegressionTest): - def __init__(self): + def __init__(self, variant): super().__init__() self.descr = 'Distributed training with TensorFlow and Horovod' - self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_systems = ['daint:gpu'] self.valid_prog_environs = ['PrgEnv-gnu'] tfshortver = '1.11' self.sourcesdir = 'https://github.com/tensorflow/benchmarks' self.modules = ['Horovod/0.15.0-CrayGNU-18.08-tf-%s.0' % tfshortver] - self.reference = { - 'dom:gpu': { - 'throughput': (1133.6, None, 0.05, 'images/s'), - }, - 'daint:gpu': { - 'throughput': (4403.0, None, 0.05, 'images/s') - }, - } + if variant == 'small': + self.valid_systems += ['dom:gpu'] + self.num_tasks = 8 + self.reference = { + 'dom:gpu': { + 'throughput': (1133.6, None, 0.05, 'images/s'), + }, + 'daint:gpu': { + 'throughput': (1134.8, None, 0.05, 'images/s') + }, + } + else: + self.num_tasks = 32 + self.reference = { + 'daint:gpu': { + 'throughput': (4403.0, None, 0.05, 'images/s') + }, + } + + self.num_tasks_per_node = 1 self.perf_patterns = { 'throughput': sn.avg(sn.extractall( r'total images/sec:\s+(?P\S+)', @@ -29,11 +41,6 @@ def __init__(self): self.sanity_patterns = sn.assert_found( r'[\S+\s+] INFO NET\/IB : Using interface ipogif0' r' for sideband communication', self.stdout) - self.num_tasks_per_node = 1 - if self.current_system.name == 'dom': - self.num_tasks = 8 - elif self.current_system.name == 'daint': - self.num_tasks = 32 self.pre_run = ['git checkout cnn_tf_v%s_compatible' % tfshortver] self.variables = { @@ -42,7 +49,7 @@ def __init__(self): 'NCCL_IB_CUDA_SUPPORT': '1', 'OMP_NUM_THREADS': '$SLURM_CPUS_PER_TASK', } - self.executable = ('python') + self.executable = 'python' self.executable_opts = [ 'scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py', '--model inception3', diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index f9d006edef..6e3edda4bc 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -73,12 +73,15 @@ def __init__(self): @rfm.required_version('>=2.16') -@rfm.simple_test +@rfm.parameterized_test(['small'], ['large']) class AllreduceTest(rfm.RegressionTest): - def __init__(self): + def __init__(self, variant): super().__init__() self.strict_check = False - self.valid_systems = ['daint:gpu', 'dom:gpu'] + self.valid_systems = ['daint:gpu', 'daint:mc'] + if variant == 'small': + self.valid_systems += ['dom:gpu', 'dom:mc'] + self.descr = 'Allreduce OSU microbenchmark' self.build_system = 'Make' self.build_system.makefile = 'Makefile_allreduce' @@ -94,21 +97,34 @@ def __init__(self): self.stdout, 'latency', float) } self.tags = {'production'} - self.reference = { - 'dom:gpu': { - 'latency': (6.0, None, 0.1, 'us') - }, - 'daint:gpu': { - 'latency': (20.5, None, 2.0, 'us') - }, - } - self.num_tasks_per_node = 1 - self.num_gpus_per_node = 1 - if self.current_system.name == 'dom': + if variant == 'small': self.num_tasks = 6 - elif self.current_system.name == 'daint': + self.reference = { + 'dom:gpu': { + 'latency': (6.0, None, 0.10, 'us') + }, + 'daint:gpu': { + 'latency': (7.81, None, 0.25, 'us') + }, + 'daint:mc': { + 'latency': (8.79, None, 0.25, 'us') + } + } + else: self.num_tasks = 16 + self.reference = { + 'daint:gpu': { + 'latency': (16.87, None, 0.40, 'us') + }, + 'daint:mc': { + 'latency': (10.85, None, 0.20, 'us') + } + } + # Allow test to run on new systems without errors + self.reference['*:latency'] = (0, None, None, 'us') + self.num_tasks_per_node = 1 + self.num_gpus_per_node = 1 self.extra_resources = { 'switches': { 'num_switches': 1 From 9d827a6b323f3af420294d2ee0a264de22743ffe Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 5 Apr 2019 18:19:20 +0200 Subject: [PATCH 2/2] Address PR comments --- cscs-checks/apps/cp2k/cp2k_check.py | 16 ++++++++-------- cscs-checks/apps/espresso/espresso_check.py | 4 ++-- cscs-checks/apps/gromacs/gromacs_check.py | 12 ++++++------ cscs-checks/apps/lammps/lammps_check.py | 12 ++++++------ cscs-checks/apps/namd/namd_check.py | 16 ++++++++-------- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/cscs-checks/apps/cp2k/cp2k_check.py b/cscs-checks/apps/cp2k/cp2k_check.py index feda362e7a..3b4269754f 100644 --- a/cscs-checks/apps/cp2k/cp2k_check.py +++ b/cscs-checks/apps/cp2k/cp2k_check.py @@ -43,11 +43,11 @@ def __init__(self): for s in ['small', 'large'] for v in ['maint', 'prod'])) class Cp2kCpuCheck(Cp2kCheck): - def __init__(self, size, variant): + def __init__(self, scale, variant): super().__init__() - self.descr = 'CP2K CPU check (version: %s, %s)' % (size, variant) + self.descr = 'CP2K CPU check (version: %s, %s)' % (scale, variant) self.valid_systems = ['daint:mc'] - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:mc'] self.num_tasks = 216 else: @@ -75,7 +75,7 @@ def __init__(self, size, variant): } } - self.reference = references[variant][size] + self.reference = references[variant][scale] self.tags |= {'maintenance' if variant == 'maint' else 'production'} @@ -83,14 +83,14 @@ def __init__(self, size, variant): for s in ['small', 'large'] for v in ['maint', 'prod'])) class Cp2kGpuCheck(Cp2kCheck): - def __init__(self, size, variant): + def __init__(self, scale, variant): super().__init__() - self.descr = 'CP2K GPU check (version: %s, %s)' % (size, variant) + self.descr = 'CP2K GPU check (version: %s, %s)' % (scale, variant) self.valid_systems = ['daint:gpu'] self.variables = {'CRAY_CUDA_MPS': '1'} self.modules = ['CP2K'] self.num_gpus_per_node = 1 - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:gpu'] self.num_tasks = 72 else: @@ -117,5 +117,5 @@ def __init__(self, size, variant): } } } - self.reference = references[variant][size] + self.reference = references[variant][scale] self.tags |= {'maintenance' if variant == 'maint' else 'production'} diff --git a/cscs-checks/apps/espresso/espresso_check.py b/cscs-checks/apps/espresso/espresso_check.py index b339e3b2ef..4cacf2934f 100644 --- a/cscs-checks/apps/espresso/espresso_check.py +++ b/cscs-checks/apps/espresso/espresso_check.py @@ -7,7 +7,7 @@ @rfm.required_version('>=2.16') @rfm.parameterized_test(['small'], ['large']) class QECheck(rfm.RunOnlyRegressionTest): - def __init__(self, size): + def __init__(self, scale): super().__init__() self.descr = 'Quantum Espresso CPU check' self.maintainers = ['AK', 'LM'] @@ -20,7 +20,7 @@ def __init__(self, size): self.modules = ['QuantumESPRESSO'] self.executable = 'pw.x' self.executable_opts = ['-in', 'ausurf.in'] - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:mc'] self.num_tasks = 216 self.num_tasks_per_node = 36 diff --git a/cscs-checks/apps/gromacs/gromacs_check.py b/cscs-checks/apps/gromacs/gromacs_check.py index 157b51226d..b7291ab011 100644 --- a/cscs-checks/apps/gromacs/gromacs_check.py +++ b/cscs-checks/apps/gromacs/gromacs_check.py @@ -50,7 +50,7 @@ def __init__(self, output_file): for s in ['small', 'large'] for v in ['prod', 'maint'])) class GromacsGPUCheck(GromacsBaseCheck): - def __init__(self, size, variant): + def __init__(self, scale, variant): super().__init__('md.log') self.valid_systems = ['daint:gpu'] self.descr = 'GROMACS GPU check' @@ -58,7 +58,7 @@ def __init__(self, size, variant): '-s herflat.tpr'] self.variables = {'CRAY_CUDA_MPS': '1'} self.num_gpus_per_node = 1 - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:gpu'] self.num_tasks = 72 self.num_tasks_per_node = 12 @@ -86,7 +86,7 @@ def __init__(self, size, variant): } }, } - self.reference = references[variant][size] + self.reference = references[variant][scale] self.tags |= {'maintenance' if variant == 'maint' else 'production'} @@ -95,14 +95,14 @@ def __init__(self, size, variant): for s in ['small', 'large'] for v in ['prod'])) class GromacsCPUCheck(GromacsBaseCheck): - def __init__(self, size, variant): + def __init__(self, scale, variant): super().__init__('md.log') self.valid_systems = ['daint:mc'] self.descr = 'GROMACS CPU check' self.executable_opts = ['mdrun', '-dlb yes', '-ntomp 1', '-npme -1', '-nb cpu', '-s herflat.tpr'] - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:mc'] self.num_tasks = 216 self.num_tasks_per_node = 36 @@ -131,7 +131,7 @@ def __init__(self, size, variant): } }, } - self.reference = references[variant][size] + self.reference = references[variant][scale] self.tags |= {'maintenance' if variant == 'maint' else 'production'} diff --git a/cscs-checks/apps/lammps/lammps_check.py b/cscs-checks/apps/lammps/lammps_check.py index 08e0e08fc3..5321767f08 100644 --- a/cscs-checks/apps/lammps/lammps_check.py +++ b/cscs-checks/apps/lammps/lammps_check.py @@ -42,14 +42,14 @@ def __init__(self): for s in ['small', 'large'] for v in ['prod', 'maint'])) class LAMMPSGPUCheck(LAMMPSBaseCheck): - def __init__(self, size, variant): + def __init__(self, scale, variant): super().__init__() self.valid_systems = ['daint:gpu'] self.executable = 'lmp_mpi' self.executable_opts = ['-sf gpu', '-pk gpu 1', '-in in.lj.gpu'] self.variables = {'CRAY_CUDA_MPS': '1'} self.num_gpus_per_node = 1 - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:gpu'] self.num_tasks = 12 self.num_tasks_per_node = 2 @@ -77,7 +77,7 @@ def __init__(self, size, variant): } }, } - self.reference = references[variant][size] + self.reference = references[variant][scale] self.tags |= {'maintenance' if variant == 'maint' else 'production'} @@ -86,12 +86,12 @@ def __init__(self, size, variant): for s in ['small', 'large'] for v in ['prod'])) class LAMMPSCPUCheck(LAMMPSBaseCheck): - def __init__(self, size, variant): + def __init__(self, scale, variant): super().__init__() self.valid_systems = ['daint:mc'] self.executable = 'lmp_omp' self.executable_opts = ['-sf omp', '-pk omp 1', '-in in.lj.cpu'] - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:mc'] self.num_tasks = 216 self.num_tasks_per_node = 36 @@ -110,5 +110,5 @@ def __init__(self, size, variant): } }, } - self.reference = references[variant][size] + self.reference = references[variant][scale] self.tags |= {'maintenance' if variant == 'maint' else 'production'} diff --git a/cscs-checks/apps/namd/namd_check.py b/cscs-checks/apps/namd/namd_check.py index fa4904475f..2a75ac0efd 100644 --- a/cscs-checks/apps/namd/namd_check.py +++ b/cscs-checks/apps/namd/namd_check.py @@ -5,7 +5,7 @@ class NamdBaseCheck(rfm.RunOnlyRegressionTest): - def __init__(self, arch, size, variant): + def __init__(self, arch, scale, variant): super().__init__() self.descr = 'NAMD check (%s, %s)' % (arch, variant) self.valid_prog_environs = ['PrgEnv-intel'] @@ -18,7 +18,7 @@ def __init__(self, arch, size, variant): self.use_multithreading = True self.num_tasks_per_core = 2 - if size == 'small': + if scale == 'small': self.num_tasks = 6 self.num_tasks_per_node = 1 else: @@ -58,14 +58,14 @@ def __init__(self, arch, size, variant): for s in ['small', 'large'] for v in ['maint', 'prod'])) class NamdGPUCheck(NamdBaseCheck): - def __init__(self, size, variant): - super().__init__('gpu', size, variant) + def __init__(self, scale, variant): + super().__init__('gpu', scale, variant) self.valid_systems = ['daint:gpu'] self.executable_opts = ['+idlepoll', '+ppn 23', 'stmv.namd'] self.num_cpus_per_task = 24 self.num_gpus_per_node = 1 self.tags |= {'maintenance' if variant == 'maint' else 'production'} - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:gpu'] self.reference = { 'dom:gpu': {'days_ns': (0.18, None, 0.05, 'days/ns')}, @@ -82,12 +82,12 @@ def __init__(self, size, variant): for s in ['small', 'large'] for v in ['maint', 'prod'])) class NamdCPUCheck(NamdBaseCheck): - def __init__(self, size, variant): - super().__init__('cpu', size, variant) + def __init__(self, scale, variant): + super().__init__('cpu', scale, variant) self.valid_systems = ['daint:mc'] self.executable_opts = ['+idlepoll', '+ppn 71', 'stmv.namd'] self.num_cpus_per_task = 72 - if size == 'small': + if scale == 'small': self.valid_systems += ['dom:mc'] self.reference = { 'dom:mc': {'days_ns': (0.57, None, 0.05, 'days/ns')},