From e971591d68325bff35c74e9557f3ad6a85ec0aed Mon Sep 17 00:00:00 2001 From: ajocksch Date: Mon, 26 Aug 2019 17:42:06 +0200 Subject: [PATCH 1/5] WIP: flexible alltoall OSU benchmark --- cscs-checks/microbenchmarks/osu/osu_tests.py | 118 ++++++++----------- 1 file changed, 49 insertions(+), 69 deletions(-) diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index 04b414283f..dfcb51e2cd 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -3,9 +3,9 @@ @rfm.required_version('>=2.16') -@rfm.parameterized_test(['production']) +@rfm.simple_test class AlltoallTest(rfm.RegressionTest): - def __init__(self, variant): + def __init__(self): super().__init__() self.strict_check = False self.valid_systems = ['daint:gpu', 'dom:gpu'] @@ -25,13 +25,13 @@ def __init__(self, variant): 'latency': sn.extractsingle(r'^8\s+(?P\S+)', self.stdout, 'latency', float) } - self.tags = {variant, 'benchmark'} + self.tags = {'production', 'benchmark'} self.reference = { 'dom:gpu': { - 'latency': (8.23, None, 0.1, 'us') + 'latency': (1.31, None, 0.1, 'us') }, 'daint:gpu': { - 'latency': (20.73, None, 2.0, 'us') + 'latency': (1.31, None, 2.0, 'us') }, '*': { 'latency': (0, None, None, 'us') @@ -39,10 +39,7 @@ def __init__(self, variant): } self.num_tasks_per_node = 1 self.num_gpus_per_node = 1 - if self.current_system.name == 'daint': - self.num_tasks = 16 - else: - self.num_tasks = 6 + self.num_tasks = 0 self.extra_resources = { 'switches': { @@ -51,6 +48,21 @@ def __init__(self, variant): } + @property + @sn.sanity_function + def num_tasks_assigned(self): + return self.job.num_tasks + + def setup(self, partition, environ, **job_opts): + num_nodes = self.num_tasks_assigned / self.num_tasks_per_node + self.perf_patterns = { + 'latency': sn.extractsingle(r'^8\s+(?P\S+)', + self.stdout, 'latency', float) / num_nodes + } + + super().setup(partition, environ, **job_opts) + + @rfm.simple_test class FlexAlltoallTest(rfm.RegressionTest): def __init__(self): @@ -76,14 +88,12 @@ def __init__(self): @rfm.required_version('>=2.16') -@rfm.parameterized_test(['small'], ['large']) +@rfm.simple_test class AllreduceTest(rfm.RegressionTest): - def __init__(self, variant): + def __init__(self): super().__init__() self.strict_check = False - self.valid_systems = ['daint:gpu', 'daint:mc'] - if variant == 'small': - self.valid_systems += ['dom:gpu', 'dom:mc'] + self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] self.descr = 'Allreduce OSU microbenchmark' self.build_system = 'Make' @@ -95,40 +105,22 @@ def __init__(self, variant): self.valid_prog_environs = ['PrgEnv-gnu'] self.maintainers = ['RS', 'VK'] self.sanity_patterns = sn.assert_found(r'^8', self.stdout) - self.perf_patterns = { - 'latency': sn.extractsingle(r'^8\s+(?P\S+)', - self.stdout, 'latency', float) - } self.tags = {'production', 'benchmark'} - if variant == 'small': - self.num_tasks = 6 - self.reference = { - 'dom:gpu': { - 'latency': (6.0, None, 0.10, 'us') - }, - 'daint:gpu': { - 'latency': (7.81, None, 0.25, 'us') - }, - 'daint:mc': { - 'latency': (8.79, None, 0.25, 'us') - }, - '*': { - 'latency': (0, None, None, 'us') - } - } - else: - self.num_tasks = 16 - self.reference = { - 'daint:gpu': { - 'latency': (16.87, None, 0.40, 'us') - }, - 'daint:mc': { - 'latency': (10.85, None, 0.20, 'us') - }, - '*': { - 'latency': (0, None, None, 'us') - } + self.num_tasks = 0 + self.reference = { + 'dom:gpu': { + 'latency': (1.0, None, 0.10, 'us') + }, + 'daint:gpu': { + 'latency': (1.302, None, 0.40, 'us') + }, + 'daint:mc': { + 'latency': (1.456, None, 0.20, 'us') + }, + '*': { + 'latency': (0, None, None, 'us') } + } self.num_tasks_per_node = 1 self.num_gpus_per_node = 1 @@ -138,32 +130,20 @@ def __init__(self, variant): } } + @property + @sn.sanity_function + def num_tasks_assigned(self): + return self.job.num_tasks -# FIXME: This test is obsolete; it is kept only for reference. -@rfm.parameterized_test(*({'num_tasks': i} for i in range(2, 10, 2))) -class AlltoallMonchAcceptanceTest(AlltoallTest): - def __init__(self, num_tasks): - super().__init__('monch_acceptance') - self.valid_systems = ['monch:compute'] - self.num_tasks = num_tasks - reference_by_node = { - 2: { - 'perf': (2.71, None, 0.1) - }, - 4: { - 'perf': (3.75, None, 0.1) - }, - 6: { - 'perf': (6.28, None, 0.1) - }, - 8: { - 'perf': (8.15, None, 0.1) - }, - } - self.reference = { - 'monch:compute': reference_by_node[self.num_tasks] + def setup(self, partition, environ, **job_opts): + num_nodes = self.num_tasks_assigned / self.num_tasks_per_node + self.perf_patterns = { + 'latency': sn.extractsingle(r'^8\s+(?P\S+)', + self.stdout, 'latency', float) / num_nodes } + super().setup(partition, environ, **job_opts) + class P2PBaseTest(rfm.RegressionTest): def __init__(self): From f98fe29b9f4847d6b2c563f46a90ee226bc08ab3 Mon Sep 17 00:00:00 2001 From: ajocksch Date: Mon, 26 Aug 2019 17:52:00 +0200 Subject: [PATCH 2/5] pep8 --- cscs-checks/microbenchmarks/osu/osu_tests.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index dfcb51e2cd..8da0ee9b14 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -47,7 +47,6 @@ def __init__(self): } } - @property @sn.sanity_function def num_tasks_assigned(self): @@ -57,7 +56,7 @@ def setup(self, partition, environ, **job_opts): num_nodes = self.num_tasks_assigned / self.num_tasks_per_node self.perf_patterns = { 'latency': sn.extractsingle(r'^8\s+(?P\S+)', - self.stdout, 'latency', float) / num_nodes + self.stdout, 'latency', float) / num_nodes } super().setup(partition, environ, **job_opts) @@ -139,7 +138,7 @@ def setup(self, partition, environ, **job_opts): num_nodes = self.num_tasks_assigned / self.num_tasks_per_node self.perf_patterns = { 'latency': sn.extractsingle(r'^8\s+(?P\S+)', - self.stdout, 'latency', float) / num_nodes + self.stdout, 'latency', float) / num_nodes } super().setup(partition, environ, **job_opts) From df57a82a5bfdec61660fb2ef2da43cebe6d0f830 Mon Sep 17 00:00:00 2001 From: ajocksch Date: Fri, 6 Sep 2019 11:22:27 +0200 Subject: [PATCH 3/5] perf_patterns in flexible alltoall --- cscs-checks/microbenchmarks/osu/osu_tests.py | 127 ++++++++++++------- 1 file changed, 79 insertions(+), 48 deletions(-) diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index 8da0ee9b14..f0991fd627 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -3,9 +3,9 @@ @rfm.required_version('>=2.16') -@rfm.simple_test +@rfm.parameterized_test(['production']) class AlltoallTest(rfm.RegressionTest): - def __init__(self): + def __init__(self, variant): super().__init__() self.strict_check = False self.valid_systems = ['daint:gpu', 'dom:gpu'] @@ -25,13 +25,13 @@ def __init__(self): 'latency': sn.extractsingle(r'^8\s+(?P\S+)', self.stdout, 'latency', float) } - self.tags = {'production', 'benchmark'} + self.tags = {variant, 'benchmark'} self.reference = { 'dom:gpu': { - 'latency': (1.31, None, 0.1, 'us') + 'latency': (8.23, None, 0.1, 'us') }, 'daint:gpu': { - 'latency': (1.31, None, 2.0, 'us') + 'latency': (20.73, None, 2.0, 'us') }, '*': { 'latency': (0, None, None, 'us') @@ -39,7 +39,10 @@ def __init__(self): } self.num_tasks_per_node = 1 self.num_gpus_per_node = 1 - self.num_tasks = 0 + if self.current_system.name == 'daint': + self.num_tasks = 16 + else: + self.num_tasks = 6 self.extra_resources = { 'switches': { @@ -47,21 +50,8 @@ def __init__(self): } } - @property - @sn.sanity_function - def num_tasks_assigned(self): - return self.job.num_tasks - - def setup(self, partition, environ, **job_opts): - num_nodes = self.num_tasks_assigned / self.num_tasks_per_node - self.perf_patterns = { - 'latency': sn.extractsingle(r'^8\s+(?P\S+)', - self.stdout, 'latency', float) / num_nodes - } - - super().setup(partition, environ, **job_opts) - +@rfm.required_version('>=2.18') @rfm.simple_test class FlexAlltoallTest(rfm.RegressionTest): def __init__(self): @@ -83,16 +73,27 @@ def __init__(self): self.num_tasks_per_node = 1 self.num_tasks = 0 self.sanity_patterns = sn.assert_found(r'^1048576', self.stdout) + self.perf_patterns = { + 'latency': sn.extractsingle(r'^8\s+(?P\S+)', + self.stdout, 'latency', float) + } + self.reference = { + '*': { + 'latency': (0, None, None, 'us') + }, + } self.tags = {'diagnostic', 'ops', 'benchmark'} @rfm.required_version('>=2.16') -@rfm.simple_test +@rfm.parameterized_test(['small'], ['large']) class AllreduceTest(rfm.RegressionTest): - def __init__(self): + def __init__(self, variant): super().__init__() self.strict_check = False - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] + self.valid_systems = ['daint:gpu', 'daint:mc'] + if variant == 'small': + self.valid_systems += ['dom:gpu', 'dom:mc'] self.descr = 'Allreduce OSU microbenchmark' self.build_system = 'Make' @@ -104,22 +105,40 @@ def __init__(self): self.valid_prog_environs = ['PrgEnv-gnu'] self.maintainers = ['RS', 'VK'] self.sanity_patterns = sn.assert_found(r'^8', self.stdout) + self.perf_patterns = { + 'latency': sn.extractsingle(r'^8\s+(?P\S+)', + self.stdout, 'latency', float) + } self.tags = {'production', 'benchmark'} - self.num_tasks = 0 - self.reference = { - 'dom:gpu': { - 'latency': (1.0, None, 0.10, 'us') - }, - 'daint:gpu': { - 'latency': (1.302, None, 0.40, 'us') - }, - 'daint:mc': { - 'latency': (1.456, None, 0.20, 'us') - }, - '*': { - 'latency': (0, None, None, 'us') + if variant == 'small': + self.num_tasks = 6 + self.reference = { + 'dom:gpu': { + 'latency': (6.0, None, 0.10, 'us') + }, + 'daint:gpu': { + 'latency': (7.81, None, 0.25, 'us') + }, + 'daint:mc': { + 'latency': (8.79, None, 0.25, 'us') + }, + '*': { + 'latency': (0, None, None, 'us') + } + } + else: + self.num_tasks = 16 + self.reference = { + 'daint:gpu': { + 'latency': (16.87, None, 0.40, 'us') + }, + 'daint:mc': { + 'latency': (10.85, None, 0.20, 'us') + }, + '*': { + 'latency': (0, None, None, 'us') + } } - } self.num_tasks_per_node = 1 self.num_gpus_per_node = 1 @@ -129,19 +148,31 @@ def __init__(self): } } - @property - @sn.sanity_function - def num_tasks_assigned(self): - return self.job.num_tasks - def setup(self, partition, environ, **job_opts): - num_nodes = self.num_tasks_assigned / self.num_tasks_per_node - self.perf_patterns = { - 'latency': sn.extractsingle(r'^8\s+(?P\S+)', - self.stdout, 'latency', float) / num_nodes +# FIXME: This test is obsolete; it is kept only for reference. +@rfm.parameterized_test(*({'num_tasks': i} for i in range(2, 10, 2))) +class AlltoallMonchAcceptanceTest(AlltoallTest): + def __init__(self, num_tasks): + super().__init__('monch_acceptance') + self.valid_systems = ['monch:compute'] + self.num_tasks = num_tasks + reference_by_node = { + 2: { + 'perf': (2.71, None, 0.1) + }, + 4: { + 'perf': (3.75, None, 0.1) + }, + 6: { + 'perf': (6.28, None, 0.1) + }, + 8: { + 'perf': (8.15, None, 0.1) + }, + } + self.reference = { + 'monch:compute': reference_by_node[self.num_tasks] } - - super().setup(partition, environ, **job_opts) class P2PBaseTest(rfm.RegressionTest): From d25e722cc9687c714ba0ec6b8b94fba6370792af Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 14 Sep 2019 13:47:13 +0200 Subject: [PATCH 4/5] Minor fixes --- cscs-checks/microbenchmarks/osu/osu_tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index f0991fd627..f455671c3d 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -51,11 +51,10 @@ def __init__(self, variant): } -@rfm.required_version('>=2.18') +@rfm.required_version('>=2.19') @rfm.simple_test class FlexAlltoallTest(rfm.RegressionTest): def __init__(self): - super().__init__() self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc', 'kesch:cn', 'kesch:pn', 'leone:normal'] From c064c745cede7878d66171008ee4357ab59f51ad Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 14 Sep 2019 14:21:13 +0200 Subject: [PATCH 5/5] Update syntax of tests --- cscs-checks/microbenchmarks/osu/osu_tests.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cscs-checks/microbenchmarks/osu/osu_tests.py b/cscs-checks/microbenchmarks/osu/osu_tests.py index f455671c3d..287353c3d5 100644 --- a/cscs-checks/microbenchmarks/osu/osu_tests.py +++ b/cscs-checks/microbenchmarks/osu/osu_tests.py @@ -2,11 +2,10 @@ import reframe.utility.sanity as sn -@rfm.required_version('>=2.16') +@rfm.required_version('>=2.19') @rfm.parameterized_test(['production']) class AlltoallTest(rfm.RegressionTest): def __init__(self, variant): - super().__init__() self.strict_check = False self.valid_systems = ['daint:gpu', 'dom:gpu'] self.descr = 'Alltoall OSU microbenchmark' @@ -84,11 +83,10 @@ def __init__(self): self.tags = {'diagnostic', 'ops', 'benchmark'} -@rfm.required_version('>=2.16') +@rfm.required_version('>=2.19') @rfm.parameterized_test(['small'], ['large']) class AllreduceTest(rfm.RegressionTest): def __init__(self, variant): - super().__init__() self.strict_check = False self.valid_systems = ['daint:gpu', 'daint:mc'] if variant == 'small': @@ -176,7 +174,6 @@ def __init__(self, num_tasks): class P2PBaseTest(rfm.RegressionTest): def __init__(self): - super().__init__() self.exclusive_access = True self.strict_check = False self.num_tasks = 2 @@ -201,7 +198,7 @@ def __init__(self): } -@rfm.required_version('>=2.16') +@rfm.required_version('>=2.19') @rfm.simple_test class P2PCPUBandwidthTest(P2PBaseTest): def __init__(self): @@ -241,7 +238,7 @@ def __init__(self): self.tags |= {'monch_acceptance'} -@rfm.required_version('>=2.16') +@rfm.required_version('>=2.19') @rfm.simple_test class P2PCPULatencyTest(P2PBaseTest): def __init__(self): @@ -281,7 +278,7 @@ def __init__(self): self.tags |= {'monch_acceptance'} -@rfm.required_version('>=2.16') +@rfm.required_version('>=2.19') @rfm.simple_test class G2GBandwidthTest(P2PBaseTest): def __init__(self): @@ -321,7 +318,7 @@ def __init__(self): self.build_system.cppflags = ['-D_ENABLE_CUDA_'] -@rfm.required_version('>=2.16') +@rfm.required_version('>=2.19') @rfm.simple_test class G2GLatencyTest(P2PBaseTest): def __init__(self):