From aec66e3485b5f459c7c69ab137069c90426ea346 Mon Sep 17 00:00:00 2001 From: jgp Date: Sat, 12 Dec 2020 18:11:07 +0100 Subject: [PATCH 1/5] adding slowest node --- .../gpu/gpu_burn/gpu_burn_test.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py index 654a230b35..2428fdea0e 100644 --- a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py +++ b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py @@ -3,6 +3,8 @@ # # SPDX-License-Identifier: BSD-3-Clause +import os + import reframe as rfm import reframe.utility.sanity as sn @@ -133,3 +135,27 @@ def set_gpus_per_node(self): self.num_gpus_per_node = 3 else: self.num_gpus_per_node = 1 + + @rfm.run_before('performance') + def report_smallest_node(self): + regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s' + rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout)) + self.nids = sn.extractall(regex, rptf, 1) + self.flops = sn.extractall(regex, rptf, 2, float) + index = -1 + flops_min = sn.min(self.flops) + for ii in range(len(sn.evaluate(self.flops))): + if self.flops[ii] == flops_min: + index = ii + break + + self.unit = f'GF/s ({self.nids[index]})' + self.perf_patterns['smallest_flops'] = flops_min + self.reference['dom:gpu:smallest_flops'] = (0, None, None, self.unit) + self.reference['daint:gpu'] = (0, None, None, self.unit) + self.reference['arolla:cn'] = (0, None, None, self.unit) + self.reference['tsa:cn'] = (0, None, None, self.unit) + self.reference['ault:amda100'] = (0, None, None, self.unit) + self.reference['ault:amdv100'] = (0, None, None, self.unit) + self.reference['ault:intelv100'] = (0, None, None, self.unit) + self.reference['ault:amdvega'] = (0, None, None, self.unit) From e3f9de083ed935a54ecb97665e9fb01de976cded Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 15 Dec 2020 08:22:56 +0100 Subject: [PATCH 2/5] fix for review --- .../gpu/gpu_burn/gpu_burn_test.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py index 2428fdea0e..37d9c54d54 100644 --- a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py +++ b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause import os +import sys import reframe as rfm import reframe.utility.sanity as sn @@ -142,20 +143,19 @@ def report_smallest_node(self): rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout)) self.nids = sn.extractall(regex, rptf, 1) self.flops = sn.extractall(regex, rptf, 2, float) - index = -1 - flops_min = sn.min(self.flops) - for ii in range(len(sn.evaluate(self.flops))): - if self.flops[ii] == flops_min: - index = ii - break + flops_min, index = sys.float_info.max, None + for i, flops in enumerate(self.flops): + if flops < flops_min: + flops_min, index = flops, i self.unit = f'GF/s ({self.nids[index]})' + unit = (0, None, None, self.unit) self.perf_patterns['smallest_flops'] = flops_min - self.reference['dom:gpu:smallest_flops'] = (0, None, None, self.unit) - self.reference['daint:gpu'] = (0, None, None, self.unit) - self.reference['arolla:cn'] = (0, None, None, self.unit) - self.reference['tsa:cn'] = (0, None, None, self.unit) - self.reference['ault:amda100'] = (0, None, None, self.unit) - self.reference['ault:amdv100'] = (0, None, None, self.unit) - self.reference['ault:intelv100'] = (0, None, None, self.unit) - self.reference['ault:amdvega'] = (0, None, None, self.unit) + self.reference['dom:gpu:smallest_flops'] = unit + self.reference['daint:gpu:smallest_flops'] = unit + self.reference['arolla:cn:smallest_flops'] = unit + self.reference['tsa:cn:smallest_flops'] = unit + self.reference['ault:amda100:smallest_flops'] = unit + self.reference['ault:amdv100:smallest_flops'] = unit + self.reference['ault:intelv100:smallest_flops'] = unit + self.reference['ault:amdvega:smallest_flops'] = unit From c0b6cca68579b51e9495287c20525085e4d1817c Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 15 Dec 2020 10:29:43 +0100 Subject: [PATCH 3/5] fix for review --- .../gpu/gpu_burn/gpu_burn_test.py | 43 ++++++++----------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py index 37d9c54d54..cbad85374e 100644 --- a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py +++ b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py @@ -38,35 +38,35 @@ def __init__(self): self.reference = { 'dom:gpu': { - 'perf': (4115, -0.10, None, 'Gflop/s'), + # 'perf': (4115, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'daint:gpu': { - 'perf': (4115, -0.10, None, 'Gflop/s'), + # 'perf': (4115, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'arolla:cn': { - 'perf': (5861, -0.10, None, 'Gflop/s'), + # 'perf': (5861, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'tsa:cn': { - 'perf': (5861, -0.10, None, 'Gflop/s'), + # 'perf': (5861, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'ault:amda100': { - 'perf': (15000, -0.10, None, 'Gflop/s'), + # 'perf': (15000, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'ault:amdv100': { - 'perf': (5500, -0.10, None, 'Gflop/s'), + # 'perf': (5500, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'ault:intelv100': { - 'perf': (5500, -0.10, None, 'Gflop/s'), + # 'perf': (5500, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, 'ault:amdvega': { - 'perf': (3450, -0.10, None, 'Gflop/s'), + # 'perf': (3450, -0.10, None, 'Gflop/s'), 'max_temp': (0, None, None, 'Celsius') }, } @@ -138,24 +138,19 @@ def set_gpus_per_node(self): self.num_gpus_per_node = 1 @rfm.run_before('performance') - def report_smallest_node(self): + def report_nid_with_smallest_flops(self): regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s' rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout)) self.nids = sn.extractall(regex, rptf, 1) self.flops = sn.extractall(regex, rptf, 2, float) - flops_min, index = sys.float_info.max, None - for i, flops in enumerate(self.flops): - if flops < flops_min: - flops_min, index = flops, i - + # find index of smallest flops: + index = self.flops.evaluate().index(min(self.flops)) self.unit = f'GF/s ({self.nids[index]})' - unit = (0, None, None, self.unit) - self.perf_patterns['smallest_flops'] = flops_min - self.reference['dom:gpu:smallest_flops'] = unit - self.reference['daint:gpu:smallest_flops'] = unit - self.reference['arolla:cn:smallest_flops'] = unit - self.reference['tsa:cn:smallest_flops'] = unit - self.reference['ault:amda100:smallest_flops'] = unit - self.reference['ault:amdv100:smallest_flops'] = unit - self.reference['ault:intelv100:smallest_flops'] = unit - self.reference['ault:amdvega:smallest_flops'] = unit + self.reference['dom:gpu:perf'] = (4115, -0.10, None, self.unit) + self.reference['daint:gpu:perf'] = (4115, -0.10, None, self.unit) + self.reference['arolla:cn:perf'] = (5861, -0.10, None, self.unit) + self.reference['tsa:cn:perf'] = (5861, -0.10, None, self.unit) + self.reference['ault:amda100:perf'] = (15000, -0.10, None, self.unit) + self.reference['ault:amdv100:perf'] = (5500, -0.10, None, self.unit) + self.reference['ault:intelv100:perf'] = (5500, -0.10, None, self.unit) + self.reference['ault:amdvega:perf'] = (3450, -0.10, None, self.unit) From 538dbbf789241086f6472beaccef0476811043b8 Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 15 Dec 2020 10:42:19 +0100 Subject: [PATCH 4/5] removing unused import --- cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py index cbad85374e..42fa7c5d26 100644 --- a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py +++ b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py @@ -4,7 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause import os -import sys import reframe as rfm import reframe.utility.sanity as sn From e7749c2484799decf02172298dccb4c2802ed085 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 16 Dec 2020 22:30:37 +0100 Subject: [PATCH 5/5] Enhance the GPU burn test --- .../gpu/gpu_burn/gpu_burn_test.py | 43 ++++++++----------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py index 42fa7c5d26..e638f318d5 100644 --- a/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py +++ b/cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py @@ -33,41 +33,35 @@ def __init__(self): r'(?P\S*) Celsius') self.perf_patterns = { 'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)), + 'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)), } self.reference = { 'dom:gpu': { - # 'perf': (4115, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (4115, -0.10, None, 'Gflop/s'), }, 'daint:gpu': { - # 'perf': (4115, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (4115, -0.10, None, 'Gflop/s'), }, 'arolla:cn': { - # 'perf': (5861, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (5861, -0.10, None, 'Gflop/s'), }, 'tsa:cn': { - # 'perf': (5861, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (5861, -0.10, None, 'Gflop/s'), }, 'ault:amda100': { - # 'perf': (15000, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (15000, -0.10, None, 'Gflop/s'), }, 'ault:amdv100': { - # 'perf': (5500, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (5500, -0.10, None, 'Gflop/s'), }, 'ault:intelv100': { - # 'perf': (5500, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (5500, -0.10, None, 'Gflop/s'), }, 'ault:amdvega': { - # 'perf': (3450, -0.10, None, 'Gflop/s'), - 'max_temp': (0, None, None, 'Celsius') + 'perf': (3450, -0.10, None, 'Gflop/s'), }, + '*': {'temp': (0, None, None, 'degC')} } self.maintainers = ['AJ', 'TM'] @@ -142,14 +136,11 @@ def report_nid_with_smallest_flops(self): rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout)) self.nids = sn.extractall(regex, rptf, 1) self.flops = sn.extractall(regex, rptf, 2, float) - # find index of smallest flops: + + # Find index of smallest flops and update reference dictionary to + # include our patched units index = self.flops.evaluate().index(min(self.flops)) - self.unit = f'GF/s ({self.nids[index]})' - self.reference['dom:gpu:perf'] = (4115, -0.10, None, self.unit) - self.reference['daint:gpu:perf'] = (4115, -0.10, None, self.unit) - self.reference['arolla:cn:perf'] = (5861, -0.10, None, self.unit) - self.reference['tsa:cn:perf'] = (5861, -0.10, None, self.unit) - self.reference['ault:amda100:perf'] = (15000, -0.10, None, self.unit) - self.reference['ault:amdv100:perf'] = (5500, -0.10, None, self.unit) - self.reference['ault:intelv100:perf'] = (5500, -0.10, None, self.unit) - self.reference['ault:amdvega:perf'] = (3450, -0.10, None, self.unit) + unit = f'GF/s ({self.nids[index]})' + for key, ref in self.reference.items(): + if not key.endswith(':temp'): + self.reference[key] = (*ref[:3], unit)