From f90edc6f9810ee62a6c3f062f1241e0726a6a9bb Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 29 Apr 2019 11:17:08 +0200 Subject: [PATCH 1/8] ert --- .../berkeleylab-roofline.py | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py new file mode 100644 index 0000000000..90f9206c4f --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -0,0 +1,119 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +class ErtTestBase(rfm.RegressionTest): + """ + The Empirical Roofline Tool, ERT, automatically generates roofline data. + https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ + """ + def __init__(self): + super().__init__() + self.descr = 'Empirical Roofline Toolkit' + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'ert') + self.build_system = 'SingleSource' + self.sourcepath = 'kernel1.c driver1.c' + self.executable = 'ert.exe' + self.build_system.ldflags = ['-O3 -fopenmp'] + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'ert') + self.rpt = '%s.rpt' % self.executable + self.maintainers = ['JG'] + self.tags = {'scs'} + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + # self.job.launcher.options = ['--cpu-bind=verbose,none'] + + +@rfm.parameterized_test(*[[mpitask, flop] + for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] + for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, + 1024]]) +class ErtBroadwellTest(ErtTestBase): + def __init__(self, mpitask, flop): + super().__init__() + ompthread = int(36/mpitask) + self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.build_system.cppflags = [ + '-DERT_ALIGN=32', + '-DERT_MEMORY_MAX=1073741824', + '-DERT_MPI=True', + '-DERT_OPENMP=True', + '-DERT_TRIALS_MIN=1', + '-DERT_WORKING_SET_MIN=1', + '-DERT_FLOP=%s' % flop, + ] + self.name = 'ert_FLOPS.' + '{:03d}'.format(flop) + \ + '_MPI.' + '{:03d}'.format(mpitask) + \ + '_OpenMP.' + '{:03d}'.format(ompthread) + self.time_limit = (0, 10, 0) + self.exclusive = True + self.num_tasks = mpitask + self.num_tasks_per_node = mpitask + self.num_cpus_per_task = int(ompthread) + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) + } + # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): + GFLOPs = 469.0 + L1bw = 1788.0 + L2bw = 855.0 + L3bw = 547.0 + DRAMbw = 70.5 + # slowest job: + mpitaskm1 = 1 + flopm1 = 1024 + self.roofline_rpt = 'rpt' + if mpitask == mpitaskm1 and flop == flopm1: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + # give enough time for all the dependent jobs to collect data + 'sleep 60', + 'cat ../ert_FLOPS*/sum |python2 roofline.py > rpt', + ] + self.sanity_patterns = sn.all([ + # --- check data type: + sn.assert_eq(sn.extractsingle( + r'^\s+(?P\w+) \* __restrict__ buf = \(\w+ \*\)' + r'malloc\(PSIZE\);', 'driver1.c', 'prec'), 'double'), + # --- check ert's roofline results: + # check GFLOPS: + sn.assert_reference(sn.extractsingle( + r'(?P\d+.\d+)\sGFLOPs EMP', self.roofline_rpt, + 'GFLOPs', float), GFLOPs, -0.1, 0.3), + # check L1 bandwidth: + sn.assert_reference(sn.extractsingle( + r'(?P\d+.\d+)\sL1 EMP', self.roofline_rpt, + 'L1bw', float), L1bw, -0.1, 0.3), + # check L2 bandwidth: + sn.assert_reference(sn.extractsingle( + r'(?P\d+.\d+)\sL2 EMP', self.roofline_rpt, + 'L2bw', float), L2bw, -0.1, 0.3), + # check L3 bandwidth: + sn.assert_reference(sn.extractsingle( + r'(?P\d+.\d+)\sL3 EMP', self.roofline_rpt, + 'L3bw', float), L3bw, -0.1, 0.3), + # check DRAM bandwidth: + sn.assert_reference(sn.extractsingle( + r'(?P\d+.\d+) DRAM EMP', self.roofline_rpt, + 'DRAMbw', float), DRAMbw, -0.1, 0.3), + ]) + else: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + ] + self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') + if not mpitask == 36: + self.job.launcher.options = ['--cpu-bind=verbose,none'] From 8390374d0303dbc09a4eec80c8bb2a4c54416d5c Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 29 Apr 2019 11:22:08 +0200 Subject: [PATCH 2/8] typo --- .../tools/profiling_and_debugging/berkeleylab-roofline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py index 90f9206c4f..286a20baa0 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -107,7 +107,7 @@ def __init__(self, mpitask, flop): sn.assert_reference(sn.extractsingle( r'(?P\d+.\d+) DRAM EMP', self.roofline_rpt, 'DRAMbw', float), DRAMbw, -0.1, 0.3), - ]) + ]) else: self.post_run = [ 'cat *_job.out | python2 preprocess.py > pre', From 61d3c4703bb95b4f162397baec23922956c8dbbd Mon Sep 17 00:00:00 2001 From: jgp Date: Wed, 1 May 2019 15:45:41 +0200 Subject: [PATCH 3/8] typo --- .../tools/profiling_and_debugging/berkeleylab-roofline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py index 286a20baa0..ad78e51eed 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -26,7 +26,6 @@ def __init__(self): def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) - # self.job.launcher.options = ['--cpu-bind=verbose,none'] @rfm.parameterized_test(*[[mpitask, flop] @@ -115,5 +114,6 @@ def __init__(self, mpitask, flop): 'python2 summary.py < max > sum', ] self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') + if not mpitask == 36: self.job.launcher.options = ['--cpu-bind=verbose,none'] From aa6aacf3bac9eede7d0409eb20cc571a27f45266 Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 3 May 2019 09:02:42 +0200 Subject: [PATCH 4/8] fix for review --- .../profiling_and_debugging/berkeleylab-roofline.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py index ad78e51eed..30ad1bd36a 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -35,7 +35,7 @@ def setup(self, partition, environ, **job_opts): class ErtBroadwellTest(ErtTestBase): def __init__(self, mpitask, flop): super().__init__() - ompthread = int(36/mpitask) + ompthread = 36 // mpitask self.valid_systems = ['daint:mc', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-gnu'] self.build_system.cppflags = [ @@ -47,14 +47,12 @@ def __init__(self, mpitask, flop): '-DERT_WORKING_SET_MIN=1', '-DERT_FLOP=%s' % flop, ] - self.name = 'ert_FLOPS.' + '{:03d}'.format(flop) + \ - '_MPI.' + '{:03d}'.format(mpitask) + \ - '_OpenMP.' + '{:03d}'.format(ompthread) - self.time_limit = (0, 10, 0) + self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( + flop, mpitask, ompthread) self.exclusive = True self.num_tasks = mpitask self.num_tasks_per_node = mpitask - self.num_cpus_per_task = int(ompthread) + self.num_cpus_per_task = ompthread self.num_tasks_per_core = 1 self.use_multithreading = False self.variables = { @@ -115,5 +113,5 @@ def __init__(self, mpitask, flop): ] self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') - if not mpitask == 36: + if mpitask != 36: self.job.launcher.options = ['--cpu-bind=verbose,none'] From 2af3829ec7df5107c9b3b0c83d63331c6469eacd Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 3 May 2019 10:09:17 +0200 Subject: [PATCH 5/8] fix for review --- .../berkeleylab-roofline.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py index 30ad1bd36a..970f4e6c76 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -26,12 +26,14 @@ def __init__(self): def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) + if self.num_tasks != 36: + self.job.launcher.options = ['--cpu-bind=verbose,none'] -@rfm.parameterized_test(*[[mpitask, flop] - for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] - for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, - 1024]]) +@rfm.parameterized_test( + *[[mpitask, flop] + for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] + for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) class ErtBroadwellTest(ErtTestBase): def __init__(self, mpitask, flop): super().__init__() @@ -60,7 +62,7 @@ def __init__(self, mpitask, flop): 'OMP_NUM_THREADS': str(self.num_cpus_per_task) } # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): - GFLOPs = 469.0 + GFLOPs = 945.0 L1bw = 1788.0 L2bw = 855.0 L3bw = 547.0 @@ -87,7 +89,7 @@ def __init__(self, mpitask, flop): # check GFLOPS: sn.assert_reference(sn.extractsingle( r'(?P\d+.\d+)\sGFLOPs EMP', self.roofline_rpt, - 'GFLOPs', float), GFLOPs, -0.1, 0.3), + 'GFLOPs', float), GFLOPs, -0.1, 0.5), # check L1 bandwidth: sn.assert_reference(sn.extractsingle( r'(?P\d+.\d+)\sL1 EMP', self.roofline_rpt, @@ -112,6 +114,3 @@ def __init__(self, mpitask, flop): 'python2 summary.py < max > sum', ] self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') - - if mpitask != 36: - self.job.launcher.options = ['--cpu-bind=verbose,none'] From 1e342c417fbb66f7bb1e2f7f745ffb49cbea2c35 Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 6 May 2019 18:11:19 +0200 Subject: [PATCH 6/8] fix for review --- .../berkeleylab-roofline.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py index 970f4e6c76..0c4f9cbeb8 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -17,7 +17,7 @@ def __init__(self): self.build_system = 'SingleSource' self.sourcepath = 'kernel1.c driver1.c' self.executable = 'ert.exe' - self.build_system.ldflags = ['-O3 -fopenmp'] + self.build_system.ldflags = ['-O3', '-fopenmp'] self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'roofline', 'ert') self.rpt = '%s.rpt' % self.executable @@ -31,13 +31,13 @@ def setup(self, partition, environ, **job_opts): @rfm.parameterized_test( - *[[mpitask, flop] - for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] + *[[num_ranks, flop] + for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1] for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) class ErtBroadwellTest(ErtTestBase): - def __init__(self, mpitask, flop): + def __init__(self, num_ranks, flop): super().__init__() - ompthread = 36 // mpitask + ompthread = 36 // num_ranks self.valid_systems = ['daint:mc', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-gnu'] self.build_system.cppflags = [ @@ -50,10 +50,10 @@ def __init__(self, mpitask, flop): '-DERT_FLOP=%s' % flop, ] self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( - flop, mpitask, ompthread) + flop, num_ranks, ompthread) self.exclusive = True - self.num_tasks = mpitask - self.num_tasks_per_node = mpitask + self.num_tasks = num_ranks + self.num_tasks_per_node = num_ranks self.num_cpus_per_task = ompthread self.num_tasks_per_core = 1 self.use_multithreading = False @@ -68,10 +68,10 @@ def __init__(self, mpitask, flop): L3bw = 547.0 DRAMbw = 70.5 # slowest job: - mpitaskm1 = 1 - flopm1 = 1024 + num_ranks_min = 1 + flop_min = 1024 self.roofline_rpt = 'rpt' - if mpitask == mpitaskm1 and flop == flopm1: + if num_ranks == num_ranks_min and flop == flop_min: self.post_run = [ 'cat *_job.out | python2 preprocess.py > pre', 'python2 maximum.py < pre > max', From 8cc8590a726b9108b18f59b532d1f8b1ba44750d Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 6 May 2019 21:26:43 +0200 Subject: [PATCH 7/8] Fix PEP8 complaints --- .../tools/profiling_and_debugging/berkeleylab-roofline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py index 0c4f9cbeb8..5c52bbc6e9 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py @@ -9,6 +9,7 @@ class ErtTestBase(rfm.RegressionTest): The Empirical Roofline Tool, ERT, automatically generates roofline data. https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ """ + def __init__(self): super().__init__() self.descr = 'Empirical Roofline Toolkit' @@ -32,8 +33,8 @@ def setup(self, partition, environ, **job_opts): @rfm.parameterized_test( *[[num_ranks, flop] - for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1] - for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) + for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1] + for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) class ErtBroadwellTest(ErtTestBase): def __init__(self, num_ranks, flop): super().__init__() From 1c4067df6dd55185e352cc5800a3f0453b8dc484 Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 7 May 2019 13:24:39 +0200 Subject: [PATCH 8/8] fix for review --- .../{berkeleylab-roofline.py => berkeley-ert.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename cscs-checks/tools/profiling_and_debugging/{berkeleylab-roofline.py => berkeley-ert.py} (98%) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py similarity index 98% rename from cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py rename to cscs-checks/tools/profiling_and_debugging/berkeley-ert.py index 0c4f9cbeb8..66aab43c61 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py @@ -41,13 +41,13 @@ def __init__(self, num_ranks, flop): self.valid_systems = ['daint:mc', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-gnu'] self.build_system.cppflags = [ + '-DERT_FLOP=%s' % flop, '-DERT_ALIGN=32', '-DERT_MEMORY_MAX=1073741824', '-DERT_MPI=True', '-DERT_OPENMP=True', '-DERT_TRIALS_MIN=1', '-DERT_WORKING_SET_MIN=1', - '-DERT_FLOP=%s' % flop, ] self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( flop, num_ranks, ompthread) @@ -78,7 +78,7 @@ def __init__(self, num_ranks, flop): 'python2 summary.py < max > sum', # give enough time for all the dependent jobs to collect data 'sleep 60', - 'cat ../ert_FLOPS*/sum |python2 roofline.py > rpt', + 'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt', ] self.sanity_patterns = sn.all([ # --- check data type: