From f90edc6f9810ee62a6c3f062f1241e0726a6a9bb Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Mon, 29 Apr 2019 11:17:08 +0200
Subject: [PATCH 1/8] ert

---
 .../berkeleylab-roofline.py                   | 119 ++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
new file mode 100644
index 0000000000..90f9206c4f
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -0,0 +1,119 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+class ErtTestBase(rfm.RegressionTest):
+    """
+    The Empirical Roofline Tool, ERT, automatically generates roofline data.
+    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
+    """
+    def __init__(self):
+        super().__init__()
+        self.descr = 'Empirical Roofline Toolkit'
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        self.build_system = 'SingleSource'
+        self.sourcepath = 'kernel1.c driver1.c'
+        self.executable = 'ert.exe'
+        self.build_system.ldflags = ['-O3 -fopenmp']
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        self.rpt = '%s.rpt' % self.executable
+        self.maintainers = ['JG']
+        self.tags = {'scs'}
+
+    def setup(self, partition, environ, **job_opts):
+        super().setup(partition, environ, **job_opts)
+        # self.job.launcher.options = ['--cpu-bind=verbose,none']
+
+
+@rfm.parameterized_test(*[[mpitask, flop]
+                        for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1]
+                        for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
+                                     1024]])
+class ErtBroadwellTest(ErtTestBase):
+    def __init__(self, mpitask, flop):
+        super().__init__()
+        ompthread = int(36/mpitask)
+        self.valid_systems = ['daint:mc', 'dom:mc']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.build_system.cppflags = [
+            '-DERT_ALIGN=32',
+            '-DERT_MEMORY_MAX=1073741824',
+            '-DERT_MPI=True',
+            '-DERT_OPENMP=True',
+            '-DERT_TRIALS_MIN=1',
+            '-DERT_WORKING_SET_MIN=1',
+            '-DERT_FLOP=%s' % flop,
+        ]
+        self.name = 'ert_FLOPS.' + '{:03d}'.format(flop) + \
+                    '_MPI.' + '{:03d}'.format(mpitask) + \
+                    '_OpenMP.' + '{:03d}'.format(ompthread)
+        self.time_limit = (0, 10, 0)
+        self.exclusive = True
+        self.num_tasks = mpitask
+        self.num_tasks_per_node = mpitask
+        self.num_cpus_per_task = int(ompthread)
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'CRAYPE_LINK_TYPE': 'dynamic',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
+        }
+        # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
+        GFLOPs = 469.0
+        L1bw = 1788.0
+        L2bw = 855.0
+        L3bw = 547.0
+        DRAMbw = 70.5
+        # slowest job:
+        mpitaskm1 = 1
+        flopm1 = 1024
+        self.roofline_rpt = 'rpt'
+        if mpitask == mpitaskm1 and flop == flopm1:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+                # give enough time for all the dependent jobs to collect data
+                'sleep 60',
+                'cat ../ert_FLOPS*/sum |python2 roofline.py > rpt',
+            ]
+            self.sanity_patterns = sn.all([
+                # --- check data type:
+                sn.assert_eq(sn.extractsingle(
+                    r'^\s+(?P<prec>\w+) \* __restrict__ buf = \(\w+ \*\)'
+                    r'malloc\(PSIZE\);', 'driver1.c', 'prec'), 'double'),
+                # --- check ert's roofline results:
+                # check GFLOPS:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
+                    'GFLOPs', float), GFLOPs, -0.1, 0.3),
+                # check L1 bandwidth:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
+                    'L1bw', float), L1bw, -0.1, 0.3),
+                # check L2 bandwidth:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<L2bw>\d+.\d+)\sL2 EMP', self.roofline_rpt,
+                    'L2bw', float), L2bw, -0.1, 0.3),
+                # check L3 bandwidth:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<L3bw>\d+.\d+)\sL3 EMP', self.roofline_rpt,
+                    'L3bw', float), L3bw, -0.1, 0.3),
+                # check DRAM bandwidth:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt,
+                    'DRAMbw', float), DRAMbw, -0.1, 0.3),
+                ])
+        else:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+            ]
+            self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')
+        if not mpitask == 36:
+            self.job.launcher.options = ['--cpu-bind=verbose,none']

From 8390374d0303dbc09a4eec80c8bb2a4c54416d5c Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Mon, 29 Apr 2019 11:22:08 +0200
Subject: [PATCH 2/8] typo

---
 .../tools/profiling_and_debugging/berkeleylab-roofline.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
index 90f9206c4f..286a20baa0 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -107,7 +107,7 @@ def __init__(self, mpitask, flop):
                 sn.assert_reference(sn.extractsingle(
                     r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt,
                     'DRAMbw', float), DRAMbw, -0.1, 0.3),
-                ])
+            ])
         else:
             self.post_run = [
                 'cat *_job.out | python2 preprocess.py > pre',

From 61d3c4703bb95b4f162397baec23922956c8dbbd Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Wed, 1 May 2019 15:45:41 +0200
Subject: [PATCH 3/8] typo

---
 .../tools/profiling_and_debugging/berkeleylab-roofline.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
index 286a20baa0..ad78e51eed 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -26,7 +26,6 @@ def __init__(self):
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
-        # self.job.launcher.options = ['--cpu-bind=verbose,none']
 
 
 @rfm.parameterized_test(*[[mpitask, flop]
@@ -115,5 +114,6 @@ def __init__(self, mpitask, flop):
                 'python2 summary.py < max > sum',
             ]
             self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')
+
         if not mpitask == 36:
             self.job.launcher.options = ['--cpu-bind=verbose,none']

From aa6aacf3bac9eede7d0409eb20cc571a27f45266 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Fri, 3 May 2019 09:02:42 +0200
Subject: [PATCH 4/8] fix for review

---
 .../profiling_and_debugging/berkeleylab-roofline.py  | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
index ad78e51eed..30ad1bd36a 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -35,7 +35,7 @@ def setup(self, partition, environ, **job_opts):
 class ErtBroadwellTest(ErtTestBase):
     def __init__(self, mpitask, flop):
         super().__init__()
-        ompthread = int(36/mpitask)
+        ompthread = 36 // mpitask
         self.valid_systems = ['daint:mc', 'dom:mc']
         self.valid_prog_environs = ['PrgEnv-gnu']
         self.build_system.cppflags = [
@@ -47,14 +47,12 @@ def __init__(self, mpitask, flop):
             '-DERT_WORKING_SET_MIN=1',
             '-DERT_FLOP=%s' % flop,
         ]
-        self.name = 'ert_FLOPS.' + '{:03d}'.format(flop) + \
-                    '_MPI.' + '{:03d}'.format(mpitask) + \
-                    '_OpenMP.' + '{:03d}'.format(ompthread)
-        self.time_limit = (0, 10, 0)
+        self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
+            flop, mpitask, ompthread)
         self.exclusive = True
         self.num_tasks = mpitask
         self.num_tasks_per_node = mpitask
-        self.num_cpus_per_task = int(ompthread)
+        self.num_cpus_per_task = ompthread
         self.num_tasks_per_core = 1
         self.use_multithreading = False
         self.variables = {
@@ -115,5 +113,5 @@ def __init__(self, mpitask, flop):
             ]
             self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')
 
-        if not mpitask == 36:
+        if mpitask != 36:
             self.job.launcher.options = ['--cpu-bind=verbose,none']

From 2af3829ec7df5107c9b3b0c83d63331c6469eacd Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Fri, 3 May 2019 10:09:17 +0200
Subject: [PATCH 5/8] fix for review

---
 .../berkeleylab-roofline.py                     | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
index 30ad1bd36a..970f4e6c76 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -26,12 +26,14 @@ def __init__(self):
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
+        if self.num_tasks != 36:
+            self.job.launcher.options = ['--cpu-bind=verbose,none']
 
 
-@rfm.parameterized_test(*[[mpitask, flop]
-                        for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1]
-                        for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512,
-                                     1024]])
+@rfm.parameterized_test(
+    *[[mpitask, flop]
+    for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1]
+    for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
 class ErtBroadwellTest(ErtTestBase):
     def __init__(self, mpitask, flop):
         super().__init__()
@@ -60,7 +62,7 @@ def __init__(self, mpitask, flop):
             'OMP_NUM_THREADS': str(self.num_cpus_per_task)
         }
         # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
-        GFLOPs = 469.0
+        GFLOPs = 945.0
         L1bw = 1788.0
         L2bw = 855.0
         L3bw = 547.0
@@ -87,7 +89,7 @@ def __init__(self, mpitask, flop):
                 # check GFLOPS:
                 sn.assert_reference(sn.extractsingle(
                     r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
-                    'GFLOPs', float), GFLOPs, -0.1, 0.3),
+                    'GFLOPs', float), GFLOPs, -0.1, 0.5),
                 # check L1 bandwidth:
                 sn.assert_reference(sn.extractsingle(
                     r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
@@ -112,6 +114,3 @@ def __init__(self, mpitask, flop):
                 'python2 summary.py < max > sum',
             ]
             self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')
-
-        if mpitask != 36:
-            self.job.launcher.options = ['--cpu-bind=verbose,none']

From 1e342c417fbb66f7bb1e2f7f745ffb49cbea2c35 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Mon, 6 May 2019 18:11:19 +0200
Subject: [PATCH 6/8] fix for review

---
 .../berkeleylab-roofline.py                   | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
index 970f4e6c76..0c4f9cbeb8 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -17,7 +17,7 @@ def __init__(self):
         self.build_system = 'SingleSource'
         self.sourcepath = 'kernel1.c driver1.c'
         self.executable = 'ert.exe'
-        self.build_system.ldflags = ['-O3 -fopenmp']
+        self.build_system.ldflags = ['-O3', '-fopenmp']
         self.sourcesdir = os.path.join(self.current_system.resourcesdir,
                                        'roofline', 'ert')
         self.rpt = '%s.rpt' % self.executable
@@ -31,13 +31,13 @@ def setup(self, partition, environ, **job_opts):
 
 
 @rfm.parameterized_test(
-    *[[mpitask, flop]
-    for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1]
+    *[[num_ranks, flop]
+    for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1]
     for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
 class ErtBroadwellTest(ErtTestBase):
-    def __init__(self, mpitask, flop):
+    def __init__(self, num_ranks, flop):
         super().__init__()
-        ompthread = 36 // mpitask
+        ompthread = 36 // num_ranks
         self.valid_systems = ['daint:mc', 'dom:mc']
         self.valid_prog_environs = ['PrgEnv-gnu']
         self.build_system.cppflags = [
@@ -50,10 +50,10 @@ def __init__(self, mpitask, flop):
             '-DERT_FLOP=%s' % flop,
         ]
         self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
-            flop, mpitask, ompthread)
+            flop, num_ranks, ompthread)
         self.exclusive = True
-        self.num_tasks = mpitask
-        self.num_tasks_per_node = mpitask
+        self.num_tasks = num_ranks
+        self.num_tasks_per_node = num_ranks
         self.num_cpus_per_task = ompthread
         self.num_tasks_per_core = 1
         self.use_multithreading = False
@@ -68,10 +68,10 @@ def __init__(self, mpitask, flop):
         L3bw = 547.0
         DRAMbw = 70.5
         # slowest job:
-        mpitaskm1 = 1
-        flopm1 = 1024
+        num_ranks_min = 1
+        flop_min = 1024
         self.roofline_rpt = 'rpt'
-        if mpitask == mpitaskm1 and flop == flopm1:
+        if num_ranks == num_ranks_min and flop == flop_min:
             self.post_run = [
                 'cat *_job.out | python2 preprocess.py > pre',
                 'python2 maximum.py < pre > max',

From 8cc8590a726b9108b18f59b532d1f8b1ba44750d Mon Sep 17 00:00:00 2001
From: Vasileios Karakasis <karakasis@cscs.ch>
Date: Mon, 6 May 2019 21:26:43 +0200
Subject: [PATCH 7/8] Fix PEP8 complaints

---
 .../tools/profiling_and_debugging/berkeleylab-roofline.py    | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
index 0c4f9cbeb8..5c52bbc6e9 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
@@ -9,6 +9,7 @@ class ErtTestBase(rfm.RegressionTest):
     The Empirical Roofline Tool, ERT, automatically generates roofline data.
     https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
     """
+
     def __init__(self):
         super().__init__()
         self.descr = 'Empirical Roofline Toolkit'
@@ -32,8 +33,8 @@ def setup(self, partition, environ, **job_opts):
 
 @rfm.parameterized_test(
     *[[num_ranks, flop]
-    for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1]
-    for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
+      for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1]
+      for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
 class ErtBroadwellTest(ErtTestBase):
     def __init__(self, num_ranks, flop):
         super().__init__()

From 1c4067df6dd55185e352cc5800a3f0453b8dc484 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 7 May 2019 13:24:39 +0200
Subject: [PATCH 8/8] fix for review

---
 .../{berkeleylab-roofline.py => berkeley-ert.py}              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename cscs-checks/tools/profiling_and_debugging/{berkeleylab-roofline.py => berkeley-ert.py} (98%)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
similarity index 98%
rename from cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
rename to cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
index 0c4f9cbeb8..66aab43c61 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeleylab-roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
@@ -41,13 +41,13 @@ def __init__(self, num_ranks, flop):
         self.valid_systems = ['daint:mc', 'dom:mc']
         self.valid_prog_environs = ['PrgEnv-gnu']
         self.build_system.cppflags = [
+            '-DERT_FLOP=%s' % flop,
             '-DERT_ALIGN=32',
             '-DERT_MEMORY_MAX=1073741824',
             '-DERT_MPI=True',
             '-DERT_OPENMP=True',
             '-DERT_TRIALS_MIN=1',
             '-DERT_WORKING_SET_MIN=1',
-            '-DERT_FLOP=%s' % flop,
         ]
         self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
             flop, num_ranks, ompthread)
@@ -78,7 +78,7 @@ def __init__(self, num_ranks, flop):
                 'python2 summary.py < max > sum',
                 # give enough time for all the dependent jobs to collect data
                 'sleep 60',
-                'cat ../ert_FLOPS*/sum |python2 roofline.py > rpt',
+                'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt',
             ]
             self.sanity_patterns = sn.all([
                 # --- check data type: