From 66a15e5b674b059738b443779b987e9a829fa8de Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Fri, 11 Oct 2019 09:33:57 +0200
Subject: [PATCH 1/6] tag1

---
 cscs-checks/apps/amber/amber_check.py         |   2 +-
 cscs-checks/apps/espresso/espresso_check.py   |   2 +-
 cscs-checks/apps/gromacs/gromacs_check.py     |   2 +-
 cscs-checks/apps/icon/rrtmgp_check.py         |   1 +
 cscs-checks/apps/lammps/lammps_check.py       |   2 +-
 cscs-checks/apps/namd/namd_check.py           |   2 +-
 cscs-checks/apps/openfoam/check_openfoam.py   |   2 +-
 .../apps/openfoam/check_openfoam_extend.py    |   2 +-
 cscs-checks/cuda/cuda_checks.py               |   2 +-
 cscs-checks/cuda/multi_gpu.py                 |   2 +-
 cscs-checks/cuda/nvml_check.py                |   2 +-
 .../libraries/io/netcdf_compile_run.py        |   2 +-
 .../libraries/math/scalapack_compile_run.py   |  58 +++---
 cscs-checks/mch/fieldextra_check.py           |   2 +-
 .../microbenchmarks/hpcg/hpcg_benchmark.py    |   2 +-
 .../microbenchmarks/spec-accel/spec.py        |   2 +-
 cscs-checks/system/io/ior_check.py            |   2 +-
 cscs-checks/tools/io/cdo.py                   |   2 +-
 cscs-checks/tools/io/nco.py                   |   2 +-
 .../berkeley-ert-nvprof.py                    |   2 +-
 .../berkeley-ert-serial.py                    | 170 ++++++++++++++++
 .../profiling_and_debugging/berkeley-ert.py   |   2 +-
 .../intel_advisor_roofline.py                 |   2 +-
 .../intel_sde_berkeley_stream.py              |   2 +-
 .../intel_sde_roofline.py                     |   2 +-
 .../intel_vtune_roofline.py                   |   2 +-
 .../likwid_roofline.py                        | 181 ++++++++++++++++++
 27 files changed, 404 insertions(+), 52 deletions(-)
 create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
 create mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py

diff --git a/cscs-checks/apps/amber/amber_check.py b/cscs-checks/apps/amber/amber_check.py
index 4018e923ce..4ddce619d2 100644
--- a/cscs-checks/apps/amber/amber_check.py
+++ b/cscs-checks/apps/amber/amber_check.py
@@ -38,7 +38,7 @@ def __init__(self, input_file, output_file):
                                      output_file, 'perf', float, item=1)
         }
         self.maintainers = ['SO', 'VH']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
 
 
 @rfm.required_version('>=2.16')
diff --git a/cscs-checks/apps/espresso/espresso_check.py b/cscs-checks/apps/espresso/espresso_check.py
index 4cacf2934f..876ae49753 100644
--- a/cscs-checks/apps/espresso/espresso_check.py
+++ b/cscs-checks/apps/espresso/espresso_check.py
@@ -11,7 +11,7 @@ def __init__(self, scale):
         super().__init__()
         self.descr = 'Quantum Espresso CPU check'
         self.maintainers = ['AK', 'LM']
-        self.tags = {'scs', 'production'}
+        self.tags = {'scs', 'production', 'resources'}
         self.sourcesdir = os.path.join(self.current_system.resourcesdir,
                                        'Espresso')
 
diff --git a/cscs-checks/apps/gromacs/gromacs_check.py b/cscs-checks/apps/gromacs/gromacs_check.py
index 05f776383c..81bc39d767 100644
--- a/cscs-checks/apps/gromacs/gromacs_check.py
+++ b/cscs-checks/apps/gromacs/gromacs_check.py
@@ -41,7 +41,7 @@ def __init__(self, output_file):
                 'num_switches': 1
             }
         }
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
 
 
 @rfm.required_version('>=2.19')
diff --git a/cscs-checks/apps/icon/rrtmgp_check.py b/cscs-checks/apps/icon/rrtmgp_check.py
index 69a48430d9..9d157e7ea9 100644
--- a/cscs-checks/apps/icon/rrtmgp_check.py
+++ b/cscs-checks/apps/icon/rrtmgp_check.py
@@ -14,6 +14,7 @@ def __init__(self):
         self.valid_prog_environs = ['PrgEnv-pgi']
         self.sourcesdir = os.path.join(self.current_system.resourcesdir,
                                        'RRTMGP')
+        self.tags = {'resources'}
         self.prebuild_cmd = ['cp build/Makefile.conf.dom build/Makefile.conf']
         self.executable = 'python'
         self.executable_opts = [
diff --git a/cscs-checks/apps/lammps/lammps_check.py b/cscs-checks/apps/lammps/lammps_check.py
index 5321767f08..23a69cff68 100644
--- a/cscs-checks/apps/lammps/lammps_check.py
+++ b/cscs-checks/apps/lammps/lammps_check.py
@@ -33,7 +33,7 @@ def __init__(self):
             }
         }
 
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
         self.maintainers = ['TR', 'VH']
 
 
diff --git a/cscs-checks/apps/namd/namd_check.py b/cscs-checks/apps/namd/namd_check.py
index 2a75ac0efd..eb91d82520 100644
--- a/cscs-checks/apps/namd/namd_check.py
+++ b/cscs-checks/apps/namd/namd_check.py
@@ -44,7 +44,7 @@ def __init__(self, arch, scale, variant):
         }
 
         self.maintainers = ['CB', 'LM']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
         self.strict_check = False
         self.extra_resources = {
             'switches': {
diff --git a/cscs-checks/apps/openfoam/check_openfoam.py b/cscs-checks/apps/openfoam/check_openfoam.py
index ade32256d7..dff3d6de8e 100644
--- a/cscs-checks/apps/openfoam/check_openfoam.py
+++ b/cscs-checks/apps/openfoam/check_openfoam.py
@@ -22,7 +22,7 @@ def __init__(self):
         self.num_cpus_per_task  = 1
 
         self.maintainers = ['MKr']
-        self.tags = {'scs', 'production'}
+        self.tags = {'scs', 'production', 'resources'}
 
         self.pre_run = ['source $FOAM_BASH']
 
diff --git a/cscs-checks/apps/openfoam/check_openfoam_extend.py b/cscs-checks/apps/openfoam/check_openfoam_extend.py
index 68a9a440de..d6b7d7194d 100644
--- a/cscs-checks/apps/openfoam/check_openfoam_extend.py
+++ b/cscs-checks/apps/openfoam/check_openfoam_extend.py
@@ -28,7 +28,7 @@ def __init__(self):
                 r'Finalising parallel run', self.stdout)
 
         self.maintainers = ['MKr']
-        self.tags = {'scs', 'production'}
+        self.tags = {'scs', 'production', 'resources'}
         self.pre_run = ['source $FOAM_INST_DIR/foam-extend-4.0/etc/bashrc']
 
 
diff --git a/cscs-checks/cuda/cuda_checks.py b/cscs-checks/cuda/cuda_checks.py
index b5c26887c9..73b84ca02b 100644
--- a/cscs-checks/cuda/cuda_checks.py
+++ b/cscs-checks/cuda/cuda_checks.py
@@ -27,7 +27,7 @@ def __init__(self):
             self.nvidia_sm = '37'
 
         self.maintainers = ['AJ', 'VK']
-        self.tags = {'production'}
+        self.tags = {'production', 'resources'}
 
 
 @rfm.required_version('>=2.14')
diff --git a/cscs-checks/cuda/multi_gpu.py b/cscs-checks/cuda/multi_gpu.py
index a56f408a8f..1392ea1df4 100644
--- a/cscs-checks/cuda/multi_gpu.py
+++ b/cscs-checks/cuda/multi_gpu.py
@@ -61,7 +61,7 @@ def __init__(self):
             'kesch:cn:d2h':   (7584, -0.1, None, 'MB/s'),
             'kesch:cn:d2d': (137408, -0.1, None, 'MB/s')
         }
-        self.tags = {'diagnostic', 'mch'}
+        self.tags = {'diagnostic', 'mch', 'resources'}
         self.maintainers = ['AJ', 'VK']
 
     def _xfer_pattern(self, xfer_kind, devno, nodename):
diff --git a/cscs-checks/cuda/nvml_check.py b/cscs-checks/cuda/nvml_check.py
index 20e88fd10e..f27d3f5a9b 100644
--- a/cscs-checks/cuda/nvml_check.py
+++ b/cscs-checks/cuda/nvml_check.py
@@ -22,4 +22,4 @@ def __init__(self):
             r"compute\s+mode\s+'Exclusive Process'", self.stdout)
 
         self.maintainers = ['AJ', 'VK']
-        self.tags = {'production'}
+        self.tags = {'production', 'resources'}
diff --git a/cscs-checks/libraries/io/netcdf_compile_run.py b/cscs-checks/libraries/io/netcdf_compile_run.py
index 6160e32531..37a3d2df7f 100644
--- a/cscs-checks/libraries/io/netcdf_compile_run.py
+++ b/cscs-checks/libraries/io/netcdf_compile_run.py
@@ -40,7 +40,7 @@ def __init__(self, lang, linkage):
         self.num_tasks_per_node = 1
         self.sanity_patterns = sn.assert_found(r'SUCCESS', self.stdout)
         self.maintainers = ['AJ', 'VK']
-        self.tags = {'production'}
+        self.tags = {'production', 'resources'}
 
     def setup(self, partition, environ, **job_opts):
         if self.current_system.name == 'kesch':
diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py
index 9a9bfbe886..62ad1d7b79 100644
--- a/cscs-checks/libraries/math/scalapack_compile_run.py
+++ b/cscs-checks/libraries/math/scalapack_compile_run.py
@@ -27,7 +27,7 @@ def __init__(self, linkage):
         self.build_system = 'SingleSource'
         self.build_system.fflags = ['-O3']
         self.maintainers = ['CB', 'LM', 'MKr']
-        self.tags = {'production'}
+        self.tags = {'production', 'resources'}
 
 
 @rfm.required_version('>=2.14')
@@ -68,31 +68,31 @@ def scalapack_sanity(number1, number2, expected_value):
         ])
 
 
-# FIXME: This test is obsolete; it is kept only for reference.
-# NOTE:  The test case is very small, but larger cases did not succeed!
-@rfm.required_version('>=2.14')
-@rfm.parameterized_test(['dynamic'])
-class ScaLAPACKPerf(ScaLAPACKTest):
-    def __init__(self, linkage):
-        super().__init__(linkage)
-
-        self.tags |= {'monch_acceptance'}
-        self.sourcepath = 'scalapack_performance_compile_run.f'
-        self.valid_systems = ['monch:compute']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.num_tasks = 64
-        self.num_tasks_per_node = 16
-
-        self.sanity_patterns = sn.assert_found(r'Run', self.stdout)
-        self.perf_patterns = {
-            'perf': sn.max(
-                sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)',
-                              self.stdout, 'gflops', float)
-            )
-        }
-
-        self.reference = {
-            'monch:compute': {
-                'perf': (24., -0.1, None)
-            }
-        }
+# # FIXME: This test is obsolete; it is kept only for reference.
+# # NOTE:  The test case is very small, but larger cases did not succeed!
+# @rfm.required_version('>=2.14')
+# @rfm.parameterized_test(['dynamic'])
+# class ScaLAPACKPerf(ScaLAPACKTest):
+#     def __init__(self, linkage):
+#         super().__init__(linkage)
+# 
+#         self.tags |= {'monch_acceptance'}
+#         self.sourcepath = 'scalapack_performance_compile_run.f'
+#         self.valid_systems = ['monch:compute']
+#         self.valid_prog_environs = ['PrgEnv-gnu']
+#         self.num_tasks = 64
+#         self.num_tasks_per_node = 16
+# 
+#         self.sanity_patterns = sn.assert_found(r'Run', self.stdout)
+#         self.perf_patterns = {
+#             'perf': sn.max(
+#                 sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)',
+#                               self.stdout, 'gflops', float)
+#             )
+#         }
+# 
+#         self.reference = {
+#             'monch:compute': {
+#                 'perf': (24., -0.1, None)
+#             }
+#         }
diff --git a/cscs-checks/mch/fieldextra_check.py b/cscs-checks/mch/fieldextra_check.py
index b45acd15d6..b10df5bf65 100644
--- a/cscs-checks/mch/fieldextra_check.py
+++ b/cscs-checks/mch/fieldextra_check.py
@@ -8,7 +8,7 @@ class FieldextraTestBase(rfm.RunOnlyRegressionTest):
     def __init__(self):
         super().__init__()
         self.maintainers = ['MKr']
-        self.tags = {'mch'}
+        self.tags = {'mch', 'resources'}
 
         self.valid_systems = ['kesch:cn']
         self.valid_prog_environs = ['PrgEnv-gnu-nompi']
diff --git a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py
index 20a4d8dcd8..65548472ac 100644
--- a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py
+++ b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py
@@ -52,7 +52,7 @@ def __init__(self):
         }
 
         self.maintainers = ['SK']
-        self.tags = {'diagnostic', 'benchmark'}
+        self.tags = {'diagnostic', 'benchmark', 'resources'}
 
     @property
     @sn.sanity_function
diff --git a/cscs-checks/microbenchmarks/spec-accel/spec.py b/cscs-checks/microbenchmarks/spec-accel/spec.py
index 2fa4f68edd..fc219d9e14 100644
--- a/cscs-checks/microbenchmarks/spec-accel/spec.py
+++ b/cscs-checks/microbenchmarks/spec-accel/spec.py
@@ -53,7 +53,7 @@ def __init__(self, prg_envs):
         }
 
         self.maintainers = ['SK']
-        self.tags = {'diagnostic'}
+        self.tags = {'diagnostic', 'resources'}
 
     def setup(self, partition, environ, **job_opts):
         self.pre_run = ['source ./shrc', 'mv %s config' %
diff --git a/cscs-checks/system/io/ior_check.py b/cscs-checks/system/io/ior_check.py
index a26b6e97c5..757d1493a5 100644
--- a/cscs-checks/system/io/ior_check.py
+++ b/cscs-checks/system/io/ior_check.py
@@ -108,7 +108,7 @@ def __init__(self, base_dir):
 
         systems_to_test = ['dom', 'daint']
         if self.current_system.name in systems_to_test:
-            self.tags |= {'production'}
+            self.tags |= {'production', 'resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/io/cdo.py b/cscs-checks/tools/io/cdo.py
index a002df9bce..400d29d0fc 100644
--- a/cscs-checks/tools/io/cdo.py
+++ b/cscs-checks/tools/io/cdo.py
@@ -40,7 +40,7 @@ def __init__(self):
             self.modules = ['CDO']
 
         self.maintainers = ['SO']
-        self.tags = {'production', 'mch'}
+        self.tags = {'production', 'mch', 'resources'}
 
 
 # Check that the netCDF loaded by the CDO module supports the nc4 filetype
diff --git a/cscs-checks/tools/io/nco.py b/cscs-checks/tools/io/nco.py
index 504ebe928c..9997f90ecf 100644
--- a/cscs-checks/tools/io/nco.py
+++ b/cscs-checks/tools/io/nco.py
@@ -33,7 +33,7 @@ def __init__(self):
             self.modules = ['NCO']
 
         self.maintainers = ['SO']
-        self.tags = {'production', 'mch'}
+        self.tags = {'production', 'mch', 'resources'}
 
 
 # Check that the netCDF loaded by the NCO module supports the nc4 filetype
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
index bc3d9ba36b..933ffab571 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
@@ -72,7 +72,7 @@ def __init__(self, gpudims, flop, repeat):
         ]
         self.build_system.ldflags = ['-O3']
         self.maintainers = ['JG']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
         gpu_blocks, gpu_threads = gpudims
         self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format(
             repeat, flop, gpu_blocks, gpu_threads)
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
new file mode 100644
index 0000000000..1dee82959c
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
@@ -0,0 +1,170 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+#{{{ base
+class ErtTestBase(rfm.RegressionTest):
+    """
+    The Empirical Roofline Tool, ERT, automatically generates roofline data.
+    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
+    """
+
+    def __init__(self):
+        self.descr = 'Empirical Roofline Toolkit'
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        self.build_system = 'SingleSource'
+        self.sourcepath = 'kernel1.c driver1.c'
+        self.executable = 'ert.exe'
+        self.build_system.ldflags = ['-O3', '-fopenmp']
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        self.rpt = '%s.rpt' % self.executable
+        self.maintainers = ['JG']
+        self.tags = {'scs', 'resources'}
+
+    def setup(self, partition, environ, **job_opts):
+        super().setup(partition, environ, **job_opts)
+        if self.num_tasks != 36:
+            self.job.launcher.options = ['--cpu-bind=verbose,none']
+#}}}
+
+#{{{ test
+@rfm.parameterized_test(
+    *[[num_ranks, flop]
+      for num_ranks in [1]
+      for flop in [256, 512, 1024]])
+      #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
+class ErtBroadwellTest(ErtTestBase):
+    def __init__(self, num_ranks, flop):
+        super().__init__()
+        ompthread = 1
+        self.valid_systems = ['daint:mc', 'dom:mc']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.build_system.cppflags = [
+            '-DERT_FLOP=%s' % flop,
+            '-DERT_ALIGN=32',
+            '-DERT_MEMORY_MAX=1073741824',
+            '-DERT_MPI=True',
+            '-DERT_OPENMP=True',
+            '-DERT_TRIALS_MIN=1',
+            '-DERT_WORKING_SET_MIN=1',
+        ]
+        self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
+            flop, num_ranks, ompthread)
+        self.exclusive = True
+        self.num_tasks = num_ranks
+        self.num_tasks_per_node = num_ranks
+        self.num_cpus_per_task = ompthread
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'CRAYPE_LINK_TYPE': 'dynamic',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
+        }
+
+        # take the "slowest" job, make it sleep after it has ended and hope the
+        # other jobs have ended too
+        # TODO: find a better way to wait for the other jobs to end
+        num_ranks_min = 1
+        flop_min = 1024
+        self.roofline_rpt = 'rpt'
+        if num_ranks == num_ranks_min and flop == flop_min:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+                # give enough time for all the dependent jobs to collect data:
+                'sleep 60',
+                'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt',
+            ]
+
+        else:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+            ]
+
+        # --- Sanity check:
+        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
+                          r'\(\w+ \*\)malloc\(PSIZE\);')
+        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
+        self.sanity_patterns = sn.all([
+            sn.assert_found('GFLOPs', 'sum'),
+            sn.assert_eq(datatype, 'double'),
+        ])
+
+        # --- Performance check:
+        if num_ranks == num_ranks_min and flop == flop_min:
+            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
+            ref_GFLOPs = 945.0
+            ref_L1bw = 1788.0
+            ref_L2bw = 855.0
+            ref_L3bw = 547.0
+            ref_DRAMbw = 70.5
+
+            # Typical performance report looks like:
+            # --------------------------------------
+            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
+            #    908.43 GFLOPs EMP
+            #    ******
+            # META_DATA
+            # OPENMP_THREADS 1
+            # FLOPS          8
+            # MPI_PROCS      36
+            # 
+            #   5647.33 L1 EMP
+            #   *******
+            #   3203.86 L2 EMP
+            #   *******
+            #   1773.58 L3 EMP
+            #   *******
+            #    139.56 L4 EMP
+            #    103.50 DRAM EMP
+            #    ******
+            # META_DATA
+            # FLOPS          2
+            # OPENMP_THREADS 1
+            # MPI_PROCS      36
+            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
+            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
+            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
+            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
+            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
+
+            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
+                                      'GFLOPs', float)
+            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
+                                      'L1bw', float)
+            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
+                                      'L2bw', float)
+            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
+                                      'L3bw', float)
+            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
+                                      'DRAMbw', float)
+
+            # --performance-report:
+            self.perf_patterns = {
+                'gflops': gflops,
+                'L1bw': L1bw,
+                'L2bw': L2bw,
+                'L3bw': L3bw,
+                'DRAMbw': DRAMbw,
+            }
+
+            self.reference = {
+                '*': {
+                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
+                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
+                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
+                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
+                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
+                }
+            }
+
+        # else:
+
+#}}}
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
index 505905d0f9..580aff48e8 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
@@ -23,7 +23,7 @@ def __init__(self):
                                        'roofline', 'ert')
         self.rpt = '%s.rpt' % self.executable
         self.maintainers = ['JG']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
index 977f2c2ab2..4172410021 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
@@ -238,7 +238,7 @@ def __init__(self, repeat, toolversion, datalayout):
         }
 
         self.maintainers = ['JG']
-        self.tags = {'production'}
+        self.tags = {'production', 'resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
index 0c410e43ad..489f4ee9dd 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
@@ -35,7 +35,7 @@ def __init__(self):
         self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
         self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
         self.maintainers = ['JG']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
 
     @property
     @sn.sanity_function
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
index f31cb542f7..b6b3015d06 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
@@ -68,7 +68,7 @@ def __init__(self, repeat, toolsversion, datalayout):
         self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
         self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
         self.maintainers = ['JG']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
         self.sanity_patterns = sn.all([
             sn.assert_eq(sn.extractsingle(
                 r'^Intel\(R\) Software Development Emulator\.  Version:  '
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
index 0ae40b9f51..46e173e8a5 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
@@ -76,7 +76,7 @@ def __init__(self, repeat, toolsversion, datalayout):
         # NOTE: -allow-multiple-runs requires to install vtune drivers
         # TODO: -collect memory-access
         self.maintainers = ['JG']
-        self.tags = {'scs'}
+        self.tags = {'scs', 'resources'}
         self.sanity_patterns = sn.all([
             sn.assert_found('loop complete.', self.stdout),
             sn.assert_eq(sn.extractsingle(
diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
new file mode 100644
index 0000000000..8a8ab08c84
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
@@ -0,0 +1,181 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
+                          for repeat in ['600000']
+                          for toolsversion in ['4.3.3']
+                          # for datalayout in ['G3_AOS_SCALAR']
+                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
+                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
+                          ])
+class IntelRooflineLikwidTest(rfm.RegressionTest):
+    '''This test checks the values reported by RRZE likwid roofline model:
+
+G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697
+G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629
+G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439
+G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B
+                              10GF                              60000  0.18
+
+        > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf
+        > likwid-perfctr -g CACHES -H
+
+        > Get group definition with (identical result):
+        > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt
+        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H
+        DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 +
+                              FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +
+                              FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)
+                              /runtime
+
+        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H
+        Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) +
+                                               SUM(MBOXxC1))*64.0/runtime
+        Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) +
+                                               SUM(MBOXxC1))*64.0
+
+       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H
+       L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
+                                          ICACHE_MISSES)*64.0/time
+       L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
+                                          ICACHE_MISSES)*64.0
+
+       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H
+       L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL +
+                                          L2_LINES_OUT_DEMAND_DIRTY)*64/time
+       L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL +
+                                          L2_LINES_OUT_DEMAND_DIRTY)*64
+
+       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H
+       Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) +
+                                              SUM(CAS_COUNT_WR))*64.0/time
+       Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) +
+                                              SUM(CAS_COUNT_WR))*64.0
+    '''
+    def __init__(self, repeat, toolsversion, datalayout):
+        super().__init__()
+        self.descr = 'Roofline Analysis test with Likwid:'
+        self.valid_systems = ['dom:mc']
+        # Reporting MFLOPS is not available on Intel Haswell cpus, see
+        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
+        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
+        self.valid_prog_environs = ['PrgEnv-intel']
+        self.modules = ['likwid']
+        # likwid/4.3.3-perf_event
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'intel_advisor')
+        self.build_system = 'SingleSource'
+        self.sourcepath = '_roofline.cpp'
+        self.executable = 'likwid-perfctr'
+        self.target_executable = './roof.exe'
+        self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON',
+                                      '-I$EBROOTLIKWID/include']
+        self.prgenv_flags = {
+            'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'],
+            # '-qopt-streaming-stores', 'always',
+        }
+        self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid']
+        self.prebuild_cmd = [
+            'patch -s < LIKWID/roofline_template.patch',
+            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
+            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
+        ]
+        self.exclusive = True
+        self.num_tasks = 1
+        self.num_tasks_per_node = 1
+        self.num_cpus_per_task = 1
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
+            'CRAYPE_LINK_TYPE': 'dynamic',
+        }
+        self.pre_run = [
+            'mv %s %s' % (self.executable, self.target_executable),
+        ]
+        self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable]
+        # -C 0 : sets processor id(s) to pin threads and measure
+        # -g   : sets performance group
+        # -m   : use likwid API
+        self.executable_opts = self.tool_flags
+        self.maintainers = ['JG']
+        self.tags = {'scs', 'resources'}
+        # self.rpt = '%s.rpt' % self.target_executable
+        self.sanity_patterns = sn.all([
+            sn.assert_found('loop complete.', self.stdout),
+            sn.assert_eq(sn.extractsingle(
+                r'^likwid-perfctr -- Version (?P<toolsversion>\d.\d.\d)',
+                self.stdout, 'toolsversion'), toolsversion),
+        ])
+        # References for Intel Broadwell CPU (E5-2695 v4):
+        references = {
+            'G3_AOS_SCALAR': {
+                'dom:mc': {
+                    'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
+                }
+            },
+            'G3_SOA_SCALAR': {
+                'dom:mc': {
+                    'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
+                }
+            },
+            'G3_AOS_VECTOR': {
+                'dom:mc': {
+                    'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.125, -0.05, 0.05, 'flop/byte')
+                }
+            },
+            'G3_SOA_VECTOR': {
+                'dom:mc': {
+                    'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
+                }
+            },
+        }
+        self.reference = references[datalayout]
+        self.perf_patterns = {
+            'gflops': self.gflops,
+            'ai': self.arithmetic_intensity,
+        }
+
+    def setup(self, partition, environ, **job_opts):
+        super().setup(partition, environ, **job_opts)
+        run_cmd = ' '.join(self.job.launcher.command(self.job))
+        self.post_run = ['%s -v' % self.executable]
+        # self.perf_group = ['L2', 'L3']
+        self.perf_group = ['L2', 'L3', 'CACHES', 'DATA',
+                           'MEM', 'MEM_DP', 'MEM_SP']
+        for perf_group in self.perf_group:
+            self.post_run += ['%s %s -C 0 -g %s -m %s' %
+                              (run_cmd, self.executable, perf_group,
+                               self.target_executable)]
+        partitiontype = partition.fullname.split(':')[1]
+        if partitiontype == 'gpu':
+            self.job.options = ['--constraint="gpu&perf"']
+        elif partitiontype == 'mc':
+            self.job.options = ['--constraint="mc&perf"']
+
+    @property
+    @sn.sanity_function
+    def arithmetic_intensity(self):
+        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
+                                 self.rpt, 'flops', int)
+        bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
+                                 self.rpt, 'bytes', int)
+        # debug: print('ai={}'.format(flops/bytes))
+        return flops/bytes
+
+    @property
+    @sn.sanity_function
+    def gflops(self):
+        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
+                                 self.rpt, 'flops', int)
+        msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
+                                'msec', float)
+        # debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
+        return (flops/((msec/1000))/10**9)

From e81f6623bf1e7eb26b8ec11b648c1995bb0549da Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Fri, 11 Oct 2019 09:42:42 +0200
Subject: [PATCH 2/6] removing unwanted commited files

---
 .../berkeley-ert-serial.py                    | 170 ----------------
 .../likwid_roofline.py                        | 181 ------------------
 2 files changed, 351 deletions(-)
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
deleted file mode 100644
index 1dee82959c..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-#{{{ base
-class ErtTestBase(rfm.RegressionTest):
-    """
-    The Empirical Roofline Tool, ERT, automatically generates roofline data.
-    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
-    """
-
-    def __init__(self):
-        self.descr = 'Empirical Roofline Toolkit'
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        self.build_system = 'SingleSource'
-        self.sourcepath = 'kernel1.c driver1.c'
-        self.executable = 'ert.exe'
-        self.build_system.ldflags = ['-O3', '-fopenmp']
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        self.rpt = '%s.rpt' % self.executable
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        if self.num_tasks != 36:
-            self.job.launcher.options = ['--cpu-bind=verbose,none']
-#}}}
-
-#{{{ test
-@rfm.parameterized_test(
-    *[[num_ranks, flop]
-      for num_ranks in [1]
-      for flop in [256, 512, 1024]])
-      #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
-class ErtBroadwellTest(ErtTestBase):
-    def __init__(self, num_ranks, flop):
-        super().__init__()
-        ompthread = 1
-        self.valid_systems = ['daint:mc', 'dom:mc']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.build_system.cppflags = [
-            '-DERT_FLOP=%s' % flop,
-            '-DERT_ALIGN=32',
-            '-DERT_MEMORY_MAX=1073741824',
-            '-DERT_MPI=True',
-            '-DERT_OPENMP=True',
-            '-DERT_TRIALS_MIN=1',
-            '-DERT_WORKING_SET_MIN=1',
-        ]
-        self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
-            flop, num_ranks, ompthread)
-        self.exclusive = True
-        self.num_tasks = num_ranks
-        self.num_tasks_per_node = num_ranks
-        self.num_cpus_per_task = ompthread
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
-        }
-
-        # take the "slowest" job, make it sleep after it has ended and hope the
-        # other jobs have ended too
-        # TODO: find a better way to wait for the other jobs to end
-        num_ranks_min = 1
-        flop_min = 1024
-        self.roofline_rpt = 'rpt'
-        if num_ranks == num_ranks_min and flop == flop_min:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-                # give enough time for all the dependent jobs to collect data:
-                'sleep 60',
-                'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt',
-            ]
-
-        else:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-            ]
-
-        # --- Sanity check:
-        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
-                          r'\(\w+ \*\)malloc\(PSIZE\);')
-        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
-        self.sanity_patterns = sn.all([
-            sn.assert_found('GFLOPs', 'sum'),
-            sn.assert_eq(datatype, 'double'),
-        ])
-
-        # --- Performance check:
-        if num_ranks == num_ranks_min and flop == flop_min:
-            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
-            ref_GFLOPs = 945.0
-            ref_L1bw = 1788.0
-            ref_L2bw = 855.0
-            ref_L3bw = 547.0
-            ref_DRAMbw = 70.5
-
-            # Typical performance report looks like:
-            # --------------------------------------
-            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
-            #    908.43 GFLOPs EMP
-            #    ******
-            # META_DATA
-            # OPENMP_THREADS 1
-            # FLOPS          8
-            # MPI_PROCS      36
-            # 
-            #   5647.33 L1 EMP
-            #   *******
-            #   3203.86 L2 EMP
-            #   *******
-            #   1773.58 L3 EMP
-            #   *******
-            #    139.56 L4 EMP
-            #    103.50 DRAM EMP
-            #    ******
-            # META_DATA
-            # FLOPS          2
-            # OPENMP_THREADS 1
-            # MPI_PROCS      36
-            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
-            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
-            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
-            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
-            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
-
-            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
-                                      'GFLOPs', float)
-            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
-                                      'L1bw', float)
-            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
-                                      'L2bw', float)
-            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
-                                      'L3bw', float)
-            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
-                                      'DRAMbw', float)
-
-            # --performance-report:
-            self.perf_patterns = {
-                'gflops': gflops,
-                'L1bw': L1bw,
-                'L2bw': L2bw,
-                'L3bw': L3bw,
-                'DRAMbw': DRAMbw,
-            }
-
-            self.reference = {
-                '*': {
-                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
-                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
-                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
-                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
-                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
-                }
-            }
-
-        # else:
-
-#}}}
diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
deleted file mode 100644
index 8a8ab08c84..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
-                          for repeat in ['600000']
-                          for toolsversion in ['4.3.3']
-                          # for datalayout in ['G3_AOS_SCALAR']
-                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
-                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
-                          ])
-class IntelRooflineLikwidTest(rfm.RegressionTest):
-    '''This test checks the values reported by RRZE likwid roofline model:
-
-G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697
-G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629
-G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439
-G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B
-                              10GF                              60000  0.18
-
-        > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf
-        > likwid-perfctr -g CACHES -H
-
-        > Get group definition with (identical result):
-        > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt
-        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H
-        DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 +
-                              FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +
-                              FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)
-                              /runtime
-
-        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H
-        Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) +
-                                               SUM(MBOXxC1))*64.0/runtime
-        Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) +
-                                               SUM(MBOXxC1))*64.0
-
-       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H
-       L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
-                                          ICACHE_MISSES)*64.0/time
-       L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
-                                          ICACHE_MISSES)*64.0
-
-       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H
-       L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL +
-                                          L2_LINES_OUT_DEMAND_DIRTY)*64/time
-       L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL +
-                                          L2_LINES_OUT_DEMAND_DIRTY)*64
-
-       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H
-       Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) +
-                                              SUM(CAS_COUNT_WR))*64.0/time
-       Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) +
-                                              SUM(CAS_COUNT_WR))*64.0
-    '''
-    def __init__(self, repeat, toolsversion, datalayout):
-        super().__init__()
-        self.descr = 'Roofline Analysis test with Likwid:'
-        self.valid_systems = ['dom:mc']
-        # Reporting MFLOPS is not available on Intel Haswell cpus, see
-        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
-        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
-        self.valid_prog_environs = ['PrgEnv-intel']
-        self.modules = ['likwid']
-        # likwid/4.3.3-perf_event
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'intel_advisor')
-        self.build_system = 'SingleSource'
-        self.sourcepath = '_roofline.cpp'
-        self.executable = 'likwid-perfctr'
-        self.target_executable = './roof.exe'
-        self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON',
-                                      '-I$EBROOTLIKWID/include']
-        self.prgenv_flags = {
-            'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'],
-            # '-qopt-streaming-stores', 'always',
-        }
-        self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid']
-        self.prebuild_cmd = [
-            'patch -s < LIKWID/roofline_template.patch',
-            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
-            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
-        ]
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-            'CRAYPE_LINK_TYPE': 'dynamic',
-        }
-        self.pre_run = [
-            'mv %s %s' % (self.executable, self.target_executable),
-        ]
-        self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable]
-        # -C 0 : sets processor id(s) to pin threads and measure
-        # -g   : sets performance group
-        # -m   : use likwid API
-        self.executable_opts = self.tool_flags
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
-        # self.rpt = '%s.rpt' % self.target_executable
-        self.sanity_patterns = sn.all([
-            sn.assert_found('loop complete.', self.stdout),
-            sn.assert_eq(sn.extractsingle(
-                r'^likwid-perfctr -- Version (?P<toolsversion>\d.\d.\d)',
-                self.stdout, 'toolsversion'), toolsversion),
-        ])
-        # References for Intel Broadwell CPU (E5-2695 v4):
-        references = {
-            'G3_AOS_SCALAR': {
-                'dom:mc': {
-                    'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_SOA_SCALAR': {
-                'dom:mc': {
-                    'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_AOS_VECTOR': {
-                'dom:mc': {
-                    'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.125, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_SOA_VECTOR': {
-                'dom:mc': {
-                    'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-        }
-        self.reference = references[datalayout]
-        self.perf_patterns = {
-            'gflops': self.gflops,
-            'ai': self.arithmetic_intensity,
-        }
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        run_cmd = ' '.join(self.job.launcher.command(self.job))
-        self.post_run = ['%s -v' % self.executable]
-        # self.perf_group = ['L2', 'L3']
-        self.perf_group = ['L2', 'L3', 'CACHES', 'DATA',
-                           'MEM', 'MEM_DP', 'MEM_SP']
-        for perf_group in self.perf_group:
-            self.post_run += ['%s %s -C 0 -g %s -m %s' %
-                              (run_cmd, self.executable, perf_group,
-                               self.target_executable)]
-        partitiontype = partition.fullname.split(':')[1]
-        if partitiontype == 'gpu':
-            self.job.options = ['--constraint="gpu&perf"']
-        elif partitiontype == 'mc':
-            self.job.options = ['--constraint="mc&perf"']
-
-    @property
-    @sn.sanity_function
-    def arithmetic_intensity(self):
-        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
-                                 self.rpt, 'flops', int)
-        bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
-                                 self.rpt, 'bytes', int)
-        # debug: print('ai={}'.format(flops/bytes))
-        return flops/bytes
-
-    @property
-    @sn.sanity_function
-    def gflops(self):
-        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
-                                 self.rpt, 'flops', int)
-        msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
-                                'msec', float)
-        # debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
-        return (flops/((msec/1000))/10**9)

From bfd0c52a76da6d224655c393bba655af6f15b30a Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Fri, 11 Oct 2019 09:44:21 +0200
Subject: [PATCH 3/6] pep8

---
 cscs-checks/libraries/math/scalapack_compile_run.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py
index 62ad1d7b79..b9ff658909 100644
--- a/cscs-checks/libraries/math/scalapack_compile_run.py
+++ b/cscs-checks/libraries/math/scalapack_compile_run.py
@@ -75,14 +75,14 @@ def scalapack_sanity(number1, number2, expected_value):
 # class ScaLAPACKPerf(ScaLAPACKTest):
 #     def __init__(self, linkage):
 #         super().__init__(linkage)
-# 
+#
 #         self.tags |= {'monch_acceptance'}
 #         self.sourcepath = 'scalapack_performance_compile_run.f'
 #         self.valid_systems = ['monch:compute']
 #         self.valid_prog_environs = ['PrgEnv-gnu']
 #         self.num_tasks = 64
 #         self.num_tasks_per_node = 16
-# 
+#
 #         self.sanity_patterns = sn.assert_found(r'Run', self.stdout)
 #         self.perf_patterns = {
 #             'perf': sn.max(
@@ -90,7 +90,7 @@ def scalapack_sanity(number1, number2, expected_value):
 #                               self.stdout, 'gflops', float)
 #             )
 #         }
-# 
+#
 #         self.reference = {
 #             'monch:compute': {
 #                 'perf': (24., -0.1, None)

From 620725146850f7368e3d6b25b1da5c23301e83d3 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 15 Oct 2019 12:42:55 +0200
Subject: [PATCH 4/6] change tagname

---
 cscs-checks/apps/amber/amber_check.py         |   2 +-
 cscs-checks/apps/espresso/espresso_check.py   |   2 +-
 cscs-checks/apps/gromacs/gromacs_check.py     |   2 +-
 cscs-checks/apps/icon/rrtmgp_check.py         |   2 +-
 cscs-checks/apps/lammps/lammps_check.py       |   2 +-
 cscs-checks/apps/namd/namd_check.py           |   2 +-
 cscs-checks/apps/openfoam/check_openfoam.py   |   2 +-
 .../apps/openfoam/check_openfoam_extend.py    |   2 +-
 cscs-checks/cuda/cuda_checks.py               |   2 +-
 cscs-checks/cuda/multi_gpu.py                 |   2 +-
 cscs-checks/cuda/nvml_check.py                |   2 +-
 .../libraries/io/netcdf_compile_run.py        |   2 +-
 .../libraries/math/scalapack_compile_run.py   |   2 +-
 cscs-checks/mch/fieldextra_check.py           |   2 +-
 .../microbenchmarks/hpcg/hpcg_benchmark.py    |   2 +-
 .../microbenchmarks/spec-accel/spec.py        |   2 +-
 cscs-checks/system/io/ior_check.py            |   2 +-
 cscs-checks/tools/io/cdo.py                   |   2 +-
 cscs-checks/tools/io/nco.py                   |   2 +-
 .../berkeley-ert-nvprof.py                    |   2 +-
 .../berkeley-ert-serial.py                    | 170 ++++++++++++++++
 .../profiling_and_debugging/berkeley-ert.py   |   2 +-
 .../intel_advisor_roofline.py                 |   2 +-
 .../intel_sde_berkeley_stream.py              |   2 +-
 .../intel_sde_roofline.py                     |   2 +-
 .../intel_vtune_roofline.py                   |   2 +-
 .../likwid_roofline.py                        | 181 ++++++++++++++++++
 27 files changed, 376 insertions(+), 25 deletions(-)
 create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
 create mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py

diff --git a/cscs-checks/apps/amber/amber_check.py b/cscs-checks/apps/amber/amber_check.py
index 4ddce619d2..8594d30394 100644
--- a/cscs-checks/apps/amber/amber_check.py
+++ b/cscs-checks/apps/amber/amber_check.py
@@ -38,7 +38,7 @@ def __init__(self, input_file, output_file):
                                      output_file, 'perf', float, item=1)
         }
         self.maintainers = ['SO', 'VH']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
 
 
 @rfm.required_version('>=2.16')
diff --git a/cscs-checks/apps/espresso/espresso_check.py b/cscs-checks/apps/espresso/espresso_check.py
index 876ae49753..c21c900f65 100644
--- a/cscs-checks/apps/espresso/espresso_check.py
+++ b/cscs-checks/apps/espresso/espresso_check.py
@@ -11,7 +11,7 @@ def __init__(self, scale):
         super().__init__()
         self.descr = 'Quantum Espresso CPU check'
         self.maintainers = ['AK', 'LM']
-        self.tags = {'scs', 'production', 'resources'}
+        self.tags = {'scs', 'production', 'external-resources'}
         self.sourcesdir = os.path.join(self.current_system.resourcesdir,
                                        'Espresso')
 
diff --git a/cscs-checks/apps/gromacs/gromacs_check.py b/cscs-checks/apps/gromacs/gromacs_check.py
index 81bc39d767..213db9531c 100644
--- a/cscs-checks/apps/gromacs/gromacs_check.py
+++ b/cscs-checks/apps/gromacs/gromacs_check.py
@@ -41,7 +41,7 @@ def __init__(self, output_file):
                 'num_switches': 1
             }
         }
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
 
 
 @rfm.required_version('>=2.19')
diff --git a/cscs-checks/apps/icon/rrtmgp_check.py b/cscs-checks/apps/icon/rrtmgp_check.py
index 9d157e7ea9..e6e332c674 100644
--- a/cscs-checks/apps/icon/rrtmgp_check.py
+++ b/cscs-checks/apps/icon/rrtmgp_check.py
@@ -14,7 +14,7 @@ def __init__(self):
         self.valid_prog_environs = ['PrgEnv-pgi']
         self.sourcesdir = os.path.join(self.current_system.resourcesdir,
                                        'RRTMGP')
-        self.tags = {'resources'}
+        self.tags = {'external-resources'}
         self.prebuild_cmd = ['cp build/Makefile.conf.dom build/Makefile.conf']
         self.executable = 'python'
         self.executable_opts = [
diff --git a/cscs-checks/apps/lammps/lammps_check.py b/cscs-checks/apps/lammps/lammps_check.py
index 23a69cff68..017dd0b067 100644
--- a/cscs-checks/apps/lammps/lammps_check.py
+++ b/cscs-checks/apps/lammps/lammps_check.py
@@ -33,7 +33,7 @@ def __init__(self):
             }
         }
 
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
         self.maintainers = ['TR', 'VH']
 
 
diff --git a/cscs-checks/apps/namd/namd_check.py b/cscs-checks/apps/namd/namd_check.py
index eb91d82520..31b3095c5c 100644
--- a/cscs-checks/apps/namd/namd_check.py
+++ b/cscs-checks/apps/namd/namd_check.py
@@ -44,7 +44,7 @@ def __init__(self, arch, scale, variant):
         }
 
         self.maintainers = ['CB', 'LM']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
         self.strict_check = False
         self.extra_resources = {
             'switches': {
diff --git a/cscs-checks/apps/openfoam/check_openfoam.py b/cscs-checks/apps/openfoam/check_openfoam.py
index dff3d6de8e..48dd58b4ba 100644
--- a/cscs-checks/apps/openfoam/check_openfoam.py
+++ b/cscs-checks/apps/openfoam/check_openfoam.py
@@ -22,7 +22,7 @@ def __init__(self):
         self.num_cpus_per_task  = 1
 
         self.maintainers = ['MKr']
-        self.tags = {'scs', 'production', 'resources'}
+        self.tags = {'scs', 'production', 'external-resources'}
 
         self.pre_run = ['source $FOAM_BASH']
 
diff --git a/cscs-checks/apps/openfoam/check_openfoam_extend.py b/cscs-checks/apps/openfoam/check_openfoam_extend.py
index d6b7d7194d..9ec5285f6d 100644
--- a/cscs-checks/apps/openfoam/check_openfoam_extend.py
+++ b/cscs-checks/apps/openfoam/check_openfoam_extend.py
@@ -28,7 +28,7 @@ def __init__(self):
                 r'Finalising parallel run', self.stdout)
 
         self.maintainers = ['MKr']
-        self.tags = {'scs', 'production', 'resources'}
+        self.tags = {'scs', 'production', 'external-resources'}
         self.pre_run = ['source $FOAM_INST_DIR/foam-extend-4.0/etc/bashrc']
 
 
diff --git a/cscs-checks/cuda/cuda_checks.py b/cscs-checks/cuda/cuda_checks.py
index 73b84ca02b..aa6ad63e22 100644
--- a/cscs-checks/cuda/cuda_checks.py
+++ b/cscs-checks/cuda/cuda_checks.py
@@ -27,7 +27,7 @@ def __init__(self):
             self.nvidia_sm = '37'
 
         self.maintainers = ['AJ', 'VK']
-        self.tags = {'production', 'resources'}
+        self.tags = {'production', 'external-resources'}
 
 
 @rfm.required_version('>=2.14')
diff --git a/cscs-checks/cuda/multi_gpu.py b/cscs-checks/cuda/multi_gpu.py
index 1392ea1df4..f2b94f7def 100644
--- a/cscs-checks/cuda/multi_gpu.py
+++ b/cscs-checks/cuda/multi_gpu.py
@@ -61,7 +61,7 @@ def __init__(self):
             'kesch:cn:d2h':   (7584, -0.1, None, 'MB/s'),
             'kesch:cn:d2d': (137408, -0.1, None, 'MB/s')
         }
-        self.tags = {'diagnostic', 'mch', 'resources'}
+        self.tags = {'diagnostic', 'mch', 'external-resources'}
         self.maintainers = ['AJ', 'VK']
 
     def _xfer_pattern(self, xfer_kind, devno, nodename):
diff --git a/cscs-checks/cuda/nvml_check.py b/cscs-checks/cuda/nvml_check.py
index f27d3f5a9b..44553c66c2 100644
--- a/cscs-checks/cuda/nvml_check.py
+++ b/cscs-checks/cuda/nvml_check.py
@@ -22,4 +22,4 @@ def __init__(self):
             r"compute\s+mode\s+'Exclusive Process'", self.stdout)
 
         self.maintainers = ['AJ', 'VK']
-        self.tags = {'production', 'resources'}
+        self.tags = {'production', 'external-resources'}
diff --git a/cscs-checks/libraries/io/netcdf_compile_run.py b/cscs-checks/libraries/io/netcdf_compile_run.py
index 37a3d2df7f..ee77b005c2 100644
--- a/cscs-checks/libraries/io/netcdf_compile_run.py
+++ b/cscs-checks/libraries/io/netcdf_compile_run.py
@@ -40,7 +40,7 @@ def __init__(self, lang, linkage):
         self.num_tasks_per_node = 1
         self.sanity_patterns = sn.assert_found(r'SUCCESS', self.stdout)
         self.maintainers = ['AJ', 'VK']
-        self.tags = {'production', 'resources'}
+        self.tags = {'production', 'external-resources'}
 
     def setup(self, partition, environ, **job_opts):
         if self.current_system.name == 'kesch':
diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py
index b9ff658909..be040662d9 100644
--- a/cscs-checks/libraries/math/scalapack_compile_run.py
+++ b/cscs-checks/libraries/math/scalapack_compile_run.py
@@ -27,7 +27,7 @@ def __init__(self, linkage):
         self.build_system = 'SingleSource'
         self.build_system.fflags = ['-O3']
         self.maintainers = ['CB', 'LM', 'MKr']
-        self.tags = {'production', 'resources'}
+        self.tags = {'production', 'external-resources'}
 
 
 @rfm.required_version('>=2.14')
diff --git a/cscs-checks/mch/fieldextra_check.py b/cscs-checks/mch/fieldextra_check.py
index b10df5bf65..60129d7887 100644
--- a/cscs-checks/mch/fieldextra_check.py
+++ b/cscs-checks/mch/fieldextra_check.py
@@ -8,7 +8,7 @@ class FieldextraTestBase(rfm.RunOnlyRegressionTest):
     def __init__(self):
         super().__init__()
         self.maintainers = ['MKr']
-        self.tags = {'mch', 'resources'}
+        self.tags = {'mch', 'external-resources'}
 
         self.valid_systems = ['kesch:cn']
         self.valid_prog_environs = ['PrgEnv-gnu-nompi']
diff --git a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py
index 65548472ac..939298704c 100644
--- a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py
+++ b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py
@@ -52,7 +52,7 @@ def __init__(self):
         }
 
         self.maintainers = ['SK']
-        self.tags = {'diagnostic', 'benchmark', 'resources'}
+        self.tags = {'diagnostic', 'benchmark', 'external-resources'}
 
     @property
     @sn.sanity_function
diff --git a/cscs-checks/microbenchmarks/spec-accel/spec.py b/cscs-checks/microbenchmarks/spec-accel/spec.py
index fc219d9e14..e6cac960bd 100644
--- a/cscs-checks/microbenchmarks/spec-accel/spec.py
+++ b/cscs-checks/microbenchmarks/spec-accel/spec.py
@@ -53,7 +53,7 @@ def __init__(self, prg_envs):
         }
 
         self.maintainers = ['SK']
-        self.tags = {'diagnostic', 'resources'}
+        self.tags = {'diagnostic', 'external-resources'}
 
     def setup(self, partition, environ, **job_opts):
         self.pre_run = ['source ./shrc', 'mv %s config' %
diff --git a/cscs-checks/system/io/ior_check.py b/cscs-checks/system/io/ior_check.py
index 757d1493a5..874113b61e 100644
--- a/cscs-checks/system/io/ior_check.py
+++ b/cscs-checks/system/io/ior_check.py
@@ -108,7 +108,7 @@ def __init__(self, base_dir):
 
         systems_to_test = ['dom', 'daint']
         if self.current_system.name in systems_to_test:
-            self.tags |= {'production', 'resources'}
+            self.tags |= {'production', 'external-resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/io/cdo.py b/cscs-checks/tools/io/cdo.py
index 400d29d0fc..62e659ee05 100644
--- a/cscs-checks/tools/io/cdo.py
+++ b/cscs-checks/tools/io/cdo.py
@@ -40,7 +40,7 @@ def __init__(self):
             self.modules = ['CDO']
 
         self.maintainers = ['SO']
-        self.tags = {'production', 'mch', 'resources'}
+        self.tags = {'production', 'mch', 'external-resources'}
 
 
 # Check that the netCDF loaded by the CDO module supports the nc4 filetype
diff --git a/cscs-checks/tools/io/nco.py b/cscs-checks/tools/io/nco.py
index 9997f90ecf..8a6a3f2e98 100644
--- a/cscs-checks/tools/io/nco.py
+++ b/cscs-checks/tools/io/nco.py
@@ -33,7 +33,7 @@ def __init__(self):
             self.modules = ['NCO']
 
         self.maintainers = ['SO']
-        self.tags = {'production', 'mch', 'resources'}
+        self.tags = {'production', 'mch', 'external-resources'}
 
 
 # Check that the netCDF loaded by the NCO module supports the nc4 filetype
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
index 933ffab571..7bfc7c64fa 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
@@ -72,7 +72,7 @@ def __init__(self, gpudims, flop, repeat):
         ]
         self.build_system.ldflags = ['-O3']
         self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
         gpu_blocks, gpu_threads = gpudims
         self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format(
             repeat, flop, gpu_blocks, gpu_threads)
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
new file mode 100644
index 0000000000..68a30c8bb4
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
@@ -0,0 +1,170 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+#{{{ base
+class ErtTestBase(rfm.RegressionTest):
+    """
+    The Empirical Roofline Tool, ERT, automatically generates roofline data.
+    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
+    """
+
+    def __init__(self):
+        self.descr = 'Empirical Roofline Toolkit'
+        self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir,
+                                       'roofline', 'ert')
+        self.build_system = 'SingleSource'
+        self.sourcepath = 'kernel1.c driver1.c'
+        self.executable = 'ert.exe'
+        self.build_system.ldflags = ['-O3', '-fopenmp']
+        self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir,
+                                       'roofline', 'ert')
+        self.rpt = '%s.rpt' % self.executable
+        self.maintainers = ['JG']
+        self.tags = {'scs', 'external-external-resources'}
+
+    def setup(self, partition, environ, **job_opts):
+        super().setup(partition, environ, **job_opts)
+        if self.num_tasks != 36:
+            self.job.launcher.options = ['--cpu-bind=verbose,none']
+#}}}
+
+#{{{ test
+@rfm.parameterized_test(
+    *[[num_ranks, flop]
+      for num_ranks in [1]
+      for flop in [256, 512, 1024]])
+      #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
+class ErtBroadwellTest(ErtTestBase):
+    def __init__(self, num_ranks, flop):
+        super().__init__()
+        ompthread = 1
+        self.valid_systems = ['daint:mc', 'dom:mc']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.build_system.cppflags = [
+            '-DERT_FLOP=%s' % flop,
+            '-DERT_ALIGN=32',
+            '-DERT_MEMORY_MAX=1073741824',
+            '-DERT_MPI=True',
+            '-DERT_OPENMP=True',
+            '-DERT_TRIALS_MIN=1',
+            '-DERT_WORKING_SET_MIN=1',
+        ]
+        self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
+            flop, num_ranks, ompthread)
+        self.exclusive = True
+        self.num_tasks = num_ranks
+        self.num_tasks_per_node = num_ranks
+        self.num_cpus_per_task = ompthread
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'CRAYPE_LINK_TYPE': 'dynamic',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
+        }
+
+        # take the "slowest" job, make it sleep after it has ended and hope the
+        # other jobs have ended too
+        # TODO: find a better way to wait for the other jobs to end
+        num_ranks_min = 1
+        flop_min = 1024
+        self.roofline_rpt = 'rpt'
+        if num_ranks == num_ranks_min and flop == flop_min:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+                # give enough time for all the dependent jobs to collect data:
+                'sleep 60',
+                'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt',
+            ]
+
+        else:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+            ]
+
+        # --- Sanity check:
+        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
+                          r'\(\w+ \*\)malloc\(PSIZE\);')
+        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
+        self.sanity_patterns = sn.all([
+            sn.assert_found('GFLOPs', 'sum'),
+            sn.assert_eq(datatype, 'double'),
+        ])
+
+        # --- Performance check:
+        if num_ranks == num_ranks_min and flop == flop_min:
+            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
+            ref_GFLOPs = 945.0
+            ref_L1bw = 1788.0
+            ref_L2bw = 855.0
+            ref_L3bw = 547.0
+            ref_DRAMbw = 70.5
+
+            # Typical performance report looks like:
+            # --------------------------------------
+            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
+            #    908.43 GFLOPs EMP
+            #    ******
+            # META_DATA
+            # OPENMP_THREADS 1
+            # FLOPS          8
+            # MPI_PROCS      36
+            # 
+            #   5647.33 L1 EMP
+            #   *******
+            #   3203.86 L2 EMP
+            #   *******
+            #   1773.58 L3 EMP
+            #   *******
+            #    139.56 L4 EMP
+            #    103.50 DRAM EMP
+            #    ******
+            # META_DATA
+            # FLOPS          2
+            # OPENMP_THREADS 1
+            # MPI_PROCS      36
+            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
+            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
+            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
+            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
+            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
+
+            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
+                                      'GFLOPs', float)
+            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
+                                      'L1bw', float)
+            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
+                                      'L2bw', float)
+            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
+                                      'L3bw', float)
+            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
+                                      'DRAMbw', float)
+
+            # --performance-report:
+            self.perf_patterns = {
+                'gflops': gflops,
+                'L1bw': L1bw,
+                'L2bw': L2bw,
+                'L3bw': L3bw,
+                'DRAMbw': DRAMbw,
+            }
+
+            self.reference = {
+                '*': {
+                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
+                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
+                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
+                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
+                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
+                }
+            }
+
+        # else:
+
+#}}}
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
index 580aff48e8..e9cef9be20 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
@@ -23,7 +23,7 @@ def __init__(self):
                                        'roofline', 'ert')
         self.rpt = '%s.rpt' % self.executable
         self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
index 4172410021..6c260b9e74 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
@@ -238,7 +238,7 @@ def __init__(self, repeat, toolversion, datalayout):
         }
 
         self.maintainers = ['JG']
-        self.tags = {'production', 'resources'}
+        self.tags = {'production', 'external-resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
index 489f4ee9dd..b3ff46d5b4 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
@@ -35,7 +35,7 @@ def __init__(self):
         self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
         self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
         self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
 
     @property
     @sn.sanity_function
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
index b6b3015d06..ab98a6cefb 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
@@ -68,7 +68,7 @@ def __init__(self, repeat, toolsversion, datalayout):
         self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
         self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
         self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
         self.sanity_patterns = sn.all([
             sn.assert_eq(sn.extractsingle(
                 r'^Intel\(R\) Software Development Emulator\.  Version:  '
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
index 46e173e8a5..ad745a924b 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
@@ -76,7 +76,7 @@ def __init__(self, repeat, toolsversion, datalayout):
         # NOTE: -allow-multiple-runs requires to install vtune drivers
         # TODO: -collect memory-access
         self.maintainers = ['JG']
-        self.tags = {'scs', 'resources'}
+        self.tags = {'scs', 'external-resources'}
         self.sanity_patterns = sn.all([
             sn.assert_found('loop complete.', self.stdout),
             sn.assert_eq(sn.extractsingle(
diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
new file mode 100644
index 0000000000..9eda059c13
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
@@ -0,0 +1,181 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
+                          for repeat in ['600000']
+                          for toolsversion in ['4.3.3']
+                          # for datalayout in ['G3_AOS_SCALAR']
+                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
+                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
+                          ])
+class IntelRooflineLikwidTest(rfm.RegressionTest):
+    '''This test checks the values reported by RRZE likwid roofline model:
+
+G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697
+G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629
+G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439
+G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B
+                              10GF                              60000  0.18
+
+        > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf
+        > likwid-perfctr -g CACHES -H
+
+        > Get group definition with (identical result):
+        > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt
+        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H
+        DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 +
+                              FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +
+                              FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)
+                              /runtime
+
+        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H
+        Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) +
+                                               SUM(MBOXxC1))*64.0/runtime
+        Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) +
+                                               SUM(MBOXxC1))*64.0
+
+       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H
+       L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
+                                          ICACHE_MISSES)*64.0/time
+       L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
+                                          ICACHE_MISSES)*64.0
+
+       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H
+       L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL +
+                                          L2_LINES_OUT_DEMAND_DIRTY)*64/time
+       L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL +
+                                          L2_LINES_OUT_DEMAND_DIRTY)*64
+
+       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H
+       Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) +
+                                              SUM(CAS_COUNT_WR))*64.0/time
+       Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) +
+                                              SUM(CAS_COUNT_WR))*64.0
+    '''
+    def __init__(self, repeat, toolsversion, datalayout):
+        super().__init__()
+        self.descr = 'Roofline Analysis test with Likwid:'
+        self.valid_systems = ['dom:mc']
+        # Reporting MFLOPS is not available on Intel Haswell cpus, see
+        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
+        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
+        self.valid_prog_environs = ['PrgEnv-intel']
+        self.modules = ['likwid']
+        # likwid/4.3.3-perf_event
+        self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir,
+                                       'roofline', 'intel_advisor')
+        self.build_system = 'SingleSource'
+        self.sourcepath = '_roofline.cpp'
+        self.executable = 'likwid-perfctr'
+        self.target_executable = './roof.exe'
+        self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON',
+                                      '-I$EBROOTLIKWID/include']
+        self.prgenv_flags = {
+            'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'],
+            # '-qopt-streaming-stores', 'always',
+        }
+        self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid']
+        self.prebuild_cmd = [
+            'patch -s < LIKWID/roofline_template.patch',
+            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
+            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
+        ]
+        self.exclusive = True
+        self.num_tasks = 1
+        self.num_tasks_per_node = 1
+        self.num_cpus_per_task = 1
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
+            'CRAYPE_LINK_TYPE': 'dynamic',
+        }
+        self.pre_run = [
+            'mv %s %s' % (self.executable, self.target_executable),
+        ]
+        self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable]
+        # -C 0 : sets processor id(s) to pin threads and measure
+        # -g   : sets performance group
+        # -m   : use likwid API
+        self.executable_opts = self.tool_flags
+        self.maintainers = ['JG']
+        self.tags = {'scs', 'external-external-resources'}
+        # self.rpt = '%s.rpt' % self.target_executable
+        self.sanity_patterns = sn.all([
+            sn.assert_found('loop complete.', self.stdout),
+            sn.assert_eq(sn.extractsingle(
+                r'^likwid-perfctr -- Version (?P<toolsversion>\d.\d.\d)',
+                self.stdout, 'toolsversion'), toolsversion),
+        ])
+        # References for Intel Broadwell CPU (E5-2695 v4):
+        references = {
+            'G3_AOS_SCALAR': {
+                'dom:mc': {
+                    'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
+                }
+            },
+            'G3_SOA_SCALAR': {
+                'dom:mc': {
+                    'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
+                }
+            },
+            'G3_AOS_VECTOR': {
+                'dom:mc': {
+                    'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.125, -0.05, 0.05, 'flop/byte')
+                }
+            },
+            'G3_SOA_VECTOR': {
+                'dom:mc': {
+                    'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
+                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
+                }
+            },
+        }
+        self.reference = references[datalayout]
+        self.perf_patterns = {
+            'gflops': self.gflops,
+            'ai': self.arithmetic_intensity,
+        }
+
+    def setup(self, partition, environ, **job_opts):
+        super().setup(partition, environ, **job_opts)
+        run_cmd = ' '.join(self.job.launcher.command(self.job))
+        self.post_run = ['%s -v' % self.executable]
+        # self.perf_group = ['L2', 'L3']
+        self.perf_group = ['L2', 'L3', 'CACHES', 'DATA',
+                           'MEM', 'MEM_DP', 'MEM_SP']
+        for perf_group in self.perf_group:
+            self.post_run += ['%s %s -C 0 -g %s -m %s' %
+                              (run_cmd, self.executable, perf_group,
+                               self.target_executable)]
+        partitiontype = partition.fullname.split(':')[1]
+        if partitiontype == 'gpu':
+            self.job.options = ['--constraint="gpu&perf"']
+        elif partitiontype == 'mc':
+            self.job.options = ['--constraint="mc&perf"']
+
+    @property
+    @sn.sanity_function
+    def arithmetic_intensity(self):
+        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
+                                 self.rpt, 'flops', int)
+        bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
+                                 self.rpt, 'bytes', int)
+        # debug: print('ai={}'.format(flops/bytes))
+        return flops/bytes
+
+    @property
+    @sn.sanity_function
+    def gflops(self):
+        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
+                                 self.rpt, 'flops', int)
+        msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
+                                'msec', float)
+        # debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
+        return (flops/((msec/1000))/10**9)

From cb8f0877f5f985842d2c083e5aeb4fa83b4246ec Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 15 Oct 2019 12:43:25 +0200
Subject: [PATCH 5/6] clean

---
 .../berkeley-ert-serial.py                    | 170 ----------------
 .../likwid_roofline.py                        | 181 ------------------
 2 files changed, 351 deletions(-)
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
deleted file mode 100644
index 68a30c8bb4..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py
+++ /dev/null
@@ -1,170 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-#{{{ base
-class ErtTestBase(rfm.RegressionTest):
-    """
-    The Empirical Roofline Tool, ERT, automatically generates roofline data.
-    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
-    """
-
-    def __init__(self):
-        self.descr = 'Empirical Roofline Toolkit'
-        self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir,
-                                       'roofline', 'ert')
-        self.build_system = 'SingleSource'
-        self.sourcepath = 'kernel1.c driver1.c'
-        self.executable = 'ert.exe'
-        self.build_system.ldflags = ['-O3', '-fopenmp']
-        self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir,
-                                       'roofline', 'ert')
-        self.rpt = '%s.rpt' % self.executable
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-external-resources'}
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        if self.num_tasks != 36:
-            self.job.launcher.options = ['--cpu-bind=verbose,none']
-#}}}
-
-#{{{ test
-@rfm.parameterized_test(
-    *[[num_ranks, flop]
-      for num_ranks in [1]
-      for flop in [256, 512, 1024]])
-      #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
-class ErtBroadwellTest(ErtTestBase):
-    def __init__(self, num_ranks, flop):
-        super().__init__()
-        ompthread = 1
-        self.valid_systems = ['daint:mc', 'dom:mc']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.build_system.cppflags = [
-            '-DERT_FLOP=%s' % flop,
-            '-DERT_ALIGN=32',
-            '-DERT_MEMORY_MAX=1073741824',
-            '-DERT_MPI=True',
-            '-DERT_OPENMP=True',
-            '-DERT_TRIALS_MIN=1',
-            '-DERT_WORKING_SET_MIN=1',
-        ]
-        self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
-            flop, num_ranks, ompthread)
-        self.exclusive = True
-        self.num_tasks = num_ranks
-        self.num_tasks_per_node = num_ranks
-        self.num_cpus_per_task = ompthread
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
-        }
-
-        # take the "slowest" job, make it sleep after it has ended and hope the
-        # other jobs have ended too
-        # TODO: find a better way to wait for the other jobs to end
-        num_ranks_min = 1
-        flop_min = 1024
-        self.roofline_rpt = 'rpt'
-        if num_ranks == num_ranks_min and flop == flop_min:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-                # give enough time for all the dependent jobs to collect data:
-                'sleep 60',
-                'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt',
-            ]
-
-        else:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-            ]
-
-        # --- Sanity check:
-        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
-                          r'\(\w+ \*\)malloc\(PSIZE\);')
-        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
-        self.sanity_patterns = sn.all([
-            sn.assert_found('GFLOPs', 'sum'),
-            sn.assert_eq(datatype, 'double'),
-        ])
-
-        # --- Performance check:
-        if num_ranks == num_ranks_min and flop == flop_min:
-            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
-            ref_GFLOPs = 945.0
-            ref_L1bw = 1788.0
-            ref_L2bw = 855.0
-            ref_L3bw = 547.0
-            ref_DRAMbw = 70.5
-
-            # Typical performance report looks like:
-            # --------------------------------------
-            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
-            #    908.43 GFLOPs EMP
-            #    ******
-            # META_DATA
-            # OPENMP_THREADS 1
-            # FLOPS          8
-            # MPI_PROCS      36
-            # 
-            #   5647.33 L1 EMP
-            #   *******
-            #   3203.86 L2 EMP
-            #   *******
-            #   1773.58 L3 EMP
-            #   *******
-            #    139.56 L4 EMP
-            #    103.50 DRAM EMP
-            #    ******
-            # META_DATA
-            # FLOPS          2
-            # OPENMP_THREADS 1
-            # MPI_PROCS      36
-            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
-            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
-            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
-            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
-            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
-
-            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
-                                      'GFLOPs', float)
-            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
-                                      'L1bw', float)
-            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
-                                      'L2bw', float)
-            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
-                                      'L3bw', float)
-            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
-                                      'DRAMbw', float)
-
-            # --performance-report:
-            self.perf_patterns = {
-                'gflops': gflops,
-                'L1bw': L1bw,
-                'L2bw': L2bw,
-                'L3bw': L3bw,
-                'DRAMbw': DRAMbw,
-            }
-
-            self.reference = {
-                '*': {
-                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
-                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
-                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
-                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
-                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
-                }
-            }
-
-        # else:
-
-#}}}
diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
deleted file mode 100644
index 9eda059c13..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
-                          for repeat in ['600000']
-                          for toolsversion in ['4.3.3']
-                          # for datalayout in ['G3_AOS_SCALAR']
-                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
-                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
-                          ])
-class IntelRooflineLikwidTest(rfm.RegressionTest):
-    '''This test checks the values reported by RRZE likwid roofline model:
-
-G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697
-G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629
-G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439
-G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B
-                              10GF                              60000  0.18
-
-        > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf
-        > likwid-perfctr -g CACHES -H
-
-        > Get group definition with (identical result):
-        > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt
-        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H
-        DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 +
-                              FP_ARITH_INST_RETIRED_SCALAR_DOUBLE +
-                              FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)
-                              /runtime
-
-        > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H
-        Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) +
-                                               SUM(MBOXxC1))*64.0/runtime
-        Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) +
-                                               SUM(MBOXxC1))*64.0
-
-       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H
-       L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
-                                          ICACHE_MISSES)*64.0/time
-       L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
-                                          ICACHE_MISSES)*64.0
-
-       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H
-       L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL +
-                                          L2_LINES_OUT_DEMAND_DIRTY)*64/time
-       L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL +
-                                          L2_LINES_OUT_DEMAND_DIRTY)*64
-
-       > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H
-       Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) +
-                                              SUM(CAS_COUNT_WR))*64.0/time
-       Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) +
-                                              SUM(CAS_COUNT_WR))*64.0
-    '''
-    def __init__(self, repeat, toolsversion, datalayout):
-        super().__init__()
-        self.descr = 'Roofline Analysis test with Likwid:'
-        self.valid_systems = ['dom:mc']
-        # Reporting MFLOPS is not available on Intel Haswell cpus, see
-        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
-        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
-        self.valid_prog_environs = ['PrgEnv-intel']
-        self.modules = ['likwid']
-        # likwid/4.3.3-perf_event
-        self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir,
-                                       'roofline', 'intel_advisor')
-        self.build_system = 'SingleSource'
-        self.sourcepath = '_roofline.cpp'
-        self.executable = 'likwid-perfctr'
-        self.target_executable = './roof.exe'
-        self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON',
-                                      '-I$EBROOTLIKWID/include']
-        self.prgenv_flags = {
-            'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'],
-            # '-qopt-streaming-stores', 'always',
-        }
-        self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid']
-        self.prebuild_cmd = [
-            'patch -s < LIKWID/roofline_template.patch',
-            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
-            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
-        ]
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-            'CRAYPE_LINK_TYPE': 'dynamic',
-        }
-        self.pre_run = [
-            'mv %s %s' % (self.executable, self.target_executable),
-        ]
-        self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable]
-        # -C 0 : sets processor id(s) to pin threads and measure
-        # -g   : sets performance group
-        # -m   : use likwid API
-        self.executable_opts = self.tool_flags
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-external-resources'}
-        # self.rpt = '%s.rpt' % self.target_executable
-        self.sanity_patterns = sn.all([
-            sn.assert_found('loop complete.', self.stdout),
-            sn.assert_eq(sn.extractsingle(
-                r'^likwid-perfctr -- Version (?P<toolsversion>\d.\d.\d)',
-                self.stdout, 'toolsversion'), toolsversion),
-        ])
-        # References for Intel Broadwell CPU (E5-2695 v4):
-        references = {
-            'G3_AOS_SCALAR': {
-                'dom:mc': {
-                    'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_SOA_SCALAR': {
-                'dom:mc': {
-                    'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_AOS_VECTOR': {
-                'dom:mc': {
-                    'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.125, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_SOA_VECTOR': {
-                'dom:mc': {
-                    'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-        }
-        self.reference = references[datalayout]
-        self.perf_patterns = {
-            'gflops': self.gflops,
-            'ai': self.arithmetic_intensity,
-        }
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        run_cmd = ' '.join(self.job.launcher.command(self.job))
-        self.post_run = ['%s -v' % self.executable]
-        # self.perf_group = ['L2', 'L3']
-        self.perf_group = ['L2', 'L3', 'CACHES', 'DATA',
-                           'MEM', 'MEM_DP', 'MEM_SP']
-        for perf_group in self.perf_group:
-            self.post_run += ['%s %s -C 0 -g %s -m %s' %
-                              (run_cmd, self.executable, perf_group,
-                               self.target_executable)]
-        partitiontype = partition.fullname.split(':')[1]
-        if partitiontype == 'gpu':
-            self.job.options = ['--constraint="gpu&perf"']
-        elif partitiontype == 'mc':
-            self.job.options = ['--constraint="mc&perf"']
-
-    @property
-    @sn.sanity_function
-    def arithmetic_intensity(self):
-        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
-                                 self.rpt, 'flops', int)
-        bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
-                                 self.rpt, 'bytes', int)
-        # debug: print('ai={}'.format(flops/bytes))
-        return flops/bytes
-
-    @property
-    @sn.sanity_function
-    def gflops(self):
-        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
-                                 self.rpt, 'flops', int)
-        msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
-                                'msec', float)
-        # debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
-        return (flops/((msec/1000))/10**9)

From 3978d5f851aeeb6054425c3fde17e765cf7abb00 Mon Sep 17 00:00:00 2001
From: Vasileios Karakasis <karakasis@cscs.ch>
Date: Wed, 16 Oct 2019 16:30:49 +0200
Subject: [PATCH 6/6] Remove completely obsolete Scalapack test

---
 .../libraries/math/scalapack_compile_run.py   | 30 -------------------
 1 file changed, 30 deletions(-)

diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py
index be040662d9..624b1a9770 100644
--- a/cscs-checks/libraries/math/scalapack_compile_run.py
+++ b/cscs-checks/libraries/math/scalapack_compile_run.py
@@ -66,33 +66,3 @@ def scalapack_sanity(number1, number2, expected_value):
             scalapack_sanity(4, 3, 0.2483911184660867),
             scalapack_sanity(4, 4, 0.1701907253504270)
         ])
-
-
-# # FIXME: This test is obsolete; it is kept only for reference.
-# # NOTE:  The test case is very small, but larger cases did not succeed!
-# @rfm.required_version('>=2.14')
-# @rfm.parameterized_test(['dynamic'])
-# class ScaLAPACKPerf(ScaLAPACKTest):
-#     def __init__(self, linkage):
-#         super().__init__(linkage)
-#
-#         self.tags |= {'monch_acceptance'}
-#         self.sourcepath = 'scalapack_performance_compile_run.f'
-#         self.valid_systems = ['monch:compute']
-#         self.valid_prog_environs = ['PrgEnv-gnu']
-#         self.num_tasks = 64
-#         self.num_tasks_per_node = 16
-#
-#         self.sanity_patterns = sn.assert_found(r'Run', self.stdout)
-#         self.perf_patterns = {
-#             'perf': sn.max(
-#                 sn.extractall(r'GFLOPS/s:\s+(?P<gflops>\S+)',
-#                               self.stdout, 'gflops', float)
-#             )
-#         }
-#
-#         self.reference = {
-#             'monch:compute': {
-#                 'perf': (24., -0.1, None)
-#             }
-#         }