From 71473c63dfca677f1087f291997a960e73c14fdb Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 11 Feb 2020 22:11:43 +0100
Subject: [PATCH 1/3] clean

---
 .../tools/profiling_and_debugging/README.md   |  19 ++
 .../intel_inspector.py                        |  95 ----------
 .../{ => roofline}/berkeley-ert-nvprof.py     |   0
 .../roofline/berkeley-ert-serial.py           | 170 ++++++++++++++++++
 .../{ => roofline}/berkeley-ert.py            |   0
 .../{ => roofline}/gpp_nvprof_roofline.py     |   0
 .../{ => roofline}/intel_advisor_roofline.py  |   2 +-
 .../{ => roofline}/intel_sde_roofline.py      |   0
 .../{ => roofline}/intel_vtune_roofline.py    |   0
 .../profiling_and_debugging/scorep_mpi_omp.py |  83 ---------
 10 files changed, 190 insertions(+), 179 deletions(-)
 create mode 100644 cscs-checks/tools/profiling_and_debugging/README.md
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/intel_inspector.py
 rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/berkeley-ert-nvprof.py (100%)
 create mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
 rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/berkeley-ert.py (100%)
 rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/gpp_nvprof_roofline.py (100%)
 rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/intel_advisor_roofline.py (99%)
 rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/intel_sde_roofline.py (100%)
 rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/intel_vtune_roofline.py (100%)
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py

diff --git a/cscs-checks/tools/profiling_and_debugging/README.md b/cscs-checks/tools/profiling_and_debugging/README.md
new file mode 100644
index 0000000000..1a9e7e42c6
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/README.md
@@ -0,0 +1,19 @@
+This directory contains scripts to test HPC debugging and performance tools on
+Piz Daint using ReFrame. More checks are available here:
+http://github.com/eth-cscs/hpctools
+
+* nvidia/nsys_cuda.py
+
+* notool/internal_timers_mpi.py
+
+* intel/intel_inspector.py
+* intel/intel_vtune.py
+* intel/intel_advisor.py
+
+* scorep/scorep_sampling_profiling.py
+* scorep/scorep_sampling_tracing.py
+
+* scalasca/scalasca_sampling_tracing.py
+* scalasca/scalasca_sampling_profiling.py
+
+* extrae/extrae.py
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_inspector.py b/cscs-checks/tools/profiling_and_debugging/intel_inspector.py
deleted file mode 100644
index e2e7a9b7b4..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/intel_inspector.py
+++ /dev/null
@@ -1,95 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(['C++'], ['F90'])
-class IntelInspectorTest(rfm.RegressionTest):
-    '''This test checks Intel Inspector:
-    https://software.intel.com/en-us/inspector
-    '''
-
-    def __init__(self, lang):
-        super().__init__()
-        self.name = 'Intel_Inspector_%s' % lang.replace('+', 'p')
-        self.descr = self.name
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
-        self.valid_prog_environs = ['PrgEnv-intel']
-        self.modules = ['inspector']
-        self.sourcesdir = os.path.join('src', lang)
-        self.build_system = 'Make'
-        if lang == 'F90':
-            self.build_system.max_concurrency = 1
-
-        self.executable = 'inspxe-cl'
-        self.target_executable = './jacobi'
-        self.prgenv_flags = {
-            'PrgEnv-gnu': ['-g', '-O2', '-fopenmp'],
-            'PrgEnv-cray': ['-g', '-O2', '-homp'],
-            'PrgEnv-intel': ['-g', '-O2', '-qopenmp'],
-            'PrgEnv-pgi': ['-g', '-O2', '-mp']
-        }
-        self.executable_opts = ['-collect mi1 %s' % self.target_executable]
-        self.exclusive = True
-        self.num_tasks = 3
-        self.num_tasks_per_node = 3
-        self.num_cpus_per_task = 4
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        num_iterations = 10
-        self.variables = {
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-            'ITERATIONS': str(num_iterations),
-            'OMP_PROC_BIND': 'true',
-            'CRAYPE_LINK_TYPE': 'dynamic',
-        }
-        self.version_rpt = 'version.rpt'
-        self.problems_rpt = 'problems.rpt'
-        self.summary_rpt = 'summary.rpt'
-        self.observations_rpt = 'observations.rpt'
-        self.pre_run = [
-            'mv %s %s' % (self.executable, self.target_executable),
-            '%s --version &> %s' % (self.executable, self.version_rpt),
-        ]
-        self.post_run = [
-            '%s -V &> %s' % (self.executable, self.version_rpt),
-            '%s -report=summary &> %s' % (self.executable, self.summary_rpt),
-            '%s -report=problems &> %s' % (self.executable, self.problems_rpt),
-            '%s -report=observations &> %s' %
-            (self.executable, self.observations_rpt),
-        ]
-        self.maintainers = ['JG', 'MKr']
-        self.tags = {'production'}
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        environ_name = self.current_environ.name
-        prgenv_flags = self.prgenv_flags[environ_name]
-        self.build_system.cflags = prgenv_flags
-        self.build_system.cxxflags = prgenv_flags
-        self.build_system.fflags = prgenv_flags
-        regexversion = (r'^Intel\(R\)\sInspector\s\d+\sUpdate\s\d+\s\(build'
-                        r'\s(?P<toolsversion>\d+)')
-        system_default_toolversion = {
-            'daint': '597413',  # 2019 Update 4
-            'dom': '597413',    # 2019 Update 4
-        }
-        toolsversion = system_default_toolversion[self.current_system.name]
-        self.sanity_patterns = sn.all([
-            # check the job:
-            sn.assert_found('SUCCESS', self.stdout),
-            # check the tool's version:
-            sn.assert_eq(sn.extractsingle(regexversion, self.version_rpt,
-                                          'toolsversion'), toolsversion),
-            # check the reports:
-            sn.assert_found(r'1 Memory leak problem\(s\) detected',
-                            self.summary_rpt),
-            sn.assert_found(r'1 Memory not deallocated problem\(s\) detected',
-                            self.summary_rpt),
-            sn.assert_found(r'_main.\w+\(\d+\): Warning X\d+: P\d: '
-                            r'Memory not deallocated:',
-                            self.observations_rpt),
-            sn.assert_found(r'_main.\w+\(\d+\): Warning X\d+:',
-                            self.problems_rpt),
-        ])
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py
similarity index 100%
rename from cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
rename to cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
new file mode 100644
index 0000000000..ba1b0ad455
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
@@ -0,0 +1,170 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+#{{{ base
+class ErtTestBase(rfm.RegressionTest):
+    """
+    The Empirical Roofline Tool, ERT, automatically generates roofline data.
+    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
+    """
+
+    def __init__(self):
+        self.descr = 'Empirical Roofline Toolkit'
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        self.build_system = 'SingleSource'
+        self.sourcepath = 'kernel1.c driver1.c'
+        self.executable = 'ert.exe'
+        self.build_system.ldflags = ['-O3', '-fopenmp']
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        self.rpt = '%s.rpt' % self.executable
+        self.maintainers = ['JG']
+        self.tags = {'scs', 'external-resources'}
+
+    def setup(self, partition, environ, **job_opts):
+        super().setup(partition, environ, **job_opts)
+        if self.num_tasks != 36:
+            self.job.launcher.options = ['--cpu-bind=verbose,none']
+#}}}
+
+#{{{ test
+@rfm.parameterized_test(
+    *[[num_ranks, flop]
+      for num_ranks in [1]
+      for flop in [256, 512, 1024]])
+      #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
+class ErtBroadwellTest(ErtTestBase):
+    def __init__(self, num_ranks, flop):
+        super().__init__()
+        ompthread = 1
+        self.valid_systems = ['daint:mc', 'dom:mc']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.build_system.cppflags = [
+            '-DERT_FLOP=%s' % flop,
+            '-DERT_ALIGN=32',
+            '-DERT_MEMORY_MAX=1073741824',
+            '-DERT_MPI=True',
+            '-DERT_OPENMP=True',
+            '-DERT_TRIALS_MIN=1',
+            '-DERT_WORKING_SET_MIN=1',
+        ]
+        self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
+            flop, num_ranks, ompthread)
+        self.exclusive = True
+        self.num_tasks = num_ranks
+        self.num_tasks_per_node = num_ranks
+        self.num_cpus_per_task = ompthread
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'CRAYPE_LINK_TYPE': 'dynamic',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
+        }
+
+        # take the "slowest" job, make it sleep after it has ended and hope the
+        # other jobs have ended too
+        # TODO: find a better way to wait for the other jobs to end
+        num_ranks_min = 1
+        flop_min = 1024
+        self.roofline_rpt = 'rpt'
+        if num_ranks == num_ranks_min and flop == flop_min:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+                # give enough time for all the dependent jobs to collect data:
+                'sleep 60',
+                'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt',
+            ]
+
+        else:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+            ]
+
+        # --- Sanity check:
+        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
+                          r'\(\w+ \*\)malloc\(PSIZE\);')
+        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
+        self.sanity_patterns = sn.all([
+            sn.assert_found('GFLOPs', 'sum'),
+            sn.assert_eq(datatype, 'double'),
+        ])
+
+        # --- Performance check:
+        if num_ranks == num_ranks_min and flop == flop_min:
+            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
+            ref_GFLOPs = 945.0
+            ref_L1bw = 1788.0
+            ref_L2bw = 855.0
+            ref_L3bw = 547.0
+            ref_DRAMbw = 70.5
+
+            # Typical performance report looks like:
+            # --------------------------------------
+            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
+            #    908.43 GFLOPs EMP
+            #    ******
+            # META_DATA
+            # OPENMP_THREADS 1
+            # FLOPS          8
+            # MPI_PROCS      36
+            # 
+            #   5647.33 L1 EMP
+            #   *******
+            #   3203.86 L2 EMP
+            #   *******
+            #   1773.58 L3 EMP
+            #   *******
+            #    139.56 L4 EMP
+            #    103.50 DRAM EMP
+            #    ******
+            # META_DATA
+            # FLOPS          2
+            # OPENMP_THREADS 1
+            # MPI_PROCS      36
+            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
+            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
+            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
+            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
+            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
+
+            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
+                                      'GFLOPs', float)
+            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
+                                      'L1bw', float)
+            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
+                                      'L2bw', float)
+            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
+                                      'L3bw', float)
+            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
+                                      'DRAMbw', float)
+
+            # --performance-report:
+            self.perf_patterns = {
+                'gflops': gflops,
+                'L1bw': L1bw,
+                'L2bw': L2bw,
+                'L3bw': L3bw,
+                'DRAMbw': DRAMbw,
+            }
+
+            self.reference = {
+                '*': {
+                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
+                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
+                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
+                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
+                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
+                }
+            }
+
+        # else:
+
+#}}}
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py
similarity index 100%
rename from cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
rename to cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py
diff --git a/cscs-checks/tools/profiling_and_debugging/gpp_nvprof_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py
similarity index 100%
rename from cscs-checks/tools/profiling_and_debugging/gpp_nvprof_roofline.py
rename to cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py
similarity index 99%
rename from cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
rename to cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py
index 797d61121a..5eb3e82580 100644
--- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py
+++ b/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py
@@ -239,7 +239,7 @@ def __init__(self, repeat, toolversion, datalayout):
         }
 
         self.maintainers = ['JG', 'MKr']
-        self.tags = {'production', 'external-resources'}
+        self.tags = {'external-resources'}
 
     def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py
similarity index 100%
rename from cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
rename to cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py
diff --git a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py
similarity index 100%
rename from cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py
rename to cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py
diff --git a/cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py b/cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py
deleted file mode 100644
index 677cdfe160..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.required_version('>=2.14')
-@rfm.parameterized_test(['C++'], ['F90'])
-class ScorepHybrid(rfm.RegressionTest):
-    def __init__(self, lang):
-        super().__init__()
-        self.name = 'scorep_mpi_omp_%s' % lang.replace('+', 'p')
-        self.descr = 'SCORE-P %s check' % lang
-        self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc']
-
-        # Score-P fails with latest clang based cce and pgi compilers:
-        # src/measurement/thread/fork_join/scorep_thread_fork_join_omp.c:402:
-        # Fatal: Bug 'TPD == 0': Invalid OpenMP thread specific data object.
-        # -> removing cce from supported compiler for now.
-        self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel']
-        self.prgenv_flags = {
-            'PrgEnv-gnu': ['-g', '-fopenmp'],
-            'PrgEnv-intel': ['-g', '-openmp'],
-        }
-        self.sourcesdir = os.path.join('src', lang)
-        self.executable = 'jacobi'
-        self.build_system = 'Make'
-        self.build_system.makefile = 'Makefile_scorep_mpi_omp'
-        # NOTE: Restrict concurrency to allow creation of Fortran modules
-        if lang == 'F90':
-            self.build_system.max_concurrency = 1
-
-        self.num_tasks = 3
-        self.num_tasks_per_node = 3
-        self.num_cpus_per_task = 4
-        self.num_iterations = 200
-        self.variables = {
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-            'ITERATIONS': str(self.num_iterations),
-            'SCOREP_ENABLE_PROFILING': 'false',
-            'SCOREP_ENABLE_TRACING': 'true',
-            'OMP_PROC_BIND': 'true',
-            'SCOREP_TIMER': 'clock_gettime'
-        }
-        cpu_count = self.num_cpus_per_task * self.num_tasks_per_node
-        self.otf2_file = 'otf2.txt'
-        self.sanity_patterns = sn.all([
-            sn.assert_found('SUCCESS', self.stdout),
-            sn.assert_eq(sn.count(sn.extractall(
-                r'(?P<line>LEAVE.*omp\s+\S+\s+\@_jacobi)', self.otf2_file,
-                'line')), 4 * self.num_iterations * cpu_count),
-            sn.assert_not_found('warning|WARNING', self.stderr)
-        ])
-        self.maintainers = ['MKr', 'JG']
-        self.tags = {'production'}
-        # additional program call in order to generate the tracing output for
-        # the sanity check
-        self.post_run = [
-            'otf2-print scorep-*/traces.otf2 > %s' % self.otf2_file
-        ]
-
-    def setup(self, partition, environ, **job_opts):
-        scorep_ver = '6.0'
-        tc_ver = '19.10'
-        cu_ver = '10.1'
-        self.scorep_modules = {
-            'PrgEnv-gnu': ['Score-P/%s-CrayGNU-%s' % (scorep_ver, tc_ver)],
-            'PrgEnv-intel': ['Score-P/%s-CrayIntel-%s' % (scorep_ver, tc_ver)],
-            'PrgEnv-pgi': ['Score-P/%s-CrayPGI-%s' % (scorep_ver, tc_ver)],
-        }
-        if partition.fullname in ['daint:gpu', 'dom:gpu']:
-            self.scorep_modules['PrgEnv-gnu'] = [
-                'Score-P/%s-CrayGNU-%s-cuda-%s' % (scorep_ver, tc_ver, cu_ver)
-            ]
-
-        self.modules = self.scorep_modules[environ.name]
-        super().setup(partition, environ, **job_opts)
-        prgenv_flags = self.prgenv_flags[self.current_environ.name]
-        self.build_system.cflags = prgenv_flags
-        self.build_system.cxxflags = prgenv_flags
-        self.build_system.fflags = prgenv_flags
-        self.build_system.ldflags = ['-lm']
-        self.build_system.options = ["PREP='scorep --mpp=mpi --thread=omp'"]

From 837b8fc60f3f2dabc1c4df85b8df9532bfea664e Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 11 Feb 2020 22:21:07 +0100
Subject: [PATCH 2/3] pep8

---
 .../roofline/berkeley-ert-serial.py           | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
index ba1b0ad455..5ad27930fc 100644
--- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
+++ b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
@@ -1,10 +1,9 @@
 import os
-
 import reframe as rfm
 import reframe.utility.sanity as sn
 
 
-#{{{ base
+# {{{ base
 class ErtTestBase(rfm.RegressionTest):
     """
     The Empirical Roofline Tool, ERT, automatically generates roofline data.
@@ -29,14 +28,13 @@ def setup(self, partition, environ, **job_opts):
         super().setup(partition, environ, **job_opts)
         if self.num_tasks != 36:
             self.job.launcher.options = ['--cpu-bind=verbose,none']
-#}}}
-
-#{{{ test
-@rfm.parameterized_test(
-    *[[num_ranks, flop]
-      for num_ranks in [1]
-      for flop in [256, 512, 1024]])
-      #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
+# }}}
+
+
+# {{{ test
+@rfm.parameterized_test(*[[num_ranks, flop]
+                          for num_ranks in [1]
+                          for flop in [256, 512, 1024]])
 class ErtBroadwellTest(ErtTestBase):
     def __init__(self, num_ranks, flop):
         super().__init__()
@@ -115,7 +113,7 @@ def __init__(self, num_ranks, flop):
             # OPENMP_THREADS 1
             # FLOPS          8
             # MPI_PROCS      36
-            # 
+            #
             #   5647.33 L1 EMP
             #   *******
             #   3203.86 L2 EMP
@@ -138,11 +136,11 @@ def __init__(self, num_ranks, flop):
             gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
                                       'GFLOPs', float)
             L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
-                                      'L1bw', float)
+                                    'L1bw', float)
             L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
-                                      'L2bw', float)
+                                    'L2bw', float)
             L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
-                                      'L3bw', float)
+                                    'L3bw', float)
             DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
                                       'DRAMbw', float)
 
@@ -167,4 +165,4 @@ def __init__(self, num_ranks, flop):
 
         # else:
 
-#}}}
+# }}}

From 4723c528804e71fcc47053972e79ffdaab1a3995 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 3 Mar 2020 12:46:57 +0100
Subject: [PATCH 3/3] cleaning

---
 .../roofline/berkeley-ert-nvprof.py           | 148 ---------
 .../roofline/berkeley-ert-serial.py           | 168 ----------
 .../roofline/berkeley-ert.py                  | 164 ----------
 .../roofline/gpp_nvprof_roofline.py           | 135 --------
 .../roofline/intel_advisor_roofline.py        | 276 ----------------
 .../roofline/intel_sde_roofline.py            | 129 --------
 .../roofline/intel_vtune_roofline.py          | 298 ------------------
 7 files changed, 1318 deletions(-)
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py
 delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py

diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py
deleted file mode 100644
index 7bfc7c64fa..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py
+++ /dev/null
@@ -1,148 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(
-    *[[gpudims, flop, repeat]
-      # gpudims sets (gpu_blocks, gpu_threads):
-      for gpudims in [(112, 1024), (224, 512), (448, 256), (896, 128),
-                      (1792, 64), (3584, 32)]
-      for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
-      # self.repeat replaces '-DERT_NUM_EXPERIMENTS=2':
-      for repeat in [1, 2]])
-class ErtP100Test(rfm.RegressionTest):
-    '''
-    The Empirical Roofline Tool, ERT, empirically generates roofline data:
-    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
-
-    This test checks the ERT tool with NVIDIA Tesla P100-PCIE-16GB:
-    Device 0: "Tesla P100-PCIE-16GB"
-      CUDA Driver Version / Runtime Version     10.1 / 10.0
-      CUDA Capability Major/Minor version number:    6.0
-      (56) Multiprocessors, (64) CUDA Cores/MP:      3584 CUDA Cores
-      GPU Max Clock rate:                            1329 MHz (1.33 GHz)
-      Theoretical peak performance per GPU:          4761 Gflop/s
-      Maximum number of threads per multiprocessor:  2048
-      Peak number of threads:                        114688 threads <---------
-      Maximum number of threads per block:           1024           <---------
-    NVRM version: NVIDIA UNIX x86_64 Kernel Module  418.39
-
-    # The following python code can help for a parameter space study:
-    # (use --exec-policy=async)
-    max_threads_per_block = 1024
-    max_threads = 114688
-    gpu_threads = max_threads_per_block * 2
-    while gpu_threads > 32:
-        gpu_threads = gpu_threads // 2
-        gpu_blocks = max_threads // gpu_threads
-        nth = gpu_threads * gpu_blocks
-        print('{} {} {} {}'.format(gpu_blocks, gpu_threads, nth, max_threads))
-    '''
-    def __init__(self, gpudims, flop, repeat):
-        super().__init__()
-        max_gpu_blocks = 3584
-        max_flops = 1024
-        max_repeat = 2
-        self.descr = 'Empirical Roofline Toolkit'
-        self.valid_systems = ['dom:gpu']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.modules = ['craype-accel-nvidia60']
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        # A single input file is required for nvcc to work:
-        self.build_system = 'SingleSource'
-        self.prebuild_cmd = [
-            'cat kernel1.c driver1.c | sed "s-^#if ERT-#ifdef ERT-g" > '
-            '_gpu.cu']
-        self.sourcepath = '_gpu.cu'
-        self.executable = 'ert.exe'
-        self.build_system.cppflags = [
-            # ERT_FLOPS = -DERT_FLOP !
-            '-DERT_FLOP=%s' % flop,
-            '-DERT_ALIGN=32',
-            # 1G = 1024^3 = 1073741824:
-            '-DERT_MEMORY_MAX=1073741824',
-            # ERT_GPU True:
-            '-DERT_GPU',
-            '-DERT_TRIALS_MIN=1',
-            '-DERT_WORKING_SET_MIN=128',
-            # '-x cu' explicitly sets the language (cuda) for the src files.
-        ]
-        self.build_system.ldflags = ['-O3']
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-resources'}
-        gpu_blocks, gpu_threads = gpudims
-        self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format(
-            repeat, flop, gpu_blocks, gpu_threads)
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
-        }
-        self.executable_opts = [str(gpu_blocks), str(gpu_threads)]
-        self.rpt = '%s.rpt' % self.executable
-        # Reference roofline boundaries for NVIDIA Tesla P100-PCIE-16GB:
-        GFLOPs = 4355.0
-        # Keeping for future reference:
-        # L1bw = 1724.0
-        # L2bw = 855.0
-        # L3bw = 547.0
-        DRAMbw = 521.0
-        self.roofline_rpt = 'rpt'
-        # use the latest job to generate the roofline rpt:
-        if (gpu_blocks == max_gpu_blocks and flop == max_flops and
-           repeat == max_repeat):
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-                # give enough time for all the dependent jobs to collect data:
-                'sleep 60',
-                'cat ../ertgpu_Run*/sum | python2 roofline.py > rpt',
-            ]
-            self.sanity_patterns = sn.all([
-                # --- check data type:
-                sn.assert_eq(sn.extractsingle(
-                    r'^\s+(?P<prec>\w+) \*\s+buf = \(\w+ \*\)'
-                    r'_mm_malloc\(PSIZE, ERT_ALIGN\);', 'driver1.c', 'prec'),
-                    'double'),
-                # --- check ert's roofline results. Typical output is:
-                #   4355.20 GFLOPs EMP
-                # META_DATA
-                # GPU_BLOCKS     1792
-                # FLOPS          1024
-                # GPU_THREADS    64
-                #
-                #   1723.95 L1 EMP
-                #    521.29 DRAM EMP
-                #
-                # check GFLOPS:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
-                    'GFLOPs', float), GFLOPs, -0.1, 0.5),
-                # check L1 bandwidth:
-                # https://cug.org/proceedings/protected/cug2019_proceedings/
-                # includes/files/pap103s2-file1.pdf:
-                #   "ERT fails to identify the L1 cache"
-                # sn.assert_reference(sn.extractsingle(
-                #     r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
-                #     'L1bw', float), L1bw, -0.1, 0.3),
-                # check DRAM bandwidth:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt,
-                    'DRAMbw', float), DRAMbw, -0.1, 0.3),
-            ])
-        else:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-            ]
-            self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
deleted file mode 100644
index 5ad27930fc..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py
+++ /dev/null
@@ -1,168 +0,0 @@
-import os
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-# {{{ base
-class ErtTestBase(rfm.RegressionTest):
-    """
-    The Empirical Roofline Tool, ERT, automatically generates roofline data.
-    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
-    """
-
-    def __init__(self):
-        self.descr = 'Empirical Roofline Toolkit'
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        self.build_system = 'SingleSource'
-        self.sourcepath = 'kernel1.c driver1.c'
-        self.executable = 'ert.exe'
-        self.build_system.ldflags = ['-O3', '-fopenmp']
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        self.rpt = '%s.rpt' % self.executable
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-resources'}
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        if self.num_tasks != 36:
-            self.job.launcher.options = ['--cpu-bind=verbose,none']
-# }}}
-
-
-# {{{ test
-@rfm.parameterized_test(*[[num_ranks, flop]
-                          for num_ranks in [1]
-                          for flop in [256, 512, 1024]])
-class ErtBroadwellTest(ErtTestBase):
-    def __init__(self, num_ranks, flop):
-        super().__init__()
-        ompthread = 1
-        self.valid_systems = ['daint:mc', 'dom:mc']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.build_system.cppflags = [
-            '-DERT_FLOP=%s' % flop,
-            '-DERT_ALIGN=32',
-            '-DERT_MEMORY_MAX=1073741824',
-            '-DERT_MPI=True',
-            '-DERT_OPENMP=True',
-            '-DERT_TRIALS_MIN=1',
-            '-DERT_WORKING_SET_MIN=1',
-        ]
-        self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
-            flop, num_ranks, ompthread)
-        self.exclusive = True
-        self.num_tasks = num_ranks
-        self.num_tasks_per_node = num_ranks
-        self.num_cpus_per_task = ompthread
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
-        }
-
-        # take the "slowest" job, make it sleep after it has ended and hope the
-        # other jobs have ended too
-        # TODO: find a better way to wait for the other jobs to end
-        num_ranks_min = 1
-        flop_min = 1024
-        self.roofline_rpt = 'rpt'
-        if num_ranks == num_ranks_min and flop == flop_min:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-                # give enough time for all the dependent jobs to collect data:
-                'sleep 60',
-                'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt',
-            ]
-
-        else:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-            ]
-
-        # --- Sanity check:
-        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
-                          r'\(\w+ \*\)malloc\(PSIZE\);')
-        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
-        self.sanity_patterns = sn.all([
-            sn.assert_found('GFLOPs', 'sum'),
-            sn.assert_eq(datatype, 'double'),
-        ])
-
-        # --- Performance check:
-        if num_ranks == num_ranks_min and flop == flop_min:
-            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
-            ref_GFLOPs = 945.0
-            ref_L1bw = 1788.0
-            ref_L2bw = 855.0
-            ref_L3bw = 547.0
-            ref_DRAMbw = 70.5
-
-            # Typical performance report looks like:
-            # --------------------------------------
-            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
-            #    908.43 GFLOPs EMP
-            #    ******
-            # META_DATA
-            # OPENMP_THREADS 1
-            # FLOPS          8
-            # MPI_PROCS      36
-            #
-            #   5647.33 L1 EMP
-            #   *******
-            #   3203.86 L2 EMP
-            #   *******
-            #   1773.58 L3 EMP
-            #   *******
-            #    139.56 L4 EMP
-            #    103.50 DRAM EMP
-            #    ******
-            # META_DATA
-            # FLOPS          2
-            # OPENMP_THREADS 1
-            # MPI_PROCS      36
-            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
-            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
-            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
-            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
-            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
-
-            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
-                                      'GFLOPs', float)
-            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
-                                    'L1bw', float)
-            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
-                                    'L2bw', float)
-            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
-                                    'L3bw', float)
-            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
-                                      'DRAMbw', float)
-
-            # --performance-report:
-            self.perf_patterns = {
-                'gflops': gflops,
-                'L1bw': L1bw,
-                'L2bw': L2bw,
-                'L3bw': L3bw,
-                'DRAMbw': DRAMbw,
-            }
-
-            self.reference = {
-                '*': {
-                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
-                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
-                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
-                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
-                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
-                }
-            }
-
-        # else:
-
-# }}}
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py
deleted file mode 100644
index e9cef9be20..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-class ErtTestBase(rfm.RegressionTest):
-    '''
-    The Empirical Roofline Tool, ERT, automatically generates roofline data.
-    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
-    '''
-
-    def __init__(self):
-        super().__init__()
-        self.descr = 'Empirical Roofline Toolkit'
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        self.build_system = 'SingleSource'
-        self.sourcepath = 'kernel1.c driver1.c'
-        self.executable = 'ert.exe'
-        self.build_system.ldflags = ['-O3', '-fopenmp']
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'ert')
-        self.rpt = '%s.rpt' % self.executable
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-resources'}
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        if self.num_tasks != 36:
-            self.job.launcher.options = ['--cpu-bind=verbose,none']
-
-
-@rfm.parameterized_test(
-    *[[num_ranks, flop]
-      for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1]
-      for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]])
-class ErtBroadwellTest(ErtTestBase):
-    def __init__(self, num_ranks, flop):
-        super().__init__()
-        ompthread = 36 // num_ranks
-        self.valid_systems = ['daint:mc', 'dom:mc']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.build_system.cppflags = [
-            '-DERT_FLOP=%s' % flop,
-            '-DERT_ALIGN=32',
-            '-DERT_MEMORY_MAX=1073741824',
-            '-DERT_MPI=True',
-            '-DERT_OPENMP=True',
-            '-DERT_TRIALS_MIN=1',
-            '-DERT_WORKING_SET_MIN=1',
-        ]
-        self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format(
-            flop, num_ranks, ompthread)
-        self.exclusive = True
-        self.num_tasks = num_ranks
-        self.num_tasks_per_node = num_ranks
-        self.num_cpus_per_task = ompthread
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
-        }
-
-        # take the "slowest" job, make it sleep after it has ended and hope the
-        # other jobs have ended too
-        # TODO: find a better way to wait for the other jobs to end
-        num_ranks_min = 1
-        flop_min = 1024
-        self.roofline_rpt = 'rpt'
-        if num_ranks == num_ranks_min and flop == flop_min:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-                # give enough time for all the dependent jobs to collect data:
-                'sleep 60',
-                'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt',
-            ]
-
-        else:
-            self.post_run = [
-                'cat *_job.out | python2 preprocess.py > pre',
-                'python2 maximum.py < pre > max',
-                'python2 summary.py < max > sum',
-            ]
-
-        # --- Sanity check:
-        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
-                          r'\(\w+ \*\)malloc\(PSIZE\);')
-        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
-        self.sanity_patterns = sn.all([
-            sn.assert_found('GFLOPs', 'sum'),
-            sn.assert_eq(datatype, 'double'),
-        ])
-
-        # --- Performance check:
-        if num_ranks == num_ranks_min and flop == flop_min:
-            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
-            ref_GFLOPs = 945.0
-            ref_L1bw = 1788.0
-            ref_L2bw = 855.0
-            ref_L3bw = 547.0
-            ref_DRAMbw = 70.5
-
-            # Typical performance report looks like:
-            # --------------------------------------
-            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
-            #    908.43 GFLOPs EMP
-            #    ******
-            # META_DATA
-            # OPENMP_THREADS 1
-            # FLOPS          8
-            # MPI_PROCS      36
-            #
-            #   5647.33 L1 EMP
-            #   *******
-            #   3203.86 L2 EMP
-            #   *******
-            #   1773.58 L3 EMP
-            #   *******
-            #    139.56 L4 EMP
-            #    103.50 DRAM EMP
-            #    ******
-            # META_DATA
-            # FLOPS          2
-            # OPENMP_THREADS 1
-            # MPI_PROCS      36
-            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
-            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
-            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
-            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
-            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
-
-            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
-                                      'GFLOPs', float)
-            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
-                                    'L1bw', float)
-            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
-                                    'L2bw', float)
-            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
-                                    'L3bw', float)
-            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
-                                      'DRAMbw', float)
-
-            # --performance-report:
-            self.perf_patterns = {
-                'gflops': gflops,
-                'L1bw': L1bw,
-                'L2bw': L2bw,
-                'L3bw': L3bw,
-                'DRAMbw': DRAMbw,
-            }
-
-            self.reference = {
-                '*': {
-                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
-                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
-                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
-                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
-                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
-                }
-            }
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py
deleted file mode 100644
index 305f49a9bd..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-class GPPBaseTest(rfm.RegressionTest):
-    '''This test checks the values reported by NVIDIA nvprof for roofline
-       modeling:
-       - https://github.com/cyanguwa/nersc-roofline/tree/master/GPP
-            (compile.survey and run.survey)
-       - https://cug.org/proceedings/protected/cug2019_proceedings/includes/
-            files/pap103s2-file1.pdf
-    '''
-    def __init__(self):
-        super().__init__()
-        self.descr = 'Roofline Analysis of the GPP code using NVIDIA nvprof'
-        self.sourcesdir = 'https://github.com/cyanguwa/nersc-roofline.git'
-        self.build_system = 'Make'
-        self.build_system.cxx = 'nvcc'
-        self.maintainers = ['JG']
-        self.tags = {'scs'}
-
-    @property
-    @sn.sanity_function
-    def flops(self):
-        flop_count_dp_avg = sn.extractsingle(
-            r'^.*flop_count_dp\s+Floating Point Operations\(Double Precision\)'
-            r'\s+.*(?P<x>\d\.\d+e\+\d+)$', self.stderr, 'x', float)
-        # print("#debug: flop_count_dp_avg={}".format(flop_count_dp_avg))
-        return flop_count_dp_avg
-
-    @property
-    @sn.sanity_function
-    def gflops_per_seconds(self):
-        sec = sn.extractsingle(
-            r'^\*+\sKernel Time Taken\s\*+=\s(?P<sec>\d+.\d+)\ssecs',
-            self.stdout, 'sec', float)
-        # print("#debug: sec={}".format(sec))
-        # print("#debug: flops={}".format(self.flops))
-        # print("#debug: gflops_per_seconds={}".format(self.flops/(sec*10**9)))
-        return (self.flops / (sec*10**9))
-
-    @property
-    @sn.sanity_function
-    def hbm_bytes(self):
-        dram_read_transactions_avg = sn.extractsingle(
-            r'^.*dram_read_transactions\s+Device Memory Read Transactions\s+.*'
-            r'(?P<x>\d\.\d+e\+\d+)$', self.stderr, 'x', float)
-        dram_write_transactions_avg = sn.extractsingle(
-            r'^.*dram_write_transactions\s+Device Memory Write Transactions\s+'
-            r'\d+\s+\d+\s+(?P<x>\d+)$', self.stderr, 'x', float)
-        transactions_size = 32.0
-        bytes = dram_read_transactions_avg + dram_write_transactions_avg
-        bytes = bytes * transactions_size
-        # print("#debug: dram_read_avg={}".format(dram_read_transactions_avg))
-        # print("#debug: dram_wr_avg={}".format(dram_write_transactions_avg))
-        # print("#debug: hbm_bytes={}".format(bytes))
-        return bytes
-
-    @property
-    @sn.sanity_function
-    def arithmetic_intensity(self):
-        # print("#debug: ai={}".format(self.flops/self.hbm_bytes))
-        return (self.flops / self.hbm_bytes)
-
-
-@rfm.parameterized_test(*[[iw, repeat, cache]
-                        for iw in [6]
-                        for repeat in [1, 2]
-                        for cache in ['HBM']])
-# To reproduce published results (on V100):
-#                        for iw in [1, 2, 3, 4, 5, 6]
-#                        for repeat in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-#                        for cache in ['L1', 'L2', 'HBM']])
-class P100Test(GPPBaseTest):
-    ''' Counters for Pascal P100 GPU:
-        userguide = 'https://docs.nvidia.com/cuda/profiler-users-guide'
-        metrics = '%s/index.html#metrics-reference-6x' % userguide
-    '''
-    def __init__(self, iw, repeat, cache):
-        super().__init__()
-        self.name = 'roofline_gpp_P100_iw{}_repeat{}_{}cache'.format(
-                    iw, repeat, cache)
-        self.valid_systems = ['dom:gpu']
-        self.valid_prog_environs = ['PrgEnv-gnu']
-        self.modules = ['craype-accel-nvidia60']
-        self.prebuild_cmd = [
-            'cd GPP/Volta',
-            # Pascal P100 GPU:
-            'sed -i "s-sm_70-sm_60-" Makefile',
-            # fma (fmad=true) vs nofma (fmad=false):
-            'sed -i "s/fmad=.*/fmad=true/g" Makefile',
-            # iw (loop size):
-            'sed -i "s/#define nend.*/#define nend %s/g" GPUComplex.h' % iw,
-        ]
-        self.executable = './fma_iw{}_rep{}_{}.exe'.format(iw, repeat, cache)
-        self.build_system.options = ['EXE=../../%s' % self.executable]
-        # 1: <n_bands> 2: <n_valence_bands> 3: <n_plane_waves>
-        # 4: <nodes_per_mpi_group> 5: <stride>
-        self.executable_opts = ['512', '2', '32768', '20', '0']
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
-        }
-        self.nvprof_metrics = {
-            'L1': ['flop_count_dp', 'gld_transactions', 'gst_transactions',
-                   'atomic_transactions', 'local_load_transactions',
-                   'local_store_transactions', 'shared_load_transactions',
-                   'shared_store_transactions'],
-            'L2': ['flop_count_dp', 'l2_read_transactions',
-                   'l2_write_transactions'],
-            'HBM': ['flop_count_dp', 'dram_read_transactions',
-                    'dram_write_transactions'],
-            'PCIe/NVLINK': ['flop_count_dp', 'system_read_transactions',
-                            'system_write_transactions']
-        }
-        sep = ' --metrics '
-        nvmetrics = sep.join(self.nvprof_metrics[cache])
-        self.post_run = [
-            'nvprof --kernels "NumBandNgpown_kernel" --metrics %s %s %s' %
-            (nvmetrics, self.executable, ' '.join(self.executable_opts))
-        ]
-        # References for Nvidia P100 (HBM, iw=6):
-        gflops = 2796.6
-        ai = 13.6
-        self.sanity_patterns = sn.all([
-            sn.assert_found('P100-PCIE-16GB', self.stderr),
-            sn.assert_reference(self.gflops_per_seconds, gflops, -0.5, 0.5),
-            sn.assert_reference(self.arithmetic_intensity, ai, -0.5, 0.5),
-        ])
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py
deleted file mode 100644
index 5eb3e82580..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py
+++ /dev/null
@@ -1,276 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(*[[repeat, toolversion, datalayout]
-                          for repeat in ['100000']
-                          for toolversion in ['597843']
-                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
-                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
-                          ])
-class IntelRooflineAdvisorTest(rfm.RegressionTest):
-    '''This test checks the values reported by Intel Advisor's roofline model:
-    https://software.intel.com/en-us/intel-advisor-xe
-
-    The roofline model is based on GFLOPS and Arithmetic Intensity (AI):
-      "Self GFLOPS" = "Self GFLOP" / "Self Elapsed Time"
-      "Self GB/s" = "Self Memory GB" / "Self Elapsed Time"
-      "Self AI" = "Self GFLOPS" / "Self GB/s"
-
-    While a roofline analysis flag exists ('advixe-cl -collect roofline'), it
-    may not be used to collect data on MPI applications; in that case, the
-    survey and flops analysis must be collected separately: first run a survey
-    analysis ('advixe-cl -collect survey') and then run a tripcounts+flops
-    analysis ('advixe-cl -collect tripcounts -flop') using the same project
-    directory for both steps.
-
-    Example result on 1 core of Intel Broadwell CPU (E5-2695 v4):
-        G3_AOS_SCALAR: gflops,  2.79 arithmetic_intensity', 0.166 380ms <- slow
-        G3_AOS_VECTOR: gflops,  3.79 arithmetic_intensity', 0.125 143ms
-        G3_SOA_SCALAR: gflops,  2.79 arithmetic_intensity', 0.166 351ms
-        G3_SOA_VECTOR: gflops, 10.62 arithmetic_intensity', 0.166  57ms <- fast
-    '''
-
-    def __init__(self, repeat, toolversion, datalayout):
-        self.descr = 'Roofline Analysis test with Intel Advisor'
-        # for reference: advisor/2019 was failing on dom with:
-        # "Exceeded job memory limit" (webrt#36087)
-        self.valid_systems = ['daint:mc', 'dom:mc']
-        # Reporting MFLOPS is not available on Intel Haswell cpus, see
-        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
-        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
-        self.valid_prog_environs = ['PrgEnv-intel']
-        self.modules = ['advisor/2019_update4']
-        # Testing with advisor/2018 (build 551025) fails with:
-        #    roof.dir/nid00753.000/trc000/trc000.advixe
-        #    Application exit code: 139
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'intel_advisor')
-        self.build_system = 'SingleSource'
-        self.sourcepath = '_roofline.cpp'
-        self.executable = 'advixe-cl'
-        self.target_executable = './roof.exe'
-        self.build_system.cppflags = ['-D_ADVISOR',
-                                      '-I$ADVISOR_2019_DIR/include']
-        self.prgenv_flags = {
-            'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'],
-        }
-        self.build_system.ldflags = ['-L$ADVISOR_2019_DIR/lib64 -littnotify']
-        self.roofline_rpt = '%s.rpt' % self.target_executable
-        self.version_rpt = 'Intel_Advisor_version.rpt'
-        self.roofline_ref = 'Intel_Advisor_roofline_reference.rpt'
-        self.prebuild_cmd = [
-            'patch -s < ADVISOR/roofline_template.patch',
-            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
-            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
-        ]
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-            'CRAYPE_LINK_TYPE': 'dynamic',
-        }
-        self.pre_run = [
-            'mv %s %s' % (self.executable, self.target_executable),
-            'advixe-cl -help collect | head -20',
-        ]
-        self.roofdir = './roof.dir'
-        self.executable_opts = [
-            '--collect survey --project-dir=%s --search-dir src:rp=. '
-            '--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' %
-            (self.roofdir, self.target_executable)
-        ]
-        # - Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
-        L1bw = 293  # *1024**3
-        L2bw = 79   # *1024**3
-        L3bw = 33   # *1024**3
-        DPfmabw = 45*1024**3
-        DPaddbw = 12*1024**3
-        ScalarAddbw = 3*1024**3
-        # --- roofline (memory) boundaries from the tool:
-        # DRAM Bandwidth (single node)             	   63206331080	 memory
-        # DRAM Bandwidth                           	  125993278750	 memory
-        # DRAM Bandwidth (single-threaded)         	   12715570803	 memory
-        # L1 Bandwidth                             	11360856466728	 memory
-        # Scalar L1 Bandwidth                      	 2648216636280	 memory
-        # L1 bandwidth (single-threaded)           	  315579346298	 memory
-        #                                                 ************
-        # Scalar L1 bandwidth (single-threaded)    	   73561573230	 memory
-        # L2 Bandwidth                             	 3102773429268	 memory
-        # Scalar L2 Bandwidth                      	  921316779936	 memory
-        # L2 bandwidth (single-threaded)           	   86188150813	 memory
-        #                                                  ***********
-        # Scalar L2 bandwidth (single-threaded)    	   25592132776	 memory
-        # L3 Bandwidth                             	 1269637300440	 memory
-        # Scalar L3 Bandwidth                      	  845928498744	 memory
-        # L3 bandwidth (single-threaded)           	   35267702790	 memory
-        #                                                  ***********
-        # Scalar L3 bandwidth (single-threaded)    	   23498013854	 memory
-        regex_roof_L1 = (r'^L1\sbandwidth\s\(single-threaded\)\s+(?P<L1bw>\d+)'
-                         r'\s+memory$')
-        regex_roof_L2 = (r'^L2\sbandwidth\s\(single-threaded\)\s+(?P<L2bw>\d+)'
-                         r'\s+memory$')
-        regex_roof_L3 = (r'^L3\sbandwidth\s\(single-threaded\)\s+(?P<L3bw>\d+)'
-                         r'\s+memory$')
-        roof_L1 = sn.round(sn.extractsingle(regex_roof_L1, self.roofline_ref,
-                                            'L1bw', int) / 1024**3, 2)
-        roof_L2 = sn.round(sn.extractsingle(regex_roof_L2, self.roofline_ref,
-                                            'L2bw', int) / 1024**3, 3)
-        roof_L3 = sn.round(sn.extractsingle(regex_roof_L3, self.roofline_ref,
-                                            'L3bw', int) / 1024**3, 3)
-
-        # --- roofline (compute) boundaries from the tool:
-        # SP Vector FMA Peak                       	 2759741518342	compute
-        # SP Vector FMA Peak (single-threaded)     	   98956234406	compute
-        # DP Vector FMA Peak                       	 1379752337990	compute
-        # DP Vector FMA Peak (single-threaded)     	   49563336304	compute
-        #                                                  ***********
-        # Scalar Add Peak                          	   93438527464	compute
-        # Scalar Add Peak (single-threaded)        	    3289577753	compute
-        #                                                   **********
-        # SP Vector Add Peak                       	  689944922272	compute
-        # SP Vector Add Peak (single-threaded)     	   24691445241	compute
-        # DP Vector Add Peak                       	  344978547363	compute
-        # DP Vector Add Peak (single-threaded)     	   12385333008	compute
-        #                                                  ***********
-        # Integer Scalar Add Peak                  	  228677310757	compute
-        # Integer Scalar Add Peak (single-threaded)	    8055287031	compute
-        # Int64 Vector Add Peak                    	  747457604632	compute
-        # Int64 Vector Add Peak (single-threaded)  	   26300241032	compute
-        # Int32 Vector Add Peak                    	 1494880413924	compute
-        # Int32 Vector Add Peak (single-threaded)  	   52738180380	compute
-        regex_roof_dpfma = (r'^DP Vector FMA Peak\s\(single-threaded\)\s+'
-                            r'(?P<DPfmabw>\d+)\s+compute$')
-        regex_roof_dpadd = (r'^DP Vector Add Peak\s\(single-threaded\)\s+'
-                            r'(?P<DPaddbw>\d+)\s+compute$')
-        regex_roof_scalaradd = (r'^Scalar Add Peak\s\(single-threaded\)\s+'
-                                r'(?P<ScalarAddbw>\d+)\s+compute$')
-        roof_dpfma = sn.extractsingle(regex_roof_dpfma, self.roofline_ref,
-                                      'DPfmabw', int)
-        roof_dpadd = sn.extractsingle(regex_roof_dpadd, self.roofline_ref,
-                                      'DPaddbw', int)
-        roof_scalaradd = sn.extractsingle(regex_roof_scalaradd,
-                                          self.roofline_ref, 'ScalarAddbw',
-                                          int)
-
-        # - API output:
-        # ('self_elapsed_time', 0.1)
-        # ('self_memory_gb', 4.2496)
-        # ('self_gb_s', 42.496)
-        # ('self_gflop', 0.5312)
-        # ('self_gflops', 5.312)
-        # ('self_arithmetic_intensity', 0.125)
-        # ('_self_gb_s', 42.495999999999995, 42.496)
-        # ('_self_gflops', 5.311999999999999, 5.312)
-        # ('_self_arithmetic_intensity', 0.125, 0.125)
-        # ('gap _self_gb_s', -7.105427357601002e-15)
-        # ('gap _self_gflops', -8.881784197001252e-16)
-        # ('gap _self_arithmetic_intensity', 0.0)
-        # returned AI gap = 0.0000000000000000
-        # returned GFLOPS gap = -0.0000000000000009
-        regex_ai_gap = r'^returned\sAI\sgap\s=\s(?P<Intensity>.*)'
-        regex_ai_gflops = r'^returned\sGFLOPS\sgap\s=\s(?P<Flops>.*)'
-        ai_gap = sn.extractsingle(regex_ai_gap, self.roofline_rpt, 'Intensity',
-                                  float)
-        ai_gflops = sn.extractsingle(regex_ai_gflops, self.roofline_rpt,
-                                     'Flops', float)
-
-        regex_toolversion = r'I*.\(build\s(?P<version>\d+)\s*.'
-        found_toolversion = sn.extractsingle(regex_toolversion,
-                                             self.version_rpt, 'version')
-        self.sanity_patterns = sn.all([
-            # check the job status:
-            sn.assert_found('loop complete.', self.stdout),
-            # check the tool's version (2019=591264, 2018=551025):
-            sn.assert_eq(found_toolversion, toolversion),
-            # --- roofline boundaries:
-            # check --report=roofs (L1, L2 and L3 bandwidth):
-            # sn.assert_reference(roof_L1, L1bw, -0.12, 0.08),
-            # sn.assert_reference(roof_L2, L2bw, -0.12, 0.08),
-            # sn.assert_reference(roof_L3, L3bw, -0.12, 0.08),
-            # check --report=roofs (DP FMA, DP Add and Scalar Add):
-            sn.assert_reference(roof_dpfma, DPfmabw, -0.12, 0.08),
-            sn.assert_reference(roof_dpadd, DPaddbw, -0.12, 0.08),
-            sn.assert_reference(roof_scalaradd, ScalarAddbw, -0.12, 0.08),
-            # --- check Arithmetic_intensity:
-            sn.assert_reference(ai_gap, 0.0, -0.01, 0.01),
-            # --- check GFLOPS:
-            sn.assert_reference(ai_gflops, 0.0, -0.01, 0.01),
-        ])
-
-        # --performance-report:
-        regex_mseconds = r'elapsed time: (?P<msec>\d+)ms'
-        regex_ai = r'^\(\'self_arithmetic_intensity\', (?P<AI>\d+.\d+)\)'
-        regex_gbs = r'^\(\'self_gb_s\', (?P<gbs>\d+.\d+)\)'
-        regex_gflops = r'^\(\'self_gflops\', (?P<gflops>\d+.\d+)\)'
-        mseconds = sn.extractsingle(regex_mseconds, self.stdout,
-                                    'msec', int)
-        arithmetic_intensity = sn.extractsingle(regex_ai, self.roofline_rpt,
-                                                'AI', float)
-        bandwidth = sn.extractsingle(regex_gbs, self.roofline_rpt,
-                                     'gbs', float)
-        gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
-                                  'gflops', float)
-        self.perf_patterns = {
-            'Elapsed': mseconds,
-            'ArithmeticIntensity': arithmetic_intensity,
-            'GFlops': gflops,
-            'Bandwidth': bandwidth,
-            'roof_L1': roof_L1,
-            'roof_L2': roof_L2,
-            'roof_L3': roof_L3,
-        }
-        self.reference = {
-            '*': {
-                'Elapsed': (0, None, None, 'ms'),
-                'ArithmeticIntensity': (0, None, None, ''),
-                'GFlops': (0, None, None, 'GFLOPs/s'),
-                'Bandwidth': (0, None, None, 'GB/s'),
-                'roof_L1': (L1bw, -0.12, 0.08, 'GB/s'),
-                'roof_L2': (L2bw, -0.12, 0.08, 'GB/s'),
-                'roof_L3': (L3bw, -0.12, 0.08, 'GB/s'),
-            }
-        }
-
-        self.maintainers = ['JG', 'MKr']
-        self.tags = {'external-resources'}
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        environ_name = self.current_environ.name
-        prgenv_flags = self.prgenv_flags[environ_name]
-        self.build_system.cxxflags = prgenv_flags
-        launcher_cmd = ' '.join(self.job.launcher.command(self.job))
-        self.post_run = [
-            # --- collecting the performance data for the roofline model is a 2
-            # steps process:
-            '%s %s --collect tripcounts --flop --project-dir=%s '
-            '--search-dir src:rp=. --data-limit=0 --no-auto-finalize '
-            '--trace-mpi -- %s' %
-            (launcher_cmd, self.executable, self.roofdir,
-             self.target_executable),
-            # --- check tool's version:
-            'advixe-cl -V &> %s' % self.version_rpt,
-            # "advixe-cl --report" looks for e000/ in the output directory;
-            # if not found, it will fail with:
-            # IOError: Survey result cannot be loaded
-            'cd %s;ln -s nid* e000;cd -' % self.roofdir,
-            # --- report reference values/boundaries (roofline_ref):
-            'advixe-cl --report=roofs --project-dir=%s &> %s' %
-            (self.roofdir, self.roofline_ref),
-            'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt),
-            'touch the_end',
-            # 'advixe-cl --format=csv' seems to be not working (empty report),
-            # keeping as reference for a future check:
-            #   'advixe-cl --show-all-columns -csv-delimiter=";"'
-            #   ' --report=tripcounts --format=csv --project-dir=%s &> %s'
-            # This can be used instead (see advisor/config/report/roofs.tmpl):
-            #   'advixe-cl --report custom --report-template ./TEMPL/cscs.tmpl'
-            #   ' --project-dir=%s &> %s'
-        ]
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py
deleted file mode 100644
index ab98a6cefb..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py
+++ /dev/null
@@ -1,129 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
-                          for repeat in ['100000']
-                          for toolsversion in ['8.35.0']
-                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
-                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
-                          ])
-class IntelRooflineSdeTest(rfm.RegressionTest):
-    '''This test checks the values reported by Intel SDE for roofline modeling:
-       - https://software.intel.com/en-us/articles/
-            intel-software-development-emulator
-       - https://bitbucket.org/dwdoerf/stream-ai-example/src/master/
-       - https://www.nersc.gov/
-            users/application-performance/measuring-arithmetic-intensity
-    '''
-    def __init__(self, repeat, toolsversion, datalayout):
-        super().__init__()
-        self.descr = 'Roofline Analysis test with Intel SDE'
-        self.valid_systems = ['dom:mc']
-        # Reporting MFLOPS is not available on Intel Haswell cpus, see
-        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
-        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
-        self.valid_prog_environs = ['PrgEnv-intel']
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'intel_advisor')
-        self.build_system = 'SingleSource'
-        self.sourcepath = '_roofline.cpp'
-        self.executable = 'sde'
-        self.target_executable = './roof.exe'
-        self.sde = '%s.sde' % self.target_executable
-        self.rpt = '%s.rpt' % self.target_executable
-        self.prebuild_cmd = [
-            'patch < SDE/roofline_template.patch',
-            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
-            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
-        ]
-        self.build_system.cppflags = ['-D_SDE']
-        self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict',
-                                     '-qopt-streaming-stores', 'always',
-                                     '-std=c++11']
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'CRAYPE_LINK_TYPE': 'dynamic',
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-        }
-        exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental'
-        self.pre_run = [
-            'mv %s %s' % (self.executable, self.target_executable),
-            'module use %s/modules/all' % exp,
-            'module load sde',
-            'sde -help'
-        ]
-        self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
-                         '-global_region -start_ssc_mark 111:repeat '
-                         '-stop_ssc_mark 222:repeat -- %s' %
-                         ('-bdw', self.sde, self.target_executable)]
-        self.executable_opts = self.sdeflags
-        self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
-        self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-resources'}
-        self.sanity_patterns = sn.all([
-            sn.assert_eq(sn.extractsingle(
-                r'^Intel\(R\) Software Development Emulator\.  Version:  '
-                r'(?P<toolsversion>\d+\.\d+\.\d+)', self.stdout,
-                'toolsversion'), toolsversion),
-        ])
-        # References for Intel Broadwell CPU (E5-2695 v4):
-        references = {
-            'G3_AOS_SCALAR': {
-                'dom:mc': {
-                    'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_SOA_SCALAR': {
-                'dom:mc': {
-                    'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_AOS_VECTOR': {
-                'dom:mc': {
-                    'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.125, -0.05, 0.05, 'flop/byte')
-                }
-            },
-            'G3_SOA_VECTOR': {
-                'dom:mc': {
-                    'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
-                    'ai': (0.16, -0.05, 0.05, 'flop/byte')
-                }
-            },
-        }
-        self.reference = references[datalayout]
-        self.perf_patterns = {
-            'gflops': self.gflops,
-            'ai': self.arithmetic_intensity,
-        }
-
-    @property
-    @sn.sanity_function
-    def arithmetic_intensity(self):
-        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
-                                 self.rpt, 'flops', int)
-        bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
-                                 self.rpt, 'bytes', int)
-        # debug: print('ai={}'.format(flops/bytes))
-        return flops/bytes
-
-    @property
-    @sn.sanity_function
-    def gflops(self):
-        flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
-                                 self.rpt, 'flops', int)
-        msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
-                                'msec', float)
-        # debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
-        return (flops/((msec/1000))/10**9)
diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py
deleted file mode 100644
index ad745a924b..0000000000
--- a/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py
+++ /dev/null
@@ -1,298 +0,0 @@
-import os
-
-import reframe as rfm
-import reframe.utility.sanity as sn
-
-
-@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
-                          for repeat in ['500000']
-                          for toolsversion in ['597835']
-                          for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
-                                             'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
-                          ])
-class IntelRooflineVtuneTest(rfm.RegressionTest):
-    '''This test checks the values reported by Vtune for roofline modeling:
-       https://docs.nersc.gov/programming/performance-debugging-tools/roofline/
-
-    Example result on 1 core of Intel Broadwell CPU (E5-2695 v4):
-        G3_AOS_SCALAR: DP GFLOPS:  3.162 Time: 0.854s <-- slow
-        G3_AOS_VECTOR: DP GFLOPS:  5.731 Time: 0.440s
-        G3_SOA_SCALAR: DP GFLOPS:  3.183 Time: 0.848s
-        G3_SOA_VECTOR: DP GFLOPS: 21.423 Time: 0.134s <-- fast
-    '''
-    def __init__(self, repeat, toolsversion, datalayout):
-        super().__init__()
-        self.descr = 'Roofline Analysis test with Intel Vtune'
-        self.debug = False
-        self.valid_systems = ['dom:mc']
-        # Reporting MFLOPS is not available on Intel Haswell cpus, see
-        # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
-        # 64-ia-32-architectures-software-developer-vol-1-manual.pdf
-        self.valid_prog_environs = ['PrgEnv-intel']
-        self.modules = ['vtune_amplifier']
-        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
-                                       'roofline', 'intel_advisor')
-        self.build_system = 'SingleSource'
-        self.sourcepath = '_roofline.cpp'
-        self.executable = 'amplxe-cl'
-        self.target_executable = './roof.exe'
-        self.build_system.cppflags = ['-D_ADVISOR',
-                                      '-I$VTUNE_AMPLIFIER_2019_DIR/include']
-        self.prgenv_flags = {
-            'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'],
-            # TODO: evaluate '-qopt-streaming-stores', 'always',
-        }
-        self.build_system.ldflags = ['-L$VTUNE_AMPLIFIER_2019_DIR/lib64',
-                                     '-littnotify']
-        self.roofline_rpt = '%s.rpt' % self.target_executable
-        self.version_rpt = 'version.rpt'
-        self.roofline_ref = 'reference.rpt'
-        self.prebuild_cmd = [
-            'patch -s < ADVISOR/roofline_template.patch',
-            'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
-            (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
-        ]
-        self.exclusive = True
-        self.num_tasks = 1
-        self.num_tasks_per_node = 1
-        self.num_cpus_per_task = 1
-        self.num_tasks_per_core = 1
-        self.use_multithreading = False
-        self.variables = {
-            'OMP_NUM_THREADS': str(self.num_cpus_per_task),
-            'CRAYPE_LINK_TYPE': 'dynamic',
-        }
-        self.pre_run = [
-            'mv %s %s' % (self.executable, self.target_executable),
-            '%s --version &> %s' % (self.executable, self.version_rpt),
-            '%s -help | head -20' % self.executable,
-        ]
-        self.roofdir = './roof.dir'
-        self.executable_opts = [
-            '-start-paused -r %s -collect hpc-performance -data-limit=0 '
-            '--search-dir src:rp=. --trace-mpi -- %s' %
-            (self.roofdir, self.target_executable)
-        ]
-        # NOTE: -allow-multiple-runs requires to install vtune drivers
-        # TODO: -collect memory-access
-        self.maintainers = ['JG']
-        self.tags = {'scs', 'external-resources'}
-        self.sanity_patterns = sn.all([
-            sn.assert_found('loop complete.', self.stdout),
-            sn.assert_eq(sn.extractsingle(
-                r'I*.\(build\s(?P<toolsversion>\d+)\s*.',
-                self.version_rpt, 'toolsversion'), toolsversion),
-        ])
-        # References for Intel Broadwell CPU (E5-2695 v4):
-        references = {
-            'G3_AOS_SCALAR': {
-                'dom:mc': {
-                    'gflops': (3.1, -0.1, None, 'Gflop/s'),
-                    'compare_sec': (0, -0.1, 0.1, 'seconds'),
-                    'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'),
-                }
-            },
-            'G3_AOS_VECTOR': {
-                'dom:mc': {
-                    'gflops': (5.7, -0.1, None, 'Gflop/s'),
-                    'compare_sec': (0, -0.1, 0.1, 'seconds'),
-                    'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'),
-                }
-            },
-            'G3_SOA_SCALAR': {
-                'dom:mc': {
-                    'gflops': (3.1, -0.1, None, 'Gflop/s'),
-                    'compare_sec': (0, -0.1, 0.1, 'seconds'),
-                    'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'),
-                }
-            },
-            'G3_SOA_VECTOR': {
-                'dom:mc': {
-                    'gflops': (21.0, -0.1, None, 'Gflop/s'),
-                    'compare_sec': (0, -0.1, 0.1, 'seconds'),
-                    'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'),
-                }
-            },
-        }
-        self.reference = references[datalayout]
-        self.perf_patterns = {
-            'gflops': self.gflops_reported,
-            'compare_sec': self.runtime_diff,
-            'compare_gflops': self.gflops_diff,
-            # TODO: 'ai': self.arithmetic_intensity,
-        }
-
-    def setup(self, partition, environ, **job_opts):
-        super().setup(partition, environ, **job_opts)
-        run_cmd = ' '.join(self.job.launcher.command(self.job))
-        self.clk_rpt = '%s_CLK.rpt' % self.target_executable
-        self.DPscalar_rpt = '%s_DP_scalar.rpt' % self.target_executable
-        self.DP128B_rpt = '%s_DP_128B.rpt' % self.target_executable
-        self.DP256B_rpt = '%s_DP_256B.rpt' % self.target_executable
-        perf_metrics = [
-            ('CPU_CLK_UNHALTED.THREAD', self.clk_rpt),
-            ('FP_ARITH_INST_RETIRED.SCALAR_DOUBLE', self.DPscalar_rpt),
-            ('FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE', self.DP128B_rpt),
-            ('FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE', self.DP256B_rpt)]
-        self.post_run = []
-        for perf_metric, perf_rpt in perf_metrics:
-            self.post_run += [
-                '%s %s -report hw-events -group-by=package -r %s.* -column=%s '
-                '&> %s' %
-                (run_cmd, self.executable, self.roofdir, perf_metric, perf_rpt)
-            ]
-        partitiontype = partition.fullname.split(':')[1]
-        if partitiontype == 'gpu':
-            self.job.options = ['--constraint="gpu&perf"']
-        elif partitiontype == 'mc':
-            self.job.options = ['--constraint="mc&perf"']
-
-    # --- Elapsed Time:
-    @property
-    @sn.sanity_function
-    def runtime_reported(self):
-        sec = sn.extractsingle(r'^Elapsed Time: (?P<sec>\S+)s', self.stdout,
-                               'sec', float)
-        if self.debug:
-            print('sec1={}'.format(sec))
-
-        return sec
-
-    @property
-    @sn.sanity_function
-    def runtime_metric(self):
-        # CPU_CLK_UNHALTED.THREAD:
-        mclk = sn.extractsingle(r'^package_0\s+(?P<clk>\d+)',
-                                self.clk_rpt, 'clk', float)
-        # GHz:
-        ghz = sn.extractsingle(r'^\s+Average CPU Frequency: (?P<ghz>\S+) GHz',
-                               self.stdout, 'ghz', float)
-        # 1 Hz = 1 cycle / 1 second
-        sec = (mclk * 10**6) / (ghz * 10**9)
-        if self.debug:
-            print('sec2={}'.format(sec))
-        return sec
-
-    @property
-    @sn.sanity_function
-    def runtime_diff(self):
-        sec = self.runtime_reported - self.runtime_metric
-        if self.debug:
-            print('sec3={}'.format(sec))
-        return sec
-
-    # --- GFLOPS/sec:
-    @property
-    @sn.sanity_function
-    def gflops_reported(self):
-        gflops = sn.extractsingle(r'^\s+DP GFLOPS: (?P<gflops>\S+)',
-                                  self.stdout, 'gflops', float)
-        if self.debug:
-            print('gflops1={}'.format(gflops))
-        return gflops
-
-    @property
-    @sn.sanity_function
-    def gflops_metric(self):
-        # > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H
-        # DP MFLOP/s = 1.0E-06*(x*2 + y + z*4)/runtime where:
-        #  x = FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*
-        #  y = FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
-        #  z = FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
-        #  TODO: check units with:
-        #       ^.*Hardware Event Count.*\((?P<unit>\S+)\)
-        # amplxe-cl -report hw-events -r roof.dir.nid00406/ -column=?
-
-        # FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:
-        DP128B = sn.extractsingle(r'^package_0\s+(?P<M>\d+)',
-                                  self.DP128B_rpt, 'M', float)
-        # FP_ARITH_INST_RETIRED.SCALAR_DOUBLE:
-        DPscalar = sn.extractsingle(r'^package_0\s+(?P<M>\d+)',
-                                    self.DPscalar_rpt, 'M', float)
-        # FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE:
-        DP256B = sn.extractsingle(r'^package_0\s+(?P<M>\d+)',
-                                  self.DP256B_rpt, 'M', float)
-
-        mflops = (DP128B*2 + DPscalar + DP256B*4) / self.runtime_reported
-        gflops = mflops / 10**3
-        if self.debug:
-            print('DP128B={}'.format(DP128B))
-            print('DPscalar={}'.format(DPscalar))
-            print('DP256B={}'.format(DP256B))
-            print('runtime={}'.format(self.runtime_reported))
-            print('gflops2={}'.format(gflops))
-        return gflops
-
-    @property
-    @sn.sanity_function
-    def gflops_diff(self):
-        gflops = self.gflops_reported - self.gflops_metric
-        if self.debug:
-            print('gflops3={}'.format(gflops))
-        return gflops
-
-    # NOTE: Bandwidth data is missing for a full roofline model.
-    # Other tools (advisor, likwid, sde) may help:
-    #  > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H
-    #  Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) +
-    #                                         SUM(MBOXxC1))*64.0/runtime
-    #  Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) +
-    #                                         SUM(MBOXxC1))*64.0
-    #
-    # > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H
-    # L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
-    #                                    ICACHE_MISSES)*64.0/time
-    # L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB +
-    #                                    ICACHE_MISSES)*64.0
-    #
-    # > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H
-    # L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL +
-    #                                    L2_LINES_OUT_DEMAND_DIRTY)*64/time
-    # L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL +
-    #                                    L2_LINES_OUT_DEMAND_DIRTY)*64
-    #
-    # > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H
-    # Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) +
-    #                                        SUM(CAS_COUNT_WR))*64.0/time
-    # Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) +
-    # Vtune supported hw-events:
-    # -------
-    # Hardware Event Count:CPU_CLK_UNHALTED.THREAD (K)
-    # Hardware Event Count:CPU_CLK_UNHALTED.REF_TSC (K)
-    # Hardware Event Count:INST_RETIRED.ANY (K)
-    # Hardware Event Count:CYCLE_ACTIVITY.STALLS_L1D_MISS (K)
-    # Hardware Event Count:CPU_CLK_UNHALTED.REF_XCLK (K)
-    # Hardware Event Count:CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE (K)
-    # Hardware Event Count:CYCLE_ACTIVITY.STALLS_L2_MISS (K)
-    # Hardware Event Count:CYCLE_ACTIVITY.STALLS_MEM_ANY (K)
-    # Hardware Event Count:CYCLE_ACTIVITY.STALLS_TOTAL (K)
-    # Hardware Event Count:IDQ_UOPS_NOT_DELIVERED.CORE (K)
-    # Hardware Event Count:INT_MISC.RECOVERY_CYCLES (K)
-    # Hardware Event Count:MEM_LOAD_UOPS_RETIRED.L3_HIT_PS (K)
-    # Hardware Event Count:MEM_LOAD_UOPS_RETIRED.L3_MISS_PS (K)
-    # Hardware Event Count:RESOURCE_STALLS.SB (K)
-    # Hardware Event Count:RS_EVENTS.EMPTY_CYCLES (K)
-    # Hardware Event Count:UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC (K)
-    # Hardware Event Count:UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC (K)
-    # Hardware Event Count:UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC (K)
-    # Hardware Event Count:UOPS_EXECUTED.CORE:cmask=1 (K)
-    # Hardware Event Count:UOPS_EXECUTED.CORE:cmask=2 (K)
-    # Hardware Event Count:UOPS_EXECUTED.CORE:cmask=3 (K)
-    # Hardware Event Count:UOPS_ISSUED.ANY (K)
-    # Hardware Event Count:UOPS_RETIRED.RETIRE_SLOTS (K)
-    # Hardware Event Count:IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE (K)
-    # Hardware Event Count:OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD:cmask=4 (K)
-    # Hardware Event Count:OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD (K)
-    # Hardware Event Count:MEM_UOPS_RETIRED.ALL_LOADS_PS (K)
-    # Hardware Event Count:MEM_UOPS_RETIRED.ALL_STORES_PS (K)
-    # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS (K)
-    # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS (K)
-    # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM_PS (K)
-    # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_PS (K)
-    # Hardware Event Count:FP_ARITH_INST_RETIRED.SCALAR_SINGLE (K)
-    # Hardware Event Count:FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE (K)
-    # Hardware Event Count:FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE (K)
-    # Hardware Event Count:FP_ARITH_INST_RETIRED.SCALAR_DOUBLE (K)
-    # Hardware Event Count:FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE (K)
-    # Hardware Event Count:FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE (K)
-    # Hardware Event Count:INST_RETIRED.X87 (K)