From e2cdf41b1ab46f933763f9930f1b8faa9d757ec9 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 7 May 2019 15:26:39 +0200
Subject: [PATCH 1/3] UES-213_nvprof

---
 .../berkeley-ert-nvprof.py                    | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
new file mode 100644
index 0000000000..4189845d72
--- /dev/null
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
@@ -0,0 +1,146 @@
+import os
+
+import reframe as rfm
+import reframe.utility.sanity as sn
+
+
+@rfm.parameterized_test(
+    *[[gpudims, flop, repeat]
+      # gpudims sets (gpu_blocks, gpu_threads):
+      for gpudims in [(112, 1024), (224, 512), (448, 256), (896, 128),
+                      (1792, 64), (3584, 32)]
+      for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+      # self.repeat replaces '-DERT_NUM_EXPERIMENTS=2':
+      for repeat in [1, 2]])
+class ErtP100Test(rfm.RegressionTest):
+    """
+    The Empirical Roofline Tool, ERT, empirically generates roofline data:
+    https://bitbucket.org/berkeleylab/cs-roofline-toolkit/
+
+    This test checks the ERT tool with NVIDIA Tesla P100-PCIE-16GB:
+    Device 0: "Tesla P100-PCIE-16GB"
+      CUDA Driver Version / Runtime Version     10.1 / 10.0
+      CUDA Capability Major/Minor version number:    6.0
+      (56) Multiprocessors, ( 64) CUDA Cores/MP:     3584 CUDA Cores
+      GPU Max Clock rate:                            1329 MHz (1.33 GHz)
+      Theoretical peak performance per GPU:          4761 Gflop/s
+      Maximum number of threads per multiprocessor:  2048
+      Peak number of threads:                        114688 threads <---------
+      Maximum number of threads per block:           1024           <---------
+    NVRM version: NVIDIA UNIX x86_64 Kernel Module  418.39
+
+    # The following python code can help for a parameter space study:
+    # (use --exec-policy=async)
+    max_threads_per_block = 1024
+    max_threads = 114688
+    gpu_threads = max_threads_per_block * 2
+    while gpu_threads > 32:
+        gpu_threads = gpu_threads // 2
+        gpu_blocks = max_threads // gpu_threads
+        nth = gpu_threads * gpu_blocks
+        print('{} {} {} {}'.format(gpu_blocks, gpu_threads, nth, max_threads))
+    """
+    def __init__(self, gpudims, flop, repeat):
+        super().__init__()
+        max_gpu_blocks = 3584
+        max_flops = 1024
+        max_repeat = 2
+        self.descr = 'Empirical Roofline Toolkit'
+        self.valid_systems = ['dom:gpu']
+        self.valid_prog_environs = ['PrgEnv-gnu']
+        self.modules = ['craype-accel-nvidia60']
+        self.sourcesdir = os.path.join(self.current_system.resourcesdir,
+                                       'roofline', 'ert')
+        # A single input file is required for nvcc to work:
+        self.build_system = 'SingleSource'
+        self.prebuild_cmd = [
+            'cat kernel1.c driver1.c | sed "s-^#if ERT-#ifdef ERT-g" > '
+            '_gpu.cu']
+        self.sourcepath = '_gpu.cu'
+        self.executable = 'ert.exe'
+        self.build_system.cppflags = [
+            # ERT_FLOPS = -DERT_FLOP !
+            '-DERT_FLOP=%s' % str(flop),
+            '-DERT_ALIGN=32',
+            # 1G = 1024^3 = 1073741824:
+            '-DERT_MEMORY_MAX=1073741824',
+            # ERT_GPU True:
+            '-DERT_GPU',
+            '-DERT_TRIALS_MIN=1',
+            '-DERT_WORKING_SET_MIN=128',
+            # '-x cu' explicitly sets the language (cuda) for the src files.
+        ]
+        self.build_system.ldflags = ['-O3']
+        self.maintainers = ['JG']
+        self.tags = {'scs'}
+        gpu_blocks = gpudims[0]
+        gpu_threads = gpudims[1]
+        self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format(
+            repeat, flop, gpu_blocks, gpu_threads)
+        self.exclusive = True
+        self.num_tasks = 1
+        self.num_tasks_per_node = 1
+        self.num_cpus_per_task = 1
+        self.num_tasks_per_core = 1
+        self.use_multithreading = False
+        self.variables = {
+            'CRAYPE_LINK_TYPE': 'dynamic',
+            'OMP_NUM_THREADS': str(self.num_cpus_per_task)
+        }
+        self.executable_opts = [str(gpu_blocks), str(gpu_threads)]
+        self.rpt = '%s.rpt' % self.executable
+        # Reference roofline boundaries for NVIDIA Tesla P100-PCIE-16GB:
+        GFLOPs = 4355.0
+        L1bw = 1724.0
+        # Keeping for future reference:
+        # L2bw = 855.0
+        # L3bw = 547.0
+        DRAMbw = 521.0
+        self.roofline_rpt = 'rpt'
+        # use the latest job to generate the roofline rpt:
+        if (gpu_blocks == max_gpu_blocks and flop == max_flops and
+           repeat == max_repeat):
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+                # give enough time for all the dependent jobs to collect data:
+                'sleep 60',
+                'cat ../ertgpu_Run*/sum | python2 roofline.py > rpt',
+            ]
+            self.sanity_patterns = sn.all([
+                # --- check data type:
+                sn.assert_eq(sn.extractsingle(
+                    r'^\s+(?P<prec>\w+) \*\s+buf = \(\w+ \*\)'
+                    r'_mm_malloc\(PSIZE, ERT_ALIGN\);', 'driver1.c', 'prec'),
+                    'double'),
+                # --- check ert's roofline results. Typical output is:
+                #   4355.20 GFLOPs EMP
+                # META_DATA
+                # GPU_BLOCKS     1792
+                # FLOPS          1024
+                # GPU_THREADS    64
+                #
+                #   1723.95 L1 EMP
+                #    521.29 DRAM EMP
+                #
+                # check GFLOPS:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
+                    'GFLOPs', float), GFLOPs, -0.1, 0.5),
+                # check L1 bandwidth:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
+                    'L1bw', float), L1bw, -0.1, 0.3),
+                # check DRAM bandwidth:
+                sn.assert_reference(sn.extractsingle(
+                    r'(?P<DRAMbw>\d+.\d+) DRAM EMPv', self.roofline_rpt,
+                    'DRAMbw', float), DRAMbw, -0.1, 0.3),
+            ])
+        else:
+            self.post_run = [
+                'cat *_job.out | python2 preprocess.py > pre',
+                'python2 maximum.py < pre > max',
+                'python2 summary.py < max > sum',
+            ]
+            self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')

From beb7d7a1514c413fc1d3d0b7f113ed659fe1d7a2 Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 7 May 2019 16:53:37 +0200
Subject: [PATCH 2/3] fix for review

---
 .../berkeley-ert-nvprof.py                        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
index 4189845d72..788aca6ceb 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
@@ -21,7 +21,7 @@ class ErtP100Test(rfm.RegressionTest):
     Device 0: "Tesla P100-PCIE-16GB"
       CUDA Driver Version / Runtime Version     10.1 / 10.0
       CUDA Capability Major/Minor version number:    6.0
-      (56) Multiprocessors, ( 64) CUDA Cores/MP:     3584 CUDA Cores
+      (56) Multiprocessors, (64) CUDA Cores/MP:      3584 CUDA Cores
       GPU Max Clock rate:                            1329 MHz (1.33 GHz)
       Theoretical peak performance per GPU:          4761 Gflop/s
       Maximum number of threads per multiprocessor:  2048
@@ -91,8 +91,8 @@ def __init__(self, gpudims, flop, repeat):
         self.rpt = '%s.rpt' % self.executable
         # Reference roofline boundaries for NVIDIA Tesla P100-PCIE-16GB:
         GFLOPs = 4355.0
-        L1bw = 1724.0
         # Keeping for future reference:
+        # L1bw = 1724.0
         # L2bw = 855.0
         # L3bw = 547.0
         DRAMbw = 521.0
@@ -129,12 +129,15 @@ def __init__(self, gpudims, flop, repeat):
                     r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
                     'GFLOPs', float), GFLOPs, -0.1, 0.5),
                 # check L1 bandwidth:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
-                    'L1bw', float), L1bw, -0.1, 0.3),
+                # https://cug.org/proceedings/protected/cug2019_proceedings/
+                # includes/files/pap103s2-file1.pdf: 
+                #   "ERT fails to identify the L1 cache"
+                # sn.assert_reference(sn.extractsingle(
+                #     r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
+                #     'L1bw', float), L1bw, -0.1, 0.3),
                 # check DRAM bandwidth:
                 sn.assert_reference(sn.extractsingle(
-                    r'(?P<DRAMbw>\d+.\d+) DRAM EMPv', self.roofline_rpt,
+                    r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt,
                     'DRAMbw', float), DRAMbw, -0.1, 0.3),
             ])
         else:

From bffa1460eea395deda897dc7c4aa651c0e7ec9bc Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Tue, 7 May 2019 21:14:43 +0200
Subject: [PATCH 3/3] fix for review

---
 .../tools/profiling_and_debugging/berkeley-ert-nvprof.py   | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
index 788aca6ceb..56ec1aa3bd 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
@@ -60,7 +60,7 @@ def __init__(self, gpudims, flop, repeat):
         self.executable = 'ert.exe'
         self.build_system.cppflags = [
             # ERT_FLOPS = -DERT_FLOP !
-            '-DERT_FLOP=%s' % str(flop),
+            '-DERT_FLOP=%s' % flop,
             '-DERT_ALIGN=32',
             # 1G = 1024^3 = 1073741824:
             '-DERT_MEMORY_MAX=1073741824',
@@ -73,8 +73,7 @@ def __init__(self, gpudims, flop, repeat):
         self.build_system.ldflags = ['-O3']
         self.maintainers = ['JG']
         self.tags = {'scs'}
-        gpu_blocks = gpudims[0]
-        gpu_threads = gpudims[1]
+        gpu_blocks, gpu_threads = gpudims
         self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format(
             repeat, flop, gpu_blocks, gpu_threads)
         self.exclusive = True
@@ -130,7 +129,7 @@ def __init__(self, gpudims, flop, repeat):
                     'GFLOPs', float), GFLOPs, -0.1, 0.5),
                 # check L1 bandwidth:
                 # https://cug.org/proceedings/protected/cug2019_proceedings/
-                # includes/files/pap103s2-file1.pdf: 
+                # includes/files/pap103s2-file1.pdf:
                 #   "ERT fails to identify the L1 cache"
                 # sn.assert_reference(sn.extractsingle(
                 #     r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,