From 31b3370dafa2fe997669e1e0c76242a49d729c9c Mon Sep 17 00:00:00 2001
From: jgp <jgp@cscs.ch>
Date: Fri, 30 Aug 2019 14:53:48 +0200
Subject: [PATCH] report for ert

---
 .../profiling_and_debugging/berkeley-ert.py   | 119 ++++++++++++------
 1 file changed, 83 insertions(+), 36 deletions(-)
diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
index f870b7e344..9a7cb32904 100644
--- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
+++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
@@ -62,13 +62,10 @@ def __init__(self, num_ranks, flop):
             'CRAYPE_LINK_TYPE': 'dynamic',
             'OMP_NUM_THREADS': str(self.num_cpus_per_task)
         }
-        # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
-        GFLOPs = 945.0
-        L1bw = 1788.0
-        L2bw = 855.0
-        L3bw = 547.0
-        DRAMbw = 70.5
-        # slowest job:
+
+        # take the "slowest" job, make it sleep after it has ended and hope the
+        # other jobs have ended too
+        # TODO: find a better way to wait for the other jobs to end
         num_ranks_min = 1
         flop_min = 1024
         self.roofline_rpt = 'rpt'
@@ -77,41 +74,91 @@ def __init__(self, num_ranks, flop):
                 'cat *_job.out | python2 preprocess.py > pre',
                 'python2 maximum.py < pre > max',
                 'python2 summary.py < max > sum',
-                # give enough time for all the dependent jobs to collect data
+                # give enough time for all the dependent jobs to collect data:
                 'sleep 60',
                 'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt',
             ]
-            self.sanity_patterns = sn.all([
-                # --- check data type:
-                sn.assert_eq(sn.extractsingle(
-                    r'^\s+(?P<prec>\w+) \* __restrict__ buf = \(\w+ \*\)'
-                    r'malloc\(PSIZE\);', 'driver1.c', 'prec'), 'double'),
-                # --- check ert's roofline results:
-                # check GFLOPS:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
-                    'GFLOPs', float), GFLOPs, -0.1, 0.5),
-                # check L1 bandwidth:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
-                    'L1bw', float), L1bw, -0.1, 0.3),
-                # check L2 bandwidth:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<L2bw>\d+.\d+)\sL2 EMP', self.roofline_rpt,
-                    'L2bw', float), L2bw, -0.1, 0.3),
-                # check L3 bandwidth:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<L3bw>\d+.\d+)\sL3 EMP', self.roofline_rpt,
-                    'L3bw', float), L3bw, -0.1, 0.3),
-                # check DRAM bandwidth:
-                sn.assert_reference(sn.extractsingle(
-                    r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt,
-                    'DRAMbw', float), DRAMbw, -0.1, 0.3),
-            ])
+
         else:
             self.post_run = [
                 'cat *_job.out | python2 preprocess.py > pre',
                 'python2 maximum.py < pre > max',
                 'python2 summary.py < max > sum',
             ]
-            self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')
+
+        # --- Sanity check:
+        regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
+                          r'\(\w+ \*\)malloc\(PSIZE\);')
+        datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
+        self.sanity_patterns = sn.all([
+            sn.assert_found('GFLOPs', 'sum'),
+            sn.assert_eq(datatype, 'double'),
+        ])
+
+        # --- Performance check:
+        if num_ranks == num_ranks_min and flop == flop_min:
+            # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
+            ref_GFLOPs = 945.0
+            ref_L1bw = 1788.0
+            ref_L2bw = 855.0
+            ref_L3bw = 547.0
+            ref_DRAMbw = 70.5
+
+            # Typical performance report looks like:
+            # --------------------------------------
+            # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
+            #    908.43 GFLOPs EMP
+            #    ******
+            # META_DATA
+            # OPENMP_THREADS 1
+            # FLOPS          8
+            # MPI_PROCS      36
+            #
+            #   5647.33 L1 EMP
+            #   *******
+            #   3203.86 L2 EMP
+            #   *******
+            #   1773.58 L3 EMP
+            #   *******
+            #    139.56 L4 EMP
+            #    103.50 DRAM EMP
+            #    ******
+            # META_DATA
+            # FLOPS          2
+            # OPENMP_THREADS 1
+            # MPI_PROCS      36
+            regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
+            regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
+            regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
+            regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
+            regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'
+
+            gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
+                                      'GFLOPs', float)
+            L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
+                                    'L1bw', float)
+            L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
+                                    'L2bw', float)
+            L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
+                                    'L3bw', float)
+            DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
+                                      'DRAMbw', float)
+
+            # --performance-report:
+            self.perf_patterns = {
+                'gflops': gflops,
+                'L1bw': L1bw,
+                'L2bw': L2bw,
+                'L3bw': L3bw,
+                'DRAMbw': DRAMbw,
+            }
+
+            self.reference = {
+                '*': {
+                    'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
+                    'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
+                    'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
+                    'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
+                    'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
+                }
+            }