From 31b3370dafa2fe997669e1e0c76242a49d729c9c Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 30 Aug 2019 14:53:48 +0200 Subject: [PATCH] report for ert --- .../profiling_and_debugging/berkeley-ert.py | 119 ++++++++++++------ 1 file changed, 83 insertions(+), 36 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py index f870b7e344..9a7cb32904 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py @@ -62,13 +62,10 @@ def __init__(self, num_ranks, flop): 'CRAYPE_LINK_TYPE': 'dynamic', 'OMP_NUM_THREADS': str(self.num_cpus_per_task) } - # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): - GFLOPs = 945.0 - L1bw = 1788.0 - L2bw = 855.0 - L3bw = 547.0 - DRAMbw = 70.5 - # slowest job: + + # take the "slowest" job, make it sleep after it has ended and hope the + # other jobs have ended too + # TODO: find a better way to wait for the other jobs to end num_ranks_min = 1 flop_min = 1024 self.roofline_rpt = 'rpt' @@ -77,41 +74,91 @@ def __init__(self, num_ranks, flop): 'cat *_job.out | python2 preprocess.py > pre', 'python2 maximum.py < pre > max', 'python2 summary.py < max > sum', - # give enough time for all the dependent jobs to collect data + # give enough time for all the dependent jobs to collect data: 'sleep 60', 'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt', ] - self.sanity_patterns = sn.all([ - # --- check data type: - sn.assert_eq(sn.extractsingle( - r'^\s+(?P\w+) \* __restrict__ buf = \(\w+ \*\)' - r'malloc\(PSIZE\);', 'driver1.c', 'prec'), 'double'), - # --- check ert's roofline results: - # check GFLOPS: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+)\sGFLOPs EMP', self.roofline_rpt, - 'GFLOPs', float), GFLOPs, -0.1, 0.5), - # check L1 bandwidth: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+)\sL1 EMP', self.roofline_rpt, - 'L1bw', float), L1bw, -0.1, 0.3), - # check L2 bandwidth: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+)\sL2 EMP', self.roofline_rpt, - 'L2bw', float), L2bw, -0.1, 0.3), - # check L3 bandwidth: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+)\sL3 EMP', self.roofline_rpt, - 'L3bw', float), L3bw, -0.1, 0.3), - # check DRAM bandwidth: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+) DRAM EMP', self.roofline_rpt, - 'DRAMbw', float), DRAMbw, -0.1, 0.3), - ]) + else: self.post_run = [ 'cat *_job.out | python2 preprocess.py > pre', 'python2 maximum.py < pre > max', 'python2 summary.py < max > sum', ] - self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') + + # --- Sanity check: + regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' + r'\(\w+ \*\)malloc\(PSIZE\);') + datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') + self.sanity_patterns = sn.all([ + sn.assert_found('GFLOPs', 'sum'), + sn.assert_eq(datatype, 'double'), + ]) + + # --- Performance check: + if num_ranks == num_ranks_min and flop == flop_min: + # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): + ref_GFLOPs = 945.0 + ref_L1bw = 1788.0 + ref_L2bw = 855.0 + ref_L3bw = 547.0 + ref_DRAMbw = 70.5 + + # Typical performance report looks like: + # -------------------------------------- + # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt + # 908.43 GFLOPs EMP + # ****** + # META_DATA + # OPENMP_THREADS 1 + # FLOPS 8 + # MPI_PROCS 36 + # + # 5647.33 L1 EMP + # ******* + # 3203.86 L2 EMP + # ******* + # 1773.58 L3 EMP + # ******* + # 139.56 L4 EMP + # 103.50 DRAM EMP + # ****** + # META_DATA + # FLOPS 2 + # OPENMP_THREADS 1 + # MPI_PROCS 36 + regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' + regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' + regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' + regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' + regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' + + gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, + 'GFLOPs', float) + L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, + 'L1bw', float) + L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, + 'L2bw', float) + L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, + 'L3bw', float) + DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, + 'DRAMbw', float) + + # --performance-report: + self.perf_patterns = { + 'gflops': gflops, + 'L1bw': L1bw, + 'L2bw': L2bw, + 'L3bw': L3bw, + 'DRAMbw': DRAMbw, + } + + self.reference = { + '*': { + 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), + 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), + 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), + 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), + 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), + } + }