From 5a3fc73525cb705bbff676c08bf79a46ac181471 Mon Sep 17 00:00:00 2001 From: jgp Date: Thu, 29 Aug 2019 19:09:37 +0200 Subject: [PATCH 1/2] perf_report --- .../intel_advisor_roofline.py | 190 +++++++++++++----- 1 file changed, 144 insertions(+), 46 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 42f5563a34..373493901c 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -4,9 +4,9 @@ import reframe.utility.sanity as sn -@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] +@rfm.parameterized_test(*[[repeat, toolversion, datalayout] for repeat in ['100000'] - for toolsversion in ['597843'] + for toolversion in ['597843'] for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] ]) @@ -32,7 +32,7 @@ class IntelRooflineAdvisorTest(rfm.RegressionTest): G3_SOA_SCALAR: gflops, 2.79 arithmetic_intensity', 0.166 351ms G3_SOA_VECTOR: gflops, 10.62 arithmetic_intensity', 0.166 57ms <- fast ''' - def __init__(self, repeat, toolsversion, datalayout): + def __init__(self, repeat, toolversion, datalayout): super().__init__() self.descr = 'Roofline Analysis test with Intel Advisor' # for reference: advisor/2019 was failing on dom with: @@ -86,60 +86,158 @@ def __init__(self, repeat, toolsversion, datalayout): '--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' % (self.roofdir, self.target_executable) ] - # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): - L1bw = 293*1024**3 - L2bw = 79*1024**3 - L3bw = 33*1024**3 + # - Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): + L1bw = 293 # *1024**3 + L2bw = 79 # *1024**3 + L3bw = 33 # *1024**3 DPfmabw = 45*1024**3 DPaddbw = 12*1024**3 ScalarAddbw = 3*1024**3 + # --- roofline (memory) boundaries from the tool: + # DRAM Bandwidth (single node) 63206331080 memory + # DRAM Bandwidth 125993278750 memory + # DRAM Bandwidth (single-threaded) 12715570803 memory + # L1 Bandwidth 11360856466728 memory + # Scalar L1 Bandwidth 2648216636280 memory + # L1 bandwidth (single-threaded) 315579346298 memory + # ************ + # Scalar L1 bandwidth (single-threaded) 73561573230 memory + # L2 Bandwidth 3102773429268 memory + # Scalar L2 Bandwidth 921316779936 memory + # L2 bandwidth (single-threaded) 86188150813 memory + # *********** + # Scalar L2 bandwidth (single-threaded) 25592132776 memory + # L3 Bandwidth 1269637300440 memory + # Scalar L3 Bandwidth 845928498744 memory + # L3 bandwidth (single-threaded) 35267702790 memory + # *********** + # Scalar L3 bandwidth (single-threaded) 23498013854 memory + regex_roof_L1 = (r'^L1\sbandwidth\s\(single-threaded\)\s+(?P\d+)' + r'\s+memory$') + regex_roof_L2 = (r'^L2\sbandwidth\s\(single-threaded\)\s+(?P\d+)' + r'\s+memory$') + regex_roof_L3 = (r'^L3\sbandwidth\s\(single-threaded\)\s+(?P\d+)' + r'\s+memory$') + roof_L1 = sn.round(sn.extractsingle(regex_roof_L1, self.roofline_ref, + 'L1bw', int) / 1024**3, 2) + roof_L2 = sn.round(sn.extractsingle(regex_roof_L2, self.roofline_ref, + 'L2bw', int) / 1024**3, 3) + roof_L3 = sn.round(sn.extractsingle(regex_roof_L3, self.roofline_ref, + 'L3bw', int) / 1024**3, 3) + + # --- roofline (compute) boundaries from the tool: + # SP Vector FMA Peak 2759741518342 compute + # SP Vector FMA Peak (single-threaded) 98956234406 compute + # DP Vector FMA Peak 1379752337990 compute + # DP Vector FMA Peak (single-threaded) 49563336304 compute + # *********** + # Scalar Add Peak 93438527464 compute + # Scalar Add Peak (single-threaded) 3289577753 compute + # ********** + # SP Vector Add Peak 689944922272 compute + # SP Vector Add Peak (single-threaded) 24691445241 compute + # DP Vector Add Peak 344978547363 compute + # DP Vector Add Peak (single-threaded) 12385333008 compute + # *********** + # Integer Scalar Add Peak 228677310757 compute + # Integer Scalar Add Peak (single-threaded) 8055287031 compute + # Int64 Vector Add Peak 747457604632 compute + # Int64 Vector Add Peak (single-threaded) 26300241032 compute + # Int32 Vector Add Peak 1494880413924 compute + # Int32 Vector Add Peak (single-threaded) 52738180380 compute + regex_roof_dpfma = (r'^DP Vector FMA Peak\s\(single-threaded\)\s+' + r'(?P\d+)\s+compute$') + regex_roof_dpadd = (r'^DP Vector Add Peak\s\(single-threaded\)\s+' + r'(?P\d+)\s+compute$') + regex_roof_scalaradd = (r'^Scalar Add Peak\s\(single-threaded\)\s+' + r'(?P\d+)\s+compute$') + roof_dpfma = sn.extractsingle(regex_roof_dpfma, self.roofline_ref, + 'DPfmabw', int) + roof_dpadd = sn.extractsingle(regex_roof_dpadd, self.roofline_ref, + 'DPaddbw', int) + roof_scalaradd = sn.extractsingle(regex_roof_scalaradd, + self.roofline_ref, 'ScalarAddbw', + int) + + # - API output: + # ('self_elapsed_time', 0.1) + # ('self_memory_gb', 4.2496) + # ('self_gb_s', 42.496) + # ('self_gflop', 0.5312) + # ('self_gflops', 5.312) + # ('self_arithmetic_intensity', 0.125) + # ('_self_gb_s', 42.495999999999995, 42.496) + # ('_self_gflops', 5.311999999999999, 5.312) + # ('_self_arithmetic_intensity', 0.125, 0.125) + # ('gap _self_gb_s', -7.105427357601002e-15) + # ('gap _self_gflops', -8.881784197001252e-16) + # ('gap _self_arithmetic_intensity', 0.0) + # returned AI gap = 0.0000000000000000 + # returned GFLOPS gap = -0.0000000000000009 + regex_ai_gap = r'^returned\sAI\sgap\s=\s(?P.*)' + regex_ai_gflops = r'^returned\sGFLOPS\sgap\s=\s(?P.*)' + ai_gap = sn.extractsingle(regex_ai_gap, self.roofline_rpt, 'Intensity', + float) + ai_gflops = sn.extractsingle(regex_ai_gflops, self.roofline_rpt, + 'Flops', float) + + regex_toolversion = r'I*.\(build\s(?P\d+)\s*.' + found_toolversion = sn.extractsingle(regex_toolversion, + self.version_rpt, 'version') self.sanity_patterns = sn.all([ # check the job status: sn.assert_found('loop complete.', self.stdout), # check the tool's version (2019=591264, 2018=551025): - sn.assert_eq(sn.extractsingle( - r'I*.\(build\s(?P\d+)\s*.', - self.version_rpt, 'toolsversion'), toolsversion), + sn.assert_eq(found_toolversion, toolversion), # --- roofline boundaries: - # check --report=roofs (L1 bandwidth): - sn.assert_reference(sn.extractsingle( - r'^L1\sbandwidth\s\(single-threaded\)\s+(?P\d+)\s+' - r'memory$', self.roofline_ref, 'L1bw', int), - L1bw, -0.12, 0.08), - # check --report=roofs (L2 bandwidth): - sn.assert_reference(sn.extractsingle( - r'^L2\sbandwidth\s\(single-threaded\)\s+(?P\d+)\s+' - r'memory$', self.roofline_ref, 'L2bw', int), - L2bw, -0.12, 0.08), - # check --report=roofs (L3 bandwidth): - sn.assert_reference(sn.extractsingle( - r'^L3\sbandwidth\s\(single-threaded\)\s+(?P\d+)\s+' - r'memory$', self.roofline_ref, 'L3bw', int), - L3bw, -0.12, 0.08), - # check --report=roofs (DP FMA): - sn.assert_reference(sn.extractsingle( - r'^DP Vector FMA Peak\s\(single-threaded\)\s+' - r'(?P\d+)\s+compute$', self.roofline_ref, - 'DPfmabw', int), DPfmabw, -0.12, 0.08), - # check --report=roofs (DP Add): - sn.assert_reference(sn.extractsingle( - r'^DP Vector Add Peak\s\(single-threaded\)\s+' - r'(?P\d+)\s+compute$', self.roofline_ref, - 'DPaddbw', int), DPaddbw, -0.12, 0.08), - # check --report=roofs (Scalar Add): - sn.assert_reference(sn.extractsingle( - r'^Scalar Add Peak\s\(single-threaded\)\s+' - r'(?P\d+)\s+compute$', self.roofline_ref, - 'ScalarAddbw', int), ScalarAddbw, -0.12, 0.08), + # check --report=roofs (L1, L2 and L3 bandwidth): + # sn.assert_reference(roof_L1, L1bw, -0.12, 0.08), + # sn.assert_reference(roof_L2, L2bw, -0.12, 0.08), + # sn.assert_reference(roof_L3, L3bw, -0.12, 0.08), + # check --report=roofs (DP FMA, DP Add and Scalar Add): + sn.assert_reference(roof_dpfma, DPfmabw, -0.12, 0.08), + sn.assert_reference(roof_dpadd, DPaddbw, -0.12, 0.08), + sn.assert_reference(roof_scalaradd, ScalarAddbw, -0.12, 0.08), # --- check Arithmetic_intensity: - sn.assert_reference(sn.extractsingle( - r'^returned\sAI\sgap\s=\s(?P.*)', self.roofline_rpt, - 'Intensity', float), 0.0, -0.01, 0.01), + sn.assert_reference(ai_gap, 0.0, -0.01, 0.01), # --- check GFLOPS: - sn.assert_reference(sn.extractsingle( - r'^returned\sGFLOPS\sgap\s=\s(?P.*)', self.roofline_rpt, - 'Flops', float), 0.0, -0.01, 0.01), + sn.assert_reference(ai_gflops, 0.0, -0.01, 0.01), ]) + + # --performance-report: + regex_mseconds = r'elapsed time: (?P\d+)ms' + regex_ai = r'^\(\'self_arithmetic_intensity\', (?P\d+.\d+)\)' + regex_gbs = r'^\(\'self_gb_s\', (?P\d+.\d+)\)' + regex_gflops = r'^\(\'self_gflops\', (?P\d+.\d+)\)' + mseconds = sn.extractsingle(regex_mseconds, self.stdout, + 'msec', int) + arithmetic_intensity = sn.extractsingle(regex_ai, self.roofline_rpt, + 'AI', float) + bandwidth = sn.extractsingle(regex_gbs, self.roofline_rpt, + 'gbs', float) + gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, + 'gflops', float) + self.perf_patterns = { + 'Elapsed': mseconds, + 'ArithmeticIntensity': arithmetic_intensity, + 'GFlops': gflops, + 'Bandwidth': bandwidth, + 'roof_L1': roof_L1, + 'roof_L2': roof_L2, + 'roof_L3': roof_L3, + } + self.reference = { + '*': { + 'Elapsed': (0, None, None, 'ms'), + 'ArithmeticIntensity': (0, None, None, ''), + 'GFlops': (0, None, None, 'GFLOPs/s'), + 'Bandwidth': (0, None, None, 'GB/s'), + 'roof_L1': (L1bw, -0.12, 0.08, 'GB/s'), + 'roof_L2': (L2bw, -0.12, 0.08, 'GB/s'), + 'roof_L3': (L3bw, -0.12, 0.08, 'GB/s'), + } + } + self.maintainers = ['JG'] self.tags = {'production'} From c0fb386ddea93421a060fe751b1d05b857c82e43 Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 30 Aug 2019 09:25:00 +0200 Subject: [PATCH 2/2] fix for review --- .../tools/profiling_and_debugging/intel_advisor_roofline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 373493901c..977f2c2ab2 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -33,7 +33,6 @@ class IntelRooflineAdvisorTest(rfm.RegressionTest): G3_SOA_VECTOR: gflops, 10.62 arithmetic_intensity', 0.166 57ms <- fast ''' def __init__(self, repeat, toolversion, datalayout): - super().__init__() self.descr = 'Roofline Analysis test with Intel Advisor' # for reference: advisor/2019 was failing on dom with: # "Exceeded job memory limit" (webrt#36087)