Skip to content

Commit 9aa14d4

Browse files
author
Vasileios Karakasis
authored
Merge pull request #916 from jgphpc/UES-509_advisor
[test] Add performance report to the Intel Advisor test
2 parents 224dc12 + 75e2461 commit 9aa14d4

File tree

1 file changed

+144
-47
lines changed

1 file changed

+144
-47
lines changed

cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py

Lines changed: 144 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
import reframe.utility.sanity as sn
55

66

7-
@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
7+
@rfm.parameterized_test(*[[repeat, toolversion, datalayout]
88
for repeat in ['100000']
9-
for toolsversion in ['597843']
9+
for toolversion in ['597843']
1010
for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
1111
'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
1212
])
@@ -32,8 +32,7 @@ class IntelRooflineAdvisorTest(rfm.RegressionTest):
3232
G3_SOA_SCALAR: gflops, 2.79 arithmetic_intensity', 0.166 351ms
3333
G3_SOA_VECTOR: gflops, 10.62 arithmetic_intensity', 0.166 57ms <- fast
3434
'''
35-
def __init__(self, repeat, toolsversion, datalayout):
36-
super().__init__()
35+
def __init__(self, repeat, toolversion, datalayout):
3736
self.descr = 'Roofline Analysis test with Intel Advisor'
3837
# for reference: advisor/2019 was failing on dom with:
3938
# "Exceeded job memory limit" (webrt#36087)
@@ -86,60 +85,158 @@ def __init__(self, repeat, toolsversion, datalayout):
8685
'--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' %
8786
(self.roofdir, self.target_executable)
8887
]
89-
# Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
90-
L1bw = 293*1024**3
91-
L2bw = 79*1024**3
92-
L3bw = 33*1024**3
88+
# - Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
89+
L1bw = 293 # *1024**3
90+
L2bw = 79 # *1024**3
91+
L3bw = 33 # *1024**3
9392
DPfmabw = 45*1024**3
9493
DPaddbw = 12*1024**3
9594
ScalarAddbw = 3*1024**3
95+
# --- roofline (memory) boundaries from the tool:
96+
# DRAM Bandwidth (single node) 63206331080 memory
97+
# DRAM Bandwidth 125993278750 memory
98+
# DRAM Bandwidth (single-threaded) 12715570803 memory
99+
# L1 Bandwidth 11360856466728 memory
100+
# Scalar L1 Bandwidth 2648216636280 memory
101+
# L1 bandwidth (single-threaded) 315579346298 memory
102+
# ************
103+
# Scalar L1 bandwidth (single-threaded) 73561573230 memory
104+
# L2 Bandwidth 3102773429268 memory
105+
# Scalar L2 Bandwidth 921316779936 memory
106+
# L2 bandwidth (single-threaded) 86188150813 memory
107+
# ***********
108+
# Scalar L2 bandwidth (single-threaded) 25592132776 memory
109+
# L3 Bandwidth 1269637300440 memory
110+
# Scalar L3 Bandwidth 845928498744 memory
111+
# L3 bandwidth (single-threaded) 35267702790 memory
112+
# ***********
113+
# Scalar L3 bandwidth (single-threaded) 23498013854 memory
114+
regex_roof_L1 = (r'^L1\sbandwidth\s\(single-threaded\)\s+(?P<L1bw>\d+)'
115+
r'\s+memory$')
116+
regex_roof_L2 = (r'^L2\sbandwidth\s\(single-threaded\)\s+(?P<L2bw>\d+)'
117+
r'\s+memory$')
118+
regex_roof_L3 = (r'^L3\sbandwidth\s\(single-threaded\)\s+(?P<L3bw>\d+)'
119+
r'\s+memory$')
120+
roof_L1 = sn.round(sn.extractsingle(regex_roof_L1, self.roofline_ref,
121+
'L1bw', int) / 1024**3, 2)
122+
roof_L2 = sn.round(sn.extractsingle(regex_roof_L2, self.roofline_ref,
123+
'L2bw', int) / 1024**3, 3)
124+
roof_L3 = sn.round(sn.extractsingle(regex_roof_L3, self.roofline_ref,
125+
'L3bw', int) / 1024**3, 3)
126+
127+
# --- roofline (compute) boundaries from the tool:
128+
# SP Vector FMA Peak 2759741518342 compute
129+
# SP Vector FMA Peak (single-threaded) 98956234406 compute
130+
# DP Vector FMA Peak 1379752337990 compute
131+
# DP Vector FMA Peak (single-threaded) 49563336304 compute
132+
# ***********
133+
# Scalar Add Peak 93438527464 compute
134+
# Scalar Add Peak (single-threaded) 3289577753 compute
135+
# **********
136+
# SP Vector Add Peak 689944922272 compute
137+
# SP Vector Add Peak (single-threaded) 24691445241 compute
138+
# DP Vector Add Peak 344978547363 compute
139+
# DP Vector Add Peak (single-threaded) 12385333008 compute
140+
# ***********
141+
# Integer Scalar Add Peak 228677310757 compute
142+
# Integer Scalar Add Peak (single-threaded) 8055287031 compute
143+
# Int64 Vector Add Peak 747457604632 compute
144+
# Int64 Vector Add Peak (single-threaded) 26300241032 compute
145+
# Int32 Vector Add Peak 1494880413924 compute
146+
# Int32 Vector Add Peak (single-threaded) 52738180380 compute
147+
regex_roof_dpfma = (r'^DP Vector FMA Peak\s\(single-threaded\)\s+'
148+
r'(?P<DPfmabw>\d+)\s+compute$')
149+
regex_roof_dpadd = (r'^DP Vector Add Peak\s\(single-threaded\)\s+'
150+
r'(?P<DPaddbw>\d+)\s+compute$')
151+
regex_roof_scalaradd = (r'^Scalar Add Peak\s\(single-threaded\)\s+'
152+
r'(?P<ScalarAddbw>\d+)\s+compute$')
153+
roof_dpfma = sn.extractsingle(regex_roof_dpfma, self.roofline_ref,
154+
'DPfmabw', int)
155+
roof_dpadd = sn.extractsingle(regex_roof_dpadd, self.roofline_ref,
156+
'DPaddbw', int)
157+
roof_scalaradd = sn.extractsingle(regex_roof_scalaradd,
158+
self.roofline_ref, 'ScalarAddbw',
159+
int)
160+
161+
# - API output:
162+
# ('self_elapsed_time', 0.1)
163+
# ('self_memory_gb', 4.2496)
164+
# ('self_gb_s', 42.496)
165+
# ('self_gflop', 0.5312)
166+
# ('self_gflops', 5.312)
167+
# ('self_arithmetic_intensity', 0.125)
168+
# ('_self_gb_s', 42.495999999999995, 42.496)
169+
# ('_self_gflops', 5.311999999999999, 5.312)
170+
# ('_self_arithmetic_intensity', 0.125, 0.125)
171+
# ('gap _self_gb_s', -7.105427357601002e-15)
172+
# ('gap _self_gflops', -8.881784197001252e-16)
173+
# ('gap _self_arithmetic_intensity', 0.0)
174+
# returned AI gap = 0.0000000000000000
175+
# returned GFLOPS gap = -0.0000000000000009
176+
regex_ai_gap = r'^returned\sAI\sgap\s=\s(?P<Intensity>.*)'
177+
regex_ai_gflops = r'^returned\sGFLOPS\sgap\s=\s(?P<Flops>.*)'
178+
ai_gap = sn.extractsingle(regex_ai_gap, self.roofline_rpt, 'Intensity',
179+
float)
180+
ai_gflops = sn.extractsingle(regex_ai_gflops, self.roofline_rpt,
181+
'Flops', float)
182+
183+
regex_toolversion = r'I*.\(build\s(?P<version>\d+)\s*.'
184+
found_toolversion = sn.extractsingle(regex_toolversion,
185+
self.version_rpt, 'version')
96186
self.sanity_patterns = sn.all([
97187
# check the job status:
98188
sn.assert_found('loop complete.', self.stdout),
99189
# check the tool's version (2019=591264, 2018=551025):
100-
sn.assert_eq(sn.extractsingle(
101-
r'I*.\(build\s(?P<toolsversion>\d+)\s*.',
102-
self.version_rpt, 'toolsversion'), toolsversion),
190+
sn.assert_eq(found_toolversion, toolversion),
103191
# --- roofline boundaries:
104-
# check --report=roofs (L1 bandwidth):
105-
sn.assert_reference(sn.extractsingle(
106-
r'^L1\sbandwidth\s\(single-threaded\)\s+(?P<L1bw>\d+)\s+'
107-
r'memory$', self.roofline_ref, 'L1bw', int),
108-
L1bw, -0.12, 0.08),
109-
# check --report=roofs (L2 bandwidth):
110-
sn.assert_reference(sn.extractsingle(
111-
r'^L2\sbandwidth\s\(single-threaded\)\s+(?P<L2bw>\d+)\s+'
112-
r'memory$', self.roofline_ref, 'L2bw', int),
113-
L2bw, -0.12, 0.08),
114-
# check --report=roofs (L3 bandwidth):
115-
sn.assert_reference(sn.extractsingle(
116-
r'^L3\sbandwidth\s\(single-threaded\)\s+(?P<L3bw>\d+)\s+'
117-
r'memory$', self.roofline_ref, 'L3bw', int),
118-
L3bw, -0.12, 0.08),
119-
# check --report=roofs (DP FMA):
120-
sn.assert_reference(sn.extractsingle(
121-
r'^DP Vector FMA Peak\s\(single-threaded\)\s+'
122-
r'(?P<DPfmabw>\d+)\s+compute$', self.roofline_ref,
123-
'DPfmabw', int), DPfmabw, -0.12, 0.08),
124-
# check --report=roofs (DP Add):
125-
sn.assert_reference(sn.extractsingle(
126-
r'^DP Vector Add Peak\s\(single-threaded\)\s+'
127-
r'(?P<DPaddbw>\d+)\s+compute$', self.roofline_ref,
128-
'DPaddbw', int), DPaddbw, -0.12, 0.08),
129-
# check --report=roofs (Scalar Add):
130-
sn.assert_reference(sn.extractsingle(
131-
r'^Scalar Add Peak\s\(single-threaded\)\s+'
132-
r'(?P<ScalarAddbw>\d+)\s+compute$', self.roofline_ref,
133-
'ScalarAddbw', int), ScalarAddbw, -0.12, 0.08),
192+
# check --report=roofs (L1, L2 and L3 bandwidth):
193+
# sn.assert_reference(roof_L1, L1bw, -0.12, 0.08),
194+
# sn.assert_reference(roof_L2, L2bw, -0.12, 0.08),
195+
# sn.assert_reference(roof_L3, L3bw, -0.12, 0.08),
196+
# check --report=roofs (DP FMA, DP Add and Scalar Add):
197+
sn.assert_reference(roof_dpfma, DPfmabw, -0.12, 0.08),
198+
sn.assert_reference(roof_dpadd, DPaddbw, -0.12, 0.08),
199+
sn.assert_reference(roof_scalaradd, ScalarAddbw, -0.12, 0.08),
134200
# --- check Arithmetic_intensity:
135-
sn.assert_reference(sn.extractsingle(
136-
r'^returned\sAI\sgap\s=\s(?P<Intensity>.*)', self.roofline_rpt,
137-
'Intensity', float), 0.0, -0.01, 0.01),
201+
sn.assert_reference(ai_gap, 0.0, -0.01, 0.01),
138202
# --- check GFLOPS:
139-
sn.assert_reference(sn.extractsingle(
140-
r'^returned\sGFLOPS\sgap\s=\s(?P<Flops>.*)', self.roofline_rpt,
141-
'Flops', float), 0.0, -0.01, 0.01),
203+
sn.assert_reference(ai_gflops, 0.0, -0.01, 0.01),
142204
])
205+
206+
# --performance-report:
207+
regex_mseconds = r'elapsed time: (?P<msec>\d+)ms'
208+
regex_ai = r'^\(\'self_arithmetic_intensity\', (?P<AI>\d+.\d+)\)'
209+
regex_gbs = r'^\(\'self_gb_s\', (?P<gbs>\d+.\d+)\)'
210+
regex_gflops = r'^\(\'self_gflops\', (?P<gflops>\d+.\d+)\)'
211+
mseconds = sn.extractsingle(regex_mseconds, self.stdout,
212+
'msec', int)
213+
arithmetic_intensity = sn.extractsingle(regex_ai, self.roofline_rpt,
214+
'AI', float)
215+
bandwidth = sn.extractsingle(regex_gbs, self.roofline_rpt,
216+
'gbs', float)
217+
gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
218+
'gflops', float)
219+
self.perf_patterns = {
220+
'Elapsed': mseconds,
221+
'ArithmeticIntensity': arithmetic_intensity,
222+
'GFlops': gflops,
223+
'Bandwidth': bandwidth,
224+
'roof_L1': roof_L1,
225+
'roof_L2': roof_L2,
226+
'roof_L3': roof_L3,
227+
}
228+
self.reference = {
229+
'*': {
230+
'Elapsed': (0, None, None, 'ms'),
231+
'ArithmeticIntensity': (0, None, None, ''),
232+
'GFlops': (0, None, None, 'GFLOPs/s'),
233+
'Bandwidth': (0, None, None, 'GB/s'),
234+
'roof_L1': (L1bw, -0.12, 0.08, 'GB/s'),
235+
'roof_L2': (L2bw, -0.12, 0.08, 'GB/s'),
236+
'roof_L3': (L3bw, -0.12, 0.08, 'GB/s'),
237+
}
238+
}
239+
143240
self.maintainers = ['JG']
144241
self.tags = {'production'}
145242

0 commit comments

Comments
 (0)