44import reframe .utility .sanity as sn
55
66
7- @rfm .parameterized_test (* [[repeat , toolsversion , datalayout ]
7+ @rfm .parameterized_test (* [[repeat , toolversion , datalayout ]
88 for repeat in ['100000' ]
9- for toolsversion in ['597843' ]
9+ for toolversion in ['597843' ]
1010 for datalayout in ['G3_AOS_SCALAR' , 'G3_SOA_SCALAR' ,
1111 'G3_AOS_VECTOR' , 'G3_SOA_VECTOR' ]
1212 ])
@@ -32,8 +32,7 @@ class IntelRooflineAdvisorTest(rfm.RegressionTest):
3232 G3_SOA_SCALAR: gflops, 2.79 arithmetic_intensity', 0.166 351ms
3333 G3_SOA_VECTOR: gflops, 10.62 arithmetic_intensity', 0.166 57ms <- fast
3434 '''
35- def __init__ (self , repeat , toolsversion , datalayout ):
36- super ().__init__ ()
35+ def __init__ (self , repeat , toolversion , datalayout ):
3736 self .descr = 'Roofline Analysis test with Intel Advisor'
3837 # for reference: advisor/2019 was failing on dom with:
3938 # "Exceeded job memory limit" (webrt#36087)
@@ -86,60 +85,158 @@ def __init__(self, repeat, toolsversion, datalayout):
8685 '--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' %
8786 (self .roofdir , self .target_executable )
8887 ]
89- # Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
90- L1bw = 293 * 1024 ** 3
91- L2bw = 79 * 1024 ** 3
92- L3bw = 33 * 1024 ** 3
88+ # - Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
89+ L1bw = 293 # *1024**3
90+ L2bw = 79 # *1024**3
91+ L3bw = 33 # *1024**3
9392 DPfmabw = 45 * 1024 ** 3
9493 DPaddbw = 12 * 1024 ** 3
9594 ScalarAddbw = 3 * 1024 ** 3
95+ # --- roofline (memory) boundaries from the tool:
96+ # DRAM Bandwidth (single node) 63206331080 memory
97+ # DRAM Bandwidth 125993278750 memory
98+ # DRAM Bandwidth (single-threaded) 12715570803 memory
99+ # L1 Bandwidth 11360856466728 memory
100+ # Scalar L1 Bandwidth 2648216636280 memory
101+ # L1 bandwidth (single-threaded) 315579346298 memory
102+ # ************
103+ # Scalar L1 bandwidth (single-threaded) 73561573230 memory
104+ # L2 Bandwidth 3102773429268 memory
105+ # Scalar L2 Bandwidth 921316779936 memory
106+ # L2 bandwidth (single-threaded) 86188150813 memory
107+ # ***********
108+ # Scalar L2 bandwidth (single-threaded) 25592132776 memory
109+ # L3 Bandwidth 1269637300440 memory
110+ # Scalar L3 Bandwidth 845928498744 memory
111+ # L3 bandwidth (single-threaded) 35267702790 memory
112+ # ***********
113+ # Scalar L3 bandwidth (single-threaded) 23498013854 memory
114+ regex_roof_L1 = (r'^L1\sbandwidth\s\(single-threaded\)\s+(?P<L1bw>\d+)'
115+ r'\s+memory$' )
116+ regex_roof_L2 = (r'^L2\sbandwidth\s\(single-threaded\)\s+(?P<L2bw>\d+)'
117+ r'\s+memory$' )
118+ regex_roof_L3 = (r'^L3\sbandwidth\s\(single-threaded\)\s+(?P<L3bw>\d+)'
119+ r'\s+memory$' )
120+ roof_L1 = sn .round (sn .extractsingle (regex_roof_L1 , self .roofline_ref ,
121+ 'L1bw' , int ) / 1024 ** 3 , 2 )
122+ roof_L2 = sn .round (sn .extractsingle (regex_roof_L2 , self .roofline_ref ,
123+ 'L2bw' , int ) / 1024 ** 3 , 3 )
124+ roof_L3 = sn .round (sn .extractsingle (regex_roof_L3 , self .roofline_ref ,
125+ 'L3bw' , int ) / 1024 ** 3 , 3 )
126+
127+ # --- roofline (compute) boundaries from the tool:
128+ # SP Vector FMA Peak 2759741518342 compute
129+ # SP Vector FMA Peak (single-threaded) 98956234406 compute
130+ # DP Vector FMA Peak 1379752337990 compute
131+ # DP Vector FMA Peak (single-threaded) 49563336304 compute
132+ # ***********
133+ # Scalar Add Peak 93438527464 compute
134+ # Scalar Add Peak (single-threaded) 3289577753 compute
135+ # **********
136+ # SP Vector Add Peak 689944922272 compute
137+ # SP Vector Add Peak (single-threaded) 24691445241 compute
138+ # DP Vector Add Peak 344978547363 compute
139+ # DP Vector Add Peak (single-threaded) 12385333008 compute
140+ # ***********
141+ # Integer Scalar Add Peak 228677310757 compute
142+ # Integer Scalar Add Peak (single-threaded) 8055287031 compute
143+ # Int64 Vector Add Peak 747457604632 compute
144+ # Int64 Vector Add Peak (single-threaded) 26300241032 compute
145+ # Int32 Vector Add Peak 1494880413924 compute
146+ # Int32 Vector Add Peak (single-threaded) 52738180380 compute
147+ regex_roof_dpfma = (r'^DP Vector FMA Peak\s\(single-threaded\)\s+'
148+ r'(?P<DPfmabw>\d+)\s+compute$' )
149+ regex_roof_dpadd = (r'^DP Vector Add Peak\s\(single-threaded\)\s+'
150+ r'(?P<DPaddbw>\d+)\s+compute$' )
151+ regex_roof_scalaradd = (r'^Scalar Add Peak\s\(single-threaded\)\s+'
152+ r'(?P<ScalarAddbw>\d+)\s+compute$' )
153+ roof_dpfma = sn .extractsingle (regex_roof_dpfma , self .roofline_ref ,
154+ 'DPfmabw' , int )
155+ roof_dpadd = sn .extractsingle (regex_roof_dpadd , self .roofline_ref ,
156+ 'DPaddbw' , int )
157+ roof_scalaradd = sn .extractsingle (regex_roof_scalaradd ,
158+ self .roofline_ref , 'ScalarAddbw' ,
159+ int )
160+
161+ # - API output:
162+ # ('self_elapsed_time', 0.1)
163+ # ('self_memory_gb', 4.2496)
164+ # ('self_gb_s', 42.496)
165+ # ('self_gflop', 0.5312)
166+ # ('self_gflops', 5.312)
167+ # ('self_arithmetic_intensity', 0.125)
168+ # ('_self_gb_s', 42.495999999999995, 42.496)
169+ # ('_self_gflops', 5.311999999999999, 5.312)
170+ # ('_self_arithmetic_intensity', 0.125, 0.125)
171+ # ('gap _self_gb_s', -7.105427357601002e-15)
172+ # ('gap _self_gflops', -8.881784197001252e-16)
173+ # ('gap _self_arithmetic_intensity', 0.0)
174+ # returned AI gap = 0.0000000000000000
175+ # returned GFLOPS gap = -0.0000000000000009
176+ regex_ai_gap = r'^returned\sAI\sgap\s=\s(?P<Intensity>.*)'
177+ regex_ai_gflops = r'^returned\sGFLOPS\sgap\s=\s(?P<Flops>.*)'
178+ ai_gap = sn .extractsingle (regex_ai_gap , self .roofline_rpt , 'Intensity' ,
179+ float )
180+ ai_gflops = sn .extractsingle (regex_ai_gflops , self .roofline_rpt ,
181+ 'Flops' , float )
182+
183+ regex_toolversion = r'I*.\(build\s(?P<version>\d+)\s*.'
184+ found_toolversion = sn .extractsingle (regex_toolversion ,
185+ self .version_rpt , 'version' )
96186 self .sanity_patterns = sn .all ([
97187 # check the job status:
98188 sn .assert_found ('loop complete.' , self .stdout ),
99189 # check the tool's version (2019=591264, 2018=551025):
100- sn .assert_eq (sn .extractsingle (
101- r'I*.\(build\s(?P<toolsversion>\d+)\s*.' ,
102- self .version_rpt , 'toolsversion' ), toolsversion ),
190+ sn .assert_eq (found_toolversion , toolversion ),
103191 # --- roofline boundaries:
104- # check --report=roofs (L1 bandwidth):
105- sn .assert_reference (sn .extractsingle (
106- r'^L1\sbandwidth\s\(single-threaded\)\s+(?P<L1bw>\d+)\s+'
107- r'memory$' , self .roofline_ref , 'L1bw' , int ),
108- L1bw , - 0.12 , 0.08 ),
109- # check --report=roofs (L2 bandwidth):
110- sn .assert_reference (sn .extractsingle (
111- r'^L2\sbandwidth\s\(single-threaded\)\s+(?P<L2bw>\d+)\s+'
112- r'memory$' , self .roofline_ref , 'L2bw' , int ),
113- L2bw , - 0.12 , 0.08 ),
114- # check --report=roofs (L3 bandwidth):
115- sn .assert_reference (sn .extractsingle (
116- r'^L3\sbandwidth\s\(single-threaded\)\s+(?P<L3bw>\d+)\s+'
117- r'memory$' , self .roofline_ref , 'L3bw' , int ),
118- L3bw , - 0.12 , 0.08 ),
119- # check --report=roofs (DP FMA):
120- sn .assert_reference (sn .extractsingle (
121- r'^DP Vector FMA Peak\s\(single-threaded\)\s+'
122- r'(?P<DPfmabw>\d+)\s+compute$' , self .roofline_ref ,
123- 'DPfmabw' , int ), DPfmabw , - 0.12 , 0.08 ),
124- # check --report=roofs (DP Add):
125- sn .assert_reference (sn .extractsingle (
126- r'^DP Vector Add Peak\s\(single-threaded\)\s+'
127- r'(?P<DPaddbw>\d+)\s+compute$' , self .roofline_ref ,
128- 'DPaddbw' , int ), DPaddbw , - 0.12 , 0.08 ),
129- # check --report=roofs (Scalar Add):
130- sn .assert_reference (sn .extractsingle (
131- r'^Scalar Add Peak\s\(single-threaded\)\s+'
132- r'(?P<ScalarAddbw>\d+)\s+compute$' , self .roofline_ref ,
133- 'ScalarAddbw' , int ), ScalarAddbw , - 0.12 , 0.08 ),
192+ # check --report=roofs (L1, L2 and L3 bandwidth):
193+ # sn.assert_reference(roof_L1, L1bw, -0.12, 0.08),
194+ # sn.assert_reference(roof_L2, L2bw, -0.12, 0.08),
195+ # sn.assert_reference(roof_L3, L3bw, -0.12, 0.08),
196+ # check --report=roofs (DP FMA, DP Add and Scalar Add):
197+ sn .assert_reference (roof_dpfma , DPfmabw , - 0.12 , 0.08 ),
198+ sn .assert_reference (roof_dpadd , DPaddbw , - 0.12 , 0.08 ),
199+ sn .assert_reference (roof_scalaradd , ScalarAddbw , - 0.12 , 0.08 ),
134200 # --- check Arithmetic_intensity:
135- sn .assert_reference (sn .extractsingle (
136- r'^returned\sAI\sgap\s=\s(?P<Intensity>.*)' , self .roofline_rpt ,
137- 'Intensity' , float ), 0.0 , - 0.01 , 0.01 ),
201+ sn .assert_reference (ai_gap , 0.0 , - 0.01 , 0.01 ),
138202 # --- check GFLOPS:
139- sn .assert_reference (sn .extractsingle (
140- r'^returned\sGFLOPS\sgap\s=\s(?P<Flops>.*)' , self .roofline_rpt ,
141- 'Flops' , float ), 0.0 , - 0.01 , 0.01 ),
203+ sn .assert_reference (ai_gflops , 0.0 , - 0.01 , 0.01 ),
142204 ])
205+
206+ # --performance-report:
207+ regex_mseconds = r'elapsed time: (?P<msec>\d+)ms'
208+ regex_ai = r'^\(\'self_arithmetic_intensity\', (?P<AI>\d+.\d+)\)'
209+ regex_gbs = r'^\(\'self_gb_s\', (?P<gbs>\d+.\d+)\)'
210+ regex_gflops = r'^\(\'self_gflops\', (?P<gflops>\d+.\d+)\)'
211+ mseconds = sn .extractsingle (regex_mseconds , self .stdout ,
212+ 'msec' , int )
213+ arithmetic_intensity = sn .extractsingle (regex_ai , self .roofline_rpt ,
214+ 'AI' , float )
215+ bandwidth = sn .extractsingle (regex_gbs , self .roofline_rpt ,
216+ 'gbs' , float )
217+ gflops = sn .extractsingle (regex_gflops , self .roofline_rpt ,
218+ 'gflops' , float )
219+ self .perf_patterns = {
220+ 'Elapsed' : mseconds ,
221+ 'ArithmeticIntensity' : arithmetic_intensity ,
222+ 'GFlops' : gflops ,
223+ 'Bandwidth' : bandwidth ,
224+ 'roof_L1' : roof_L1 ,
225+ 'roof_L2' : roof_L2 ,
226+ 'roof_L3' : roof_L3 ,
227+ }
228+ self .reference = {
229+ '*' : {
230+ 'Elapsed' : (0 , None , None , 'ms' ),
231+ 'ArithmeticIntensity' : (0 , None , None , '' ),
232+ 'GFlops' : (0 , None , None , 'GFLOPs/s' ),
233+ 'Bandwidth' : (0 , None , None , 'GB/s' ),
234+ 'roof_L1' : (L1bw , - 0.12 , 0.08 , 'GB/s' ),
235+ 'roof_L2' : (L2bw , - 0.12 , 0.08 , 'GB/s' ),
236+ 'roof_L3' : (L3bw , - 0.12 , 0.08 , 'GB/s' ),
237+ }
238+ }
239+
143240 self .maintainers = ['JG' ]
144241 self .tags = {'production' }
145242
0 commit comments