Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 83 additions & 36 deletions cscs-checks/tools/profiling_and_debugging/berkeley-ert.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,10 @@ def __init__(self, num_ranks, flop):
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
}
# Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4):
GFLOPs = 945.0
L1bw = 1788.0
L2bw = 855.0
L3bw = 547.0
DRAMbw = 70.5
# slowest job:

# take the "slowest" job, make it sleep after it has ended and hope the
# other jobs have ended too
# TODO: find a better way to wait for the other jobs to end
num_ranks_min = 1
flop_min = 1024
self.roofline_rpt = 'rpt'
Expand All @@ -77,41 +74,91 @@ def __init__(self, num_ranks, flop):
'cat *_job.out | python2 preprocess.py > pre',
'python2 maximum.py < pre > max',
'python2 summary.py < max > sum',
# give enough time for all the dependent jobs to collect data
# give enough time for all the dependent jobs to collect data:
'sleep 60',
'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt',
]
self.sanity_patterns = sn.all([
# --- check data type:
sn.assert_eq(sn.extractsingle(
r'^\s+(?P<prec>\w+) \* __restrict__ buf = \(\w+ \*\)'
r'malloc\(PSIZE\);', 'driver1.c', 'prec'), 'double'),
# --- check ert's roofline results:
# check GFLOPS:
sn.assert_reference(sn.extractsingle(
r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt,
'GFLOPs', float), GFLOPs, -0.1, 0.5),
# check L1 bandwidth:
sn.assert_reference(sn.extractsingle(
r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt,
'L1bw', float), L1bw, -0.1, 0.3),
# check L2 bandwidth:
sn.assert_reference(sn.extractsingle(
r'(?P<L2bw>\d+.\d+)\sL2 EMP', self.roofline_rpt,
'L2bw', float), L2bw, -0.1, 0.3),
# check L3 bandwidth:
sn.assert_reference(sn.extractsingle(
r'(?P<L3bw>\d+.\d+)\sL3 EMP', self.roofline_rpt,
'L3bw', float), L3bw, -0.1, 0.3),
# check DRAM bandwidth:
sn.assert_reference(sn.extractsingle(
r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt,
'DRAMbw', float), DRAMbw, -0.1, 0.3),
])

else:
self.post_run = [
'cat *_job.out | python2 preprocess.py > pre',
'python2 maximum.py < pre > max',
'python2 summary.py < max > sum',
]
self.sanity_patterns = sn.assert_found('GFLOPs', 'sum')

# --- Sanity check:
regex_datatype = (r'^\s+(?P<type>\w+) \* __restrict__ buf = '
r'\(\w+ \*\)malloc\(PSIZE\);')
datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type')
self.sanity_patterns = sn.all([
sn.assert_found('GFLOPs', 'sum'),
sn.assert_eq(datatype, 'double'),
])

# --- Performance check:
if num_ranks == num_ranks_min and flop == flop_min:
# Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4):
ref_GFLOPs = 945.0
ref_L1bw = 1788.0
ref_L2bw = 855.0
ref_L3bw = 547.0
ref_DRAMbw = 70.5

# Typical performance report looks like:
# --------------------------------------
# ert_FLOPS.1024_MPI.001_OpenMP.036/rpt
# 908.43 GFLOPs EMP
# ******
# META_DATA
# OPENMP_THREADS 1
# FLOPS 8
# MPI_PROCS 36
#
# 5647.33 L1 EMP
# *******
# 3203.86 L2 EMP
# *******
# 1773.58 L3 EMP
# *******
# 139.56 L4 EMP
# 103.50 DRAM EMP
# ******
# META_DATA
# FLOPS 2
# OPENMP_THREADS 1
# MPI_PROCS 36
regex_gflops = r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP'
regex_L1bw = r'(?P<L1bw>\d+.\d+)\sL1 EMP'
regex_L2bw = r'(?P<L2bw>\d+.\d+)\sL2 EMP'
regex_L3bw = r'(?P<L3bw>\d+.\d+)\sL3 EMP'
regex_DRAMbw = r'(?P<DRAMbw>\d+.\d+) DRAM EMP'

gflops = sn.extractsingle(regex_gflops, self.roofline_rpt,
'GFLOPs', float)
L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt,
'L1bw', float)
L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt,
'L2bw', float)
L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt,
'L3bw', float)
DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt,
'DRAMbw', float)

# --performance-report:
self.perf_patterns = {
'gflops': gflops,
'L1bw': L1bw,
'L2bw': L2bw,
'L3bw': L3bw,
'DRAMbw': DRAMbw,
}

self.reference = {
'*': {
'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'),
'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'),
'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'),
'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'),
'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'),
}
}