Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 151 additions & 0 deletions cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os

import reframe as rfm
import reframe.utility.sanity as sn


class SdeBaseTest(rfm.RegressionTest):
'''This test checks the values reported by Intel SDE for roofline modeling:
- https://software.intel.com/en-us/articles/
intel-software-development-emulator
- https://bitbucket.org/dwdoerf/stream-ai-example/src/master/
- https://www.nersc.gov/
users/application-performance/measuring-arithmetic-intensity
'''
def __init__(self):
super().__init__()
self.descr = 'Roofline Analysis test with Intel SDE'
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
'roofline', 'sde')
self.build_system = 'SingleSource'
self.sourcepath = 'stream_mpi.c'
self.executable = 'sde'
self.target_executable = './stream.exe'
self.sde = '%s.sde' % self.target_executable
self.rpt = '%s.rpt' % self.target_executable
self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict',
'-qopt-streaming-stores', 'always']
exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental'
self.pre_run = [
'mv %s %s' % (self.executable, self.target_executable),
'module use %s/modules/all' % exp,
'module load sde',
'sde -help'
]
self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
self.maintainers = ['JG']
self.tags = {'scs'}

@property
@sn.sanity_function
def arithmetic_intensity(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
self.rpt, 'bytes', int)
return flops/bytes

@property
@sn.sanity_function
def gflops(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P<avgtime>\d+\.\d+)',
self.stdout, 'avgtime', float)
step = sn.extractsingle(r'^Each kernel will be executed (?P<step>\d+)',
self.stdout, 'step', int)
return flops/(sec*step*10**9)

def setup(self, partition, environ, **job_opts):
self.executable_opts = self.sdeflags
super().setup(partition, environ, **job_opts)
if self.num_tasks != 36:
self.job.options = ['--cpu-bind=verbose,none']
else:
self.job.options = ['--cpu-bind=verbose']


@rfm.parameterized_test(*[[num_ranks, arraysize]
for num_ranks in [2]
for arraysize in [100000000]])
# For parameter space study, you may want to use:
# for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1]
# for arraysize in [400000000, 200000000, 100000000]])
class SdeBroadwellJ1Test(SdeBaseTest):
def __init__(self, num_ranks, arraysize):
super().__init__()
ompthread = 36 // num_ranks
self.valid_systems = ['dom:mc']
self.valid_prog_environs = ['PrgEnv-intel']
self.build_system.cppflags = [
'-D_SDE',
'-DSTREAM_ARRAY_SIZE=%s' % arraysize,
'-DNTIMES=50'
]
self.exclusive = True
self.num_tasks = num_ranks
self.num_tasks_per_node = num_ranks
self.num_cpus_per_task = ompthread
self.num_tasks_per_core = 1
self.use_multithreading = False
self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format(
arraysize, num_ranks, ompthread, self.num_tasks_per_core)
self.variables = {
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
}
self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
'-global_region -start_ssc_mark 111:repeat '
'-stop_ssc_mark 222:repeat -- %s' %
('-bdw', self.sde, self.target_executable)]
# References for Intel Broadwell CPU (E5-2695 v4):
ai = 0.0825
gflops = 9.773
self.sanity_patterns = sn.all([
sn.assert_reference(self.gflops, gflops, -0.1, 0.3),
sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3),
])


@rfm.parameterized_test(*[[num_ranks, arraysize]
for num_ranks in [2]
for arraysize in [100000000]])
# For parameter space study, you may want to use:
# for num_ranks in [72, 36, 24, 18, 12, 9, 8, 6, 4, 3, 2,
# 1]
# for arraysize in [400000000, 200000000, 100000000]])
class SdeBroadwellJ2Test(SdeBaseTest):
def __init__(self, num_ranks, arraysize):
super().__init__()
ompthread = 72 // num_ranks
self.valid_systems = ['dom:mc']
self.valid_prog_environs = ['PrgEnv-intel']
self.build_system.cppflags = [
'-D_SDE',
'-DSTREAM_ARRAY_SIZE=%s' % arraysize,
'-DNTIMES=50'
]
self.exclusive = True
self.num_tasks = num_ranks
self.num_tasks_per_node = num_ranks
self.num_cpus_per_task = ompthread
self.num_tasks_per_core = 2
self.use_multithreading = True
self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format(
arraysize, num_ranks, ompthread, self.num_tasks_per_core)
self.variables = {
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
}
self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
'-global_region -start_ssc_mark 111:repeat '
'-stop_ssc_mark 222:repeat -- %s' %
('-bdw', self.sde, self.target_executable)]
# References for Intel Broadwell CPU (E5-2695 v4):
ai = 0.0822
gflops = 9.602
self.sanity_patterns = sn.all([
sn.assert_reference(self.gflops, gflops, -0.1, 0.3),
sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3),
])
129 changes: 129 additions & 0 deletions cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os

import reframe as rfm
import reframe.utility.sanity as sn


@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
for repeat in ['100000']
for toolsversion in ['8.35.0']
for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
])
class IntelRooflineSdeTest(rfm.RegressionTest):
'''This test checks the values reported by Intel SDE for roofline modeling:
- https://software.intel.com/en-us/articles/
intel-software-development-emulator
- https://bitbucket.org/dwdoerf/stream-ai-example/src/master/
- https://www.nersc.gov/
users/application-performance/measuring-arithmetic-intensity
'''
def __init__(self, repeat, toolsversion, datalayout):
super().__init__()
self.descr = 'Roofline Analysis test with Intel SDE'
self.valid_systems = ['dom:mc']
# Reporting MFLOPS is not available on Intel Haswell cpus, see
# https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
# 64-ia-32-architectures-software-developer-vol-1-manual.pdf
self.valid_prog_environs = ['PrgEnv-intel']
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
'roofline', 'intel_advisor')
self.build_system = 'SingleSource'
self.sourcepath = '_roofline.cpp'
self.executable = 'sde'
self.target_executable = './roof.exe'
self.sde = '%s.sde' % self.target_executable
self.rpt = '%s.rpt' % self.target_executable
self.prebuild_cmd = [
'patch < SDE/roofline_template.patch',
'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
(repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
]
self.build_system.cppflags = ['-D_SDE']
self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict',
'-qopt-streaming-stores', 'always',
'-std=c++11']
self.exclusive = True
self.num_tasks = 1
self.num_tasks_per_node = 1
self.num_cpus_per_task = 1
self.num_tasks_per_core = 1
self.use_multithreading = False
self.variables = {
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task),
}
exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental'
self.pre_run = [
'mv %s %s' % (self.executable, self.target_executable),
'module use %s/modules/all' % exp,
'module load sde',
'sde -help'
]
self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
'-global_region -start_ssc_mark 111:repeat '
'-stop_ssc_mark 222:repeat -- %s' %
('-bdw', self.sde, self.target_executable)]
self.executable_opts = self.sdeflags
self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
self.maintainers = ['JG']
self.tags = {'scs'}
self.sanity_patterns = sn.all([
sn.assert_eq(sn.extractsingle(
r'^Intel\(R\) Software Development Emulator\. Version: '
r'(?P<toolsversion>\d+\.\d+\.\d+)', self.stdout,
'toolsversion'), toolsversion),
])
# References for Intel Broadwell CPU (E5-2695 v4):
references = {
'G3_AOS_SCALAR': {
'dom:mc': {
'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
'ai': (0.16, -0.05, 0.05, 'flop/byte')
}
},
'G3_SOA_SCALAR': {
'dom:mc': {
'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
'ai': (0.16, -0.05, 0.05, 'flop/byte')
}
},
'G3_AOS_VECTOR': {
'dom:mc': {
'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
'ai': (0.125, -0.05, 0.05, 'flop/byte')
}
},
'G3_SOA_VECTOR': {
'dom:mc': {
'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
'ai': (0.16, -0.05, 0.05, 'flop/byte')
}
},
}
self.reference = references[datalayout]
self.perf_patterns = {
'gflops': self.gflops,
'ai': self.arithmetic_intensity,
}

@property
@sn.sanity_function
def arithmetic_intensity(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
self.rpt, 'bytes', int)
# debug: print('ai={}'.format(flops/bytes))
return flops/bytes

@property
@sn.sanity_function
def gflops(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
'msec', float)
# debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
return (flops/((msec/1000))/10**9)