Skip to content

Commit

Permalink
Merge pull request #774 from jgphpc/UES-213_run
Browse files Browse the repository at this point in the history
[test] Add sde roofline check
  • Loading branch information
Vasileios Karakasis committed May 6, 2019
2 parents aca387d + 42bd5a0 commit 7998250
Show file tree
Hide file tree
Showing 2 changed files with 280 additions and 0 deletions.
151 changes: 151 additions & 0 deletions cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os

import reframe as rfm
import reframe.utility.sanity as sn


class SdeBaseTest(rfm.RegressionTest):
'''This test checks the values reported by Intel SDE for roofline modeling:
- https://software.intel.com/en-us/articles/
intel-software-development-emulator
- https://bitbucket.org/dwdoerf/stream-ai-example/src/master/
- https://www.nersc.gov/
users/application-performance/measuring-arithmetic-intensity
'''
def __init__(self):
super().__init__()
self.descr = 'Roofline Analysis test with Intel SDE'
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
'roofline', 'sde')
self.build_system = 'SingleSource'
self.sourcepath = 'stream_mpi.c'
self.executable = 'sde'
self.target_executable = './stream.exe'
self.sde = '%s.sde' % self.target_executable
self.rpt = '%s.rpt' % self.target_executable
self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict',
'-qopt-streaming-stores', 'always']
exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental'
self.pre_run = [
'mv %s %s' % (self.executable, self.target_executable),
'module use %s/modules/all' % exp,
'module load sde',
'sde -help'
]
self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
self.maintainers = ['JG']
self.tags = {'scs'}

@property
@sn.sanity_function
def arithmetic_intensity(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
self.rpt, 'bytes', int)
return flops/bytes

@property
@sn.sanity_function
def gflops(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P<avgtime>\d+\.\d+)',
self.stdout, 'avgtime', float)
step = sn.extractsingle(r'^Each kernel will be executed (?P<step>\d+)',
self.stdout, 'step', int)
return flops/(sec*step*10**9)

def setup(self, partition, environ, **job_opts):
self.executable_opts = self.sdeflags
super().setup(partition, environ, **job_opts)
if self.num_tasks != 36:
self.job.options = ['--cpu-bind=verbose,none']
else:
self.job.options = ['--cpu-bind=verbose']


@rfm.parameterized_test(*[[num_ranks, arraysize]
for num_ranks in [2]
for arraysize in [100000000]])
# For parameter space study, you may want to use:
# for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1]
# for arraysize in [400000000, 200000000, 100000000]])
class SdeBroadwellJ1Test(SdeBaseTest):
def __init__(self, num_ranks, arraysize):
super().__init__()
ompthread = 36 // num_ranks
self.valid_systems = ['dom:mc']
self.valid_prog_environs = ['PrgEnv-intel']
self.build_system.cppflags = [
'-D_SDE',
'-DSTREAM_ARRAY_SIZE=%s' % arraysize,
'-DNTIMES=50'
]
self.exclusive = True
self.num_tasks = num_ranks
self.num_tasks_per_node = num_ranks
self.num_cpus_per_task = ompthread
self.num_tasks_per_core = 1
self.use_multithreading = False
self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format(
arraysize, num_ranks, ompthread, self.num_tasks_per_core)
self.variables = {
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
}
self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
'-global_region -start_ssc_mark 111:repeat '
'-stop_ssc_mark 222:repeat -- %s' %
('-bdw', self.sde, self.target_executable)]
# References for Intel Broadwell CPU (E5-2695 v4):
ai = 0.0825
gflops = 9.773
self.sanity_patterns = sn.all([
sn.assert_reference(self.gflops, gflops, -0.1, 0.3),
sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3),
])


@rfm.parameterized_test(*[[num_ranks, arraysize]
for num_ranks in [2]
for arraysize in [100000000]])
# For parameter space study, you may want to use:
# for num_ranks in [72, 36, 24, 18, 12, 9, 8, 6, 4, 3, 2,
# 1]
# for arraysize in [400000000, 200000000, 100000000]])
class SdeBroadwellJ2Test(SdeBaseTest):
def __init__(self, num_ranks, arraysize):
super().__init__()
ompthread = 72 // num_ranks
self.valid_systems = ['dom:mc']
self.valid_prog_environs = ['PrgEnv-intel']
self.build_system.cppflags = [
'-D_SDE',
'-DSTREAM_ARRAY_SIZE=%s' % arraysize,
'-DNTIMES=50'
]
self.exclusive = True
self.num_tasks = num_ranks
self.num_tasks_per_node = num_ranks
self.num_cpus_per_task = ompthread
self.num_tasks_per_core = 2
self.use_multithreading = True
self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format(
arraysize, num_ranks, ompthread, self.num_tasks_per_core)
self.variables = {
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task)
}
self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
'-global_region -start_ssc_mark 111:repeat '
'-stop_ssc_mark 222:repeat -- %s' %
('-bdw', self.sde, self.target_executable)]
# References for Intel Broadwell CPU (E5-2695 v4):
ai = 0.0822
gflops = 9.602
self.sanity_patterns = sn.all([
sn.assert_reference(self.gflops, gflops, -0.1, 0.3),
sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3),
])
129 changes: 129 additions & 0 deletions cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import os

import reframe as rfm
import reframe.utility.sanity as sn


@rfm.parameterized_test(*[[repeat, toolsversion, datalayout]
for repeat in ['100000']
for toolsversion in ['8.35.0']
for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR',
'G3_AOS_VECTOR', 'G3_SOA_VECTOR']
])
class IntelRooflineSdeTest(rfm.RegressionTest):
'''This test checks the values reported by Intel SDE for roofline modeling:
- https://software.intel.com/en-us/articles/
intel-software-development-emulator
- https://bitbucket.org/dwdoerf/stream-ai-example/src/master/
- https://www.nersc.gov/
users/application-performance/measuring-arithmetic-intensity
'''
def __init__(self, repeat, toolsversion, datalayout):
super().__init__()
self.descr = 'Roofline Analysis test with Intel SDE'
self.valid_systems = ['dom:mc']
# Reporting MFLOPS is not available on Intel Haswell cpus, see
# https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/
# 64-ia-32-architectures-software-developer-vol-1-manual.pdf
self.valid_prog_environs = ['PrgEnv-intel']
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
'roofline', 'intel_advisor')
self.build_system = 'SingleSource'
self.sourcepath = '_roofline.cpp'
self.executable = 'sde'
self.target_executable = './roof.exe'
self.sde = '%s.sde' % self.target_executable
self.rpt = '%s.rpt' % self.target_executable
self.prebuild_cmd = [
'patch < SDE/roofline_template.patch',
'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' %
(repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp')
]
self.build_system.cppflags = ['-D_SDE']
self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict',
'-qopt-streaming-stores', 'always',
'-std=c++11']
self.exclusive = True
self.num_tasks = 1
self.num_tasks_per_node = 1
self.num_cpus_per_task = 1
self.num_tasks_per_core = 1
self.use_multithreading = False
self.variables = {
'CRAYPE_LINK_TYPE': 'dynamic',
'OMP_NUM_THREADS': str(self.num_cpus_per_task),
}
exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental'
self.pre_run = [
'mv %s %s' % (self.executable, self.target_executable),
'module use %s/modules/all' % exp,
'module load sde',
'sde -help'
]
self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 '
'-global_region -start_ssc_mark 111:repeat '
'-stop_ssc_mark 222:repeat -- %s' %
('-bdw', self.sde, self.target_executable)]
self.executable_opts = self.sdeflags
self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt)
self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)]
self.maintainers = ['JG']
self.tags = {'scs'}
self.sanity_patterns = sn.all([
sn.assert_eq(sn.extractsingle(
r'^Intel\(R\) Software Development Emulator\. Version: '
r'(?P<toolsversion>\d+\.\d+\.\d+)', self.stdout,
'toolsversion'), toolsversion),
])
# References for Intel Broadwell CPU (E5-2695 v4):
references = {
'G3_AOS_SCALAR': {
'dom:mc': {
'gflops': (0.596, -0.1, 0.3, 'Gflop/s'),
'ai': (0.16, -0.05, 0.05, 'flop/byte')
}
},
'G3_SOA_SCALAR': {
'dom:mc': {
'gflops': (0.612, -0.1, 0.3, 'Gflop/s'),
'ai': (0.16, -0.05, 0.05, 'flop/byte')
}
},
'G3_AOS_VECTOR': {
'dom:mc': {
'gflops': (1.152, -0.1, 0.3, 'Gflop/s'),
'ai': (0.125, -0.05, 0.05, 'flop/byte')
}
},
'G3_SOA_VECTOR': {
'dom:mc': {
'gflops': (1.125, -0.1, 0.3, 'Gflop/s'),
'ai': (0.16, -0.05, 0.05, 'flop/byte')
}
},
}
self.reference = references[datalayout]
self.perf_patterns = {
'gflops': self.gflops,
'ai': self.arithmetic_intensity,
}

@property
@sn.sanity_function
def arithmetic_intensity(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
bytes = sn.extractsingle(r'^--->Total Bytes = (?P<bytes>\d+)',
self.rpt, 'bytes', int)
# debug: print('ai={}'.format(flops/bytes))
return flops/bytes

@property
@sn.sanity_function
def gflops(self):
flops = sn.extractsingle(r'^--->Total FLOPs = (?P<flops>\d+)',
self.rpt, 'flops', int)
msec = sn.extractsingle(r'^elapsed time: (?P<msec>\d+)ms', self.stdout,
'msec', float)
# debug: print('gflops={}'.format(flops/((msec/1000)*10**6)))
return (flops/((msec/1000))/10**9)

0 comments on commit 7998250

Please sign in to comment.