From 6dcbaffe71c6137485a3aeba61719feea25e8e34 Mon Sep 17 00:00:00 2001 From: jgp Date: Wed, 1 May 2019 13:10:05 +0200 Subject: [PATCH 1/8] sde --- .../intel_sde_berkeley_stream.py | 175 ++++++++++++++++++ .../intel_sde_roofline.py | 129 +++++++++++++ 2 files changed, 304 insertions(+) create mode 100644 cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py create mode 100644 cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py new file mode 100644 index 0000000000..7fea54116f --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -0,0 +1,175 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +class SdeBaseTest(rfm.RegressionTest): + '''This test checks the values reported by Intel SDE for roofline modeling: + - https://software.intel.com/en-us/articles/ + intel-software-development-emulator + - https://bitbucket.org/dwdoerf/stream-ai-example/src/master/ + - https://www.nersc.gov/ + users/application-performance/measuring-arithmetic-intensity + ''' + def __init__(self): + super().__init__() + self.descr = 'Roofline Analysis test with Intel SDE' + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'sde') + self.build_system = 'SingleSource' + self.sourcepath = 'stream_mpi.c' + self.executable = 'sde' + self.target_executable = './stream.exe' + self.sde = '%s.sde' % self.target_executable + self.rpt = '%s.rpt' % self.target_executable + self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict', + '-qopt-streaming-stores', 'always'] + self.pre_run = [ + 'mv %s %s' % (self.executable, self.target_executable), + 'module use /apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/' + 'experimental/modules/all', + 'module load sde', + 'sde -help' + ] + self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) + self.post_run = ['./parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] + self.maintainers = ['JG'] + self.tags = {'scs'} + + def setup(self, partition, environ, **job_opts): + self.executable_opts = self.sdeflags + super().setup(partition, environ, **job_opts) + if not self.num_tasks == 36: + self.job.options = ['--cpu-bind=verbose,none'] + else: + self.job.options = ['--cpu-bind=verbose'] + + +@rfm.parameterized_test(*[[mpitask, arraysize] + for mpitask in [2] + for arraysize in [100000000]]) +# for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] +# for arraysize in [400000000, 200000000, 100000000]]) +class SdeBroadwellJ1Test(SdeBaseTest): + def __init__(self, mpitask, arraysize): + super().__init__() + ompthread = int(36/mpitask) + self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-intel'] + self.build_system.cppflags = [ + '-D_SDE', + '-DSTREAM_ARRAY_SIZE=%s' % arraysize, + '-DNTIMES=50' + ] + self.time_limit = (0, 10, 0) + self.exclusive = True + self.num_tasks = mpitask + self.num_tasks_per_node = mpitask + self.num_cpus_per_task = int(ompthread) + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.name = 'sde_n.' + '{:010d}'.format(arraysize) + \ + '_MPI.' + '{:03d}'.format(mpitask) + \ + '_OpenMP.' + '{:03d}'.format(ompthread) + \ + '_j.%s' % self.num_tasks_per_core + self.variables = { + 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) + } + self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 ' + '-global_region -start_ssc_mark 111:repeat ' + '-stop_ssc_mark 222:repeat -- %s' % + ('-bdw', self.sde, self.target_executable)] + # References for Intel Broadwell CPU (E5-2695 v4): + ai = 0.0825 + gflops = 9773.0 + self.sanity_patterns = sn.all([ + sn.assert_reference(self.gflops, gflops, -0.1, 0.3), + sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3), + ]) + + @property + @sn.sanity_function + def arithmetic_intensity(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'byts', int) + return flops/byts + + @property + @sn.sanity_function + def gflops(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P\d+\.\d+)', + self.stdout, 'avgtime', float) + step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', + self.stdout, 'step', int) + return flops/(sec*step*10**6) + + +@rfm.parameterized_test(*[[mpitask, arraysize] + for mpitask in [2] + for arraysize in [100000000]]) +# for mpitask in [72, 36, 24, 18, 12, 9, 8, 6, 4, 3, 2, +# 1] +# for arraysize in [400000000, 200000000, 100000000]]) +class SdeBroadwellJ2Test(SdeBaseTest): + def __init__(self, mpitask, arraysize): + super().__init__() + ompthread = int(72/mpitask) + self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-intel'] + self.build_system.cppflags = [ + '-D_SDE', + '-DSTREAM_ARRAY_SIZE=%s' % arraysize, + '-DNTIMES=50' + ] + self.time_limit = (0, 10, 0) + self.exclusive = True + self.num_tasks = mpitask + self.num_tasks_per_node = mpitask + self.num_cpus_per_task = int(ompthread) + self.num_tasks_per_core = 2 + self.use_multithreading = True + self.name = 'sde_n.' + '{:010d}'.format(arraysize) + \ + '_MPI.' + '{:03d}'.format(mpitask) + \ + '_OpenMP.' + '{:03d}'.format(ompthread) + \ + '_j.%s' % self.num_tasks_per_core + self.variables = { + 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) + } + self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 ' + '-global_region -start_ssc_mark 111:repeat ' + '-stop_ssc_mark 222:repeat -- %s' % + ('-bdw', self.sde, self.target_executable)] + # References for Intel Broadwell CPU (E5-2695 v4): + ai = 0.0822 + gflops = 9602.0 + self.sanity_patterns = sn.all([ + sn.assert_reference(self.gflops, gflops, -0.1, 0.3), + sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3), + ]) + + @property + @sn.sanity_function + def arithmetic_intensity(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'byts', int) + return flops/byts + + @property + @sn.sanity_function + def gflops(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P\d+\.\d+)', + self.stdout, 'avgtime', float) + step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', + self.stdout, 'step', int) + return flops/(sec*step*10**6) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py new file mode 100644 index 0000000000..9e79c20313 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py @@ -0,0 +1,129 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] + for repeat in ['100000'] + for toolsversion in ['8.35.0'] + for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', + 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] + ]) +class IntelRooflineSdeTest(rfm.RegressionTest): + '''This test checks the values reported by Intel SDE for roofline modeling: + - https://software.intel.com/en-us/articles/ + intel-software-development-emulator + - https://bitbucket.org/dwdoerf/stream-ai-example/src/master/ + - https://www.nersc.gov/ + users/application-performance/measuring-arithmetic-intensity + ''' + def __init__(self, repeat, toolsversion, datalayout): + super().__init__() + self.descr = 'Roofline Analysis test with Intel SDE' + self.valid_systems = ['dom:mc'] + # Reporting MFLOPS is not available on Intel Haswell cpus, see + # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ + # 64-ia-32-architectures-software-developer-vol-1-manual.pdf + self.valid_prog_environs = ['PrgEnv-intel'] + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'intel_advisor') + self.build_system = 'SingleSource' + self.sourcepath = '_roofline.cpp' + self.prebuild_cmd = [ + 'patch < SDE/roofline_template.patch', + 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % + (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') + ] + self.build_system.cppflags = ['-D_SDE'] + self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict', + '-qopt-streaming-stores', 'always', + '-std=c++11'] + self.exclusive = True + self.num_tasks = 1 + self.num_tasks_per_node = 1 + self.num_cpus_per_task = 1 + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'CRAYPE_LINK_TYPE': 'dynamic', + } + self.executable = 'sde' + self.target_executable = './roof.exe' + self.sde = '%s.sde' % self.target_executable + self.rpt = '%s.rpt' % self.target_executable + self.pre_run = [ + 'mv %s %s' % (self.executable, self.target_executable), + 'module use /apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/' + 'experimental/modules/all', + 'module load sde', + 'sde -help' + ] + self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 ' + '-global_region -start_ssc_mark 111:repeat ' + '-stop_ssc_mark 222:repeat -- %s' % + ('-bdw', self.sde, self.target_executable)] + self.executable_opts = self.sdeflags + self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) + self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] + self.maintainers = ['JG'] + self.tags = {'production'} + self.sanity_patterns = sn.all([ + sn.assert_eq(sn.extractsingle( + r'^Intel\(R\) Software Development Emulator\. Version: ' + r'(?P\d+\.\d+\.\d+)', self.stdout, + 'toolsversion'), toolsversion), + ]) + # References for Intel Broadwell CPU (E5-2695 v4): + references = { + 'G3_AOS_SCALAR': { + 'dom:mc': { + 'gflops': (596, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_SOA_SCALAR': { + 'dom:mc': { + 'gflops': (612, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_AOS_VECTOR': { + 'dom:mc': { + 'gflops': (1152, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.125, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_SOA_VECTOR': { + 'dom:mc': { + 'gflops': (1125, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + } + self.reference = references[datalayout] + self.perf_patterns = { + 'gflops': self.gflops, + 'ai': self.arithmetic_intensity, + } + + @property + @sn.sanity_function + def arithmetic_intensity(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'byts', int) + # debug: print('ai={}'.format(flops/byts)) + return flops/byts + + @property + @sn.sanity_function + def gflops(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, + 'msec', float) + # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) + return flops/((msec/1000)*10**6) From 1a9c3ac6d5073f59066983b65b4ec4d2404b61c6 Mon Sep 17 00:00:00 2001 From: jgp Date: Wed, 1 May 2019 17:39:46 +0200 Subject: [PATCH 2/8] gflops --- .../intel_sde_berkeley_stream.py | 4 ++-- .../profiling_and_debugging/intel_sde_roofline.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 7fea54116f..82e2271bbe 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -148,7 +148,7 @@ def __init__(self, mpitask, arraysize): ('-bdw', self.sde, self.target_executable)] # References for Intel Broadwell CPU (E5-2695 v4): ai = 0.0822 - gflops = 9602.0 + gflops = 9.602 self.sanity_patterns = sn.all([ sn.assert_reference(self.gflops, gflops, -0.1, 0.3), sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3), @@ -172,4 +172,4 @@ def gflops(self): self.stdout, 'avgtime', float) step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', self.stdout, 'step', int) - return flops/(sec*step*10**6) + return ((flops/(sec*step))/10**9) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py index 9e79c20313..66ea9b232f 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py @@ -79,25 +79,25 @@ def __init__(self, repeat, toolsversion, datalayout): references = { 'G3_AOS_SCALAR': { 'dom:mc': { - 'gflops': (596, -0.1, 0.3, 'Gflop/s'), + 'gflops': (0.596, -0.1, 0.3, 'Gflop/s'), 'ai': (0.16, -0.05, 0.05, 'flop/byte') } }, 'G3_SOA_SCALAR': { 'dom:mc': { - 'gflops': (612, -0.1, 0.3, 'Gflop/s'), + 'gflops': (0.612, -0.1, 0.3, 'Gflop/s'), 'ai': (0.16, -0.05, 0.05, 'flop/byte') } }, 'G3_AOS_VECTOR': { 'dom:mc': { - 'gflops': (1152, -0.1, 0.3, 'Gflop/s'), + 'gflops': (1.152, -0.1, 0.3, 'Gflop/s'), 'ai': (0.125, -0.05, 0.05, 'flop/byte') } }, 'G3_SOA_VECTOR': { 'dom:mc': { - 'gflops': (1125, -0.1, 0.3, 'Gflop/s'), + 'gflops': (1.125, -0.1, 0.3, 'Gflop/s'), 'ai': (0.16, -0.05, 0.05, 'flop/byte') } }, @@ -126,4 +126,4 @@ def gflops(self): msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, 'msec', float) # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) - return flops/((msec/1000)*10**6) + return (flops/((msec/1000))/10**9) From 75976bdbb4befc66fe40005ea4102fc0215f2fd0 Mon Sep 17 00:00:00 2001 From: jgp Date: Wed, 1 May 2019 17:44:21 +0200 Subject: [PATCH 3/8] gflops --- .../profiling_and_debugging/intel_sde_berkeley_stream.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 82e2271bbe..01987f69fc 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -83,7 +83,7 @@ def __init__(self, mpitask, arraysize): ('-bdw', self.sde, self.target_executable)] # References for Intel Broadwell CPU (E5-2695 v4): ai = 0.0825 - gflops = 9773.0 + gflops = 9.773 self.sanity_patterns = sn.all([ sn.assert_reference(self.gflops, gflops, -0.1, 0.3), sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3), @@ -107,7 +107,7 @@ def gflops(self): self.stdout, 'avgtime', float) step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', self.stdout, 'step', int) - return flops/(sec*step*10**6) + return flops/(sec*step*10**9) @rfm.parameterized_test(*[[mpitask, arraysize] From 09ad4a6d973d810cd8236a6a70c1a1c9f262f092 Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 3 May 2019 11:24:46 +0200 Subject: [PATCH 4/8] fix for review --- .../intel_sde_berkeley_stream.py | 64 +++++++------------ 1 file changed, 23 insertions(+), 41 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 01987f69fc..2f064967e3 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -37,10 +37,30 @@ def __init__(self): self.maintainers = ['JG'] self.tags = {'scs'} + @property + @sn.sanity_function + def arithmetic_intensity(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'byts', int) + return flops/byts + + @property + @sn.sanity_function + def gflops(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P\d+\.\d+)', + self.stdout, 'avgtime', float) + step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', + self.stdout, 'step', int) + return flops/(sec*step*10**9) + def setup(self, partition, environ, **job_opts): self.executable_opts = self.sdeflags super().setup(partition, environ, **job_opts) - if not self.num_tasks == 36: + if self.num_tasks != 36: self.job.options = ['--cpu-bind=verbose,none'] else: self.job.options = ['--cpu-bind=verbose'] @@ -49,6 +69,7 @@ def setup(self, partition, environ, **job_opts): @rfm.parameterized_test(*[[mpitask, arraysize] for mpitask in [2] for arraysize in [100000000]]) +# For parameter space study, you may want to use: # for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] # for arraysize in [400000000, 200000000, 100000000]]) class SdeBroadwellJ1Test(SdeBaseTest): @@ -89,30 +110,11 @@ def __init__(self, mpitask, arraysize): sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3), ]) - @property - @sn.sanity_function - def arithmetic_intensity(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'byts', int) - return flops/byts - - @property - @sn.sanity_function - def gflops(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P\d+\.\d+)', - self.stdout, 'avgtime', float) - step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', - self.stdout, 'step', int) - return flops/(sec*step*10**9) - @rfm.parameterized_test(*[[mpitask, arraysize] for mpitask in [2] for arraysize in [100000000]]) +# For parameter space study, you may want to use: # for mpitask in [72, 36, 24, 18, 12, 9, 8, 6, 4, 3, 2, # 1] # for arraysize in [400000000, 200000000, 100000000]]) @@ -153,23 +155,3 @@ def __init__(self, mpitask, arraysize): sn.assert_reference(self.gflops, gflops, -0.1, 0.3), sn.assert_reference(self.arithmetic_intensity, ai, -0.1, 0.3), ]) - - @property - @sn.sanity_function - def arithmetic_intensity(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'byts', int) - return flops/byts - - @property - @sn.sanity_function - def gflops(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - sec = sn.extractsingle(r'^Triad:\s+\d+\.\d+\s+(?P\d+\.\d+)', - self.stdout, 'avgtime', float) - step = sn.extractsingle(r'^Each kernel will be executed (?P\d+)', - self.stdout, 'step', int) - return ((flops/(sec*step))/10**9) From 9a64708a17bdee1f8b089af59233e724b4b950ea Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 3 May 2019 14:28:07 +0200 Subject: [PATCH 5/8] fix for review --- .../profiling_and_debugging/intel_sde_berkeley_stream.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 2f064967e3..e03c89646e 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -75,7 +75,7 @@ def setup(self, partition, environ, **job_opts): class SdeBroadwellJ1Test(SdeBaseTest): def __init__(self, mpitask, arraysize): super().__init__() - ompthread = int(36/mpitask) + ompthread = 36 // mpitask self.valid_systems = ['daint:mc', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-intel'] self.build_system.cppflags = [ @@ -87,7 +87,7 @@ def __init__(self, mpitask, arraysize): self.exclusive = True self.num_tasks = mpitask self.num_tasks_per_node = mpitask - self.num_cpus_per_task = int(ompthread) + self.num_cpus_per_task = ompthread self.num_tasks_per_core = 1 self.use_multithreading = False self.name = 'sde_n.' + '{:010d}'.format(arraysize) + \ @@ -121,7 +121,7 @@ def __init__(self, mpitask, arraysize): class SdeBroadwellJ2Test(SdeBaseTest): def __init__(self, mpitask, arraysize): super().__init__() - ompthread = int(72/mpitask) + ompthread = 72 // mpitask self.valid_systems = ['daint:mc', 'dom:mc'] self.valid_prog_environs = ['PrgEnv-intel'] self.build_system.cppflags = [ @@ -133,7 +133,7 @@ def __init__(self, mpitask, arraysize): self.exclusive = True self.num_tasks = mpitask self.num_tasks_per_node = mpitask - self.num_cpus_per_task = int(ompthread) + self.num_cpus_per_task = ompthread self.num_tasks_per_core = 2 self.use_multithreading = True self.name = 'sde_n.' + '{:010d}'.format(arraysize) + \ From 4d957d750b6f6f36c95e065013da4be78b76fd04 Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 3 May 2019 14:51:49 +0200 Subject: [PATCH 6/8] fix for review --- .../intel_sde_berkeley_stream.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index e03c89646e..15ae97e477 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -90,10 +90,8 @@ def __init__(self, mpitask, arraysize): self.num_cpus_per_task = ompthread self.num_tasks_per_core = 1 self.use_multithreading = False - self.name = 'sde_n.' + '{:010d}'.format(arraysize) + \ - '_MPI.' + '{:03d}'.format(mpitask) + \ - '_OpenMP.' + '{:03d}'.format(ompthread) + \ - '_j.%s' % self.num_tasks_per_core + self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format( + arraysize, mpitask, ompthread, self.num_tasks_per_core) self.variables = { 'CRAYPE_LINK_TYPE': 'dynamic', 'OMP_NUM_THREADS': str(self.num_cpus_per_task) @@ -136,10 +134,8 @@ def __init__(self, mpitask, arraysize): self.num_cpus_per_task = ompthread self.num_tasks_per_core = 2 self.use_multithreading = True - self.name = 'sde_n.' + '{:010d}'.format(arraysize) + \ - '_MPI.' + '{:03d}'.format(mpitask) + \ - '_OpenMP.' + '{:03d}'.format(ompthread) + \ - '_j.%s' % self.num_tasks_per_core + self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format( + arraysize, mpitask, ompthread, self.num_tasks_per_core) self.variables = { 'CRAYPE_LINK_TYPE': 'dynamic', 'OMP_NUM_THREADS': str(self.num_cpus_per_task) From dfb7fcd3e3d224dfc5be2d806d7c812b59043ebf Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 6 May 2019 14:05:31 +0200 Subject: [PATCH 7/8] fix for review --- .../intel_sde_berkeley_stream.py | 50 +++++++++---------- .../intel_sde_roofline.py | 24 ++++----- 2 files changed, 36 insertions(+), 38 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 15ae97e477..4557f7d198 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -25,15 +25,15 @@ def __init__(self): self.rpt = '%s.rpt' % self.target_executable self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict', '-qopt-streaming-stores', 'always'] + exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental' self.pre_run = [ 'mv %s %s' % (self.executable, self.target_executable), - 'module use /apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/' - 'experimental/modules/all', + 'module use %s/modules/all' % exp, 'module load sde', 'sde -help' ] self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) - self.post_run = ['./parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] + self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] self.maintainers = ['JG'] self.tags = {'scs'} @@ -42,9 +42,9 @@ def __init__(self): def arithmetic_intensity(self): flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', self.rpt, 'flops', int) - byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'byts', int) - return flops/byts + bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'bytes', int) + return flops/bytes @property @sn.sanity_function @@ -66,32 +66,31 @@ def setup(self, partition, environ, **job_opts): self.job.options = ['--cpu-bind=verbose'] -@rfm.parameterized_test(*[[mpitask, arraysize] - for mpitask in [2] +@rfm.parameterized_test(*[[num_ranks, arraysize] + for num_ranks in [2] for arraysize in [100000000]]) # For parameter space study, you may want to use: -# for mpitask in [36, 18, 12, 9, 6, 4, 3, 2, 1] +# for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1] # for arraysize in [400000000, 200000000, 100000000]]) class SdeBroadwellJ1Test(SdeBaseTest): - def __init__(self, mpitask, arraysize): + def __init__(self, num_ranks, arraysize): super().__init__() - ompthread = 36 // mpitask - self.valid_systems = ['daint:mc', 'dom:mc'] + ompthread = 36 // num_ranks + self.valid_systems = ['dom:mc'] self.valid_prog_environs = ['PrgEnv-intel'] self.build_system.cppflags = [ '-D_SDE', '-DSTREAM_ARRAY_SIZE=%s' % arraysize, '-DNTIMES=50' ] - self.time_limit = (0, 10, 0) self.exclusive = True - self.num_tasks = mpitask - self.num_tasks_per_node = mpitask + self.num_tasks = num_ranks + self.num_tasks_per_node = num_ranks self.num_cpus_per_task = ompthread self.num_tasks_per_core = 1 self.use_multithreading = False self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format( - arraysize, mpitask, ompthread, self.num_tasks_per_core) + arraysize, num_ranks, ompthread, self.num_tasks_per_core) self.variables = { 'CRAYPE_LINK_TYPE': 'dynamic', 'OMP_NUM_THREADS': str(self.num_cpus_per_task) @@ -109,33 +108,32 @@ def __init__(self, mpitask, arraysize): ]) -@rfm.parameterized_test(*[[mpitask, arraysize] - for mpitask in [2] +@rfm.parameterized_test(*[[num_ranks, arraysize] + for num_ranks in [2] for arraysize in [100000000]]) # For parameter space study, you may want to use: -# for mpitask in [72, 36, 24, 18, 12, 9, 8, 6, 4, 3, 2, +# for num_ranks in [72, 36, 24, 18, 12, 9, 8, 6, 4, 3, 2, # 1] # for arraysize in [400000000, 200000000, 100000000]]) class SdeBroadwellJ2Test(SdeBaseTest): - def __init__(self, mpitask, arraysize): + def __init__(self, num_ranks, arraysize): super().__init__() - ompthread = 72 // mpitask - self.valid_systems = ['daint:mc', 'dom:mc'] + ompthread = 72 // num_ranks + self.valid_systems = ['dom:mc'] self.valid_prog_environs = ['PrgEnv-intel'] self.build_system.cppflags = [ '-D_SDE', '-DSTREAM_ARRAY_SIZE=%s' % arraysize, '-DNTIMES=50' ] - self.time_limit = (0, 10, 0) self.exclusive = True - self.num_tasks = mpitask - self.num_tasks_per_node = mpitask + self.num_tasks = num_ranks + self.num_tasks_per_node = num_ranks self.num_cpus_per_task = ompthread self.num_tasks_per_core = 2 self.use_multithreading = True self.name = 'sde_n.{:010d}_MPI.{:03d}_OpenMP.{:03d}_j.{:01d}'.format( - arraysize, mpitask, ompthread, self.num_tasks_per_core) + arraysize, num_ranks, ompthread, self.num_tasks_per_core) self.variables = { 'CRAYPE_LINK_TYPE': 'dynamic', 'OMP_NUM_THREADS': str(self.num_cpus_per_task) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py index 66ea9b232f..e2ed04153f 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py @@ -30,6 +30,10 @@ def __init__(self, repeat, toolsversion, datalayout): 'roofline', 'intel_advisor') self.build_system = 'SingleSource' self.sourcepath = '_roofline.cpp' + self.executable = 'sde' + self.target_executable = './roof.exe' + self.sde = '%s.sde' % self.target_executable + self.rpt = '%s.rpt' % self.target_executable self.prebuild_cmd = [ 'patch < SDE/roofline_template.patch', 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % @@ -46,17 +50,13 @@ def __init__(self, repeat, toolsversion, datalayout): self.num_tasks_per_core = 1 self.use_multithreading = False self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), } - self.executable = 'sde' - self.target_executable = './roof.exe' - self.sde = '%s.sde' % self.target_executable - self.rpt = '%s.rpt' % self.target_executable + exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental' self.pre_run = [ 'mv %s %s' % (self.executable, self.target_executable), - 'module use /apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/' - 'experimental/modules/all', + 'module use %s/modules/all' % exp, 'module load sde', 'sde -help' ] @@ -68,7 +68,7 @@ def __init__(self, repeat, toolsversion, datalayout): self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] self.maintainers = ['JG'] - self.tags = {'production'} + self.tags = {'scs'} self.sanity_patterns = sn.all([ sn.assert_eq(sn.extractsingle( r'^Intel\(R\) Software Development Emulator\. Version: ' @@ -113,10 +113,10 @@ def __init__(self, repeat, toolsversion, datalayout): def arithmetic_intensity(self): flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', self.rpt, 'flops', int) - byts = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'byts', int) - # debug: print('ai={}'.format(flops/byts)) - return flops/byts + bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'bytes', int) + # debug: print('ai={}'.format(flops/bytes)) + return flops/bytes @property @sn.sanity_function From 017bc2a276d40e4662c3940d8db18849f91491c7 Mon Sep 17 00:00:00 2001 From: jgp Date: Mon, 6 May 2019 16:54:36 +0200 Subject: [PATCH 8/8] typo --- .../tools/profiling_and_debugging/intel_sde_berkeley_stream.py | 2 +- cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 4557f7d198..0c410e43ad 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -43,7 +43,7 @@ def arithmetic_intensity(self): flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', self.rpt, 'flops', int) bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'bytes', int) + self.rpt, 'bytes', int) return flops/bytes @property diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py index e2ed04153f..f31cb542f7 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py @@ -114,7 +114,7 @@ def arithmetic_intensity(self): flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', self.rpt, 'flops', int) bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'bytes', int) + self.rpt, 'bytes', int) # debug: print('ai={}'.format(flops/bytes)) return flops/bytes