From 71473c63dfca677f1087f291997a960e73c14fdb Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 11 Feb 2020 22:11:43 +0100 Subject: [PATCH 1/3] clean --- .../tools/profiling_and_debugging/README.md | 19 ++ .../intel_inspector.py | 95 ---------- .../{ => roofline}/berkeley-ert-nvprof.py | 0 .../roofline/berkeley-ert-serial.py | 170 ++++++++++++++++++ .../{ => roofline}/berkeley-ert.py | 0 .../{ => roofline}/gpp_nvprof_roofline.py | 0 .../{ => roofline}/intel_advisor_roofline.py | 2 +- .../{ => roofline}/intel_sde_roofline.py | 0 .../{ => roofline}/intel_vtune_roofline.py | 0 .../profiling_and_debugging/scorep_mpi_omp.py | 83 --------- 10 files changed, 190 insertions(+), 179 deletions(-) create mode 100644 cscs-checks/tools/profiling_and_debugging/README.md delete mode 100644 cscs-checks/tools/profiling_and_debugging/intel_inspector.py rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/berkeley-ert-nvprof.py (100%) create mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/berkeley-ert.py (100%) rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/gpp_nvprof_roofline.py (100%) rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/intel_advisor_roofline.py (99%) rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/intel_sde_roofline.py (100%) rename cscs-checks/tools/profiling_and_debugging/{ => roofline}/intel_vtune_roofline.py (100%) delete mode 100644 cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py diff --git a/cscs-checks/tools/profiling_and_debugging/README.md b/cscs-checks/tools/profiling_and_debugging/README.md new file mode 100644 index 0000000000..1a9e7e42c6 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/README.md @@ -0,0 +1,19 @@ +This directory contains scripts to test HPC debugging and performance tools on +Piz Daint using ReFrame. More checks are available here: +http://github.com/eth-cscs/hpctools + +* nvidia/nsys_cuda.py + +* notool/internal_timers_mpi.py + +* intel/intel_inspector.py +* intel/intel_vtune.py +* intel/intel_advisor.py + +* scorep/scorep_sampling_profiling.py +* scorep/scorep_sampling_tracing.py + +* scalasca/scalasca_sampling_tracing.py +* scalasca/scalasca_sampling_profiling.py + +* extrae/extrae.py diff --git a/cscs-checks/tools/profiling_and_debugging/intel_inspector.py b/cscs-checks/tools/profiling_and_debugging/intel_inspector.py deleted file mode 100644 index e2e7a9b7b4..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/intel_inspector.py +++ /dev/null @@ -1,95 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test(['C++'], ['F90']) -class IntelInspectorTest(rfm.RegressionTest): - '''This test checks Intel Inspector: - https://software.intel.com/en-us/inspector - ''' - - def __init__(self, lang): - super().__init__() - self.name = 'Intel_Inspector_%s' % lang.replace('+', 'p') - self.descr = self.name - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] - self.valid_prog_environs = ['PrgEnv-intel'] - self.modules = ['inspector'] - self.sourcesdir = os.path.join('src', lang) - self.build_system = 'Make' - if lang == 'F90': - self.build_system.max_concurrency = 1 - - self.executable = 'inspxe-cl' - self.target_executable = './jacobi' - self.prgenv_flags = { - 'PrgEnv-gnu': ['-g', '-O2', '-fopenmp'], - 'PrgEnv-cray': ['-g', '-O2', '-homp'], - 'PrgEnv-intel': ['-g', '-O2', '-qopenmp'], - 'PrgEnv-pgi': ['-g', '-O2', '-mp'] - } - self.executable_opts = ['-collect mi1 %s' % self.target_executable] - self.exclusive = True - self.num_tasks = 3 - self.num_tasks_per_node = 3 - self.num_cpus_per_task = 4 - self.num_tasks_per_core = 1 - self.use_multithreading = False - num_iterations = 10 - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'ITERATIONS': str(num_iterations), - 'OMP_PROC_BIND': 'true', - 'CRAYPE_LINK_TYPE': 'dynamic', - } - self.version_rpt = 'version.rpt' - self.problems_rpt = 'problems.rpt' - self.summary_rpt = 'summary.rpt' - self.observations_rpt = 'observations.rpt' - self.pre_run = [ - 'mv %s %s' % (self.executable, self.target_executable), - '%s --version &> %s' % (self.executable, self.version_rpt), - ] - self.post_run = [ - '%s -V &> %s' % (self.executable, self.version_rpt), - '%s -report=summary &> %s' % (self.executable, self.summary_rpt), - '%s -report=problems &> %s' % (self.executable, self.problems_rpt), - '%s -report=observations &> %s' % - (self.executable, self.observations_rpt), - ] - self.maintainers = ['JG', 'MKr'] - self.tags = {'production'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - environ_name = self.current_environ.name - prgenv_flags = self.prgenv_flags[environ_name] - self.build_system.cflags = prgenv_flags - self.build_system.cxxflags = prgenv_flags - self.build_system.fflags = prgenv_flags - regexversion = (r'^Intel\(R\)\sInspector\s\d+\sUpdate\s\d+\s\(build' - r'\s(?P\d+)') - system_default_toolversion = { - 'daint': '597413', # 2019 Update 4 - 'dom': '597413', # 2019 Update 4 - } - toolsversion = system_default_toolversion[self.current_system.name] - self.sanity_patterns = sn.all([ - # check the job: - sn.assert_found('SUCCESS', self.stdout), - # check the tool's version: - sn.assert_eq(sn.extractsingle(regexversion, self.version_rpt, - 'toolsversion'), toolsversion), - # check the reports: - sn.assert_found(r'1 Memory leak problem\(s\) detected', - self.summary_rpt), - sn.assert_found(r'1 Memory not deallocated problem\(s\) detected', - self.summary_rpt), - sn.assert_found(r'_main.\w+\(\d+\): Warning X\d+: P\d: ' - r'Memory not deallocated:', - self.observations_rpt), - sn.assert_found(r'_main.\w+\(\d+\): Warning X\d+:', - self.problems_rpt), - ]) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py similarity index 100% rename from cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py rename to cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py new file mode 100644 index 0000000000..ba1b0ad455 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py @@ -0,0 +1,170 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +#{{{ base +class ErtTestBase(rfm.RegressionTest): + """ + The Empirical Roofline Tool, ERT, automatically generates roofline data. + https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ + """ + + def __init__(self): + self.descr = 'Empirical Roofline Toolkit' + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'ert') + self.build_system = 'SingleSource' + self.sourcepath = 'kernel1.c driver1.c' + self.executable = 'ert.exe' + self.build_system.ldflags = ['-O3', '-fopenmp'] + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'ert') + self.rpt = '%s.rpt' % self.executable + self.maintainers = ['JG'] + self.tags = {'scs', 'external-resources'} + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + if self.num_tasks != 36: + self.job.launcher.options = ['--cpu-bind=verbose,none'] +#}}} + +#{{{ test +@rfm.parameterized_test( + *[[num_ranks, flop] + for num_ranks in [1] + for flop in [256, 512, 1024]]) + #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) +class ErtBroadwellTest(ErtTestBase): + def __init__(self, num_ranks, flop): + super().__init__() + ompthread = 1 + self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.build_system.cppflags = [ + '-DERT_FLOP=%s' % flop, + '-DERT_ALIGN=32', + '-DERT_MEMORY_MAX=1073741824', + '-DERT_MPI=True', + '-DERT_OPENMP=True', + '-DERT_TRIALS_MIN=1', + '-DERT_WORKING_SET_MIN=1', + ] + self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( + flop, num_ranks, ompthread) + self.exclusive = True + self.num_tasks = num_ranks + self.num_tasks_per_node = num_ranks + self.num_cpus_per_task = ompthread + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) + } + + # take the "slowest" job, make it sleep after it has ended and hope the + # other jobs have ended too + # TODO: find a better way to wait for the other jobs to end + num_ranks_min = 1 + flop_min = 1024 + self.roofline_rpt = 'rpt' + if num_ranks == num_ranks_min and flop == flop_min: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + # give enough time for all the dependent jobs to collect data: + 'sleep 60', + 'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt', + ] + + else: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + ] + + # --- Sanity check: + regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' + r'\(\w+ \*\)malloc\(PSIZE\);') + datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') + self.sanity_patterns = sn.all([ + sn.assert_found('GFLOPs', 'sum'), + sn.assert_eq(datatype, 'double'), + ]) + + # --- Performance check: + if num_ranks == num_ranks_min and flop == flop_min: + # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): + ref_GFLOPs = 945.0 + ref_L1bw = 1788.0 + ref_L2bw = 855.0 + ref_L3bw = 547.0 + ref_DRAMbw = 70.5 + + # Typical performance report looks like: + # -------------------------------------- + # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt + # 908.43 GFLOPs EMP + # ****** + # META_DATA + # OPENMP_THREADS 1 + # FLOPS 8 + # MPI_PROCS 36 + # + # 5647.33 L1 EMP + # ******* + # 3203.86 L2 EMP + # ******* + # 1773.58 L3 EMP + # ******* + # 139.56 L4 EMP + # 103.50 DRAM EMP + # ****** + # META_DATA + # FLOPS 2 + # OPENMP_THREADS 1 + # MPI_PROCS 36 + regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' + regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' + regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' + regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' + regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' + + gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, + 'GFLOPs', float) + L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, + 'L1bw', float) + L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, + 'L2bw', float) + L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, + 'L3bw', float) + DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, + 'DRAMbw', float) + + # --performance-report: + self.perf_patterns = { + 'gflops': gflops, + 'L1bw': L1bw, + 'L2bw': L2bw, + 'L3bw': L3bw, + 'DRAMbw': DRAMbw, + } + + self.reference = { + '*': { + 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), + 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), + 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), + 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), + 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), + } + } + + # else: + +#}}} diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py similarity index 100% rename from cscs-checks/tools/profiling_and_debugging/berkeley-ert.py rename to cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py diff --git a/cscs-checks/tools/profiling_and_debugging/gpp_nvprof_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py similarity index 100% rename from cscs-checks/tools/profiling_and_debugging/gpp_nvprof_roofline.py rename to cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py similarity index 99% rename from cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py rename to cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py index 797d61121a..5eb3e82580 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py @@ -239,7 +239,7 @@ def __init__(self, repeat, toolversion, datalayout): } self.maintainers = ['JG', 'MKr'] - self.tags = {'production', 'external-resources'} + self.tags = {'external-resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py similarity index 100% rename from cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py rename to cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py similarity index 100% rename from cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py rename to cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py b/cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py deleted file mode 100644 index 677cdfe160..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/scorep_mpi_omp.py +++ /dev/null @@ -1,83 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.required_version('>=2.14') -@rfm.parameterized_test(['C++'], ['F90']) -class ScorepHybrid(rfm.RegressionTest): - def __init__(self, lang): - super().__init__() - self.name = 'scorep_mpi_omp_%s' % lang.replace('+', 'p') - self.descr = 'SCORE-P %s check' % lang - self.valid_systems = ['daint:gpu', 'daint:mc', 'dom:gpu', 'dom:mc'] - - # Score-P fails with latest clang based cce and pgi compilers: - # src/measurement/thread/fork_join/scorep_thread_fork_join_omp.c:402: - # Fatal: Bug 'TPD == 0': Invalid OpenMP thread specific data object. - # -> removing cce from supported compiler for now. - self.valid_prog_environs = ['PrgEnv-gnu', 'PrgEnv-intel'] - self.prgenv_flags = { - 'PrgEnv-gnu': ['-g', '-fopenmp'], - 'PrgEnv-intel': ['-g', '-openmp'], - } - self.sourcesdir = os.path.join('src', lang) - self.executable = 'jacobi' - self.build_system = 'Make' - self.build_system.makefile = 'Makefile_scorep_mpi_omp' - # NOTE: Restrict concurrency to allow creation of Fortran modules - if lang == 'F90': - self.build_system.max_concurrency = 1 - - self.num_tasks = 3 - self.num_tasks_per_node = 3 - self.num_cpus_per_task = 4 - self.num_iterations = 200 - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'ITERATIONS': str(self.num_iterations), - 'SCOREP_ENABLE_PROFILING': 'false', - 'SCOREP_ENABLE_TRACING': 'true', - 'OMP_PROC_BIND': 'true', - 'SCOREP_TIMER': 'clock_gettime' - } - cpu_count = self.num_cpus_per_task * self.num_tasks_per_node - self.otf2_file = 'otf2.txt' - self.sanity_patterns = sn.all([ - sn.assert_found('SUCCESS', self.stdout), - sn.assert_eq(sn.count(sn.extractall( - r'(?PLEAVE.*omp\s+\S+\s+\@_jacobi)', self.otf2_file, - 'line')), 4 * self.num_iterations * cpu_count), - sn.assert_not_found('warning|WARNING', self.stderr) - ]) - self.maintainers = ['MKr', 'JG'] - self.tags = {'production'} - # additional program call in order to generate the tracing output for - # the sanity check - self.post_run = [ - 'otf2-print scorep-*/traces.otf2 > %s' % self.otf2_file - ] - - def setup(self, partition, environ, **job_opts): - scorep_ver = '6.0' - tc_ver = '19.10' - cu_ver = '10.1' - self.scorep_modules = { - 'PrgEnv-gnu': ['Score-P/%s-CrayGNU-%s' % (scorep_ver, tc_ver)], - 'PrgEnv-intel': ['Score-P/%s-CrayIntel-%s' % (scorep_ver, tc_ver)], - 'PrgEnv-pgi': ['Score-P/%s-CrayPGI-%s' % (scorep_ver, tc_ver)], - } - if partition.fullname in ['daint:gpu', 'dom:gpu']: - self.scorep_modules['PrgEnv-gnu'] = [ - 'Score-P/%s-CrayGNU-%s-cuda-%s' % (scorep_ver, tc_ver, cu_ver) - ] - - self.modules = self.scorep_modules[environ.name] - super().setup(partition, environ, **job_opts) - prgenv_flags = self.prgenv_flags[self.current_environ.name] - self.build_system.cflags = prgenv_flags - self.build_system.cxxflags = prgenv_flags - self.build_system.fflags = prgenv_flags - self.build_system.ldflags = ['-lm'] - self.build_system.options = ["PREP='scorep --mpp=mpi --thread=omp'"] From 837b8fc60f3f2dabc1c4df85b8df9532bfea664e Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 11 Feb 2020 22:21:07 +0100 Subject: [PATCH 2/3] pep8 --- .../roofline/berkeley-ert-serial.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py index ba1b0ad455..5ad27930fc 100644 --- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py +++ b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py @@ -1,10 +1,9 @@ import os - import reframe as rfm import reframe.utility.sanity as sn -#{{{ base +# {{{ base class ErtTestBase(rfm.RegressionTest): """ The Empirical Roofline Tool, ERT, automatically generates roofline data. @@ -29,14 +28,13 @@ def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) if self.num_tasks != 36: self.job.launcher.options = ['--cpu-bind=verbose,none'] -#}}} - -#{{{ test -@rfm.parameterized_test( - *[[num_ranks, flop] - for num_ranks in [1] - for flop in [256, 512, 1024]]) - #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) +# }}} + + +# {{{ test +@rfm.parameterized_test(*[[num_ranks, flop] + for num_ranks in [1] + for flop in [256, 512, 1024]]) class ErtBroadwellTest(ErtTestBase): def __init__(self, num_ranks, flop): super().__init__() @@ -115,7 +113,7 @@ def __init__(self, num_ranks, flop): # OPENMP_THREADS 1 # FLOPS 8 # MPI_PROCS 36 - # + # # 5647.33 L1 EMP # ******* # 3203.86 L2 EMP @@ -138,11 +136,11 @@ def __init__(self, num_ranks, flop): gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, 'GFLOPs', float) L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, - 'L1bw', float) + 'L1bw', float) L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, - 'L2bw', float) + 'L2bw', float) L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, - 'L3bw', float) + 'L3bw', float) DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, 'DRAMbw', float) @@ -167,4 +165,4 @@ def __init__(self, num_ranks, flop): # else: -#}}} +# }}} From 4723c528804e71fcc47053972e79ffdaab1a3995 Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 3 Mar 2020 12:46:57 +0100 Subject: [PATCH 3/3] cleaning --- .../roofline/berkeley-ert-nvprof.py | 148 --------- .../roofline/berkeley-ert-serial.py | 168 ---------- .../roofline/berkeley-ert.py | 164 ---------- .../roofline/gpp_nvprof_roofline.py | 135 -------- .../roofline/intel_advisor_roofline.py | 276 ---------------- .../roofline/intel_sde_roofline.py | 129 -------- .../roofline/intel_vtune_roofline.py | 298 ------------------ 7 files changed, 1318 deletions(-) delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py deleted file mode 100644 index 7bfc7c64fa..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-nvprof.py +++ /dev/null @@ -1,148 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test( - *[[gpudims, flop, repeat] - # gpudims sets (gpu_blocks, gpu_threads): - for gpudims in [(112, 1024), (224, 512), (448, 256), (896, 128), - (1792, 64), (3584, 32)] - for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] - # self.repeat replaces '-DERT_NUM_EXPERIMENTS=2': - for repeat in [1, 2]]) -class ErtP100Test(rfm.RegressionTest): - ''' - The Empirical Roofline Tool, ERT, empirically generates roofline data: - https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ - - This test checks the ERT tool with NVIDIA Tesla P100-PCIE-16GB: - Device 0: "Tesla P100-PCIE-16GB" - CUDA Driver Version / Runtime Version 10.1 / 10.0 - CUDA Capability Major/Minor version number: 6.0 - (56) Multiprocessors, (64) CUDA Cores/MP: 3584 CUDA Cores - GPU Max Clock rate: 1329 MHz (1.33 GHz) - Theoretical peak performance per GPU: 4761 Gflop/s - Maximum number of threads per multiprocessor: 2048 - Peak number of threads: 114688 threads <--------- - Maximum number of threads per block: 1024 <--------- - NVRM version: NVIDIA UNIX x86_64 Kernel Module 418.39 - - # The following python code can help for a parameter space study: - # (use --exec-policy=async) - max_threads_per_block = 1024 - max_threads = 114688 - gpu_threads = max_threads_per_block * 2 - while gpu_threads > 32: - gpu_threads = gpu_threads // 2 - gpu_blocks = max_threads // gpu_threads - nth = gpu_threads * gpu_blocks - print('{} {} {} {}'.format(gpu_blocks, gpu_threads, nth, max_threads)) - ''' - def __init__(self, gpudims, flop, repeat): - super().__init__() - max_gpu_blocks = 3584 - max_flops = 1024 - max_repeat = 2 - self.descr = 'Empirical Roofline Toolkit' - self.valid_systems = ['dom:gpu'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.modules = ['craype-accel-nvidia60'] - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - # A single input file is required for nvcc to work: - self.build_system = 'SingleSource' - self.prebuild_cmd = [ - 'cat kernel1.c driver1.c | sed "s-^#if ERT-#ifdef ERT-g" > ' - '_gpu.cu'] - self.sourcepath = '_gpu.cu' - self.executable = 'ert.exe' - self.build_system.cppflags = [ - # ERT_FLOPS = -DERT_FLOP ! - '-DERT_FLOP=%s' % flop, - '-DERT_ALIGN=32', - # 1G = 1024^3 = 1073741824: - '-DERT_MEMORY_MAX=1073741824', - # ERT_GPU True: - '-DERT_GPU', - '-DERT_TRIALS_MIN=1', - '-DERT_WORKING_SET_MIN=128', - # '-x cu' explicitly sets the language (cuda) for the src files. - ] - self.build_system.ldflags = ['-O3'] - self.maintainers = ['JG'] - self.tags = {'scs', 'external-resources'} - gpu_blocks, gpu_threads = gpudims - self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format( - repeat, flop, gpu_blocks, gpu_threads) - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task) - } - self.executable_opts = [str(gpu_blocks), str(gpu_threads)] - self.rpt = '%s.rpt' % self.executable - # Reference roofline boundaries for NVIDIA Tesla P100-PCIE-16GB: - GFLOPs = 4355.0 - # Keeping for future reference: - # L1bw = 1724.0 - # L2bw = 855.0 - # L3bw = 547.0 - DRAMbw = 521.0 - self.roofline_rpt = 'rpt' - # use the latest job to generate the roofline rpt: - if (gpu_blocks == max_gpu_blocks and flop == max_flops and - repeat == max_repeat): - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - # give enough time for all the dependent jobs to collect data: - 'sleep 60', - 'cat ../ertgpu_Run*/sum | python2 roofline.py > rpt', - ] - self.sanity_patterns = sn.all([ - # --- check data type: - sn.assert_eq(sn.extractsingle( - r'^\s+(?P\w+) \*\s+buf = \(\w+ \*\)' - r'_mm_malloc\(PSIZE, ERT_ALIGN\);', 'driver1.c', 'prec'), - 'double'), - # --- check ert's roofline results. Typical output is: - # 4355.20 GFLOPs EMP - # META_DATA - # GPU_BLOCKS 1792 - # FLOPS 1024 - # GPU_THREADS 64 - # - # 1723.95 L1 EMP - # 521.29 DRAM EMP - # - # check GFLOPS: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+)\sGFLOPs EMP', self.roofline_rpt, - 'GFLOPs', float), GFLOPs, -0.1, 0.5), - # check L1 bandwidth: - # https://cug.org/proceedings/protected/cug2019_proceedings/ - # includes/files/pap103s2-file1.pdf: - # "ERT fails to identify the L1 cache" - # sn.assert_reference(sn.extractsingle( - # r'(?P\d+.\d+)\sL1 EMP', self.roofline_rpt, - # 'L1bw', float), L1bw, -0.1, 0.3), - # check DRAM bandwidth: - sn.assert_reference(sn.extractsingle( - r'(?P\d+.\d+) DRAM EMP', self.roofline_rpt, - 'DRAMbw', float), DRAMbw, -0.1, 0.3), - ]) - else: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - ] - self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py deleted file mode 100644 index 5ad27930fc..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert-serial.py +++ /dev/null @@ -1,168 +0,0 @@ -import os -import reframe as rfm -import reframe.utility.sanity as sn - - -# {{{ base -class ErtTestBase(rfm.RegressionTest): - """ - The Empirical Roofline Tool, ERT, automatically generates roofline data. - https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ - """ - - def __init__(self): - self.descr = 'Empirical Roofline Toolkit' - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - self.build_system = 'SingleSource' - self.sourcepath = 'kernel1.c driver1.c' - self.executable = 'ert.exe' - self.build_system.ldflags = ['-O3', '-fopenmp'] - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - self.rpt = '%s.rpt' % self.executable - self.maintainers = ['JG'] - self.tags = {'scs', 'external-resources'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - if self.num_tasks != 36: - self.job.launcher.options = ['--cpu-bind=verbose,none'] -# }}} - - -# {{{ test -@rfm.parameterized_test(*[[num_ranks, flop] - for num_ranks in [1] - for flop in [256, 512, 1024]]) -class ErtBroadwellTest(ErtTestBase): - def __init__(self, num_ranks, flop): - super().__init__() - ompthread = 1 - self.valid_systems = ['daint:mc', 'dom:mc'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.build_system.cppflags = [ - '-DERT_FLOP=%s' % flop, - '-DERT_ALIGN=32', - '-DERT_MEMORY_MAX=1073741824', - '-DERT_MPI=True', - '-DERT_OPENMP=True', - '-DERT_TRIALS_MIN=1', - '-DERT_WORKING_SET_MIN=1', - ] - self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( - flop, num_ranks, ompthread) - self.exclusive = True - self.num_tasks = num_ranks - self.num_tasks_per_node = num_ranks - self.num_cpus_per_task = ompthread - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task) - } - - # take the "slowest" job, make it sleep after it has ended and hope the - # other jobs have ended too - # TODO: find a better way to wait for the other jobs to end - num_ranks_min = 1 - flop_min = 1024 - self.roofline_rpt = 'rpt' - if num_ranks == num_ranks_min and flop == flop_min: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - # give enough time for all the dependent jobs to collect data: - 'sleep 60', - 'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt', - ] - - else: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - ] - - # --- Sanity check: - regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' - r'\(\w+ \*\)malloc\(PSIZE\);') - datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') - self.sanity_patterns = sn.all([ - sn.assert_found('GFLOPs', 'sum'), - sn.assert_eq(datatype, 'double'), - ]) - - # --- Performance check: - if num_ranks == num_ranks_min and flop == flop_min: - # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): - ref_GFLOPs = 945.0 - ref_L1bw = 1788.0 - ref_L2bw = 855.0 - ref_L3bw = 547.0 - ref_DRAMbw = 70.5 - - # Typical performance report looks like: - # -------------------------------------- - # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt - # 908.43 GFLOPs EMP - # ****** - # META_DATA - # OPENMP_THREADS 1 - # FLOPS 8 - # MPI_PROCS 36 - # - # 5647.33 L1 EMP - # ******* - # 3203.86 L2 EMP - # ******* - # 1773.58 L3 EMP - # ******* - # 139.56 L4 EMP - # 103.50 DRAM EMP - # ****** - # META_DATA - # FLOPS 2 - # OPENMP_THREADS 1 - # MPI_PROCS 36 - regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' - regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' - regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' - regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' - regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' - - gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, - 'GFLOPs', float) - L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, - 'L1bw', float) - L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, - 'L2bw', float) - L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, - 'L3bw', float) - DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, - 'DRAMbw', float) - - # --performance-report: - self.perf_patterns = { - 'gflops': gflops, - 'L1bw': L1bw, - 'L2bw': L2bw, - 'L3bw': L3bw, - 'DRAMbw': DRAMbw, - } - - self.reference = { - '*': { - 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), - 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), - 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), - 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), - 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), - } - } - - # else: - -# }}} diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py deleted file mode 100644 index e9cef9be20..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/berkeley-ert.py +++ /dev/null @@ -1,164 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -class ErtTestBase(rfm.RegressionTest): - ''' - The Empirical Roofline Tool, ERT, automatically generates roofline data. - https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ - ''' - - def __init__(self): - super().__init__() - self.descr = 'Empirical Roofline Toolkit' - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - self.build_system = 'SingleSource' - self.sourcepath = 'kernel1.c driver1.c' - self.executable = 'ert.exe' - self.build_system.ldflags = ['-O3', '-fopenmp'] - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - self.rpt = '%s.rpt' % self.executable - self.maintainers = ['JG'] - self.tags = {'scs', 'external-resources'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - if self.num_tasks != 36: - self.job.launcher.options = ['--cpu-bind=verbose,none'] - - -@rfm.parameterized_test( - *[[num_ranks, flop] - for num_ranks in [36, 18, 12, 9, 6, 4, 3, 2, 1] - for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) -class ErtBroadwellTest(ErtTestBase): - def __init__(self, num_ranks, flop): - super().__init__() - ompthread = 36 // num_ranks - self.valid_systems = ['daint:mc', 'dom:mc'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.build_system.cppflags = [ - '-DERT_FLOP=%s' % flop, - '-DERT_ALIGN=32', - '-DERT_MEMORY_MAX=1073741824', - '-DERT_MPI=True', - '-DERT_OPENMP=True', - '-DERT_TRIALS_MIN=1', - '-DERT_WORKING_SET_MIN=1', - ] - self.name = 'ert_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( - flop, num_ranks, ompthread) - self.exclusive = True - self.num_tasks = num_ranks - self.num_tasks_per_node = num_ranks - self.num_cpus_per_task = ompthread - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task) - } - - # take the "slowest" job, make it sleep after it has ended and hope the - # other jobs have ended too - # TODO: find a better way to wait for the other jobs to end - num_ranks_min = 1 - flop_min = 1024 - self.roofline_rpt = 'rpt' - if num_ranks == num_ranks_min and flop == flop_min: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - # give enough time for all the dependent jobs to collect data: - 'sleep 60', - 'cat ../ert_FLOPS*/sum | python2 roofline.py > rpt', - ] - - else: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - ] - - # --- Sanity check: - regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' - r'\(\w+ \*\)malloc\(PSIZE\);') - datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') - self.sanity_patterns = sn.all([ - sn.assert_found('GFLOPs', 'sum'), - sn.assert_eq(datatype, 'double'), - ]) - - # --- Performance check: - if num_ranks == num_ranks_min and flop == flop_min: - # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): - ref_GFLOPs = 945.0 - ref_L1bw = 1788.0 - ref_L2bw = 855.0 - ref_L3bw = 547.0 - ref_DRAMbw = 70.5 - - # Typical performance report looks like: - # -------------------------------------- - # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt - # 908.43 GFLOPs EMP - # ****** - # META_DATA - # OPENMP_THREADS 1 - # FLOPS 8 - # MPI_PROCS 36 - # - # 5647.33 L1 EMP - # ******* - # 3203.86 L2 EMP - # ******* - # 1773.58 L3 EMP - # ******* - # 139.56 L4 EMP - # 103.50 DRAM EMP - # ****** - # META_DATA - # FLOPS 2 - # OPENMP_THREADS 1 - # MPI_PROCS 36 - regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' - regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' - regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' - regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' - regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' - - gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, - 'GFLOPs', float) - L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, - 'L1bw', float) - L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, - 'L2bw', float) - L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, - 'L3bw', float) - DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, - 'DRAMbw', float) - - # --performance-report: - self.perf_patterns = { - 'gflops': gflops, - 'L1bw': L1bw, - 'L2bw': L2bw, - 'L3bw': L3bw, - 'DRAMbw': DRAMbw, - } - - self.reference = { - '*': { - 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), - 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), - 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), - 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), - 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), - } - } diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py deleted file mode 100644 index 305f49a9bd..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/gpp_nvprof_roofline.py +++ /dev/null @@ -1,135 +0,0 @@ -import reframe as rfm -import reframe.utility.sanity as sn - - -class GPPBaseTest(rfm.RegressionTest): - '''This test checks the values reported by NVIDIA nvprof for roofline - modeling: - - https://github.com/cyanguwa/nersc-roofline/tree/master/GPP - (compile.survey and run.survey) - - https://cug.org/proceedings/protected/cug2019_proceedings/includes/ - files/pap103s2-file1.pdf - ''' - def __init__(self): - super().__init__() - self.descr = 'Roofline Analysis of the GPP code using NVIDIA nvprof' - self.sourcesdir = 'https://github.com/cyanguwa/nersc-roofline.git' - self.build_system = 'Make' - self.build_system.cxx = 'nvcc' - self.maintainers = ['JG'] - self.tags = {'scs'} - - @property - @sn.sanity_function - def flops(self): - flop_count_dp_avg = sn.extractsingle( - r'^.*flop_count_dp\s+Floating Point Operations\(Double Precision\)' - r'\s+.*(?P\d\.\d+e\+\d+)$', self.stderr, 'x', float) - # print("#debug: flop_count_dp_avg={}".format(flop_count_dp_avg)) - return flop_count_dp_avg - - @property - @sn.sanity_function - def gflops_per_seconds(self): - sec = sn.extractsingle( - r'^\*+\sKernel Time Taken\s\*+=\s(?P\d+.\d+)\ssecs', - self.stdout, 'sec', float) - # print("#debug: sec={}".format(sec)) - # print("#debug: flops={}".format(self.flops)) - # print("#debug: gflops_per_seconds={}".format(self.flops/(sec*10**9))) - return (self.flops / (sec*10**9)) - - @property - @sn.sanity_function - def hbm_bytes(self): - dram_read_transactions_avg = sn.extractsingle( - r'^.*dram_read_transactions\s+Device Memory Read Transactions\s+.*' - r'(?P\d\.\d+e\+\d+)$', self.stderr, 'x', float) - dram_write_transactions_avg = sn.extractsingle( - r'^.*dram_write_transactions\s+Device Memory Write Transactions\s+' - r'\d+\s+\d+\s+(?P\d+)$', self.stderr, 'x', float) - transactions_size = 32.0 - bytes = dram_read_transactions_avg + dram_write_transactions_avg - bytes = bytes * transactions_size - # print("#debug: dram_read_avg={}".format(dram_read_transactions_avg)) - # print("#debug: dram_wr_avg={}".format(dram_write_transactions_avg)) - # print("#debug: hbm_bytes={}".format(bytes)) - return bytes - - @property - @sn.sanity_function - def arithmetic_intensity(self): - # print("#debug: ai={}".format(self.flops/self.hbm_bytes)) - return (self.flops / self.hbm_bytes) - - -@rfm.parameterized_test(*[[iw, repeat, cache] - for iw in [6] - for repeat in [1, 2] - for cache in ['HBM']]) -# To reproduce published results (on V100): -# for iw in [1, 2, 3, 4, 5, 6] -# for repeat in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] -# for cache in ['L1', 'L2', 'HBM']]) -class P100Test(GPPBaseTest): - ''' Counters for Pascal P100 GPU: - userguide = 'https://docs.nvidia.com/cuda/profiler-users-guide' - metrics = '%s/index.html#metrics-reference-6x' % userguide - ''' - def __init__(self, iw, repeat, cache): - super().__init__() - self.name = 'roofline_gpp_P100_iw{}_repeat{}_{}cache'.format( - iw, repeat, cache) - self.valid_systems = ['dom:gpu'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.modules = ['craype-accel-nvidia60'] - self.prebuild_cmd = [ - 'cd GPP/Volta', - # Pascal P100 GPU: - 'sed -i "s-sm_70-sm_60-" Makefile', - # fma (fmad=true) vs nofma (fmad=false): - 'sed -i "s/fmad=.*/fmad=true/g" Makefile', - # iw (loop size): - 'sed -i "s/#define nend.*/#define nend %s/g" GPUComplex.h' % iw, - ] - self.executable = './fma_iw{}_rep{}_{}.exe'.format(iw, repeat, cache) - self.build_system.options = ['EXE=../../%s' % self.executable] - # 1: 2: 3: - # 4: 5: - self.executable_opts = ['512', '2', '32768', '20', '0'] - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task) - } - self.nvprof_metrics = { - 'L1': ['flop_count_dp', 'gld_transactions', 'gst_transactions', - 'atomic_transactions', 'local_load_transactions', - 'local_store_transactions', 'shared_load_transactions', - 'shared_store_transactions'], - 'L2': ['flop_count_dp', 'l2_read_transactions', - 'l2_write_transactions'], - 'HBM': ['flop_count_dp', 'dram_read_transactions', - 'dram_write_transactions'], - 'PCIe/NVLINK': ['flop_count_dp', 'system_read_transactions', - 'system_write_transactions'] - } - sep = ' --metrics ' - nvmetrics = sep.join(self.nvprof_metrics[cache]) - self.post_run = [ - 'nvprof --kernels "NumBandNgpown_kernel" --metrics %s %s %s' % - (nvmetrics, self.executable, ' '.join(self.executable_opts)) - ] - # References for Nvidia P100 (HBM, iw=6): - gflops = 2796.6 - ai = 13.6 - self.sanity_patterns = sn.all([ - sn.assert_found('P100-PCIE-16GB', self.stderr), - sn.assert_reference(self.gflops_per_seconds, gflops, -0.5, 0.5), - sn.assert_reference(self.arithmetic_intensity, ai, -0.5, 0.5), - ]) diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py deleted file mode 100644 index 5eb3e82580..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/intel_advisor_roofline.py +++ /dev/null @@ -1,276 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test(*[[repeat, toolversion, datalayout] - for repeat in ['100000'] - for toolversion in ['597843'] - for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', - 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] - ]) -class IntelRooflineAdvisorTest(rfm.RegressionTest): - '''This test checks the values reported by Intel Advisor's roofline model: - https://software.intel.com/en-us/intel-advisor-xe - - The roofline model is based on GFLOPS and Arithmetic Intensity (AI): - "Self GFLOPS" = "Self GFLOP" / "Self Elapsed Time" - "Self GB/s" = "Self Memory GB" / "Self Elapsed Time" - "Self AI" = "Self GFLOPS" / "Self GB/s" - - While a roofline analysis flag exists ('advixe-cl -collect roofline'), it - may not be used to collect data on MPI applications; in that case, the - survey and flops analysis must be collected separately: first run a survey - analysis ('advixe-cl -collect survey') and then run a tripcounts+flops - analysis ('advixe-cl -collect tripcounts -flop') using the same project - directory for both steps. - - Example result on 1 core of Intel Broadwell CPU (E5-2695 v4): - G3_AOS_SCALAR: gflops, 2.79 arithmetic_intensity', 0.166 380ms <- slow - G3_AOS_VECTOR: gflops, 3.79 arithmetic_intensity', 0.125 143ms - G3_SOA_SCALAR: gflops, 2.79 arithmetic_intensity', 0.166 351ms - G3_SOA_VECTOR: gflops, 10.62 arithmetic_intensity', 0.166 57ms <- fast - ''' - - def __init__(self, repeat, toolversion, datalayout): - self.descr = 'Roofline Analysis test with Intel Advisor' - # for reference: advisor/2019 was failing on dom with: - # "Exceeded job memory limit" (webrt#36087) - self.valid_systems = ['daint:mc', 'dom:mc'] - # Reporting MFLOPS is not available on Intel Haswell cpus, see - # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ - # 64-ia-32-architectures-software-developer-vol-1-manual.pdf - self.valid_prog_environs = ['PrgEnv-intel'] - self.modules = ['advisor/2019_update4'] - # Testing with advisor/2018 (build 551025) fails with: - # roof.dir/nid00753.000/trc000/trc000.advixe - # Application exit code: 139 - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'intel_advisor') - self.build_system = 'SingleSource' - self.sourcepath = '_roofline.cpp' - self.executable = 'advixe-cl' - self.target_executable = './roof.exe' - self.build_system.cppflags = ['-D_ADVISOR', - '-I$ADVISOR_2019_DIR/include'] - self.prgenv_flags = { - 'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'], - } - self.build_system.ldflags = ['-L$ADVISOR_2019_DIR/lib64 -littnotify'] - self.roofline_rpt = '%s.rpt' % self.target_executable - self.version_rpt = 'Intel_Advisor_version.rpt' - self.roofline_ref = 'Intel_Advisor_roofline_reference.rpt' - self.prebuild_cmd = [ - 'patch -s < ADVISOR/roofline_template.patch', - 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % - (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') - ] - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'CRAYPE_LINK_TYPE': 'dynamic', - } - self.pre_run = [ - 'mv %s %s' % (self.executable, self.target_executable), - 'advixe-cl -help collect | head -20', - ] - self.roofdir = './roof.dir' - self.executable_opts = [ - '--collect survey --project-dir=%s --search-dir src:rp=. ' - '--data-limit=0 --no-auto-finalize --trace-mpi -- %s ' % - (self.roofdir, self.target_executable) - ] - # - Reference roofline boundaries for Intel Broadwell CPU (E5-2695 v4): - L1bw = 293 # *1024**3 - L2bw = 79 # *1024**3 - L3bw = 33 # *1024**3 - DPfmabw = 45*1024**3 - DPaddbw = 12*1024**3 - ScalarAddbw = 3*1024**3 - # --- roofline (memory) boundaries from the tool: - # DRAM Bandwidth (single node) 63206331080 memory - # DRAM Bandwidth 125993278750 memory - # DRAM Bandwidth (single-threaded) 12715570803 memory - # L1 Bandwidth 11360856466728 memory - # Scalar L1 Bandwidth 2648216636280 memory - # L1 bandwidth (single-threaded) 315579346298 memory - # ************ - # Scalar L1 bandwidth (single-threaded) 73561573230 memory - # L2 Bandwidth 3102773429268 memory - # Scalar L2 Bandwidth 921316779936 memory - # L2 bandwidth (single-threaded) 86188150813 memory - # *********** - # Scalar L2 bandwidth (single-threaded) 25592132776 memory - # L3 Bandwidth 1269637300440 memory - # Scalar L3 Bandwidth 845928498744 memory - # L3 bandwidth (single-threaded) 35267702790 memory - # *********** - # Scalar L3 bandwidth (single-threaded) 23498013854 memory - regex_roof_L1 = (r'^L1\sbandwidth\s\(single-threaded\)\s+(?P\d+)' - r'\s+memory$') - regex_roof_L2 = (r'^L2\sbandwidth\s\(single-threaded\)\s+(?P\d+)' - r'\s+memory$') - regex_roof_L3 = (r'^L3\sbandwidth\s\(single-threaded\)\s+(?P\d+)' - r'\s+memory$') - roof_L1 = sn.round(sn.extractsingle(regex_roof_L1, self.roofline_ref, - 'L1bw', int) / 1024**3, 2) - roof_L2 = sn.round(sn.extractsingle(regex_roof_L2, self.roofline_ref, - 'L2bw', int) / 1024**3, 3) - roof_L3 = sn.round(sn.extractsingle(regex_roof_L3, self.roofline_ref, - 'L3bw', int) / 1024**3, 3) - - # --- roofline (compute) boundaries from the tool: - # SP Vector FMA Peak 2759741518342 compute - # SP Vector FMA Peak (single-threaded) 98956234406 compute - # DP Vector FMA Peak 1379752337990 compute - # DP Vector FMA Peak (single-threaded) 49563336304 compute - # *********** - # Scalar Add Peak 93438527464 compute - # Scalar Add Peak (single-threaded) 3289577753 compute - # ********** - # SP Vector Add Peak 689944922272 compute - # SP Vector Add Peak (single-threaded) 24691445241 compute - # DP Vector Add Peak 344978547363 compute - # DP Vector Add Peak (single-threaded) 12385333008 compute - # *********** - # Integer Scalar Add Peak 228677310757 compute - # Integer Scalar Add Peak (single-threaded) 8055287031 compute - # Int64 Vector Add Peak 747457604632 compute - # Int64 Vector Add Peak (single-threaded) 26300241032 compute - # Int32 Vector Add Peak 1494880413924 compute - # Int32 Vector Add Peak (single-threaded) 52738180380 compute - regex_roof_dpfma = (r'^DP Vector FMA Peak\s\(single-threaded\)\s+' - r'(?P\d+)\s+compute$') - regex_roof_dpadd = (r'^DP Vector Add Peak\s\(single-threaded\)\s+' - r'(?P\d+)\s+compute$') - regex_roof_scalaradd = (r'^Scalar Add Peak\s\(single-threaded\)\s+' - r'(?P\d+)\s+compute$') - roof_dpfma = sn.extractsingle(regex_roof_dpfma, self.roofline_ref, - 'DPfmabw', int) - roof_dpadd = sn.extractsingle(regex_roof_dpadd, self.roofline_ref, - 'DPaddbw', int) - roof_scalaradd = sn.extractsingle(regex_roof_scalaradd, - self.roofline_ref, 'ScalarAddbw', - int) - - # - API output: - # ('self_elapsed_time', 0.1) - # ('self_memory_gb', 4.2496) - # ('self_gb_s', 42.496) - # ('self_gflop', 0.5312) - # ('self_gflops', 5.312) - # ('self_arithmetic_intensity', 0.125) - # ('_self_gb_s', 42.495999999999995, 42.496) - # ('_self_gflops', 5.311999999999999, 5.312) - # ('_self_arithmetic_intensity', 0.125, 0.125) - # ('gap _self_gb_s', -7.105427357601002e-15) - # ('gap _self_gflops', -8.881784197001252e-16) - # ('gap _self_arithmetic_intensity', 0.0) - # returned AI gap = 0.0000000000000000 - # returned GFLOPS gap = -0.0000000000000009 - regex_ai_gap = r'^returned\sAI\sgap\s=\s(?P.*)' - regex_ai_gflops = r'^returned\sGFLOPS\sgap\s=\s(?P.*)' - ai_gap = sn.extractsingle(regex_ai_gap, self.roofline_rpt, 'Intensity', - float) - ai_gflops = sn.extractsingle(regex_ai_gflops, self.roofline_rpt, - 'Flops', float) - - regex_toolversion = r'I*.\(build\s(?P\d+)\s*.' - found_toolversion = sn.extractsingle(regex_toolversion, - self.version_rpt, 'version') - self.sanity_patterns = sn.all([ - # check the job status: - sn.assert_found('loop complete.', self.stdout), - # check the tool's version (2019=591264, 2018=551025): - sn.assert_eq(found_toolversion, toolversion), - # --- roofline boundaries: - # check --report=roofs (L1, L2 and L3 bandwidth): - # sn.assert_reference(roof_L1, L1bw, -0.12, 0.08), - # sn.assert_reference(roof_L2, L2bw, -0.12, 0.08), - # sn.assert_reference(roof_L3, L3bw, -0.12, 0.08), - # check --report=roofs (DP FMA, DP Add and Scalar Add): - sn.assert_reference(roof_dpfma, DPfmabw, -0.12, 0.08), - sn.assert_reference(roof_dpadd, DPaddbw, -0.12, 0.08), - sn.assert_reference(roof_scalaradd, ScalarAddbw, -0.12, 0.08), - # --- check Arithmetic_intensity: - sn.assert_reference(ai_gap, 0.0, -0.01, 0.01), - # --- check GFLOPS: - sn.assert_reference(ai_gflops, 0.0, -0.01, 0.01), - ]) - - # --performance-report: - regex_mseconds = r'elapsed time: (?P\d+)ms' - regex_ai = r'^\(\'self_arithmetic_intensity\', (?P\d+.\d+)\)' - regex_gbs = r'^\(\'self_gb_s\', (?P\d+.\d+)\)' - regex_gflops = r'^\(\'self_gflops\', (?P\d+.\d+)\)' - mseconds = sn.extractsingle(regex_mseconds, self.stdout, - 'msec', int) - arithmetic_intensity = sn.extractsingle(regex_ai, self.roofline_rpt, - 'AI', float) - bandwidth = sn.extractsingle(regex_gbs, self.roofline_rpt, - 'gbs', float) - gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, - 'gflops', float) - self.perf_patterns = { - 'Elapsed': mseconds, - 'ArithmeticIntensity': arithmetic_intensity, - 'GFlops': gflops, - 'Bandwidth': bandwidth, - 'roof_L1': roof_L1, - 'roof_L2': roof_L2, - 'roof_L3': roof_L3, - } - self.reference = { - '*': { - 'Elapsed': (0, None, None, 'ms'), - 'ArithmeticIntensity': (0, None, None, ''), - 'GFlops': (0, None, None, 'GFLOPs/s'), - 'Bandwidth': (0, None, None, 'GB/s'), - 'roof_L1': (L1bw, -0.12, 0.08, 'GB/s'), - 'roof_L2': (L2bw, -0.12, 0.08, 'GB/s'), - 'roof_L3': (L3bw, -0.12, 0.08, 'GB/s'), - } - } - - self.maintainers = ['JG', 'MKr'] - self.tags = {'external-resources'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - environ_name = self.current_environ.name - prgenv_flags = self.prgenv_flags[environ_name] - self.build_system.cxxflags = prgenv_flags - launcher_cmd = ' '.join(self.job.launcher.command(self.job)) - self.post_run = [ - # --- collecting the performance data for the roofline model is a 2 - # steps process: - '%s %s --collect tripcounts --flop --project-dir=%s ' - '--search-dir src:rp=. --data-limit=0 --no-auto-finalize ' - '--trace-mpi -- %s' % - (launcher_cmd, self.executable, self.roofdir, - self.target_executable), - # --- check tool's version: - 'advixe-cl -V &> %s' % self.version_rpt, - # "advixe-cl --report" looks for e000/ in the output directory; - # if not found, it will fail with: - # IOError: Survey result cannot be loaded - 'cd %s;ln -s nid* e000;cd -' % self.roofdir, - # --- report reference values/boundaries (roofline_ref): - 'advixe-cl --report=roofs --project-dir=%s &> %s' % - (self.roofdir, self.roofline_ref), - 'python2 API/cscs.py %s &> %s' % (self.roofdir, self.roofline_rpt), - 'touch the_end', - # 'advixe-cl --format=csv' seems to be not working (empty report), - # keeping as reference for a future check: - # 'advixe-cl --show-all-columns -csv-delimiter=";"' - # ' --report=tripcounts --format=csv --project-dir=%s &> %s' - # This can be used instead (see advisor/config/report/roofs.tmpl): - # 'advixe-cl --report custom --report-template ./TEMPL/cscs.tmpl' - # ' --project-dir=%s &> %s' - ] diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py deleted file mode 100644 index ab98a6cefb..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/intel_sde_roofline.py +++ /dev/null @@ -1,129 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] - for repeat in ['100000'] - for toolsversion in ['8.35.0'] - for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', - 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] - ]) -class IntelRooflineSdeTest(rfm.RegressionTest): - '''This test checks the values reported by Intel SDE for roofline modeling: - - https://software.intel.com/en-us/articles/ - intel-software-development-emulator - - https://bitbucket.org/dwdoerf/stream-ai-example/src/master/ - - https://www.nersc.gov/ - users/application-performance/measuring-arithmetic-intensity - ''' - def __init__(self, repeat, toolsversion, datalayout): - super().__init__() - self.descr = 'Roofline Analysis test with Intel SDE' - self.valid_systems = ['dom:mc'] - # Reporting MFLOPS is not available on Intel Haswell cpus, see - # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ - # 64-ia-32-architectures-software-developer-vol-1-manual.pdf - self.valid_prog_environs = ['PrgEnv-intel'] - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'intel_advisor') - self.build_system = 'SingleSource' - self.sourcepath = '_roofline.cpp' - self.executable = 'sde' - self.target_executable = './roof.exe' - self.sde = '%s.sde' % self.target_executable - self.rpt = '%s.rpt' % self.target_executable - self.prebuild_cmd = [ - 'patch < SDE/roofline_template.patch', - 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % - (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') - ] - self.build_system.cppflags = ['-D_SDE'] - self.build_system.ldflags = ['-g', '-O3', '-qopenmp', '-restrict', - '-qopt-streaming-stores', 'always', - '-std=c++11'] - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - } - exp = '/apps/dom/UES/jenkins/7.0.UP00/mc/easybuild/experimental' - self.pre_run = [ - 'mv %s %s' % (self.executable, self.target_executable), - 'module use %s/modules/all' % exp, - 'module load sde', - 'sde -help' - ] - self.sdeflags = ['%s -d -iform 1 -omix %s -i -top_blocks 500 ' - '-global_region -start_ssc_mark 111:repeat ' - '-stop_ssc_mark 222:repeat -- %s' % - ('-bdw', self.sde, self.target_executable)] - self.executable_opts = self.sdeflags - self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) - self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] - self.maintainers = ['JG'] - self.tags = {'scs', 'external-resources'} - self.sanity_patterns = sn.all([ - sn.assert_eq(sn.extractsingle( - r'^Intel\(R\) Software Development Emulator\. Version: ' - r'(?P\d+\.\d+\.\d+)', self.stdout, - 'toolsversion'), toolsversion), - ]) - # References for Intel Broadwell CPU (E5-2695 v4): - references = { - 'G3_AOS_SCALAR': { - 'dom:mc': { - 'gflops': (0.596, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_SOA_SCALAR': { - 'dom:mc': { - 'gflops': (0.612, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_AOS_VECTOR': { - 'dom:mc': { - 'gflops': (1.152, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.125, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_SOA_VECTOR': { - 'dom:mc': { - 'gflops': (1.125, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - } - self.reference = references[datalayout] - self.perf_patterns = { - 'gflops': self.gflops, - 'ai': self.arithmetic_intensity, - } - - @property - @sn.sanity_function - def arithmetic_intensity(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'bytes', int) - # debug: print('ai={}'.format(flops/bytes)) - return flops/bytes - - @property - @sn.sanity_function - def gflops(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, - 'msec', float) - # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) - return (flops/((msec/1000))/10**9) diff --git a/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py deleted file mode 100644 index ad745a924b..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/roofline/intel_vtune_roofline.py +++ /dev/null @@ -1,298 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] - for repeat in ['500000'] - for toolsversion in ['597835'] - for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', - 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] - ]) -class IntelRooflineVtuneTest(rfm.RegressionTest): - '''This test checks the values reported by Vtune for roofline modeling: - https://docs.nersc.gov/programming/performance-debugging-tools/roofline/ - - Example result on 1 core of Intel Broadwell CPU (E5-2695 v4): - G3_AOS_SCALAR: DP GFLOPS: 3.162 Time: 0.854s <-- slow - G3_AOS_VECTOR: DP GFLOPS: 5.731 Time: 0.440s - G3_SOA_SCALAR: DP GFLOPS: 3.183 Time: 0.848s - G3_SOA_VECTOR: DP GFLOPS: 21.423 Time: 0.134s <-- fast - ''' - def __init__(self, repeat, toolsversion, datalayout): - super().__init__() - self.descr = 'Roofline Analysis test with Intel Vtune' - self.debug = False - self.valid_systems = ['dom:mc'] - # Reporting MFLOPS is not available on Intel Haswell cpus, see - # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ - # 64-ia-32-architectures-software-developer-vol-1-manual.pdf - self.valid_prog_environs = ['PrgEnv-intel'] - self.modules = ['vtune_amplifier'] - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'intel_advisor') - self.build_system = 'SingleSource' - self.sourcepath = '_roofline.cpp' - self.executable = 'amplxe-cl' - self.target_executable = './roof.exe' - self.build_system.cppflags = ['-D_ADVISOR', - '-I$VTUNE_AMPLIFIER_2019_DIR/include'] - self.prgenv_flags = { - 'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'], - # TODO: evaluate '-qopt-streaming-stores', 'always', - } - self.build_system.ldflags = ['-L$VTUNE_AMPLIFIER_2019_DIR/lib64', - '-littnotify'] - self.roofline_rpt = '%s.rpt' % self.target_executable - self.version_rpt = 'version.rpt' - self.roofline_ref = 'reference.rpt' - self.prebuild_cmd = [ - 'patch -s < ADVISOR/roofline_template.patch', - 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % - (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') - ] - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'CRAYPE_LINK_TYPE': 'dynamic', - } - self.pre_run = [ - 'mv %s %s' % (self.executable, self.target_executable), - '%s --version &> %s' % (self.executable, self.version_rpt), - '%s -help | head -20' % self.executable, - ] - self.roofdir = './roof.dir' - self.executable_opts = [ - '-start-paused -r %s -collect hpc-performance -data-limit=0 ' - '--search-dir src:rp=. --trace-mpi -- %s' % - (self.roofdir, self.target_executable) - ] - # NOTE: -allow-multiple-runs requires to install vtune drivers - # TODO: -collect memory-access - self.maintainers = ['JG'] - self.tags = {'scs', 'external-resources'} - self.sanity_patterns = sn.all([ - sn.assert_found('loop complete.', self.stdout), - sn.assert_eq(sn.extractsingle( - r'I*.\(build\s(?P\d+)\s*.', - self.version_rpt, 'toolsversion'), toolsversion), - ]) - # References for Intel Broadwell CPU (E5-2695 v4): - references = { - 'G3_AOS_SCALAR': { - 'dom:mc': { - 'gflops': (3.1, -0.1, None, 'Gflop/s'), - 'compare_sec': (0, -0.1, 0.1, 'seconds'), - 'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'), - } - }, - 'G3_AOS_VECTOR': { - 'dom:mc': { - 'gflops': (5.7, -0.1, None, 'Gflop/s'), - 'compare_sec': (0, -0.1, 0.1, 'seconds'), - 'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'), - } - }, - 'G3_SOA_SCALAR': { - 'dom:mc': { - 'gflops': (3.1, -0.1, None, 'Gflop/s'), - 'compare_sec': (0, -0.1, 0.1, 'seconds'), - 'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'), - } - }, - 'G3_SOA_VECTOR': { - 'dom:mc': { - 'gflops': (21.0, -0.1, None, 'Gflop/s'), - 'compare_sec': (0, -0.1, 0.1, 'seconds'), - 'compare_gflops': (0, -0.2, 0.2, 'Gflop/s'), - } - }, - } - self.reference = references[datalayout] - self.perf_patterns = { - 'gflops': self.gflops_reported, - 'compare_sec': self.runtime_diff, - 'compare_gflops': self.gflops_diff, - # TODO: 'ai': self.arithmetic_intensity, - } - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - run_cmd = ' '.join(self.job.launcher.command(self.job)) - self.clk_rpt = '%s_CLK.rpt' % self.target_executable - self.DPscalar_rpt = '%s_DP_scalar.rpt' % self.target_executable - self.DP128B_rpt = '%s_DP_128B.rpt' % self.target_executable - self.DP256B_rpt = '%s_DP_256B.rpt' % self.target_executable - perf_metrics = [ - ('CPU_CLK_UNHALTED.THREAD', self.clk_rpt), - ('FP_ARITH_INST_RETIRED.SCALAR_DOUBLE', self.DPscalar_rpt), - ('FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE', self.DP128B_rpt), - ('FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE', self.DP256B_rpt)] - self.post_run = [] - for perf_metric, perf_rpt in perf_metrics: - self.post_run += [ - '%s %s -report hw-events -group-by=package -r %s.* -column=%s ' - '&> %s' % - (run_cmd, self.executable, self.roofdir, perf_metric, perf_rpt) - ] - partitiontype = partition.fullname.split(':')[1] - if partitiontype == 'gpu': - self.job.options = ['--constraint="gpu&perf"'] - elif partitiontype == 'mc': - self.job.options = ['--constraint="mc&perf"'] - - # --- Elapsed Time: - @property - @sn.sanity_function - def runtime_reported(self): - sec = sn.extractsingle(r'^Elapsed Time: (?P\S+)s', self.stdout, - 'sec', float) - if self.debug: - print('sec1={}'.format(sec)) - - return sec - - @property - @sn.sanity_function - def runtime_metric(self): - # CPU_CLK_UNHALTED.THREAD: - mclk = sn.extractsingle(r'^package_0\s+(?P\d+)', - self.clk_rpt, 'clk', float) - # GHz: - ghz = sn.extractsingle(r'^\s+Average CPU Frequency: (?P\S+) GHz', - self.stdout, 'ghz', float) - # 1 Hz = 1 cycle / 1 second - sec = (mclk * 10**6) / (ghz * 10**9) - if self.debug: - print('sec2={}'.format(sec)) - return sec - - @property - @sn.sanity_function - def runtime_diff(self): - sec = self.runtime_reported - self.runtime_metric - if self.debug: - print('sec3={}'.format(sec)) - return sec - - # --- GFLOPS/sec: - @property - @sn.sanity_function - def gflops_reported(self): - gflops = sn.extractsingle(r'^\s+DP GFLOPS: (?P\S+)', - self.stdout, 'gflops', float) - if self.debug: - print('gflops1={}'.format(gflops)) - return gflops - - @property - @sn.sanity_function - def gflops_metric(self): - # > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H - # DP MFLOP/s = 1.0E-06*(x*2 + y + z*4)/runtime where: - # x = FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE* - # y = FP_ARITH_INST_RETIRED_SCALAR_DOUBLE - # z = FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE - # TODO: check units with: - # ^.*Hardware Event Count.*\((?P\S+)\) - # amplxe-cl -report hw-events -r roof.dir.nid00406/ -column=? - - # FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE: - DP128B = sn.extractsingle(r'^package_0\s+(?P\d+)', - self.DP128B_rpt, 'M', float) - # FP_ARITH_INST_RETIRED.SCALAR_DOUBLE: - DPscalar = sn.extractsingle(r'^package_0\s+(?P\d+)', - self.DPscalar_rpt, 'M', float) - # FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE: - DP256B = sn.extractsingle(r'^package_0\s+(?P\d+)', - self.DP256B_rpt, 'M', float) - - mflops = (DP128B*2 + DPscalar + DP256B*4) / self.runtime_reported - gflops = mflops / 10**3 - if self.debug: - print('DP128B={}'.format(DP128B)) - print('DPscalar={}'.format(DPscalar)) - print('DP256B={}'.format(DP256B)) - print('runtime={}'.format(self.runtime_reported)) - print('gflops2={}'.format(gflops)) - return gflops - - @property - @sn.sanity_function - def gflops_diff(self): - gflops = self.gflops_reported - self.gflops_metric - if self.debug: - print('gflops3={}'.format(gflops)) - return gflops - - # NOTE: Bandwidth data is missing for a full roofline model. - # Other tools (advisor, likwid, sde) may help: - # > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H - # Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) + - # SUM(MBOXxC1))*64.0/runtime - # Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) + - # SUM(MBOXxC1))*64.0 - # - # > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H - # L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + - # ICACHE_MISSES)*64.0/time - # L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + - # ICACHE_MISSES)*64.0 - # - # > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H - # L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL + - # L2_LINES_OUT_DEMAND_DIRTY)*64/time - # L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL + - # L2_LINES_OUT_DEMAND_DIRTY)*64 - # - # > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H - # Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) + - # SUM(CAS_COUNT_WR))*64.0/time - # Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) + - # Vtune supported hw-events: - # ------- - # Hardware Event Count:CPU_CLK_UNHALTED.THREAD (K) - # Hardware Event Count:CPU_CLK_UNHALTED.REF_TSC (K) - # Hardware Event Count:INST_RETIRED.ANY (K) - # Hardware Event Count:CYCLE_ACTIVITY.STALLS_L1D_MISS (K) - # Hardware Event Count:CPU_CLK_UNHALTED.REF_XCLK (K) - # Hardware Event Count:CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE (K) - # Hardware Event Count:CYCLE_ACTIVITY.STALLS_L2_MISS (K) - # Hardware Event Count:CYCLE_ACTIVITY.STALLS_MEM_ANY (K) - # Hardware Event Count:CYCLE_ACTIVITY.STALLS_TOTAL (K) - # Hardware Event Count:IDQ_UOPS_NOT_DELIVERED.CORE (K) - # Hardware Event Count:INT_MISC.RECOVERY_CYCLES (K) - # Hardware Event Count:MEM_LOAD_UOPS_RETIRED.L3_HIT_PS (K) - # Hardware Event Count:MEM_LOAD_UOPS_RETIRED.L3_MISS_PS (K) - # Hardware Event Count:RESOURCE_STALLS.SB (K) - # Hardware Event Count:RS_EVENTS.EMPTY_CYCLES (K) - # Hardware Event Count:UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC (K) - # Hardware Event Count:UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC (K) - # Hardware Event Count:UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC (K) - # Hardware Event Count:UOPS_EXECUTED.CORE:cmask=1 (K) - # Hardware Event Count:UOPS_EXECUTED.CORE:cmask=2 (K) - # Hardware Event Count:UOPS_EXECUTED.CORE:cmask=3 (K) - # Hardware Event Count:UOPS_ISSUED.ANY (K) - # Hardware Event Count:UOPS_RETIRED.RETIRE_SLOTS (K) - # Hardware Event Count:IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE (K) - # Hardware Event Count:OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD:cmask=4 (K) - # Hardware Event Count:OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD (K) - # Hardware Event Count:MEM_UOPS_RETIRED.ALL_LOADS_PS (K) - # Hardware Event Count:MEM_UOPS_RETIRED.ALL_STORES_PS (K) - # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS (K) - # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS (K) - # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM_PS (K) - # Hardware Event Count:MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_PS (K) - # Hardware Event Count:FP_ARITH_INST_RETIRED.SCALAR_SINGLE (K) - # Hardware Event Count:FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE (K) - # Hardware Event Count:FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE (K) - # Hardware Event Count:FP_ARITH_INST_RETIRED.SCALAR_DOUBLE (K) - # Hardware Event Count:FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE (K) - # Hardware Event Count:FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE (K) - # Hardware Event Count:INST_RETIRED.X87 (K)