From 66a15e5b674b059738b443779b987e9a829fa8de Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 11 Oct 2019 09:33:57 +0200 Subject: [PATCH 1/6] tag1 --- cscs-checks/apps/amber/amber_check.py | 2 +- cscs-checks/apps/espresso/espresso_check.py | 2 +- cscs-checks/apps/gromacs/gromacs_check.py | 2 +- cscs-checks/apps/icon/rrtmgp_check.py | 1 + cscs-checks/apps/lammps/lammps_check.py | 2 +- cscs-checks/apps/namd/namd_check.py | 2 +- cscs-checks/apps/openfoam/check_openfoam.py | 2 +- .../apps/openfoam/check_openfoam_extend.py | 2 +- cscs-checks/cuda/cuda_checks.py | 2 +- cscs-checks/cuda/multi_gpu.py | 2 +- cscs-checks/cuda/nvml_check.py | 2 +- .../libraries/io/netcdf_compile_run.py | 2 +- .../libraries/math/scalapack_compile_run.py | 58 +++--- cscs-checks/mch/fieldextra_check.py | 2 +- .../microbenchmarks/hpcg/hpcg_benchmark.py | 2 +- .../microbenchmarks/spec-accel/spec.py | 2 +- cscs-checks/system/io/ior_check.py | 2 +- cscs-checks/tools/io/cdo.py | 2 +- cscs-checks/tools/io/nco.py | 2 +- .../berkeley-ert-nvprof.py | 2 +- .../berkeley-ert-serial.py | 170 ++++++++++++++++ .../profiling_and_debugging/berkeley-ert.py | 2 +- .../intel_advisor_roofline.py | 2 +- .../intel_sde_berkeley_stream.py | 2 +- .../intel_sde_roofline.py | 2 +- .../intel_vtune_roofline.py | 2 +- .../likwid_roofline.py | 181 ++++++++++++++++++ 27 files changed, 404 insertions(+), 52 deletions(-) create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py create mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py diff --git a/cscs-checks/apps/amber/amber_check.py b/cscs-checks/apps/amber/amber_check.py index 4018e923ce..4ddce619d2 100644 --- a/cscs-checks/apps/amber/amber_check.py +++ b/cscs-checks/apps/amber/amber_check.py @@ -38,7 +38,7 @@ def __init__(self, input_file, output_file): output_file, 'perf', float, item=1) } self.maintainers = ['SO', 'VH'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} @rfm.required_version('>=2.16') diff --git a/cscs-checks/apps/espresso/espresso_check.py b/cscs-checks/apps/espresso/espresso_check.py index 4cacf2934f..876ae49753 100644 --- a/cscs-checks/apps/espresso/espresso_check.py +++ b/cscs-checks/apps/espresso/espresso_check.py @@ -11,7 +11,7 @@ def __init__(self, scale): super().__init__() self.descr = 'Quantum Espresso CPU check' self.maintainers = ['AK', 'LM'] - self.tags = {'scs', 'production'} + self.tags = {'scs', 'production', 'resources'} self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'Espresso') diff --git a/cscs-checks/apps/gromacs/gromacs_check.py b/cscs-checks/apps/gromacs/gromacs_check.py index 05f776383c..81bc39d767 100644 --- a/cscs-checks/apps/gromacs/gromacs_check.py +++ b/cscs-checks/apps/gromacs/gromacs_check.py @@ -41,7 +41,7 @@ def __init__(self, output_file): 'num_switches': 1 } } - self.tags = {'scs'} + self.tags = {'scs', 'resources'} @rfm.required_version('>=2.19') diff --git a/cscs-checks/apps/icon/rrtmgp_check.py b/cscs-checks/apps/icon/rrtmgp_check.py index 69a48430d9..9d157e7ea9 100644 --- a/cscs-checks/apps/icon/rrtmgp_check.py +++ b/cscs-checks/apps/icon/rrtmgp_check.py @@ -14,6 +14,7 @@ def __init__(self): self.valid_prog_environs = ['PrgEnv-pgi'] self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'RRTMGP') + self.tags = {'resources'} self.prebuild_cmd = ['cp build/Makefile.conf.dom build/Makefile.conf'] self.executable = 'python' self.executable_opts = [ diff --git a/cscs-checks/apps/lammps/lammps_check.py b/cscs-checks/apps/lammps/lammps_check.py index 5321767f08..23a69cff68 100644 --- a/cscs-checks/apps/lammps/lammps_check.py +++ b/cscs-checks/apps/lammps/lammps_check.py @@ -33,7 +33,7 @@ def __init__(self): } } - self.tags = {'scs'} + self.tags = {'scs', 'resources'} self.maintainers = ['TR', 'VH'] diff --git a/cscs-checks/apps/namd/namd_check.py b/cscs-checks/apps/namd/namd_check.py index 2a75ac0efd..eb91d82520 100644 --- a/cscs-checks/apps/namd/namd_check.py +++ b/cscs-checks/apps/namd/namd_check.py @@ -44,7 +44,7 @@ def __init__(self, arch, scale, variant): } self.maintainers = ['CB', 'LM'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} self.strict_check = False self.extra_resources = { 'switches': { diff --git a/cscs-checks/apps/openfoam/check_openfoam.py b/cscs-checks/apps/openfoam/check_openfoam.py index ade32256d7..dff3d6de8e 100644 --- a/cscs-checks/apps/openfoam/check_openfoam.py +++ b/cscs-checks/apps/openfoam/check_openfoam.py @@ -22,7 +22,7 @@ def __init__(self): self.num_cpus_per_task = 1 self.maintainers = ['MKr'] - self.tags = {'scs', 'production'} + self.tags = {'scs', 'production', 'resources'} self.pre_run = ['source $FOAM_BASH'] diff --git a/cscs-checks/apps/openfoam/check_openfoam_extend.py b/cscs-checks/apps/openfoam/check_openfoam_extend.py index 68a9a440de..d6b7d7194d 100644 --- a/cscs-checks/apps/openfoam/check_openfoam_extend.py +++ b/cscs-checks/apps/openfoam/check_openfoam_extend.py @@ -28,7 +28,7 @@ def __init__(self): r'Finalising parallel run', self.stdout) self.maintainers = ['MKr'] - self.tags = {'scs', 'production'} + self.tags = {'scs', 'production', 'resources'} self.pre_run = ['source $FOAM_INST_DIR/foam-extend-4.0/etc/bashrc'] diff --git a/cscs-checks/cuda/cuda_checks.py b/cscs-checks/cuda/cuda_checks.py index b5c26887c9..73b84ca02b 100644 --- a/cscs-checks/cuda/cuda_checks.py +++ b/cscs-checks/cuda/cuda_checks.py @@ -27,7 +27,7 @@ def __init__(self): self.nvidia_sm = '37' self.maintainers = ['AJ', 'VK'] - self.tags = {'production'} + self.tags = {'production', 'resources'} @rfm.required_version('>=2.14') diff --git a/cscs-checks/cuda/multi_gpu.py b/cscs-checks/cuda/multi_gpu.py index a56f408a8f..1392ea1df4 100644 --- a/cscs-checks/cuda/multi_gpu.py +++ b/cscs-checks/cuda/multi_gpu.py @@ -61,7 +61,7 @@ def __init__(self): 'kesch:cn:d2h': (7584, -0.1, None, 'MB/s'), 'kesch:cn:d2d': (137408, -0.1, None, 'MB/s') } - self.tags = {'diagnostic', 'mch'} + self.tags = {'diagnostic', 'mch', 'resources'} self.maintainers = ['AJ', 'VK'] def _xfer_pattern(self, xfer_kind, devno, nodename): diff --git a/cscs-checks/cuda/nvml_check.py b/cscs-checks/cuda/nvml_check.py index 20e88fd10e..f27d3f5a9b 100644 --- a/cscs-checks/cuda/nvml_check.py +++ b/cscs-checks/cuda/nvml_check.py @@ -22,4 +22,4 @@ def __init__(self): r"compute\s+mode\s+'Exclusive Process'", self.stdout) self.maintainers = ['AJ', 'VK'] - self.tags = {'production'} + self.tags = {'production', 'resources'} diff --git a/cscs-checks/libraries/io/netcdf_compile_run.py b/cscs-checks/libraries/io/netcdf_compile_run.py index 6160e32531..37a3d2df7f 100644 --- a/cscs-checks/libraries/io/netcdf_compile_run.py +++ b/cscs-checks/libraries/io/netcdf_compile_run.py @@ -40,7 +40,7 @@ def __init__(self, lang, linkage): self.num_tasks_per_node = 1 self.sanity_patterns = sn.assert_found(r'SUCCESS', self.stdout) self.maintainers = ['AJ', 'VK'] - self.tags = {'production'} + self.tags = {'production', 'resources'} def setup(self, partition, environ, **job_opts): if self.current_system.name == 'kesch': diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py index 9a9bfbe886..62ad1d7b79 100644 --- a/cscs-checks/libraries/math/scalapack_compile_run.py +++ b/cscs-checks/libraries/math/scalapack_compile_run.py @@ -27,7 +27,7 @@ def __init__(self, linkage): self.build_system = 'SingleSource' self.build_system.fflags = ['-O3'] self.maintainers = ['CB', 'LM', 'MKr'] - self.tags = {'production'} + self.tags = {'production', 'resources'} @rfm.required_version('>=2.14') @@ -68,31 +68,31 @@ def scalapack_sanity(number1, number2, expected_value): ]) -# FIXME: This test is obsolete; it is kept only for reference. -# NOTE: The test case is very small, but larger cases did not succeed! -@rfm.required_version('>=2.14') -@rfm.parameterized_test(['dynamic']) -class ScaLAPACKPerf(ScaLAPACKTest): - def __init__(self, linkage): - super().__init__(linkage) - - self.tags |= {'monch_acceptance'} - self.sourcepath = 'scalapack_performance_compile_run.f' - self.valid_systems = ['monch:compute'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.num_tasks = 64 - self.num_tasks_per_node = 16 - - self.sanity_patterns = sn.assert_found(r'Run', self.stdout) - self.perf_patterns = { - 'perf': sn.max( - sn.extractall(r'GFLOPS/s:\s+(?P\S+)', - self.stdout, 'gflops', float) - ) - } - - self.reference = { - 'monch:compute': { - 'perf': (24., -0.1, None) - } - } +# # FIXME: This test is obsolete; it is kept only for reference. +# # NOTE: The test case is very small, but larger cases did not succeed! +# @rfm.required_version('>=2.14') +# @rfm.parameterized_test(['dynamic']) +# class ScaLAPACKPerf(ScaLAPACKTest): +# def __init__(self, linkage): +# super().__init__(linkage) +# +# self.tags |= {'monch_acceptance'} +# self.sourcepath = 'scalapack_performance_compile_run.f' +# self.valid_systems = ['monch:compute'] +# self.valid_prog_environs = ['PrgEnv-gnu'] +# self.num_tasks = 64 +# self.num_tasks_per_node = 16 +# +# self.sanity_patterns = sn.assert_found(r'Run', self.stdout) +# self.perf_patterns = { +# 'perf': sn.max( +# sn.extractall(r'GFLOPS/s:\s+(?P\S+)', +# self.stdout, 'gflops', float) +# ) +# } +# +# self.reference = { +# 'monch:compute': { +# 'perf': (24., -0.1, None) +# } +# } diff --git a/cscs-checks/mch/fieldextra_check.py b/cscs-checks/mch/fieldextra_check.py index b45acd15d6..b10df5bf65 100644 --- a/cscs-checks/mch/fieldextra_check.py +++ b/cscs-checks/mch/fieldextra_check.py @@ -8,7 +8,7 @@ class FieldextraTestBase(rfm.RunOnlyRegressionTest): def __init__(self): super().__init__() self.maintainers = ['MKr'] - self.tags = {'mch'} + self.tags = {'mch', 'resources'} self.valid_systems = ['kesch:cn'] self.valid_prog_environs = ['PrgEnv-gnu-nompi'] diff --git a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py index 20a4d8dcd8..65548472ac 100644 --- a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py +++ b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py @@ -52,7 +52,7 @@ def __init__(self): } self.maintainers = ['SK'] - self.tags = {'diagnostic', 'benchmark'} + self.tags = {'diagnostic', 'benchmark', 'resources'} @property @sn.sanity_function diff --git a/cscs-checks/microbenchmarks/spec-accel/spec.py b/cscs-checks/microbenchmarks/spec-accel/spec.py index 2fa4f68edd..fc219d9e14 100644 --- a/cscs-checks/microbenchmarks/spec-accel/spec.py +++ b/cscs-checks/microbenchmarks/spec-accel/spec.py @@ -53,7 +53,7 @@ def __init__(self, prg_envs): } self.maintainers = ['SK'] - self.tags = {'diagnostic'} + self.tags = {'diagnostic', 'resources'} def setup(self, partition, environ, **job_opts): self.pre_run = ['source ./shrc', 'mv %s config' % diff --git a/cscs-checks/system/io/ior_check.py b/cscs-checks/system/io/ior_check.py index a26b6e97c5..757d1493a5 100644 --- a/cscs-checks/system/io/ior_check.py +++ b/cscs-checks/system/io/ior_check.py @@ -108,7 +108,7 @@ def __init__(self, base_dir): systems_to_test = ['dom', 'daint'] if self.current_system.name in systems_to_test: - self.tags |= {'production'} + self.tags |= {'production', 'resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/io/cdo.py b/cscs-checks/tools/io/cdo.py index a002df9bce..400d29d0fc 100644 --- a/cscs-checks/tools/io/cdo.py +++ b/cscs-checks/tools/io/cdo.py @@ -40,7 +40,7 @@ def __init__(self): self.modules = ['CDO'] self.maintainers = ['SO'] - self.tags = {'production', 'mch'} + self.tags = {'production', 'mch', 'resources'} # Check that the netCDF loaded by the CDO module supports the nc4 filetype diff --git a/cscs-checks/tools/io/nco.py b/cscs-checks/tools/io/nco.py index 504ebe928c..9997f90ecf 100644 --- a/cscs-checks/tools/io/nco.py +++ b/cscs-checks/tools/io/nco.py @@ -33,7 +33,7 @@ def __init__(self): self.modules = ['NCO'] self.maintainers = ['SO'] - self.tags = {'production', 'mch'} + self.tags = {'production', 'mch', 'resources'} # Check that the netCDF loaded by the NCO module supports the nc4 filetype diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py index bc3d9ba36b..933ffab571 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py @@ -72,7 +72,7 @@ def __init__(self, gpudims, flop, repeat): ] self.build_system.ldflags = ['-O3'] self.maintainers = ['JG'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} gpu_blocks, gpu_threads = gpudims self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format( repeat, flop, gpu_blocks, gpu_threads) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py new file mode 100644 index 0000000000..1dee82959c --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py @@ -0,0 +1,170 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +#{{{ base +class ErtTestBase(rfm.RegressionTest): + """ + The Empirical Roofline Tool, ERT, automatically generates roofline data. + https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ + """ + + def __init__(self): + self.descr = 'Empirical Roofline Toolkit' + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'ert') + self.build_system = 'SingleSource' + self.sourcepath = 'kernel1.c driver1.c' + self.executable = 'ert.exe' + self.build_system.ldflags = ['-O3', '-fopenmp'] + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'ert') + self.rpt = '%s.rpt' % self.executable + self.maintainers = ['JG'] + self.tags = {'scs', 'resources'} + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + if self.num_tasks != 36: + self.job.launcher.options = ['--cpu-bind=verbose,none'] +#}}} + +#{{{ test +@rfm.parameterized_test( + *[[num_ranks, flop] + for num_ranks in [1] + for flop in [256, 512, 1024]]) + #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) +class ErtBroadwellTest(ErtTestBase): + def __init__(self, num_ranks, flop): + super().__init__() + ompthread = 1 + self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.build_system.cppflags = [ + '-DERT_FLOP=%s' % flop, + '-DERT_ALIGN=32', + '-DERT_MEMORY_MAX=1073741824', + '-DERT_MPI=True', + '-DERT_OPENMP=True', + '-DERT_TRIALS_MIN=1', + '-DERT_WORKING_SET_MIN=1', + ] + self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( + flop, num_ranks, ompthread) + self.exclusive = True + self.num_tasks = num_ranks + self.num_tasks_per_node = num_ranks + self.num_cpus_per_task = ompthread + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) + } + + # take the "slowest" job, make it sleep after it has ended and hope the + # other jobs have ended too + # TODO: find a better way to wait for the other jobs to end + num_ranks_min = 1 + flop_min = 1024 + self.roofline_rpt = 'rpt' + if num_ranks == num_ranks_min and flop == flop_min: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + # give enough time for all the dependent jobs to collect data: + 'sleep 60', + 'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt', + ] + + else: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + ] + + # --- Sanity check: + regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' + r'\(\w+ \*\)malloc\(PSIZE\);') + datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') + self.sanity_patterns = sn.all([ + sn.assert_found('GFLOPs', 'sum'), + sn.assert_eq(datatype, 'double'), + ]) + + # --- Performance check: + if num_ranks == num_ranks_min and flop == flop_min: + # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): + ref_GFLOPs = 945.0 + ref_L1bw = 1788.0 + ref_L2bw = 855.0 + ref_L3bw = 547.0 + ref_DRAMbw = 70.5 + + # Typical performance report looks like: + # -------------------------------------- + # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt + # 908.43 GFLOPs EMP + # ****** + # META_DATA + # OPENMP_THREADS 1 + # FLOPS 8 + # MPI_PROCS 36 + # + # 5647.33 L1 EMP + # ******* + # 3203.86 L2 EMP + # ******* + # 1773.58 L3 EMP + # ******* + # 139.56 L4 EMP + # 103.50 DRAM EMP + # ****** + # META_DATA + # FLOPS 2 + # OPENMP_THREADS 1 + # MPI_PROCS 36 + regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' + regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' + regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' + regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' + regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' + + gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, + 'GFLOPs', float) + L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, + 'L1bw', float) + L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, + 'L2bw', float) + L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, + 'L3bw', float) + DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, + 'DRAMbw', float) + + # --performance-report: + self.perf_patterns = { + 'gflops': gflops, + 'L1bw': L1bw, + 'L2bw': L2bw, + 'L3bw': L3bw, + 'DRAMbw': DRAMbw, + } + + self.reference = { + '*': { + 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), + 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), + 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), + 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), + 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), + } + } + + # else: + +#}}} diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py index 505905d0f9..580aff48e8 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py @@ -23,7 +23,7 @@ def __init__(self): 'roofline', 'ert') self.rpt = '%s.rpt' % self.executable self.maintainers = ['JG'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 977f2c2ab2..4172410021 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -238,7 +238,7 @@ def __init__(self, repeat, toolversion, datalayout): } self.maintainers = ['JG'] - self.tags = {'production'} + self.tags = {'production', 'resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 0c410e43ad..489f4ee9dd 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -35,7 +35,7 @@ def __init__(self): self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] self.maintainers = ['JG'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} @property @sn.sanity_function diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py index f31cb542f7..b6b3015d06 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py @@ -68,7 +68,7 @@ def __init__(self, repeat, toolsversion, datalayout): self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] self.maintainers = ['JG'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} self.sanity_patterns = sn.all([ sn.assert_eq(sn.extractsingle( r'^Intel\(R\) Software Development Emulator\. Version: ' diff --git a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py index 0ae40b9f51..46e173e8a5 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py @@ -76,7 +76,7 @@ def __init__(self, repeat, toolsversion, datalayout): # NOTE: -allow-multiple-runs requires to install vtune drivers # TODO: -collect memory-access self.maintainers = ['JG'] - self.tags = {'scs'} + self.tags = {'scs', 'resources'} self.sanity_patterns = sn.all([ sn.assert_found('loop complete.', self.stdout), sn.assert_eq(sn.extractsingle( diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py new file mode 100644 index 0000000000..8a8ab08c84 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py @@ -0,0 +1,181 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] + for repeat in ['600000'] + for toolsversion in ['4.3.3'] + # for datalayout in ['G3_AOS_SCALAR'] + for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', + 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] + ]) +class IntelRooflineLikwidTest(rfm.RegressionTest): + '''This test checks the values reported by RRZE likwid roofline model: + +G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697 +G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629 +G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439 +G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B + 10GF 60000 0.18 + + > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf + > likwid-perfctr -g CACHES -H + + > Get group definition with (identical result): + > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt + > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H + DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 + + FP_ARITH_INST_RETIRED_SCALAR_DOUBLE + + FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4) + /runtime + + > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H + Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) + + SUM(MBOXxC1))*64.0/runtime + Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) + + SUM(MBOXxC1))*64.0 + + > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H + L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + + ICACHE_MISSES)*64.0/time + L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + + ICACHE_MISSES)*64.0 + + > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H + L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL + + L2_LINES_OUT_DEMAND_DIRTY)*64/time + L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL + + L2_LINES_OUT_DEMAND_DIRTY)*64 + + > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H + Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) + + SUM(CAS_COUNT_WR))*64.0/time + Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) + + SUM(CAS_COUNT_WR))*64.0 + ''' + def __init__(self, repeat, toolsversion, datalayout): + super().__init__() + self.descr = 'Roofline Analysis test with Likwid:' + self.valid_systems = ['dom:mc'] + # Reporting MFLOPS is not available on Intel Haswell cpus, see + # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ + # 64-ia-32-architectures-software-developer-vol-1-manual.pdf + self.valid_prog_environs = ['PrgEnv-intel'] + self.modules = ['likwid'] + # likwid/4.3.3-perf_event + self.sourcesdir = os.path.join(self.current_system.resourcesdir, + 'roofline', 'intel_advisor') + self.build_system = 'SingleSource' + self.sourcepath = '_roofline.cpp' + self.executable = 'likwid-perfctr' + self.target_executable = './roof.exe' + self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON', + '-I$EBROOTLIKWID/include'] + self.prgenv_flags = { + 'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'], + # '-qopt-streaming-stores', 'always', + } + self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid'] + self.prebuild_cmd = [ + 'patch -s < LIKWID/roofline_template.patch', + 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % + (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') + ] + self.exclusive = True + self.num_tasks = 1 + self.num_tasks_per_node = 1 + self.num_cpus_per_task = 1 + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'CRAYPE_LINK_TYPE': 'dynamic', + } + self.pre_run = [ + 'mv %s %s' % (self.executable, self.target_executable), + ] + self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable] + # -C 0 : sets processor id(s) to pin threads and measure + # -g : sets performance group + # -m : use likwid API + self.executable_opts = self.tool_flags + self.maintainers = ['JG'] + self.tags = {'scs', 'resources'} + # self.rpt = '%s.rpt' % self.target_executable + self.sanity_patterns = sn.all([ + sn.assert_found('loop complete.', self.stdout), + sn.assert_eq(sn.extractsingle( + r'^likwid-perfctr -- Version (?P\d.\d.\d)', + self.stdout, 'toolsversion'), toolsversion), + ]) + # References for Intel Broadwell CPU (E5-2695 v4): + references = { + 'G3_AOS_SCALAR': { + 'dom:mc': { + 'gflops': (0.596, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_SOA_SCALAR': { + 'dom:mc': { + 'gflops': (0.612, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_AOS_VECTOR': { + 'dom:mc': { + 'gflops': (1.152, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.125, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_SOA_VECTOR': { + 'dom:mc': { + 'gflops': (1.125, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + } + self.reference = references[datalayout] + self.perf_patterns = { + 'gflops': self.gflops, + 'ai': self.arithmetic_intensity, + } + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + run_cmd = ' '.join(self.job.launcher.command(self.job)) + self.post_run = ['%s -v' % self.executable] + # self.perf_group = ['L2', 'L3'] + self.perf_group = ['L2', 'L3', 'CACHES', 'DATA', + 'MEM', 'MEM_DP', 'MEM_SP'] + for perf_group in self.perf_group: + self.post_run += ['%s %s -C 0 -g %s -m %s' % + (run_cmd, self.executable, perf_group, + self.target_executable)] + partitiontype = partition.fullname.split(':')[1] + if partitiontype == 'gpu': + self.job.options = ['--constraint="gpu&perf"'] + elif partitiontype == 'mc': + self.job.options = ['--constraint="mc&perf"'] + + @property + @sn.sanity_function + def arithmetic_intensity(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'bytes', int) + # debug: print('ai={}'.format(flops/bytes)) + return flops/bytes + + @property + @sn.sanity_function + def gflops(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, + 'msec', float) + # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) + return (flops/((msec/1000))/10**9) From e81f6623bf1e7eb26b8ec11b648c1995bb0549da Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 11 Oct 2019 09:42:42 +0200 Subject: [PATCH 2/6] removing unwanted commited files --- .../berkeley-ert-serial.py | 170 ---------------- .../likwid_roofline.py | 181 ------------------ 2 files changed, 351 deletions(-) delete mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py deleted file mode 100644 index 1dee82959c..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py +++ /dev/null @@ -1,170 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -#{{{ base -class ErtTestBase(rfm.RegressionTest): - """ - The Empirical Roofline Tool, ERT, automatically generates roofline data. - https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ - """ - - def __init__(self): - self.descr = 'Empirical Roofline Toolkit' - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - self.build_system = 'SingleSource' - self.sourcepath = 'kernel1.c driver1.c' - self.executable = 'ert.exe' - self.build_system.ldflags = ['-O3', '-fopenmp'] - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'ert') - self.rpt = '%s.rpt' % self.executable - self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - if self.num_tasks != 36: - self.job.launcher.options = ['--cpu-bind=verbose,none'] -#}}} - -#{{{ test -@rfm.parameterized_test( - *[[num_ranks, flop] - for num_ranks in [1] - for flop in [256, 512, 1024]]) - #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) -class ErtBroadwellTest(ErtTestBase): - def __init__(self, num_ranks, flop): - super().__init__() - ompthread = 1 - self.valid_systems = ['daint:mc', 'dom:mc'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.build_system.cppflags = [ - '-DERT_FLOP=%s' % flop, - '-DERT_ALIGN=32', - '-DERT_MEMORY_MAX=1073741824', - '-DERT_MPI=True', - '-DERT_OPENMP=True', - '-DERT_TRIALS_MIN=1', - '-DERT_WORKING_SET_MIN=1', - ] - self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( - flop, num_ranks, ompthread) - self.exclusive = True - self.num_tasks = num_ranks - self.num_tasks_per_node = num_ranks - self.num_cpus_per_task = ompthread - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task) - } - - # take the "slowest" job, make it sleep after it has ended and hope the - # other jobs have ended too - # TODO: find a better way to wait for the other jobs to end - num_ranks_min = 1 - flop_min = 1024 - self.roofline_rpt = 'rpt' - if num_ranks == num_ranks_min and flop == flop_min: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - # give enough time for all the dependent jobs to collect data: - 'sleep 60', - 'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt', - ] - - else: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - ] - - # --- Sanity check: - regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' - r'\(\w+ \*\)malloc\(PSIZE\);') - datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') - self.sanity_patterns = sn.all([ - sn.assert_found('GFLOPs', 'sum'), - sn.assert_eq(datatype, 'double'), - ]) - - # --- Performance check: - if num_ranks == num_ranks_min and flop == flop_min: - # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): - ref_GFLOPs = 945.0 - ref_L1bw = 1788.0 - ref_L2bw = 855.0 - ref_L3bw = 547.0 - ref_DRAMbw = 70.5 - - # Typical performance report looks like: - # -------------------------------------- - # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt - # 908.43 GFLOPs EMP - # ****** - # META_DATA - # OPENMP_THREADS 1 - # FLOPS 8 - # MPI_PROCS 36 - # - # 5647.33 L1 EMP - # ******* - # 3203.86 L2 EMP - # ******* - # 1773.58 L3 EMP - # ******* - # 139.56 L4 EMP - # 103.50 DRAM EMP - # ****** - # META_DATA - # FLOPS 2 - # OPENMP_THREADS 1 - # MPI_PROCS 36 - regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' - regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' - regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' - regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' - regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' - - gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, - 'GFLOPs', float) - L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, - 'L1bw', float) - L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, - 'L2bw', float) - L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, - 'L3bw', float) - DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, - 'DRAMbw', float) - - # --performance-report: - self.perf_patterns = { - 'gflops': gflops, - 'L1bw': L1bw, - 'L2bw': L2bw, - 'L3bw': L3bw, - 'DRAMbw': DRAMbw, - } - - self.reference = { - '*': { - 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), - 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), - 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), - 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), - 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), - } - } - - # else: - -#}}} diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py deleted file mode 100644 index 8a8ab08c84..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py +++ /dev/null @@ -1,181 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] - for repeat in ['600000'] - for toolsversion in ['4.3.3'] - # for datalayout in ['G3_AOS_SCALAR'] - for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', - 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] - ]) -class IntelRooflineLikwidTest(rfm.RegressionTest): - '''This test checks the values reported by RRZE likwid roofline model: - -G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697 -G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629 -G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439 -G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B - 10GF 60000 0.18 - - > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf - > likwid-perfctr -g CACHES -H - - > Get group definition with (identical result): - > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt - > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H - DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 + - FP_ARITH_INST_RETIRED_SCALAR_DOUBLE + - FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4) - /runtime - - > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H - Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) + - SUM(MBOXxC1))*64.0/runtime - Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) + - SUM(MBOXxC1))*64.0 - - > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H - L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + - ICACHE_MISSES)*64.0/time - L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + - ICACHE_MISSES)*64.0 - - > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H - L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL + - L2_LINES_OUT_DEMAND_DIRTY)*64/time - L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL + - L2_LINES_OUT_DEMAND_DIRTY)*64 - - > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H - Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) + - SUM(CAS_COUNT_WR))*64.0/time - Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) + - SUM(CAS_COUNT_WR))*64.0 - ''' - def __init__(self, repeat, toolsversion, datalayout): - super().__init__() - self.descr = 'Roofline Analysis test with Likwid:' - self.valid_systems = ['dom:mc'] - # Reporting MFLOPS is not available on Intel Haswell cpus, see - # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ - # 64-ia-32-architectures-software-developer-vol-1-manual.pdf - self.valid_prog_environs = ['PrgEnv-intel'] - self.modules = ['likwid'] - # likwid/4.3.3-perf_event - self.sourcesdir = os.path.join(self.current_system.resourcesdir, - 'roofline', 'intel_advisor') - self.build_system = 'SingleSource' - self.sourcepath = '_roofline.cpp' - self.executable = 'likwid-perfctr' - self.target_executable = './roof.exe' - self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON', - '-I$EBROOTLIKWID/include'] - self.prgenv_flags = { - 'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'], - # '-qopt-streaming-stores', 'always', - } - self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid'] - self.prebuild_cmd = [ - 'patch -s < LIKWID/roofline_template.patch', - 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % - (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') - ] - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'CRAYPE_LINK_TYPE': 'dynamic', - } - self.pre_run = [ - 'mv %s %s' % (self.executable, self.target_executable), - ] - self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable] - # -C 0 : sets processor id(s) to pin threads and measure - # -g : sets performance group - # -m : use likwid API - self.executable_opts = self.tool_flags - self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} - # self.rpt = '%s.rpt' % self.target_executable - self.sanity_patterns = sn.all([ - sn.assert_found('loop complete.', self.stdout), - sn.assert_eq(sn.extractsingle( - r'^likwid-perfctr -- Version (?P\d.\d.\d)', - self.stdout, 'toolsversion'), toolsversion), - ]) - # References for Intel Broadwell CPU (E5-2695 v4): - references = { - 'G3_AOS_SCALAR': { - 'dom:mc': { - 'gflops': (0.596, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_SOA_SCALAR': { - 'dom:mc': { - 'gflops': (0.612, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_AOS_VECTOR': { - 'dom:mc': { - 'gflops': (1.152, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.125, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_SOA_VECTOR': { - 'dom:mc': { - 'gflops': (1.125, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - } - self.reference = references[datalayout] - self.perf_patterns = { - 'gflops': self.gflops, - 'ai': self.arithmetic_intensity, - } - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - run_cmd = ' '.join(self.job.launcher.command(self.job)) - self.post_run = ['%s -v' % self.executable] - # self.perf_group = ['L2', 'L3'] - self.perf_group = ['L2', 'L3', 'CACHES', 'DATA', - 'MEM', 'MEM_DP', 'MEM_SP'] - for perf_group in self.perf_group: - self.post_run += ['%s %s -C 0 -g %s -m %s' % - (run_cmd, self.executable, perf_group, - self.target_executable)] - partitiontype = partition.fullname.split(':')[1] - if partitiontype == 'gpu': - self.job.options = ['--constraint="gpu&perf"'] - elif partitiontype == 'mc': - self.job.options = ['--constraint="mc&perf"'] - - @property - @sn.sanity_function - def arithmetic_intensity(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'bytes', int) - # debug: print('ai={}'.format(flops/bytes)) - return flops/bytes - - @property - @sn.sanity_function - def gflops(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, - 'msec', float) - # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) - return (flops/((msec/1000))/10**9) From bfd0c52a76da6d224655c393bba655af6f15b30a Mon Sep 17 00:00:00 2001 From: jgp Date: Fri, 11 Oct 2019 09:44:21 +0200 Subject: [PATCH 3/6] pep8 --- cscs-checks/libraries/math/scalapack_compile_run.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py index 62ad1d7b79..b9ff658909 100644 --- a/cscs-checks/libraries/math/scalapack_compile_run.py +++ b/cscs-checks/libraries/math/scalapack_compile_run.py @@ -75,14 +75,14 @@ def scalapack_sanity(number1, number2, expected_value): # class ScaLAPACKPerf(ScaLAPACKTest): # def __init__(self, linkage): # super().__init__(linkage) -# +# # self.tags |= {'monch_acceptance'} # self.sourcepath = 'scalapack_performance_compile_run.f' # self.valid_systems = ['monch:compute'] # self.valid_prog_environs = ['PrgEnv-gnu'] # self.num_tasks = 64 # self.num_tasks_per_node = 16 -# +# # self.sanity_patterns = sn.assert_found(r'Run', self.stdout) # self.perf_patterns = { # 'perf': sn.max( @@ -90,7 +90,7 @@ def scalapack_sanity(number1, number2, expected_value): # self.stdout, 'gflops', float) # ) # } -# +# # self.reference = { # 'monch:compute': { # 'perf': (24., -0.1, None) From 620725146850f7368e3d6b25b1da5c23301e83d3 Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 15 Oct 2019 12:42:55 +0200 Subject: [PATCH 4/6] change tagname --- cscs-checks/apps/amber/amber_check.py | 2 +- cscs-checks/apps/espresso/espresso_check.py | 2 +- cscs-checks/apps/gromacs/gromacs_check.py | 2 +- cscs-checks/apps/icon/rrtmgp_check.py | 2 +- cscs-checks/apps/lammps/lammps_check.py | 2 +- cscs-checks/apps/namd/namd_check.py | 2 +- cscs-checks/apps/openfoam/check_openfoam.py | 2 +- .../apps/openfoam/check_openfoam_extend.py | 2 +- cscs-checks/cuda/cuda_checks.py | 2 +- cscs-checks/cuda/multi_gpu.py | 2 +- cscs-checks/cuda/nvml_check.py | 2 +- .../libraries/io/netcdf_compile_run.py | 2 +- .../libraries/math/scalapack_compile_run.py | 2 +- cscs-checks/mch/fieldextra_check.py | 2 +- .../microbenchmarks/hpcg/hpcg_benchmark.py | 2 +- .../microbenchmarks/spec-accel/spec.py | 2 +- cscs-checks/system/io/ior_check.py | 2 +- cscs-checks/tools/io/cdo.py | 2 +- cscs-checks/tools/io/nco.py | 2 +- .../berkeley-ert-nvprof.py | 2 +- .../berkeley-ert-serial.py | 170 ++++++++++++++++ .../profiling_and_debugging/berkeley-ert.py | 2 +- .../intel_advisor_roofline.py | 2 +- .../intel_sde_berkeley_stream.py | 2 +- .../intel_sde_roofline.py | 2 +- .../intel_vtune_roofline.py | 2 +- .../likwid_roofline.py | 181 ++++++++++++++++++ 27 files changed, 376 insertions(+), 25 deletions(-) create mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py create mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py diff --git a/cscs-checks/apps/amber/amber_check.py b/cscs-checks/apps/amber/amber_check.py index 4ddce619d2..8594d30394 100644 --- a/cscs-checks/apps/amber/amber_check.py +++ b/cscs-checks/apps/amber/amber_check.py @@ -38,7 +38,7 @@ def __init__(self, input_file, output_file): output_file, 'perf', float, item=1) } self.maintainers = ['SO', 'VH'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} @rfm.required_version('>=2.16') diff --git a/cscs-checks/apps/espresso/espresso_check.py b/cscs-checks/apps/espresso/espresso_check.py index 876ae49753..c21c900f65 100644 --- a/cscs-checks/apps/espresso/espresso_check.py +++ b/cscs-checks/apps/espresso/espresso_check.py @@ -11,7 +11,7 @@ def __init__(self, scale): super().__init__() self.descr = 'Quantum Espresso CPU check' self.maintainers = ['AK', 'LM'] - self.tags = {'scs', 'production', 'resources'} + self.tags = {'scs', 'production', 'external-resources'} self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'Espresso') diff --git a/cscs-checks/apps/gromacs/gromacs_check.py b/cscs-checks/apps/gromacs/gromacs_check.py index 81bc39d767..213db9531c 100644 --- a/cscs-checks/apps/gromacs/gromacs_check.py +++ b/cscs-checks/apps/gromacs/gromacs_check.py @@ -41,7 +41,7 @@ def __init__(self, output_file): 'num_switches': 1 } } - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} @rfm.required_version('>=2.19') diff --git a/cscs-checks/apps/icon/rrtmgp_check.py b/cscs-checks/apps/icon/rrtmgp_check.py index 9d157e7ea9..e6e332c674 100644 --- a/cscs-checks/apps/icon/rrtmgp_check.py +++ b/cscs-checks/apps/icon/rrtmgp_check.py @@ -14,7 +14,7 @@ def __init__(self): self.valid_prog_environs = ['PrgEnv-pgi'] self.sourcesdir = os.path.join(self.current_system.resourcesdir, 'RRTMGP') - self.tags = {'resources'} + self.tags = {'external-resources'} self.prebuild_cmd = ['cp build/Makefile.conf.dom build/Makefile.conf'] self.executable = 'python' self.executable_opts = [ diff --git a/cscs-checks/apps/lammps/lammps_check.py b/cscs-checks/apps/lammps/lammps_check.py index 23a69cff68..017dd0b067 100644 --- a/cscs-checks/apps/lammps/lammps_check.py +++ b/cscs-checks/apps/lammps/lammps_check.py @@ -33,7 +33,7 @@ def __init__(self): } } - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} self.maintainers = ['TR', 'VH'] diff --git a/cscs-checks/apps/namd/namd_check.py b/cscs-checks/apps/namd/namd_check.py index eb91d82520..31b3095c5c 100644 --- a/cscs-checks/apps/namd/namd_check.py +++ b/cscs-checks/apps/namd/namd_check.py @@ -44,7 +44,7 @@ def __init__(self, arch, scale, variant): } self.maintainers = ['CB', 'LM'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} self.strict_check = False self.extra_resources = { 'switches': { diff --git a/cscs-checks/apps/openfoam/check_openfoam.py b/cscs-checks/apps/openfoam/check_openfoam.py index dff3d6de8e..48dd58b4ba 100644 --- a/cscs-checks/apps/openfoam/check_openfoam.py +++ b/cscs-checks/apps/openfoam/check_openfoam.py @@ -22,7 +22,7 @@ def __init__(self): self.num_cpus_per_task = 1 self.maintainers = ['MKr'] - self.tags = {'scs', 'production', 'resources'} + self.tags = {'scs', 'production', 'external-resources'} self.pre_run = ['source $FOAM_BASH'] diff --git a/cscs-checks/apps/openfoam/check_openfoam_extend.py b/cscs-checks/apps/openfoam/check_openfoam_extend.py index d6b7d7194d..9ec5285f6d 100644 --- a/cscs-checks/apps/openfoam/check_openfoam_extend.py +++ b/cscs-checks/apps/openfoam/check_openfoam_extend.py @@ -28,7 +28,7 @@ def __init__(self): r'Finalising parallel run', self.stdout) self.maintainers = ['MKr'] - self.tags = {'scs', 'production', 'resources'} + self.tags = {'scs', 'production', 'external-resources'} self.pre_run = ['source $FOAM_INST_DIR/foam-extend-4.0/etc/bashrc'] diff --git a/cscs-checks/cuda/cuda_checks.py b/cscs-checks/cuda/cuda_checks.py index 73b84ca02b..aa6ad63e22 100644 --- a/cscs-checks/cuda/cuda_checks.py +++ b/cscs-checks/cuda/cuda_checks.py @@ -27,7 +27,7 @@ def __init__(self): self.nvidia_sm = '37' self.maintainers = ['AJ', 'VK'] - self.tags = {'production', 'resources'} + self.tags = {'production', 'external-resources'} @rfm.required_version('>=2.14') diff --git a/cscs-checks/cuda/multi_gpu.py b/cscs-checks/cuda/multi_gpu.py index 1392ea1df4..f2b94f7def 100644 --- a/cscs-checks/cuda/multi_gpu.py +++ b/cscs-checks/cuda/multi_gpu.py @@ -61,7 +61,7 @@ def __init__(self): 'kesch:cn:d2h': (7584, -0.1, None, 'MB/s'), 'kesch:cn:d2d': (137408, -0.1, None, 'MB/s') } - self.tags = {'diagnostic', 'mch', 'resources'} + self.tags = {'diagnostic', 'mch', 'external-resources'} self.maintainers = ['AJ', 'VK'] def _xfer_pattern(self, xfer_kind, devno, nodename): diff --git a/cscs-checks/cuda/nvml_check.py b/cscs-checks/cuda/nvml_check.py index f27d3f5a9b..44553c66c2 100644 --- a/cscs-checks/cuda/nvml_check.py +++ b/cscs-checks/cuda/nvml_check.py @@ -22,4 +22,4 @@ def __init__(self): r"compute\s+mode\s+'Exclusive Process'", self.stdout) self.maintainers = ['AJ', 'VK'] - self.tags = {'production', 'resources'} + self.tags = {'production', 'external-resources'} diff --git a/cscs-checks/libraries/io/netcdf_compile_run.py b/cscs-checks/libraries/io/netcdf_compile_run.py index 37a3d2df7f..ee77b005c2 100644 --- a/cscs-checks/libraries/io/netcdf_compile_run.py +++ b/cscs-checks/libraries/io/netcdf_compile_run.py @@ -40,7 +40,7 @@ def __init__(self, lang, linkage): self.num_tasks_per_node = 1 self.sanity_patterns = sn.assert_found(r'SUCCESS', self.stdout) self.maintainers = ['AJ', 'VK'] - self.tags = {'production', 'resources'} + self.tags = {'production', 'external-resources'} def setup(self, partition, environ, **job_opts): if self.current_system.name == 'kesch': diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py index b9ff658909..be040662d9 100644 --- a/cscs-checks/libraries/math/scalapack_compile_run.py +++ b/cscs-checks/libraries/math/scalapack_compile_run.py @@ -27,7 +27,7 @@ def __init__(self, linkage): self.build_system = 'SingleSource' self.build_system.fflags = ['-O3'] self.maintainers = ['CB', 'LM', 'MKr'] - self.tags = {'production', 'resources'} + self.tags = {'production', 'external-resources'} @rfm.required_version('>=2.14') diff --git a/cscs-checks/mch/fieldextra_check.py b/cscs-checks/mch/fieldextra_check.py index b10df5bf65..60129d7887 100644 --- a/cscs-checks/mch/fieldextra_check.py +++ b/cscs-checks/mch/fieldextra_check.py @@ -8,7 +8,7 @@ class FieldextraTestBase(rfm.RunOnlyRegressionTest): def __init__(self): super().__init__() self.maintainers = ['MKr'] - self.tags = {'mch', 'resources'} + self.tags = {'mch', 'external-resources'} self.valid_systems = ['kesch:cn'] self.valid_prog_environs = ['PrgEnv-gnu-nompi'] diff --git a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py index 65548472ac..939298704c 100644 --- a/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py +++ b/cscs-checks/microbenchmarks/hpcg/hpcg_benchmark.py @@ -52,7 +52,7 @@ def __init__(self): } self.maintainers = ['SK'] - self.tags = {'diagnostic', 'benchmark', 'resources'} + self.tags = {'diagnostic', 'benchmark', 'external-resources'} @property @sn.sanity_function diff --git a/cscs-checks/microbenchmarks/spec-accel/spec.py b/cscs-checks/microbenchmarks/spec-accel/spec.py index fc219d9e14..e6cac960bd 100644 --- a/cscs-checks/microbenchmarks/spec-accel/spec.py +++ b/cscs-checks/microbenchmarks/spec-accel/spec.py @@ -53,7 +53,7 @@ def __init__(self, prg_envs): } self.maintainers = ['SK'] - self.tags = {'diagnostic', 'resources'} + self.tags = {'diagnostic', 'external-resources'} def setup(self, partition, environ, **job_opts): self.pre_run = ['source ./shrc', 'mv %s config' % diff --git a/cscs-checks/system/io/ior_check.py b/cscs-checks/system/io/ior_check.py index 757d1493a5..874113b61e 100644 --- a/cscs-checks/system/io/ior_check.py +++ b/cscs-checks/system/io/ior_check.py @@ -108,7 +108,7 @@ def __init__(self, base_dir): systems_to_test = ['dom', 'daint'] if self.current_system.name in systems_to_test: - self.tags |= {'production', 'resources'} + self.tags |= {'production', 'external-resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/io/cdo.py b/cscs-checks/tools/io/cdo.py index 400d29d0fc..62e659ee05 100644 --- a/cscs-checks/tools/io/cdo.py +++ b/cscs-checks/tools/io/cdo.py @@ -40,7 +40,7 @@ def __init__(self): self.modules = ['CDO'] self.maintainers = ['SO'] - self.tags = {'production', 'mch', 'resources'} + self.tags = {'production', 'mch', 'external-resources'} # Check that the netCDF loaded by the CDO module supports the nc4 filetype diff --git a/cscs-checks/tools/io/nco.py b/cscs-checks/tools/io/nco.py index 9997f90ecf..8a6a3f2e98 100644 --- a/cscs-checks/tools/io/nco.py +++ b/cscs-checks/tools/io/nco.py @@ -33,7 +33,7 @@ def __init__(self): self.modules = ['NCO'] self.maintainers = ['SO'] - self.tags = {'production', 'mch', 'resources'} + self.tags = {'production', 'mch', 'external-resources'} # Check that the netCDF loaded by the NCO module supports the nc4 filetype diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py index 933ffab571..7bfc7c64fa 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py @@ -72,7 +72,7 @@ def __init__(self, gpudims, flop, repeat): ] self.build_system.ldflags = ['-O3'] self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} gpu_blocks, gpu_threads = gpudims self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format( repeat, flop, gpu_blocks, gpu_threads) diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py new file mode 100644 index 0000000000..68a30c8bb4 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py @@ -0,0 +1,170 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +#{{{ base +class ErtTestBase(rfm.RegressionTest): + """ + The Empirical Roofline Tool, ERT, automatically generates roofline data. + https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ + """ + + def __init__(self): + self.descr = 'Empirical Roofline Toolkit' + self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir, + 'roofline', 'ert') + self.build_system = 'SingleSource' + self.sourcepath = 'kernel1.c driver1.c' + self.executable = 'ert.exe' + self.build_system.ldflags = ['-O3', '-fopenmp'] + self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir, + 'roofline', 'ert') + self.rpt = '%s.rpt' % self.executable + self.maintainers = ['JG'] + self.tags = {'scs', 'external-external-resources'} + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + if self.num_tasks != 36: + self.job.launcher.options = ['--cpu-bind=verbose,none'] +#}}} + +#{{{ test +@rfm.parameterized_test( + *[[num_ranks, flop] + for num_ranks in [1] + for flop in [256, 512, 1024]]) + #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) +class ErtBroadwellTest(ErtTestBase): + def __init__(self, num_ranks, flop): + super().__init__() + ompthread = 1 + self.valid_systems = ['daint:mc', 'dom:mc'] + self.valid_prog_environs = ['PrgEnv-gnu'] + self.build_system.cppflags = [ + '-DERT_FLOP=%s' % flop, + '-DERT_ALIGN=32', + '-DERT_MEMORY_MAX=1073741824', + '-DERT_MPI=True', + '-DERT_OPENMP=True', + '-DERT_TRIALS_MIN=1', + '-DERT_WORKING_SET_MIN=1', + ] + self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( + flop, num_ranks, ompthread) + self.exclusive = True + self.num_tasks = num_ranks + self.num_tasks_per_node = num_ranks + self.num_cpus_per_task = ompthread + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'CRAYPE_LINK_TYPE': 'dynamic', + 'OMP_NUM_THREADS': str(self.num_cpus_per_task) + } + + # take the "slowest" job, make it sleep after it has ended and hope the + # other jobs have ended too + # TODO: find a better way to wait for the other jobs to end + num_ranks_min = 1 + flop_min = 1024 + self.roofline_rpt = 'rpt' + if num_ranks == num_ranks_min and flop == flop_min: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + # give enough time for all the dependent jobs to collect data: + 'sleep 60', + 'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt', + ] + + else: + self.post_run = [ + 'cat *_job.out | python2 preprocess.py > pre', + 'python2 maximum.py < pre > max', + 'python2 summary.py < max > sum', + ] + + # --- Sanity check: + regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' + r'\(\w+ \*\)malloc\(PSIZE\);') + datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') + self.sanity_patterns = sn.all([ + sn.assert_found('GFLOPs', 'sum'), + sn.assert_eq(datatype, 'double'), + ]) + + # --- Performance check: + if num_ranks == num_ranks_min and flop == flop_min: + # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): + ref_GFLOPs = 945.0 + ref_L1bw = 1788.0 + ref_L2bw = 855.0 + ref_L3bw = 547.0 + ref_DRAMbw = 70.5 + + # Typical performance report looks like: + # -------------------------------------- + # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt + # 908.43 GFLOPs EMP + # ****** + # META_DATA + # OPENMP_THREADS 1 + # FLOPS 8 + # MPI_PROCS 36 + # + # 5647.33 L1 EMP + # ******* + # 3203.86 L2 EMP + # ******* + # 1773.58 L3 EMP + # ******* + # 139.56 L4 EMP + # 103.50 DRAM EMP + # ****** + # META_DATA + # FLOPS 2 + # OPENMP_THREADS 1 + # MPI_PROCS 36 + regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' + regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' + regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' + regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' + regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' + + gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, + 'GFLOPs', float) + L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, + 'L1bw', float) + L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, + 'L2bw', float) + L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, + 'L3bw', float) + DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, + 'DRAMbw', float) + + # --performance-report: + self.perf_patterns = { + 'gflops': gflops, + 'L1bw': L1bw, + 'L2bw': L2bw, + 'L3bw': L3bw, + 'DRAMbw': DRAMbw, + } + + self.reference = { + '*': { + 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), + 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), + 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), + 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), + 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), + } + } + + # else: + +#}}} diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py index 580aff48e8..e9cef9be20 100644 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py +++ b/cscs-checks/tools/profiling_and_debugging/berkeley-ert.py @@ -23,7 +23,7 @@ def __init__(self): 'roofline', 'ert') self.rpt = '%s.rpt' % self.executable self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py index 4172410021..6c260b9e74 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_advisor_roofline.py @@ -238,7 +238,7 @@ def __init__(self, repeat, toolversion, datalayout): } self.maintainers = ['JG'] - self.tags = {'production', 'resources'} + self.tags = {'production', 'external-resources'} def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py index 489f4ee9dd..b3ff46d5b4 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_berkeley_stream.py @@ -35,7 +35,7 @@ def __init__(self): self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} @property @sn.sanity_function diff --git a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py index b6b3015d06..ab98a6cefb 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_sde_roofline.py @@ -68,7 +68,7 @@ def __init__(self, repeat, toolsversion, datalayout): self.sanity_patterns = sn.assert_found('Total FLOPs =', self.rpt) self.post_run = ['SDE/parse-sde.sh %s.* &> %s' % (self.sde, self.rpt)] self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} self.sanity_patterns = sn.all([ sn.assert_eq(sn.extractsingle( r'^Intel\(R\) Software Development Emulator\. Version: ' diff --git a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py index 46e173e8a5..ad745a924b 100644 --- a/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py +++ b/cscs-checks/tools/profiling_and_debugging/intel_vtune_roofline.py @@ -76,7 +76,7 @@ def __init__(self, repeat, toolsversion, datalayout): # NOTE: -allow-multiple-runs requires to install vtune drivers # TODO: -collect memory-access self.maintainers = ['JG'] - self.tags = {'scs', 'resources'} + self.tags = {'scs', 'external-resources'} self.sanity_patterns = sn.all([ sn.assert_found('loop complete.', self.stdout), sn.assert_eq(sn.extractsingle( diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py new file mode 100644 index 0000000000..9eda059c13 --- /dev/null +++ b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py @@ -0,0 +1,181 @@ +import os + +import reframe as rfm +import reframe.utility.sanity as sn + + +@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] + for repeat in ['600000'] + for toolsversion in ['4.3.3'] + # for datalayout in ['G3_AOS_SCALAR'] + for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', + 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] + ]) +class IntelRooflineLikwidTest(rfm.RegressionTest): + '''This test checks the values reported by RRZE likwid roofline model: + +G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697 +G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629 +G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439 +G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B + 10GF 60000 0.18 + + > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf + > likwid-perfctr -g CACHES -H + + > Get group definition with (identical result): + > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt + > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H + DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 + + FP_ARITH_INST_RETIRED_SCALAR_DOUBLE + + FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4) + /runtime + + > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H + Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) + + SUM(MBOXxC1))*64.0/runtime + Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) + + SUM(MBOXxC1))*64.0 + + > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H + L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + + ICACHE_MISSES)*64.0/time + L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + + ICACHE_MISSES)*64.0 + + > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H + L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL + + L2_LINES_OUT_DEMAND_DIRTY)*64/time + L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL + + L2_LINES_OUT_DEMAND_DIRTY)*64 + + > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H + Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) + + SUM(CAS_COUNT_WR))*64.0/time + Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) + + SUM(CAS_COUNT_WR))*64.0 + ''' + def __init__(self, repeat, toolsversion, datalayout): + super().__init__() + self.descr = 'Roofline Analysis test with Likwid:' + self.valid_systems = ['dom:mc'] + # Reporting MFLOPS is not available on Intel Haswell cpus, see + # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ + # 64-ia-32-architectures-software-developer-vol-1-manual.pdf + self.valid_prog_environs = ['PrgEnv-intel'] + self.modules = ['likwid'] + # likwid/4.3.3-perf_event + self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir, + 'roofline', 'intel_advisor') + self.build_system = 'SingleSource' + self.sourcepath = '_roofline.cpp' + self.executable = 'likwid-perfctr' + self.target_executable = './roof.exe' + self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON', + '-I$EBROOTLIKWID/include'] + self.prgenv_flags = { + 'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'], + # '-qopt-streaming-stores', 'always', + } + self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid'] + self.prebuild_cmd = [ + 'patch -s < LIKWID/roofline_template.patch', + 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % + (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') + ] + self.exclusive = True + self.num_tasks = 1 + self.num_tasks_per_node = 1 + self.num_cpus_per_task = 1 + self.num_tasks_per_core = 1 + self.use_multithreading = False + self.variables = { + 'OMP_NUM_THREADS': str(self.num_cpus_per_task), + 'CRAYPE_LINK_TYPE': 'dynamic', + } + self.pre_run = [ + 'mv %s %s' % (self.executable, self.target_executable), + ] + self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable] + # -C 0 : sets processor id(s) to pin threads and measure + # -g : sets performance group + # -m : use likwid API + self.executable_opts = self.tool_flags + self.maintainers = ['JG'] + self.tags = {'scs', 'external-external-resources'} + # self.rpt = '%s.rpt' % self.target_executable + self.sanity_patterns = sn.all([ + sn.assert_found('loop complete.', self.stdout), + sn.assert_eq(sn.extractsingle( + r'^likwid-perfctr -- Version (?P\d.\d.\d)', + self.stdout, 'toolsversion'), toolsversion), + ]) + # References for Intel Broadwell CPU (E5-2695 v4): + references = { + 'G3_AOS_SCALAR': { + 'dom:mc': { + 'gflops': (0.596, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_SOA_SCALAR': { + 'dom:mc': { + 'gflops': (0.612, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_AOS_VECTOR': { + 'dom:mc': { + 'gflops': (1.152, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.125, -0.05, 0.05, 'flop/byte') + } + }, + 'G3_SOA_VECTOR': { + 'dom:mc': { + 'gflops': (1.125, -0.1, 0.3, 'Gflop/s'), + 'ai': (0.16, -0.05, 0.05, 'flop/byte') + } + }, + } + self.reference = references[datalayout] + self.perf_patterns = { + 'gflops': self.gflops, + 'ai': self.arithmetic_intensity, + } + + def setup(self, partition, environ, **job_opts): + super().setup(partition, environ, **job_opts) + run_cmd = ' '.join(self.job.launcher.command(self.job)) + self.post_run = ['%s -v' % self.executable] + # self.perf_group = ['L2', 'L3'] + self.perf_group = ['L2', 'L3', 'CACHES', 'DATA', + 'MEM', 'MEM_DP', 'MEM_SP'] + for perf_group in self.perf_group: + self.post_run += ['%s %s -C 0 -g %s -m %s' % + (run_cmd, self.executable, perf_group, + self.target_executable)] + partitiontype = partition.fullname.split(':')[1] + if partitiontype == 'gpu': + self.job.options = ['--constraint="gpu&perf"'] + elif partitiontype == 'mc': + self.job.options = ['--constraint="mc&perf"'] + + @property + @sn.sanity_function + def arithmetic_intensity(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', + self.rpt, 'bytes', int) + # debug: print('ai={}'.format(flops/bytes)) + return flops/bytes + + @property + @sn.sanity_function + def gflops(self): + flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', + self.rpt, 'flops', int) + msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, + 'msec', float) + # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) + return (flops/((msec/1000))/10**9) From cb8f0877f5f985842d2c083e5aeb4fa83b4246ec Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 15 Oct 2019 12:43:25 +0200 Subject: [PATCH 5/6] clean --- .../berkeley-ert-serial.py | 170 ---------------- .../likwid_roofline.py | 181 ------------------ 2 files changed, 351 deletions(-) delete mode 100644 cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py delete mode 100644 cscs-checks/tools/profiling_and_debugging/likwid_roofline.py diff --git a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py b/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py deleted file mode 100644 index 68a30c8bb4..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/berkeley-ert-serial.py +++ /dev/null @@ -1,170 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -#{{{ base -class ErtTestBase(rfm.RegressionTest): - """ - The Empirical Roofline Tool, ERT, automatically generates roofline data. - https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ - """ - - def __init__(self): - self.descr = 'Empirical Roofline Toolkit' - self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir, - 'roofline', 'ert') - self.build_system = 'SingleSource' - self.sourcepath = 'kernel1.c driver1.c' - self.executable = 'ert.exe' - self.build_system.ldflags = ['-O3', '-fopenmp'] - self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir, - 'roofline', 'ert') - self.rpt = '%s.rpt' % self.executable - self.maintainers = ['JG'] - self.tags = {'scs', 'external-external-resources'} - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - if self.num_tasks != 36: - self.job.launcher.options = ['--cpu-bind=verbose,none'] -#}}} - -#{{{ test -@rfm.parameterized_test( - *[[num_ranks, flop] - for num_ranks in [1] - for flop in [256, 512, 1024]]) - #for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]]) -class ErtBroadwellTest(ErtTestBase): - def __init__(self, num_ranks, flop): - super().__init__() - ompthread = 1 - self.valid_systems = ['daint:mc', 'dom:mc'] - self.valid_prog_environs = ['PrgEnv-gnu'] - self.build_system.cppflags = [ - '-DERT_FLOP=%s' % flop, - '-DERT_ALIGN=32', - '-DERT_MEMORY_MAX=1073741824', - '-DERT_MPI=True', - '-DERT_OPENMP=True', - '-DERT_TRIALS_MIN=1', - '-DERT_WORKING_SET_MIN=1', - ] - self.name = 'ert_serial_FLOPS.{:04d}_MPI.{:03d}_OpenMP.{:03d}'.format( - flop, num_ranks, ompthread) - self.exclusive = True - self.num_tasks = num_ranks - self.num_tasks_per_node = num_ranks - self.num_cpus_per_task = ompthread - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'CRAYPE_LINK_TYPE': 'dynamic', - 'OMP_NUM_THREADS': str(self.num_cpus_per_task) - } - - # take the "slowest" job, make it sleep after it has ended and hope the - # other jobs have ended too - # TODO: find a better way to wait for the other jobs to end - num_ranks_min = 1 - flop_min = 1024 - self.roofline_rpt = 'rpt' - if num_ranks == num_ranks_min and flop == flop_min: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - # give enough time for all the dependent jobs to collect data: - 'sleep 60', - 'cat ../ert_serial_FLOPS*/sum | python2 roofline.py > rpt', - ] - - else: - self.post_run = [ - 'cat *_job.out | python2 preprocess.py > pre', - 'python2 maximum.py < pre > max', - 'python2 summary.py < max > sum', - ] - - # --- Sanity check: - regex_datatype = (r'^\s+(?P\w+) \* __restrict__ buf = ' - r'\(\w+ \*\)malloc\(PSIZE\);') - datatype = sn.extractsingle(regex_datatype, 'driver1.c', 'type') - self.sanity_patterns = sn.all([ - sn.assert_found('GFLOPs', 'sum'), - sn.assert_eq(datatype, 'double'), - ]) - - # --- Performance check: - if num_ranks == num_ranks_min and flop == flop_min: - # Reference roofline boundaries for Intel BroadwellCPU (E5-2695v4): - ref_GFLOPs = 945.0 - ref_L1bw = 1788.0 - ref_L2bw = 855.0 - ref_L3bw = 547.0 - ref_DRAMbw = 70.5 - - # Typical performance report looks like: - # -------------------------------------- - # ert_FLOPS.1024_MPI.001_OpenMP.036/rpt - # 908.43 GFLOPs EMP - # ****** - # META_DATA - # OPENMP_THREADS 1 - # FLOPS 8 - # MPI_PROCS 36 - # - # 5647.33 L1 EMP - # ******* - # 3203.86 L2 EMP - # ******* - # 1773.58 L3 EMP - # ******* - # 139.56 L4 EMP - # 103.50 DRAM EMP - # ****** - # META_DATA - # FLOPS 2 - # OPENMP_THREADS 1 - # MPI_PROCS 36 - regex_gflops = r'(?P\d+.\d+)\sGFLOPs EMP' - regex_L1bw = r'(?P\d+.\d+)\sL1 EMP' - regex_L2bw = r'(?P\d+.\d+)\sL2 EMP' - regex_L3bw = r'(?P\d+.\d+)\sL3 EMP' - regex_DRAMbw = r'(?P\d+.\d+) DRAM EMP' - - gflops = sn.extractsingle(regex_gflops, self.roofline_rpt, - 'GFLOPs', float) - L1bw = sn.extractsingle(regex_L1bw, self.roofline_rpt, - 'L1bw', float) - L2bw = sn.extractsingle(regex_L2bw, self.roofline_rpt, - 'L2bw', float) - L3bw = sn.extractsingle(regex_L3bw, self.roofline_rpt, - 'L3bw', float) - DRAMbw = sn.extractsingle(regex_DRAMbw, self.roofline_rpt, - 'DRAMbw', float) - - # --performance-report: - self.perf_patterns = { - 'gflops': gflops, - 'L1bw': L1bw, - 'L2bw': L2bw, - 'L3bw': L3bw, - 'DRAMbw': DRAMbw, - } - - self.reference = { - '*': { - 'gflops': (ref_GFLOPs, -0.1, 0.5, 'GF/s'), - 'L1bw': (ref_L1bw, -0.1, 0.3, 'GB/s'), - 'L2bw': (ref_L2bw, -0.1, 0.3, 'GB/s'), - 'L3bw': (ref_L3bw, -0.1, 0.3, 'GB/s'), - 'DRAMbw': (ref_DRAMbw, -0.1, 0.3, 'GB/s'), - } - } - - # else: - -#}}} diff --git a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py b/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py deleted file mode 100644 index 9eda059c13..0000000000 --- a/cscs-checks/tools/profiling_and_debugging/likwid_roofline.py +++ /dev/null @@ -1,181 +0,0 @@ -import os - -import reframe as rfm -import reframe.utility.sanity as sn - - -@rfm.parameterized_test(*[[repeat, toolsversion, datalayout] - for repeat in ['600000'] - for toolsversion in ['4.3.3'] - # for datalayout in ['G3_AOS_SCALAR'] - for datalayout in ['G3_AOS_SCALAR', 'G3_SOA_SCALAR', - 'G3_AOS_VECTOR', 'G3_SOA_VECTOR'] - ]) -class IntelRooflineLikwidTest(rfm.RegressionTest): - '''This test checks the values reported by RRZE likwid roofline model: - -G3_AOS_SCALAR DP Mflops/sec = 3280.32 L2 bandwidth [MBytes/s] = 39441.3 0.0831697 -G3_AOS_VECTOR DP Mflops/sec = 6432.24 L2 bandwidth [MBytes/s] = 76914 0.083629 -G3_SOA_SCALAR DP Mflops/sec = 3288.39 L2 bandwidth [MBytes/s] = 9.98179 329.439 -G3_SOA_VECTOR DP Mflops/sec = 21126.6 L2 bandwidth [MBytes/s] = 9.6529 2188.63 2.3F/B - 10GF 60000 0.18 - - > https://crd.lbl.gov/assets/Uploads/ECP18-Roofline-3-LIKWID.pdf - > likwid-perfctr -g CACHES -H - - > Get group definition with (identical result): - > cat $EBROOTLIKWID/share/likwid/perfgroups/broadwell/FLOPS_DP.txt - > srun -Cmc,perf -n1 -t1 likwid-perfctr -g FLOPS_DP -H - DP MFLOP/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2 + - FP_ARITH_INST_RETIRED_SCALAR_DOUBLE + - FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4) - /runtime - - > srun -Cmc,perf -n1 -t1 likwid-perfctr -g MEM -H - Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0) + - SUM(MBOXxC1))*64.0/runtime - Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0) + - SUM(MBOXxC1))*64.0 - - > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L2 -H - L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + - ICACHE_MISSES)*64.0/time - L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT + L2_TRANS_L1D_WB + - ICACHE_MISSES)*64.0 - - > srun -Cmc,perf -t1 -n1 likwid-perfctr -g L3 -H - L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL + - L2_LINES_OUT_DEMAND_DIRTY)*64/time - L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL + - L2_LINES_OUT_DEMAND_DIRTY)*64 - - > srun -Cmc,perf -t1 -n1 likwid-perfctr -g CACHES -H - Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD) + - SUM(CAS_COUNT_WR))*64.0/time - Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD) + - SUM(CAS_COUNT_WR))*64.0 - ''' - def __init__(self, repeat, toolsversion, datalayout): - super().__init__() - self.descr = 'Roofline Analysis test with Likwid:' - self.valid_systems = ['dom:mc'] - # Reporting MFLOPS is not available on Intel Haswell cpus, see - # https://www.intel.fr/content/dam/www/public/us/en/documents/manuals/ - # 64-ia-32-architectures-software-developer-vol-1-manual.pdf - self.valid_prog_environs = ['PrgEnv-intel'] - self.modules = ['likwid'] - # likwid/4.3.3-perf_event - self.sourcesdir = os.path.join(self.current_system.external-external-resourcesdir, - 'roofline', 'intel_advisor') - self.build_system = 'SingleSource' - self.sourcepath = '_roofline.cpp' - self.executable = 'likwid-perfctr' - self.target_executable = './roof.exe' - self.build_system.cppflags = ['-D_LIKWID', '-DLIKWID_PERFMON', - '-I$EBROOTLIKWID/include'] - self.prgenv_flags = { - 'PrgEnv-intel': ['-g', '-O2', '-std=c++11', '-restrict'], - # '-qopt-streaming-stores', 'always', - } - self.build_system.ldflags = ['-L$EBROOTLIKWID/lib', '-llikwid'] - self.prebuild_cmd = [ - 'patch -s < LIKWID/roofline_template.patch', - 'sed -e "s-XXXX-%s-" -e "s-YYYY-%s-" %s &> %s' % - (repeat, datalayout, 'roofline_template.cpp', '_roofline.cpp') - ] - self.exclusive = True - self.num_tasks = 1 - self.num_tasks_per_node = 1 - self.num_cpus_per_task = 1 - self.num_tasks_per_core = 1 - self.use_multithreading = False - self.variables = { - 'OMP_NUM_THREADS': str(self.num_cpus_per_task), - 'CRAYPE_LINK_TYPE': 'dynamic', - } - self.pre_run = [ - 'mv %s %s' % (self.executable, self.target_executable), - ] - self.tool_flags = ['-C 0 -g FLOPS_DP -m %s ' % self.target_executable] - # -C 0 : sets processor id(s) to pin threads and measure - # -g : sets performance group - # -m : use likwid API - self.executable_opts = self.tool_flags - self.maintainers = ['JG'] - self.tags = {'scs', 'external-external-resources'} - # self.rpt = '%s.rpt' % self.target_executable - self.sanity_patterns = sn.all([ - sn.assert_found('loop complete.', self.stdout), - sn.assert_eq(sn.extractsingle( - r'^likwid-perfctr -- Version (?P\d.\d.\d)', - self.stdout, 'toolsversion'), toolsversion), - ]) - # References for Intel Broadwell CPU (E5-2695 v4): - references = { - 'G3_AOS_SCALAR': { - 'dom:mc': { - 'gflops': (0.596, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_SOA_SCALAR': { - 'dom:mc': { - 'gflops': (0.612, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_AOS_VECTOR': { - 'dom:mc': { - 'gflops': (1.152, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.125, -0.05, 0.05, 'flop/byte') - } - }, - 'G3_SOA_VECTOR': { - 'dom:mc': { - 'gflops': (1.125, -0.1, 0.3, 'Gflop/s'), - 'ai': (0.16, -0.05, 0.05, 'flop/byte') - } - }, - } - self.reference = references[datalayout] - self.perf_patterns = { - 'gflops': self.gflops, - 'ai': self.arithmetic_intensity, - } - - def setup(self, partition, environ, **job_opts): - super().setup(partition, environ, **job_opts) - run_cmd = ' '.join(self.job.launcher.command(self.job)) - self.post_run = ['%s -v' % self.executable] - # self.perf_group = ['L2', 'L3'] - self.perf_group = ['L2', 'L3', 'CACHES', 'DATA', - 'MEM', 'MEM_DP', 'MEM_SP'] - for perf_group in self.perf_group: - self.post_run += ['%s %s -C 0 -g %s -m %s' % - (run_cmd, self.executable, perf_group, - self.target_executable)] - partitiontype = partition.fullname.split(':')[1] - if partitiontype == 'gpu': - self.job.options = ['--constraint="gpu&perf"'] - elif partitiontype == 'mc': - self.job.options = ['--constraint="mc&perf"'] - - @property - @sn.sanity_function - def arithmetic_intensity(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - bytes = sn.extractsingle(r'^--->Total Bytes = (?P\d+)', - self.rpt, 'bytes', int) - # debug: print('ai={}'.format(flops/bytes)) - return flops/bytes - - @property - @sn.sanity_function - def gflops(self): - flops = sn.extractsingle(r'^--->Total FLOPs = (?P\d+)', - self.rpt, 'flops', int) - msec = sn.extractsingle(r'^elapsed time: (?P\d+)ms', self.stdout, - 'msec', float) - # debug: print('gflops={}'.format(flops/((msec/1000)*10**6))) - return (flops/((msec/1000))/10**9) From 3978d5f851aeeb6054425c3fde17e765cf7abb00 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 16 Oct 2019 16:30:49 +0200 Subject: [PATCH 6/6] Remove completely obsolete Scalapack test --- .../libraries/math/scalapack_compile_run.py | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/cscs-checks/libraries/math/scalapack_compile_run.py b/cscs-checks/libraries/math/scalapack_compile_run.py index be040662d9..624b1a9770 100644 --- a/cscs-checks/libraries/math/scalapack_compile_run.py +++ b/cscs-checks/libraries/math/scalapack_compile_run.py @@ -66,33 +66,3 @@ def scalapack_sanity(number1, number2, expected_value): scalapack_sanity(4, 3, 0.2483911184660867), scalapack_sanity(4, 4, 0.1701907253504270) ]) - - -# # FIXME: This test is obsolete; it is kept only for reference. -# # NOTE: The test case is very small, but larger cases did not succeed! -# @rfm.required_version('>=2.14') -# @rfm.parameterized_test(['dynamic']) -# class ScaLAPACKPerf(ScaLAPACKTest): -# def __init__(self, linkage): -# super().__init__(linkage) -# -# self.tags |= {'monch_acceptance'} -# self.sourcepath = 'scalapack_performance_compile_run.f' -# self.valid_systems = ['monch:compute'] -# self.valid_prog_environs = ['PrgEnv-gnu'] -# self.num_tasks = 64 -# self.num_tasks_per_node = 16 -# -# self.sanity_patterns = sn.assert_found(r'Run', self.stdout) -# self.perf_patterns = { -# 'perf': sn.max( -# sn.extractall(r'GFLOPS/s:\s+(?P\S+)', -# self.stdout, 'gflops', float) -# ) -# } -# -# self.reference = { -# 'monch:compute': { -# 'perf': (24., -0.1, None) -# } -# }