-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #775 from jgphpc/UES-213_nvprof
[test] Add ert roofline check (gpu)
- Loading branch information
Showing
1 changed file
with
148 additions
and
0 deletions.
There are no files selected for viewing
148 changes: 148 additions & 0 deletions
148
cscs-checks/tools/profiling_and_debugging/berkeley-ert-nvprof.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import os | ||
|
||
import reframe as rfm | ||
import reframe.utility.sanity as sn | ||
|
||
|
||
@rfm.parameterized_test( | ||
*[[gpudims, flop, repeat] | ||
# gpudims sets (gpu_blocks, gpu_threads): | ||
for gpudims in [(112, 1024), (224, 512), (448, 256), (896, 128), | ||
(1792, 64), (3584, 32)] | ||
for flop in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] | ||
# self.repeat replaces '-DERT_NUM_EXPERIMENTS=2': | ||
for repeat in [1, 2]]) | ||
class ErtP100Test(rfm.RegressionTest): | ||
""" | ||
The Empirical Roofline Tool, ERT, empirically generates roofline data: | ||
https://bitbucket.org/berkeleylab/cs-roofline-toolkit/ | ||
This test checks the ERT tool with NVIDIA Tesla P100-PCIE-16GB: | ||
Device 0: "Tesla P100-PCIE-16GB" | ||
CUDA Driver Version / Runtime Version 10.1 / 10.0 | ||
CUDA Capability Major/Minor version number: 6.0 | ||
(56) Multiprocessors, (64) CUDA Cores/MP: 3584 CUDA Cores | ||
GPU Max Clock rate: 1329 MHz (1.33 GHz) | ||
Theoretical peak performance per GPU: 4761 Gflop/s | ||
Maximum number of threads per multiprocessor: 2048 | ||
Peak number of threads: 114688 threads <--------- | ||
Maximum number of threads per block: 1024 <--------- | ||
NVRM version: NVIDIA UNIX x86_64 Kernel Module 418.39 | ||
# The following python code can help for a parameter space study: | ||
# (use --exec-policy=async) | ||
max_threads_per_block = 1024 | ||
max_threads = 114688 | ||
gpu_threads = max_threads_per_block * 2 | ||
while gpu_threads > 32: | ||
gpu_threads = gpu_threads // 2 | ||
gpu_blocks = max_threads // gpu_threads | ||
nth = gpu_threads * gpu_blocks | ||
print('{} {} {} {}'.format(gpu_blocks, gpu_threads, nth, max_threads)) | ||
""" | ||
def __init__(self, gpudims, flop, repeat): | ||
super().__init__() | ||
max_gpu_blocks = 3584 | ||
max_flops = 1024 | ||
max_repeat = 2 | ||
self.descr = 'Empirical Roofline Toolkit' | ||
self.valid_systems = ['dom:gpu'] | ||
self.valid_prog_environs = ['PrgEnv-gnu'] | ||
self.modules = ['craype-accel-nvidia60'] | ||
self.sourcesdir = os.path.join(self.current_system.resourcesdir, | ||
'roofline', 'ert') | ||
# A single input file is required for nvcc to work: | ||
self.build_system = 'SingleSource' | ||
self.prebuild_cmd = [ | ||
'cat kernel1.c driver1.c | sed "s-^#if ERT-#ifdef ERT-g" > ' | ||
'_gpu.cu'] | ||
self.sourcepath = '_gpu.cu' | ||
self.executable = 'ert.exe' | ||
self.build_system.cppflags = [ | ||
# ERT_FLOPS = -DERT_FLOP ! | ||
'-DERT_FLOP=%s' % flop, | ||
'-DERT_ALIGN=32', | ||
# 1G = 1024^3 = 1073741824: | ||
'-DERT_MEMORY_MAX=1073741824', | ||
# ERT_GPU True: | ||
'-DERT_GPU', | ||
'-DERT_TRIALS_MIN=1', | ||
'-DERT_WORKING_SET_MIN=128', | ||
# '-x cu' explicitly sets the language (cuda) for the src files. | ||
] | ||
self.build_system.ldflags = ['-O3'] | ||
self.maintainers = ['JG'] | ||
self.tags = {'scs'} | ||
gpu_blocks, gpu_threads = gpudims | ||
self.name = 'ertgpu_Run.{}_FLOPS.{}_GPUBlocks.{}_GPUThreads.{}'.format( | ||
repeat, flop, gpu_blocks, gpu_threads) | ||
self.exclusive = True | ||
self.num_tasks = 1 | ||
self.num_tasks_per_node = 1 | ||
self.num_cpus_per_task = 1 | ||
self.num_tasks_per_core = 1 | ||
self.use_multithreading = False | ||
self.variables = { | ||
'CRAYPE_LINK_TYPE': 'dynamic', | ||
'OMP_NUM_THREADS': str(self.num_cpus_per_task) | ||
} | ||
self.executable_opts = [str(gpu_blocks), str(gpu_threads)] | ||
self.rpt = '%s.rpt' % self.executable | ||
# Reference roofline boundaries for NVIDIA Tesla P100-PCIE-16GB: | ||
GFLOPs = 4355.0 | ||
# Keeping for future reference: | ||
# L1bw = 1724.0 | ||
# L2bw = 855.0 | ||
# L3bw = 547.0 | ||
DRAMbw = 521.0 | ||
self.roofline_rpt = 'rpt' | ||
# use the latest job to generate the roofline rpt: | ||
if (gpu_blocks == max_gpu_blocks and flop == max_flops and | ||
repeat == max_repeat): | ||
self.post_run = [ | ||
'cat *_job.out | python2 preprocess.py > pre', | ||
'python2 maximum.py < pre > max', | ||
'python2 summary.py < max > sum', | ||
# give enough time for all the dependent jobs to collect data: | ||
'sleep 60', | ||
'cat ../ertgpu_Run*/sum | python2 roofline.py > rpt', | ||
] | ||
self.sanity_patterns = sn.all([ | ||
# --- check data type: | ||
sn.assert_eq(sn.extractsingle( | ||
r'^\s+(?P<prec>\w+) \*\s+buf = \(\w+ \*\)' | ||
r'_mm_malloc\(PSIZE, ERT_ALIGN\);', 'driver1.c', 'prec'), | ||
'double'), | ||
# --- check ert's roofline results. Typical output is: | ||
# 4355.20 GFLOPs EMP | ||
# META_DATA | ||
# GPU_BLOCKS 1792 | ||
# FLOPS 1024 | ||
# GPU_THREADS 64 | ||
# | ||
# 1723.95 L1 EMP | ||
# 521.29 DRAM EMP | ||
# | ||
# check GFLOPS: | ||
sn.assert_reference(sn.extractsingle( | ||
r'(?P<GFLOPs>\d+.\d+)\sGFLOPs EMP', self.roofline_rpt, | ||
'GFLOPs', float), GFLOPs, -0.1, 0.5), | ||
# check L1 bandwidth: | ||
# https://cug.org/proceedings/protected/cug2019_proceedings/ | ||
# includes/files/pap103s2-file1.pdf: | ||
# "ERT fails to identify the L1 cache" | ||
# sn.assert_reference(sn.extractsingle( | ||
# r'(?P<L1bw>\d+.\d+)\sL1 EMP', self.roofline_rpt, | ||
# 'L1bw', float), L1bw, -0.1, 0.3), | ||
# check DRAM bandwidth: | ||
sn.assert_reference(sn.extractsingle( | ||
r'(?P<DRAMbw>\d+.\d+) DRAM EMP', self.roofline_rpt, | ||
'DRAMbw', float), DRAMbw, -0.1, 0.3), | ||
]) | ||
else: | ||
self.post_run = [ | ||
'cat *_job.out | python2 preprocess.py > pre', | ||
'python2 maximum.py < pre > max', | ||
'python2 summary.py < max > sum', | ||
] | ||
self.sanity_patterns = sn.assert_found('GFLOPs', 'sum') |