Skip to content
160 changes: 160 additions & 0 deletions cscs-checks/microbenchmarks/likwid/likwid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import reframe as rfm
import reframe.utility.sanity as sn


class MemBandwidthTest(rfm.RunOnlyRegressionTest):
def __init__(self):
super().__init__()

self.modules = ['likwid']
self.valid_prog_environs = ['PrgEnv-gnu']
self.sourcesdir = None

self.executable = 'likwid-bench'

self.num_tasks = 0
self.num_tasks_per_core = 2
self.system_num_cpus = {
'daint:mc': 72,
'daint:gpu': 24,
'dom:mc': 72,
'dom:gpu': 24,
}
self.system_numa_domains = {
'daint:mc': ['S0', 'S1'],
'daint:gpu': ['S0'],
'dom:mc': ['S0', 'S1'],
'dom:gpu': ['S0'],
}

# Test each level at half capacity times nthreads per domain
self.system_cache_sizes = {
'daint:mc': {'L1': '288kB', 'L2': '2304kB', 'L3': '23MB',
'memory': '1800MB'},
'daint:gpu': {'L1': '192kB', 'L2': '1536kB', 'L3': '15MB',
'memory': '1200MB'},
'dom:mc': {'L1': '288kB', 'L2': '2304kB', 'L3': '23MB',
'memory': '1800MB'},
'dom:gpu': {'L1': '192kB', 'L2': '1536kB', 'L3': '15MB',
'memory': '1200MB'},
}

self.maintainers = ['SK']
self.tags = {'benchmark', 'diagnostic'}

bw_pattern = sn.extractsingle(r'MByte/s:\s*(?P<bw>\S+)',
self.stdout, 'bw', float)

self.sanity_patterns = sn.assert_ge(bw_pattern, 0.0)
self.perf_patterns = {
'bandwidth': bw_pattern
}


@rfm.required_version('>=2.16-dev0')
@rfm.parameterized_test(*[[l, k] for l in ['L1', 'L2', 'L3']
for k in ['load_avx', 'store_avx']],
['memory', 'load_avx'],
['memory', 'store_mem_avx'])
class CPUBandwidth(MemBandwidthTest):
def __init__(self, mem_level, kernel_name):
super().__init__()

self.descr = 'CPU <- %s %s benchmark' % (mem_level, kernel_name)
self.valid_systems = ['daint:mc', 'daint:gpu', 'dom:gpu', 'dom:mc']

# the kernel to run in likwid
self.kernel_name = kernel_name
self.mem_level = mem_level

self.refs = {
'mc': {
'load_avx': {'L1': 5100000, 'L2': 2000000, 'L3': 900000,
'memory': 130000},
'store_avx': {'L1': 2800000, 'L2': 900000, 'L3': 480000},
'store_mem_avx': {'memory': 85000},
},
'gpu': {
'load_avx': {'L1': 2100000, 'L2': 850000, 'L3': 360000,
'memory': 65000},
'store_avx': {'L1': 1200000, 'L2': 340000, 'L3': 210000},
'store_mem_avx': {'memory': 42500},
}
}
ref_proxy = {part: self.refs[part][kernel_name][mem_level]
for part in self.refs.keys()}

self.reference = {
'daint:gpu': {
'bandwidth': (ref_proxy['gpu'], -0.1, None, 'MB/s')
},
'daint:mc': {
'bandwidth': (ref_proxy['mc'], -0.1, None, 'MB/s')
},
'dom:gpu': {
'bandwidth': (ref_proxy['gpu'], -0.1, None, 'MB/s')
},
'dom:mc': {
'bandwidth': (ref_proxy['mc'], -0.1, None, 'MB/s')
},
}

def setup(self, partition, environ, **job_opts):
pfn = parition.fullname
self.data_size = self.system_cache_sizes[pfn][self.mem_level]
self.num_cpus_per_task = self.system_num_cpus[partition.fullname]
numa_domains = self.system_numa_domains[partition.fullname]
num_cpu_domain = self.num_cpus_per_task / (len(numa_domains) *
self.num_tasks_per_core)
# result for daint:mc: '-w S0:100MB:18:1:2 -w S1:100MB:18:1:2'
# format: -w domain:data_size:nthreads:chunk_size:stride
# chunk_size and stride affect which cpus from <domain> are selected
workgroups = ['-w %s:%s:%d:1:2' %
(dom, self.data_size, num_cpu_domain)
for dom in numa_domains]

self.executable_opts = ['-t %s' % self.kernel_name] + workgroups

super().setup(partition, environ, **job_opts)


@rfm.required_version('>=2.16-dev0')
@rfm.simple_test
class CPUBandwidthCrossSocket(MemBandwidthTest):
def __init__(self):
super().__init__()

self.descr = ("CPU S0 <- main memory S1 read " +
"CPU S1 <- main memory S0 read")

self.valid_systems = ['daint:mc', 'dom:mc']
self.kernel_name = 'load_avx'
self.reference = {
'daint:mc': {
'bandwidth': (56000, -0.1, None, 'MB/s')
},
'dom:mc': {
'bandwidth': (56000, -0.1, None, 'MB/s')
},
}

def setup(self, partition, environ, **job_opts):

self.num_cpus_per_task = self.system_num_cpus[partition.fullname]
numa_domains = self.system_numa_domains[partition.fullname]

num_cpu_domain = (self.num_cpus_per_task /
(len(numa_domains) * self.num_tasks_per_core))

# daint:mc: '-w S0:100MB:18:1:2-0:S1 -w S1:100MB:18:1:2-0:S0'
# format:
# -w domain:data_size:nthreads:chunk_size:stride-stream_nr:mem_domain
# chunk_size and stride affect which cpus from <domain> are selected
workgroups = ['-w %s:100MB:%d:1:2-0:%s' %
(dom_cpu, num_cpu_domain, dom_mem)
for dom_cpu, dom_mem in
zip(numa_domains[:2], reversed(numa_domains[:2]))]

self.executable_opts = ['-t %s' % self.kernel_name] + workgroups

super().setup(partition, environ, **job_opts)