Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
b77f5ba
Preliminary work on the perf patterns
jjotero Jun 28, 2021
c2fdb86
Merge branch 'feat/sanity' into feat/perf-syntax
jjotero Jun 30, 2021
53eb6cb
Implement performance decorators
jjotero Jun 30, 2021
9444ddd
Merge branch 'master' of github.com:eth-cscs/reframe into feat/perf-s…
jjotero Jul 1, 2021
4b5e889
Merge branch 'bugfix/hook-override' into feat/perf-syntax
jjotero Jul 1, 2021
cf65b72
Update syntax compat logic
jjotero Jul 1, 2021
94916a6
Implement new perf syntax
jjotero Jul 2, 2021
d974764
Port jobreport to new syntax
jjotero Jul 2, 2021
98432db
Fix reference types
jjotero Jul 5, 2021
4b89a8d
Merge branch 'master' into feat/perf-syntax
jjotero Jul 19, 2021
20633fa
Add perf_key argument to @perf_function
jjotero Jul 19, 2021
0fc17dd
Add make_performance_function method
jjotero Jul 19, 2021
c5fa688
Make unit type-checking more robust
jjotero Jul 19, 2021
7607dea
Add signature checking on perf fns
jjotero Jul 19, 2021
e0b3ab8
Remove unused imports
jjotero Jul 19, 2021
277e73d
Add trivially callable util
jjotero Jul 20, 2021
e4f4092
Remove unused imports
jjotero Jul 20, 2021
cabacdd
Address PR comments
jjotero Jul 21, 2021
fb41d6a
Merge branch 'master' into feat/perf-syntax
jjotero Jul 26, 2021
cdf69cc
Allow units in references
jjotero Jul 26, 2021
98d43d2
Rename performance_report to performance_variables
jjotero Jul 30, 2021
17a2881
Cleanup the metaclass
jjotero Jul 30, 2021
b719629
Merge branch 'master' into feat/perf-syntax
jjotero Jul 30, 2021
01a1b78
Update check syntax
jjotero Jul 30, 2021
6f8694d
Transpose nested loop in meta
jjotero Aug 2, 2021
84deeb0
Prototype test syntax
jjotero Aug 3, 2021
e560cbd
Make perf_variables a variable
jjotero Aug 3, 2021
e0db6c5
Give perf_variables a default value
jjotero Aug 4, 2021
164c59e
Raise warning if any perf var fails
jjotero Aug 4, 2021
0d5f4cb
Address PR comments
jjotero Aug 4, 2021
de2ea43
Merge branch 'master' into feat/perf-other
jjotero Aug 4, 2021
bfdd758
Remove unused imports
jjotero Aug 4, 2021
7499b4d
Add unit tests
jjotero Aug 6, 2021
6463f80
Fix PEP complaints
jjotero Aug 6, 2021
3bcc349
move make_performance_function into sn module
jjotero Aug 9, 2021
6ce8bec
Merge branch 'master' into feat/perf-syntax
jjotero Aug 9, 2021
439ced6
Address PR comments
jjotero Aug 19, 2021
87f4583
Update docs
jjotero Aug 19, 2021
fa071f3
Merge branch 'master' into local/perf-syntax
jjotero Aug 19, 2021
00dfb53
Fix PEP complaints
jjotero Aug 19, 2021
5b19511
Update docs with PR comments
jjotero Aug 23, 2021
b7f926d
Update perf_variables docstring
jjotero Aug 23, 2021
fa2c92a
Merge branch 'master' into feat/perf-syntax
jjotero Aug 23, 2021
06813e9
Docs fine tuning
Aug 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 31 additions & 22 deletions cscs-checks/microbenchmarks/gpu/gpu_burn/gpu_burn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
#
# SPDX-License-Identifier: BSD-3-Clause

import os

import reframe as rfm
import reframe.utility.sanity as sn
import reframe.utility.osext as osext
from reframe.core.exceptions import SanityError

from hpctestlib.microbenchmarks.gpu.gpu_burn import GpuBurn
import cscstests.microbenchmarks.gpu.hooks as hooks
Expand All @@ -24,30 +25,29 @@ class gpu_burn_check(GpuBurn):
num_tasks = 0
reference = {
'dom:gpu': {
'perf': (4115, -0.10, None, 'Gflop/s'),
'min_perf': (4115, -0.10, None, 'Gflop/s'),
},
'daint:gpu': {
'perf': (4115, -0.10, None, 'Gflop/s'),
'min_perf': (4115, -0.10, None, 'Gflop/s'),
},
'arolla:cn': {
'perf': (5861, -0.10, None, 'Gflop/s'),
'min_perf': (5861, -0.10, None, 'Gflop/s'),
},
'tsa:cn': {
'perf': (5861, -0.10, None, 'Gflop/s'),
'min_perf': (5861, -0.10, None, 'Gflop/s'),
},
'ault:amda100': {
'perf': (15000, -0.10, None, 'Gflop/s'),
'min_perf': (15000, -0.10, None, 'Gflop/s'),
},
'ault:amdv100': {
'perf': (5500, -0.10, None, 'Gflop/s'),
'min_perf': (5500, -0.10, None, 'Gflop/s'),
},
'ault:intelv100': {
'perf': (5500, -0.10, None, 'Gflop/s'),
'min_perf': (5500, -0.10, None, 'Gflop/s'),
},
'ault:amdvega': {
'perf': (3450, -0.10, None, 'Gflop/s'),
'min_perf': (3450, -0.10, None, 'Gflop/s'),
},
'*': {'temp': (0, None, None, 'degC')}
}

maintainers = ['AJ', 'TM']
Expand All @@ -63,16 +63,25 @@ def set_num_gpus_per_node(self):
hooks.set_num_gpus_per_node(self)

@run_before('performance')
def report_nid_with_smallest_flops(self):
regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s'
rptf = os.path.join(self.stagedir, sn.evaluate(self.stdout))
self.nids = sn.extractall(regex, rptf, 1)
self.flops = sn.extractall(regex, rptf, 2, float)
def report_slow_nodes(self):
'''Report the base perf metrics and also all the slow nodes.'''

# Only report the nodes that don't meet the perf reference
with osext.change_dir(self.stagedir):
key = f'{self.current_partition.fullname}:min_perf'
if key in self.reference:
regex = r'\[(\S+)\] GPU\s+\d\(OK\): (\d+) GF/s'
nids = set(sn.extractall(regex, self.stdout, 1))

# Get the references
ref, lt, ut, *_ = self.reference[key]

# Flag the slow nodes
for nid in nids:
try:
node_perf = self.min_perf(nid)
val = node_perf.evaluate(cache=True)
sn.assert_reference(val, ref, lt, ut).evaluate()
except SanityError:
self.perf_variables[nid] = node_perf

# Find index of smallest flops and update reference dictionary to
# include our patched units
index = self.flops.evaluate().index(min(self.flops))
unit = f'GF/s ({self.nids[index]})'
for key, ref in self.reference.items():
if not key.endswith(':temp'):
self.reference[key] = (*ref[:3], unit)
12 changes: 8 additions & 4 deletions cscs-checks/system/jobreport/gpu_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,12 @@ def gpu_usage_sanity(self):
sn.assert_ge(sn.min(time_reported), self.burn_time)
])

@performance_function('nodes')
def total_nodes_reported(self):
return sn.count(self.nodes_reported)

@run_before('performance')
def set_perf_patterns(self):
def set_perf_variables(self):
'''The number of reported nodes can be used as a perf metric.

For now, the low limit can go to zero, but this can be set to a more
Expand All @@ -103,9 +107,9 @@ def set_perf_patterns(self):

self.reference = {
'*': {
'nodes_reported': (self.num_tasks, self.perf_floor, 0, 'nodes')
'nodes_reported': (self.num_tasks, self.perf_floor, 0)
},
}
self.perf_patterns = {
'nodes_reported': sn.count(self.nodes_reported)
self.perf_variables = {
'nodes_reported': self.total_nodes_reported()
}
29 changes: 27 additions & 2 deletions docs/deferrable_functions_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,20 @@ Deferrable Functions Reference
*Deferrable functions* are the functions whose execution may be postponed to a later time after they are called.
The key characteristic of these functions is that they store their arguments when they are called, and the execution itself does not occur until the function is evaluated either explicitly or implicitly.

ReFrame provides an ample set of deferrable utilities and it also allows users to write their own deferrable functions when needed.
Please refer to ":doc:`deferrables`" for a hands-on explanation on how deferrable functions work and how to create custom deferrable functions.


Explicit evaluation of deferrable functions
-------------------------------------------

Deferrable functions may be evaluated at any time by calling :func:`evaluate` on their return value or by passing the deferred function itself to the :func:`~reframe.utility.sanity.evaluate()` free function.
These :func:`evaluate` functions take an optional :class:`bool` argument ``cache``, which can be used to cache the evaluation of the deferrable function.
Hence, if caching is enabled on a given deferrable function, any subsequent calls to :func:`evaluate` will simply return the previously cached results.

.. versionchanged:: 3.8.0
Support of cached evaluation is added.


Implicit evaluation of deferrable functions
-------------------------------------------
Expand Down Expand Up @@ -48,9 +58,24 @@ Currently ReFrame provides three broad categories of deferrable functions:
They include, but are not limited to, functions to iterate over regex matches in a file, extracting and converting values from regex matches, computing statistical information on series of data etc.


Users can write their own deferrable functions as well.
The page ":doc:`deferrables`" explains in detail how deferrable functions work and how users can write their own.
.. _deferrable-performance-functions:


--------------------------------
Deferrable performance functions
--------------------------------

.. versionadded:: 3.8.0

Deferrable performance functions are a special type of deferrable functions which are intended for measuring a given quantity.
Therefore, this kind of deferrable functions have an associated unit that can be used to interpret the return values from these functions.
The unit of a deferrable performance function can be accessed through the public member :attr:`unit`.
Regular deferrable functions can be promoted to deferrable performance functions using the :func:`~reframe.utility.sanity.make_performance_function` utility.
Also, this utility allows to create performance functions directly from any callable.


List of deferrable functions and utilities
------------------------------------------

.. py:decorator:: reframe.utility.sanity.deferrable(func)

Expand Down
15 changes: 15 additions & 0 deletions docs/regression_test_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,21 @@ Built-in functions

.. versionadded:: 3.7.0

.. py:decorator:: RegressionMixin.performance_function(unit, *, perf_key=None)

Decorate a member function as a performance function of the test.

This decorator converts the decorated method into a performance deferrable function (see ":ref:`deferrable-performance-functions`" for more details) whose evaluation is deferred to the performance stage of the regression test.
The decorated function must take a single argument without a default value (i.e. ``self``) and any number of arguments with default values.
A test may decorate multiple member functions as performance functions, where each of the decorated functions must be provided with the units of the performance quantitites to be extracted from the test.
These performance units must be of type :class:`str`.
Any performance function may be overridden in a derived class and multiple bases may define their own performance functions.
In the event of a name conflict, the derived class will follow Python's `MRO <https://docs.python.org/3/library/stdtypes.html#class.__mro__>`_ to choose the appropriate performance function.
However, defining more than one performance function with the same name in the same class is disallowed.

The full set of performance functions of a regression test is stored under :attr:`~reframe.core.pipeline.RegressionTest.perf_variables` as key-value pairs, where, by default, the key is the name of the decorated member function, and the value is the deferred performance function itself.
Optionally, the key under which a performance function is stored in :attr:`~reframe.core.pipeline.RegressionTest.perf_variables` can be customised by passing the desired key as the ``perf_key`` argument to this decorator.

.. py:decorator:: RegressionMixin.deferrable(func)

Converts the decorated method into a deferrable function.
Expand Down
41 changes: 22 additions & 19 deletions hpctestlib/microbenchmarks/gpu/gpu_burn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,6 @@ class GpuBurn(rfm.RegressionTest, pin_prefix=True):
build_system = 'Make'
executable = './gpu_burn.x'
num_tasks_per_node = 1
reference = {
'*': {
'perf': (0, None, None, 'Gflop/s'),
'temp': (0, None, None, 'degC')
}
}

@run_before('compile')
def set_gpu_build(self):
Expand All @@ -83,7 +77,6 @@ def set_gpu_build(self):
raise ValueError('unknown gpu_build option')

@property
@deferrable
def num_tasks_assigned(self):
'''Total number of times the gpu burn will run.

Expand All @@ -103,17 +96,27 @@ def count_successful_burns(self):
r'^\s*\[[^\]]*\]\s*GPU\s*\d+\(OK\)', self.stdout)
), self.num_tasks_assigned)

@run_before('performance')
def set_perf_patterns(self):
'''Extract the minimum performance and maximum temperature recorded.
def _extract_perf_metric(self, metric, nid=None):
'''Utility to extract performance metrics.'''

The performance and temperature data are reported in Gflops/s and
deg. Celsius respectively.
'''
if metric not in {'perf', 'temp'}:
raise ValueError(
f"unsupported value in 'metric' argument: {metric!r}"
)

if nid is None:
nid = r'[^\]]*'

patt = (rf'^\s*\[{nid}\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
rf'\s+(?P<temp>\S*)\s+Celsius')
return sn.extractall(patt, self.stdout, metric, float)

@performance_function('Gflop/s')
def min_perf(self, nid=None):
'''Lowest performance recorded.'''
return sn.min(self._extract_perf_metric('perf', nid))

patt = (r'^\s*\[[^\]]*\]\s*GPU\s+\d+\(\S*\):\s+(?P<perf>\S*)\s+GF\/s'
r'\s+(?P<temp>\S*)\s+Celsius')
self.perf_patterns = {
'perf': sn.min(sn.extractall(patt, self.stdout, 'perf', float)),
'temp': sn.max(sn.extractall(patt, self.stdout, 'temp', float)),
}
@performance_function('degC')
def max_temp(self, nid=None):
'''Maximum temperature recorded.'''
return sn.max(self._extract_perf_metric('temp', nid))
56 changes: 46 additions & 10 deletions reframe/core/deferrable.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,31 +44,36 @@ def __init__(self, fn, *args, **kwargs):

# We cache the value of the last evaluation inside a tuple.
# We don't cache the value directly, because it can be any.

# NOTE: The cache for the moment is only used by
# `__rfm_json_encode__`. Enabling caching in the evaluation is a
# reasonable optimization, but might break compatibility, so it needs
# to be thought thoroughly and communicated properly in the
# documentation.
self._cached = ()
self._return_cached = False

def evaluate(self, cache=False):
# Return the cached value (if any)
if self._return_cached and not cache:
return self._cached[0]
elif cache:
self._return_cached = cache

def evaluate(self):
fn_args = []
for arg in self._args:
fn_args.append(
arg.evaluate() if isinstance(arg, type(self)) else arg
arg.evaluate() if isinstance(arg, _DeferredExpression) else arg
)

fn_kwargs = {}
for k, v in self._kwargs.items():
fn_kwargs[k] = (
v.evaluate() if isinstance(v, type(self)) else v
v.evaluate() if isinstance(v, _DeferredExpression) else v
)

ret = self._fn(*fn_args, **fn_kwargs)
if isinstance(ret, type(self)):

# Evaluate the return for as long as a deferred expression returns
# another deferred expression.
while isinstance(ret, _DeferredExpression):
ret = ret.evaluate()

# Cache the results for any subsequent evaluate calls.
self._cached = (ret,)
return ret

Expand Down Expand Up @@ -355,3 +360,34 @@ def __abs__(a):
@deferrable
def __invert__(a):
return ~a


class _DeferredPerformanceExpression(_DeferredExpression):
'''Represents a performance function whose evaluation has been deferred.

It extends the :class:`_DeferredExpression` class by adding the ``unit``
attribute. This attribute represents the unit of the performance
metric to be extracted by the performance function.
'''

def __init__(self, fn, unit, *args, **kwargs):
super().__init__(fn, *args, **kwargs)

if not isinstance(unit, str):
raise TypeError(
'performance units must be a string'
)

self._unit = unit

@classmethod
def construct_from_deferred_expr(cls, expr, unit):
if not isinstance(expr, _DeferredExpression):
raise TypeError("'expr' argument is not an instance of the "
"_DeferredExpression class")

return cls(expr._fn, unit, *(expr._args), **(expr._kwargs))

@property
def unit(self):
return self._unit
33 changes: 14 additions & 19 deletions reframe/core/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,25 +128,6 @@ def __repr__(self):
class HookRegistry:
'''Global hook registry.'''

@classmethod
def create(cls, namespace):
'''Create a hook registry from a class namespace.

Hook functions have an `_rfm_attach` attribute that specify the stages
of the pipeline where they must be attached. Dependencies will be
resolved first in the post-setup phase if not assigned elsewhere.
'''

local_hooks = util.OrderedSet()
for v in namespace.values():
if hasattr(v, '_rfm_attach'):
local_hooks.add(Hook(v))
elif hasattr(v, '_rfm_resolve_deps'):
v._rfm_attach = ['post_setup']
local_hooks.add(Hook(v))

return cls(local_hooks)

def __init__(self, hooks=None):
self.__hooks = util.OrderedSet()
if hooks is not None:
Expand All @@ -161,6 +142,20 @@ def __getattr__(self, name):
def __iter__(self):
return iter(self.__hooks)

def add(self, v):
'''Add value to the hook registry if it meets the conditions.

Hook functions have an `_rfm_attach` attribute that specify the stages
of the pipeline where they must be attached. Dependencies will be
resolved first in the post-setup phase if not assigned elsewhere.
'''

if hasattr(v, '_rfm_attach'):
self.__hooks.add(Hook(v))
elif hasattr(v, '_rfm_resolve_deps'):
v._rfm_attach = ['post_setup']
self.__hooks.add(Hook(v))

def update(self, hooks, *, denied_hooks=None):
'''Update the hook registry with the hooks from another hook registry.

Expand Down
Loading