Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions cscs-checks/cuda/nvml_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,37 @@
import reframe.utility.sanity as sn


@rfm.required_version('>=2.14')
@rfm.simple_test
class NvmlCheck(rfm.RegressionTest):
''' This test checks gpu modes with nvml:
* COMPUTE MODE:
result = nvmlDeviceGetComputeMode(device, &compute_mode);

* GPU OPERATION MODE (not supported since K20s, keeping as reminder):
result = nvmlDeviceGetGpuOperationMode(device, &gom_mode_current,
&gom_mode_pending);
NVML_GOM_ALL_ON Everything is enabled and running at full speed.
NVML_GOM_COMPUTE Designed for running only compute tasks.
Graphics operations < are not allowed.
NVML_GOM_LOW_DP Designed for running graphics applications that do not
require < high bandwidth double precision.
'''
def __init__(self):
super().__init__()
self.descr = 'check GPU compute mode'
self.valid_systems = ['daint:gpu', 'dom:gpu']
self.valid_prog_environs = ['PrgEnv-gnu']
self.sourcesdir = os.path.join(self.current_system.resourcesdir,
'CUDA', 'nvml')
self.modules = ['craype-accel-nvidia60']
self.sourcepath = 'nvml.c'
self.build_system = 'SingleSource'
self.sourcepath = 'example.c'
self.prebuild_cmd = [
'cp $CUDATOOLKIT_HOME/nvml/example/example.c .',
'patch -i ./nvml_example.patch'
]
self.build_system.ldflags = ['-lnvidia-ml']
self.sanity_patterns = sn.assert_found(
r"compute\s+mode\s+'Exclusive Process'", self.stdout)
r"\s+Changing device.s compute mode from 'Exclusive Process' to ",
self.stdout)

self.maintainers = ['AJ', 'VK']
self.tags = {'production'}
self.tags = {'production', 'craype'}
10 changes: 10 additions & 0 deletions cscs-checks/cuda/src/nvml_example.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
--- example.c.ori 2019-10-11 11:57:51.000000000 +0200
+++ example.c 2019-10-11 11:58:24.000000000 +0200
@@ -63,6 +63,7 @@
unsigned int device_count, i;

// First initialize NVML library
+ printf("NVML_API_VERSION=%d\n", NVML_API_VERSION);
result = nvmlInit();
if (NVML_SUCCESS != result)
{