Skip to content

Commit

Permalink
[nvml] add compute_running_process (processes running in gpu) (Data…
Browse files Browse the repository at this point in the history
…Dog#1580)

* add process reporting

* add metadata

* fix failing test

* update version

* fix styling issues

* sync config.yaml.example
  • Loading branch information
BeLeap authored and DannyBrock committed Dec 27, 2022
1 parent 508e86b commit d650dbe
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 2 deletions.
4 changes: 4 additions & 0 deletions nvml/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# CHANGELOG - nvml

## 1.0.6

* Add compute_running_process metrics.

## 1.0.5

* Add fan speed to monitored metrics.
Expand Down
2 changes: 1 addition & 1 deletion nvml/datadog_checks/nvml/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# (C) Datadog, Inc. 2020-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
__version__ = '1.0.5'
__version__ = '1.0.6'
2 changes: 1 addition & 1 deletion nvml/datadog_checks/nvml/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ init_config:
#
# service: <SERVICE>

## Every instance is scheduled independent of the others.
## Every instance is scheduled independently of the others.
#
instances:

Expand Down
9 changes: 9 additions & 0 deletions nvml/datadog_checks/nvml/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,15 @@ def gather_gpu(self, handle, tags):
fan_speed = NvmlCheck.N.nvmlDeviceGetFanSpeed(handle)
self.gauge('fan_speed', fan_speed, tags=tags)

with NvmlCall("compute_running_processes", self.log):
compute_running_processes = NvmlCheck.N.nvmlDeviceGetComputeRunningProcesses_v2(handle)
for compute_running_process in compute_running_processes:
self.gauge(
'compute_running_process',
compute_running_process.usedGpuMemory,
tags=tags + [f"pid:{compute_running_process.pid}"],
)

def _start_discovery(self):
"""Start daemon thread to discover which k8s pod is assigned to a GPU"""
# type: () -> None
Expand Down
1 change: 1 addition & 0 deletions nvml/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ nvml.pcie_tx_throughput,gauge,,kibibyte,second,PCIe TX utilization,0,nvml,TX_uti
nvml.pcie_rx_throughput,gauge,,kibibyte,second,PCIe RX utilization,0,nvml,RX_utilization,
nvml.temperature,gauge,,,,Current temperature for this GPU in degrees celsius,0,nvml,temperature,
nvml.fan_speed,gauge,,percent,,The current utilization for the fan,0,nvml,fan_speed,
nvml.compute_running_process,gauge,,byte,,The current usage of gpu memory by process,0,nvml,compute_running_process,
7 changes: 7 additions & 0 deletions nvml/tests/test_nvml.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# (C) Datadog, Inc. 2020-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from collections import namedtuple
from types import SimpleNamespace

import mock
Expand Down Expand Up @@ -74,6 +75,11 @@ def nvmlDeviceGetTemperature(h, b):
def nvmlDeviceGetFanSpeed(h):
return 14

@staticmethod
def nvmlDeviceGetComputeRunningProcesses_v2(h):
Mock = namedtuple('Mock', ['pid', 'usedGpuMemory'])
return [Mock(pid=1, usedGpuMemory=11)]


@pytest.mark.unit
def test_check(aggregator, instance):
Expand All @@ -95,5 +101,6 @@ def test_check(aggregator, instance):
aggregator.assert_metric('nvml.power_usage', tags=expected_tags, count=1)
aggregator.assert_metric('nvml.temperature', tags=expected_tags, count=1)
aggregator.assert_metric('nvml.fan_speed', tags=expected_tags, count=1)
aggregator.assert_metric('nvml.compute_running_process', tags=expected_tags + ["pid:1"], count=1)

aggregator.assert_all_metrics_covered()

0 comments on commit d650dbe

Please sign in to comment.