[nvml] add compute_running_process (processes running in gpu) (Data…

…Dog#1580) * add process reporting * add metadata * fix failing test * update version * fix styling issues * sync config.yaml.example
rajeevthefiddler · Dec 27, 2022 · d650dbe · d650dbe
1 parent 508e86b
commit d650dbe
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 2 deletions.
diff --git a/nvml/CHANGELOG.md b/nvml/CHANGELOG.md
@@ -1,5 +1,9 @@
 # CHANGELOG - nvml
 
+## 1.0.6
+
+* Add compute_running_process metrics.
+
 ## 1.0.5
 
 * Add fan speed to monitored metrics.

diff --git a/nvml/datadog_checks/nvml/__about__.py b/nvml/datadog_checks/nvml/__about__.py
@@ -1,4 +1,4 @@
 # (C) Datadog, Inc. 2020-present
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
-__version__ = '1.0.5'
+__version__ = '1.0.6'
diff --git a/nvml/datadog_checks/nvml/data/conf.yaml.example b/nvml/datadog_checks/nvml/data/conf.yaml.example
@@ -9,7 +9,7 @@ init_config:
     #
     # service: <SERVICE>
 
-## Every instance is scheduled independent of the others.
+## Every instance is scheduled independently of the others.
 #
 instances:
 

diff --git a/nvml/datadog_checks/nvml/nvml.py b/nvml/datadog_checks/nvml/nvml.py
@@ -171,6 +171,15 @@ def gather_gpu(self, handle, tags):
             fan_speed = NvmlCheck.N.nvmlDeviceGetFanSpeed(handle)
             self.gauge('fan_speed', fan_speed, tags=tags)
 
+        with NvmlCall("compute_running_processes", self.log):
+            compute_running_processes = NvmlCheck.N.nvmlDeviceGetComputeRunningProcesses_v2(handle)
+            for compute_running_process in compute_running_processes:
+                self.gauge(
+                    'compute_running_process',
+                    compute_running_process.usedGpuMemory,
+                    tags=tags + [f"pid:{compute_running_process.pid}"],
+                )
+
     def _start_discovery(self):
         """Start daemon thread to discover which k8s pod is assigned to a GPU"""
         # type: () -> None

diff --git a/nvml/metadata.csv b/nvml/metadata.csv
@@ -13,3 +13,4 @@ nvml.pcie_tx_throughput,gauge,,kibibyte,second,PCIe TX utilization,0,nvml,TX_uti
 nvml.pcie_rx_throughput,gauge,,kibibyte,second,PCIe RX utilization,0,nvml,RX_utilization,
 nvml.temperature,gauge,,,,Current temperature for this GPU in degrees celsius,0,nvml,temperature,
 nvml.fan_speed,gauge,,percent,,The current utilization for the fan,0,nvml,fan_speed,
+nvml.compute_running_process,gauge,,byte,,The current usage of gpu memory by process,0,nvml,compute_running_process,
diff --git a/nvml/tests/test_nvml.py b/nvml/tests/test_nvml.py
@@ -1,6 +1,7 @@
 # (C) Datadog, Inc. 2020-present
 # All rights reserved
 # Licensed under a 3-clause BSD style license (see LICENSE)
+from collections import namedtuple
 from types import SimpleNamespace
 
 import mock
@@ -74,6 +75,11 @@ def nvmlDeviceGetTemperature(h, b):
     def nvmlDeviceGetFanSpeed(h):
         return 14
 
+    @staticmethod
+    def nvmlDeviceGetComputeRunningProcesses_v2(h):
+        Mock = namedtuple('Mock', ['pid', 'usedGpuMemory'])
+        return [Mock(pid=1, usedGpuMemory=11)]
+
 
 @pytest.mark.unit
 def test_check(aggregator, instance):
@@ -95,5 +101,6 @@ def test_check(aggregator, instance):
     aggregator.assert_metric('nvml.power_usage', tags=expected_tags, count=1)
     aggregator.assert_metric('nvml.temperature', tags=expected_tags, count=1)
     aggregator.assert_metric('nvml.fan_speed', tags=expected_tags, count=1)
+    aggregator.assert_metric('nvml.compute_running_process', tags=expected_tags + ["pid:1"], count=1)
 
     aggregator.assert_all_metrics_covered()