Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for Nvidia MIG virtual devices #32

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 55 additions & 18 deletions nvgpu/__init__.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,75 @@
# nvgpu __init__.py

import re
import six
import subprocess
# import six # Unused import?


def gpu_info():
gpus = [line for line in _run_cmd(['nvidia-smi', '-L']) if line]
gpu_infos = [re.match('GPU ([0-9]+): (.+?) \(UUID: ([^)]+)\)', gpu) for gpu in gpus]
"""Parse `nvidia-smi` CLI output to collect Nvidia GPU information."""
# Ensure that `nvidia-smi` is available and finds GPUs before proceeding.
try:
gpus = [line.strip() for line in _run_cmd(["nvidia_smi", "-L"]) if line]
except (FileNotFoundError, subprocess.CalledProcessError):
return []

# Manage output according to whether Nvidia MIG devices are enabled or not
mig_mode = _run_cmd(
["nvidia-smi", "--query-gpu=mig.mode.current", "--format=csv,noheader"]
)[0]
if mig_mode == "Enabled":
re_gpu = "(.+?)\s{1,}Device\s{1,}([0-9]+): \(UUID: ([^)]+)\)" # Match strings like 'MIG 1g.5gb Device 0: (UUID: MIG-GPU-6482a92e-d06b-dc68-c272-e3d8f7ecabbf/7/0)'
re_match_order = ("type", "index", "uuid")
gpus = [
line for line in gpus if not line.startswith("GPU")
] # Update to avoid physical GPUs (only count virtual MIG devices)
else:
re_gpu = "GPU ([0-9]+): (.+?) \(UUID: ([^)]+)\)" # Match strings like 'GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-84ccface-663f-f5fd-8e8e-109d0f78bd2f)'
re_match_order = ("index", "type", "uuid")

gpu_infos = [re.match(re_gpu, gpu) for gpu in gpus]
gpu_infos = [info.groups() for info in gpu_infos if info is not None]
gpu_infos = [dict(zip(['index', 'type', 'uuid'], info)) for info in gpu_infos]
gpu_infos = [dict(zip(re_match_order, info)) for info in gpu_infos]
gpu_count = len(gpus)

lines = _run_cmd(['nvidia-smi'])
cuda_version = float(lines[2].split('CUDA Version: ')[1].split(' ')[0])
lines = _run_cmd(["nvidia-smi"])
cuda_version = float(lines[2].split("CUDA Version: ")[1].split(" ")[0])

if cuda_version < 11:
line_distance = 3
selected_lines = lines[7:7 + line_distance * gpu_count]
selected_lines = lines[7 : 7 + line_distance * gpu_count]
else:
line_distance = 4
selected_lines = lines[8:8 + line_distance * gpu_count]
if mig_mode == "Enabled":
line_distance = 2
selected_lines = lines[19 : 19 + line_distance * gpu_count]
else:
line_distance = 4
selected_lines = lines[8 : 8 + line_distance * gpu_count]

for i in range(gpu_count):
mem_used, mem_total = [int(m.strip().replace('MiB', '')) for m in
selected_lines[line_distance * i + 1].split('|')[2].strip().split('/')]
gpu_infos[i]['mem_used'] = mem_used
gpu_infos[i]['mem_total'] = mem_total
gpu_infos[i]['mem_used_percent'] = 100. * mem_used / mem_total
mem_used, mem_total = [
int(m.strip().replace("MiB", ""))
for m in selected_lines[line_distance * i + 1]
.split("|")[2]
.strip()
.split("/")
]
gpu_infos[i]["mem_used"] = mem_used
gpu_infos[i]["mem_total"] = mem_total
gpu_infos[i]["mem_used_percent"] = 100.0 * mem_used / mem_total

return gpu_infos


def _run_cmd(cmd):
output = subprocess.check_output(cmd)
if isinstance(output, bytes):
output = output.decode('UTF-8')
return output.split('\n')
output = output.decode("UTF-8")
return output.split("\n")


def available_gpus(max_used_percent=20.):
return [gpu['index'] for gpu in gpu_info() if gpu['mem_used_percent'] <= max_used_percent]
def available_gpus(max_used_percent=20.0):
gpus = gpu_info()
if gpus:
return [gpu["index"] for gpu in gpus if gpu["mem_used_percent"] <= max_used_percent]
return []