/
diagnose_gpu.py
134 lines (117 loc) · 4.99 KB
/
diagnose_gpu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Diagnostic functions for GPU information, failings, memory usage, etc.
"""
from __future__ import annotations
from typing import Optional, Union, List, TextIO
import os
import sys
import gc
import subprocess
import torch
from returnn.util.better_exchook import better_exchook
from returnn.util.basic import human_bytes_size
def print_available_devices(*, file: Optional[TextIO] = None):
"""
Print available devices, GPU (CUDA or other), etc.
:param file: where to print to. stdout by default
"""
if file is None:
file = sys.stdout
cuda_visible_devs = None
if "CUDA_VISIBLE_DEVICES" in os.environ:
print("CUDA_VISIBLE_DEVICES is set to %r." % os.environ["CUDA_VISIBLE_DEVICES"], file=file)
cuda_visible_devs = dict(enumerate([int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(",") if d]))
else:
if torch.cuda.is_available():
print("CUDA_VISIBLE_DEVICES is not set.", file=file)
if torch.cuda.is_available():
print("Available CUDA devices:")
count = torch.cuda.device_count()
if cuda_visible_devs is not None and len(cuda_visible_devs) != count:
print(
f"(Mismatch between CUDA device count {count}"
f" and CUDA_VISIBLE_DEVICES {cuda_visible_devs} count {len(cuda_visible_devs)}?)",
file=file,
)
for i in range(count):
print(f" {i + 1}/{count}: cuda:{i}", file=file)
props = torch.cuda.get_device_properties(i)
print(f" name: {props.name}", file=file)
print(f" total_memory: {human_bytes_size(props.total_memory)}", file=file)
print(f" capability: {props.major}.{props.minor}", file=file)
if cuda_visible_devs is not None:
if len(cuda_visible_devs) == count:
dev_idx_s = cuda_visible_devs[i]
else:
dev_idx_s = "?"
else:
dev_idx_s = i
print(f" device_index: {dev_idx_s}", file=file)
if not count:
print(" (None)")
else:
print("(CUDA not available)")
def print_using_cuda_device_report(dev: Union[str, torch.device], *, file: Optional[TextIO] = None):
"""
Theano and TensorFlow print sth like: Using gpu device 2: GeForce GTX 980 (...)
Print in a similar format so that some scripts which grep our stdout work just as before.
"""
if file is None:
file = sys.stdout
if isinstance(dev, str):
dev = torch.device(dev)
assert dev.type == "cuda", f"expected CUDA device, got {dev}"
if dev.index is not None:
idx = dev.index
else:
idx = torch.cuda.current_device()
if "CUDA_VISIBLE_DEVICES" in os.environ:
cuda_visible_devs = dict(enumerate([int(d) for d in os.environ["CUDA_VISIBLE_DEVICES"].split(",") if d]))
idx_s = cuda_visible_devs.get(idx, torch.cuda.device_count() + idx)
else:
idx_s = idx
print(f"Using gpu device {idx_s}:", torch.cuda.get_device_name(idx), file=file)
free, total = torch.cuda.mem_get_info(idx)
print(f"Total GPU {idx_s} memory {human_bytes_size(total)}, free {human_bytes_size(free)}", file=file)
def diagnose_no_gpu() -> List[str]:
"""
Diagnose why we have no GPU.
Print to stdout, but also prepare summary strings.
:return: summary strings
"""
# Currently we assume Nvidia CUDA here, but once we support other backends (e.g. ROCm),
# first check which backend is most reasonable here.
res = []
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES", None))
print("LD_LIBRARY_PATH:", os.environ.get("LD_LIBRARY_PATH", None))
try:
torch.cuda.init()
except Exception as exc:
print("torch.cuda.init() failed:", exc)
better_exchook(*sys.exc_info(), debugshell=False)
res.append(f"torch.cuda.init() failed: {type(exc).__name__} {exc}")
try:
subprocess.check_call(["nvidia-smi"])
except Exception as exc:
print("nvidia-smi failed:", exc)
better_exchook(*sys.exc_info(), debugshell=False)
res.append(f"nvidia-smi failed")
return res
def garbage_collect():
"""
Perform garbage collection, including any special logic for GPU.
Also see:
https://github.com/pytorch/pytorch/issues/18853
https://github.com/pytorch/pytorch/issues/27600
https://pytorch.org/docs/stable/notes/faq.html#my-out-of-memory-exception-handler-can-t-allocate-memory
https://github.com/Lightning-AI/pytorch-lightning/blob/7a4b0fc4331633cdf00b88776689e8a84ef96cb4/src/lightning/pytorch/utilities/memory.py#L83
"""
gc.collect()
if torch.cuda.is_initialized():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
stats = [
f"alloc {human_bytes_size(torch.cuda.memory_allocated())}",
f"reserved {human_bytes_size(torch.cuda.memory_reserved())}",
]
print(f"CUDA memory usage after triggered GC:", " ".join(stats))