Closed
Description
def device_count():
"""Returns the number of GPUs available."""
if is_available():
_lazy_init()
return torch._C._cuda_getDeviceCount()
else:
return 0
Device count does not return the number of GPUs available. It returns the number of GPUs. Those GPUs may not be available as they are being used by another process.
Will torch in the future support getting the GPUs that have no processes running on them?
For the mean time, I use this method to determine a list of GPUs with >98% free memory. This allows me to only use the GPUs that do not have processes running on them.
def cuda_devices():
"""
Checks for all CUDA devices with free memory.
Returns:
(list [int]) the CUDA devices available
"""
# Find Cuda
cuda = None
for libname in ('libcuda.so', 'libcuda.dylib', 'cuda.dll'):
try:
cuda = ctypes.CDLL(libname)
except OSError:
continue
else:
break
# Constants taken from cuda.h
CUDA_SUCCESS = 0
num_gpu = ctypes.c_int()
error = ctypes.c_char_p()
free_memory = ctypes.c_size_t()
total_memory = ctypes.c_size_t()
context = ctypes.c_void_p()
device = ctypes.c_int()
ret = [] # Device IDs that are not used.
def run(result, func, *args):
nonlocal error
result = func(*args)
if result != CUDA_SUCCESS:
cuda.cuGetErrorString(result, ctypes.byref(error))
logger.warn("%s failed with error code %d: %s", func.__name__, result,
error.value.decode())
return False
return True
# Check if Cuda is available
if not cuda:
return ret
result = cuda.cuInit(0)
# Get number of GPU
if not run(result, cuda.cuDeviceGetCount, ctypes.byref(num_gpu)):
return ret
for i in range(num_gpu.value):
if (not run(result, cuda.cuDeviceGet, ctypes.byref(device), i) or
not run(result, cuda.cuDeviceGet, ctypes.byref(device), i) or
not run(result, cuda.cuCtxCreate, ctypes.byref(context), 0, device) or
not run(result, cuda.cuMemGetInfo,
ctypes.byref(free_memory), ctypes.byref(total_memory))):
continue
percent_free_memory = float(free_memory.value) / total_memory.value
logger.info('CUDA device %d has %f free memory [%d MiB of %d MiB]', i, percent_free_memory,
free_memory.value / 1024**2, total_memory.value / 1024**2)
if percent_free_memory > 0.98:
logger.info('CUDA device %d is available', i)
ret.append(i)
cuda.cuCtxDetach(context)
return ret
Metadata
Metadata
Assignees
Labels
No labels