`torch.cuda.device_count` is misleading

```
def device_count():
    """Returns the number of GPUs available."""
    if is_available():
        _lazy_init()
        return torch._C._cuda_getDeviceCount()
    else:
        return 0
```

Device count does not return the number of GPUs available. It returns the number of GPUs. Those GPUs may not be available as they are being used by another process.

Will torch in the future support getting the GPUs that have no processes running on them? 

For the mean time, I use this method to determine a list of GPUs with >98% free memory. This allows me to only use the GPUs that do not have processes running on them.
```
def cuda_devices():
    """
    Checks for all CUDA devices with free memory.

    Returns:
        (list [int]) the CUDA devices available
    """

    # Find Cuda
    cuda = None
    for libname in ('libcuda.so', 'libcuda.dylib', 'cuda.dll'):
        try:
            cuda = ctypes.CDLL(libname)
        except OSError:
            continue
        else:
            break

    # Constants taken from cuda.h
    CUDA_SUCCESS = 0

    num_gpu = ctypes.c_int()
    error = ctypes.c_char_p()
    free_memory = ctypes.c_size_t()
    total_memory = ctypes.c_size_t()
    context = ctypes.c_void_p()
    device = ctypes.c_int()
    ret = []  # Device IDs that are not used.

    def run(result, func, *args):
        nonlocal error
        result = func(*args)
        if result != CUDA_SUCCESS:
            cuda.cuGetErrorString(result, ctypes.byref(error))
            logger.warn("%s failed with error code %d: %s", func.__name__, result,
                        error.value.decode())
            return False
        return True

    # Check if Cuda is available
    if not cuda:
        return ret

    result = cuda.cuInit(0)

    # Get number of GPU
    if not run(result, cuda.cuDeviceGetCount, ctypes.byref(num_gpu)):
        return ret

    for i in range(num_gpu.value):
        if (not run(result, cuda.cuDeviceGet, ctypes.byref(device), i) or
                not run(result, cuda.cuDeviceGet, ctypes.byref(device), i) or
                not run(result, cuda.cuCtxCreate, ctypes.byref(context), 0, device) or
                not run(result, cuda.cuMemGetInfo,
                        ctypes.byref(free_memory), ctypes.byref(total_memory))):
            continue

        percent_free_memory = float(free_memory.value) / total_memory.value
        logger.info('CUDA device %d has %f free memory [%d MiB of %d MiB]', i, percent_free_memory,
                    free_memory.value / 1024**2, total_memory.value / 1024**2)
        if percent_free_memory > 0.98:
            logger.info('CUDA device %d is available', i)
            ret.append(i)

        cuda.cuCtxDetach(context)

    return ret
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`torch.cuda.device_count` is misleading #2379

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

torch.cuda.device_count is misleading #2379

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`torch.cuda.device_count` is misleading #2379