Skip to content

How to solve dist.init_process_group from hanging (or deadlocks) with DGX A100? #53395

@brando90

Description

@brando90

🐛 Bug

DDP deadlocks on a new dgx A100 machine with 8 gpus

To Reproduce

Run this self contained code:

"""
For code used in distributed training.
"""
from typing import Tuple

import torch
import torch.distributed as dist

import os

from torch import Tensor

import torch.multiprocessing as mp

def set_sharing_strategy(new_strategy=None):
    """
    https://pytorch.org/docs/stable/multiprocessing.html
    https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302
    https://stackoverflow.com/questions/66426199/how-does-one-setup-the-set-sharing-strategy-strategy-for-multiprocessing-in-pyto
    """
    from sys import platform

    if new_strategy is not None:
        mp.set_sharing_strategy(new_strategy=new_strategy)
    else:
        if platform == 'darwin':  # OS X
            # only sharing strategy available at OS X
            mp.set_sharing_strategy('file_system')
        else:
            # ulimit -n 32767 or ulimit -n unlimited (perhaps later do try catch to execute this increase fd limit)
            mp.set_sharing_strategy('file_descriptor')

def use_file_system_sharing_strategy():
    """
    when to many file descriptor error happens

    https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302
    """
    import torch.multiprocessing
    torch.multiprocessing.set_sharing_strategy('file_system')

def find_free_port():
    """ https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number """
    import socket
    from contextlib import closing

    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return str(s.getsockname()[1])

def setup_process(rank, world_size, backend='gloo'):
    """
    Initialize the distributed environment (for each process).

    gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
    it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.

    export NCCL_SOCKET_IFNAME=eth0
    export NCCL_IB_DISABLE=1

    https://stackoverflow.com/questions/61075390/about-pytorch-nccl-error-unhandled-system-error-nccl-version-2-4-8

    https://pytorch.org/docs/stable/distributed.html#common-environment-variables
    """
    import torch.distributed as dist
    import os
    import torch

    if rank != -1:  # -1 rank indicates serial code
        print(f'setting up rank={rank} (with world_size={world_size})')
        # MASTER_ADDR = 'localhost'
        MASTER_ADDR = '127.0.0.1'
        MASTER_PORT = find_free_port()
        # set up the master's ip address so this child process can coordinate
        os.environ['MASTER_ADDR'] = MASTER_ADDR
        print(f"{MASTER_ADDR=}")
        os.environ['MASTER_PORT'] = MASTER_PORT
        print(f"{MASTER_PORT}")

        # - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
        if torch.cuda.is_available():
            # unsure if this is really needed
            # os.environ['NCCL_SOCKET_IFNAME'] = 'eth0'
            # os.environ['NCCL_IB_DISABLE'] = '1'
            backend = 'nccl'
        print(f'{backend=}')
        # Initializes the default distributed process group, and this will also initialize the distributed package.
        dist.init_process_group(backend, rank=rank, world_size=world_size)
        # dist.init_process_group(backend, rank=rank, world_size=world_size)
        # dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
        print(f'--> done setting up rank={rank}')

def cleanup(rank):
    """ Destroy a given process group, and deinitialize the distributed package """
    # only destroy the process distributed group if the code is not running serially
    if rank != -1:  # -1 rank indicates serial code
        dist.destroy_process_group()

def get_batch(batch: Tuple[Tensor, Tensor], rank) -> Tuple[Tensor, Tensor]:
    x, y = batch
    if torch.cuda.is_available():
        x, y = x.to(rank), y.to(rank)
    else:
        # I don't think this is needed...
        # x, y = x.share_memory_(), y.share_memory_()
        pass
    return x, y

def test_setup():
    print('test_setup')
    world_size = 4
    mp.spawn(setup_process, args=(world_size,), nprocs=4)
    dist.destroy_process_group()
    print('successful test_setup!')


if __name__ == '__main__':
    test_setup()

error msg

Traceback (most recent call last):
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
KeyboardInterrupt
Process SpawnProcess-3:
Traceback (most recent call last):
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
  File "/home/miranda9/ML4Coq/ml4coq-proj/embeddings_zoo/tree_nns/main_brando.py", line 252, in train
    setup_process(rank, world_size=opts.world_size)
  File "/home/miranda9/ML4Coq/ml4coq-proj/embeddings_zoo/distributed.py", line 85, in setup_process
    dist.init_process_group(backend, rank=rank, world_size=world_size)
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 436, in init_process_group
    store, rank, world_size = next(rendezvous_iterator)
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 179, in _env_rendezvous_handler
    store = TCPStore(master_addr, master_port, world_size, start_daemon, timeout)
RuntimeError: connect() timed out.

During handling of the above exception, another exception occurred:

Expected behavior

This should set up the processes and then kill them with no issue.

Environment

Please copy and paste the output from our
environment collection script
(or fill out the checklist below manually).

You can get the script and run it with:

(metalearning) $ python collect_env.py
Collecting environment information...
PyTorch version: 1.7.1
Is debug build: False
CUDA used to build PyTorch: 11.0
ROCM used to build PyTorch: N/A

OS: NVIDIA DGX Server (x86_64)
GCC version: (GCC) 5.2.0
Clang version: Could not collect
CMake version: Could not collect

Python version: 3.8 (64-bit runtime)
Is CUDA available: True
CUDA runtime version: 11.0.221
GPU models and configuration: 
GPU 0: A100-SXM4-40GB
GPU 1: A100-SXM4-40GB
GPU 2: A100-SXM4-40GB
GPU 3: A100-SXM4-40GB
GPU 4: A100-SXM4-40GB
GPU 5: A100-SXM4-40GB
GPU 6: A100-SXM4-40GB
GPU 7: A100-SXM4-40GB

Nvidia driver version: 450.102.04
cuDNN version: Probably one of the following:
/usr/lib64/libcudnn.so.7.6.3
/usr/lib64/libcudnn.so.8.0.5
/usr/lib64/libcudnn_adv_infer.so.8.0.5
/usr/lib64/libcudnn_adv_train.so.8.0.5
/usr/lib64/libcudnn_cnn_infer.so.8.0.5
/usr/lib64/libcudnn_cnn_train.so.8.0.5
/usr/lib64/libcudnn_ops_infer.so.8.0.5
/usr/lib64/libcudnn_ops_train.so.8.0.5
HIP runtime version: N/A
MIOpen runtime version: N/A

Versions of relevant libraries:
[pip3] numpy==1.19.2
[pip3] torch==1.7.1
[pip3] torchaudio==0.7.0a0+a853dff
[pip3] torchmeta==1.6.1
[pip3] torchvision==0.8.2
[conda] blas                      1.0                         mkl  
[conda] cudatoolkit               11.0.221             h6bb024c_0  
[conda] mkl                       2020.2                      256  
[conda] mkl-service               2.3.0            py38he904b0f_0  
[conda] mkl_fft                   1.3.0            py38h54f3939_0  
[conda] mkl_random                1.1.1            py38h0573a6f_0  
[conda] numpy                     1.19.2           py38h54aff64_0  
[conda] numpy-base                1.19.2           py38hfa32c7d_0  
[conda] pytorch                   1.7.1           py3.8_cuda11.0.221_cudnn8.0.5_0    pytorch
[conda] torchaudio                0.7.2                      py38    pytorch
[conda] torchmeta                 1.6.1                    pypi_0    pypi
[conda] torchvision               0.8.2                py38_cu110    pytorch

  • PyTorch Version (e.g., 1.0): pytorch 1.7.1 py3.8_cuda11.0.221_cudnn8.0.5_0 pytorch

  • OS (e.g., Linux): $ uname -a
    Linux dgx 3.10.0-1160.11.1.el7.x86_64 Matrix multiplication operator #1 SMP Fri Dec 18 16:34:56 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux

  • How you installed PyTorch (conda, pip, source): conda, conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch

  • Build command you used (if compiling from source): conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch

  • Python version: Python 3.8.2 (default, Mar 26 2020, 15:53:00)

  • CUDA/cuDNN version: $ nvcc --version
    nvcc: NVIDIA (R) Cuda compiler driver
    Copyright (c) 2005-2020 NVIDIA Corporation
    Built on Wed_Jul_22_19:09:09_PDT_2020
    Cuda compilation tools, release 11.0, V11.0.221
    Build cuda_11.0_bu.TC445_37.28845127_0

  • GPU models and configuration: dgx a100 see output of nvidia-smi bellow A100-SXM4-40GB

  • Any other relevant information:

$ nvidia-smi
Fri Mar  5 13:12:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  A100-SXM4-40GB      On   | 00000000:07:00.0 Off |                    0 |
| N/A   26C    P0    51W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  A100-SXM4-40GB      On   | 00000000:0F:00.0 Off |                    0 |
| N/A   25C    P0    52W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  A100-SXM4-40GB      On   | 00000000:47:00.0 Off |                    0 |
| N/A   25C    P0    51W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  A100-SXM4-40GB      On   | 00000000:4E:00.0 Off |                    0 |
| N/A   25C    P0    51W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   4  A100-SXM4-40GB      On   | 00000000:87:00.0 Off |                    0 |
| N/A   30C    P0    52W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   5  A100-SXM4-40GB      On   | 00000000:90:00.0 Off |                    0 |
| N/A   29C    P0    53W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   6  A100-SXM4-40GB      On   | 00000000:B7:00.0 Off |                    0 |
| N/A   29C    P0    52W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   7  A100-SXM4-40GB      On   | 00000000:BD:00.0 Off |                    0 |
| N/A   45C    P0   211W / 400W |   7500MiB / 40537MiB |     99%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    7   N/A  N/A    147243      C   python                           7497MiB |
+-----------------------------------------------------------------------------+

Additional context

I installed apex as I found on other links successfully but that didn't help.

Additional Additional context

crossposted with less details: https://stackoverflow.com/questions/66498045/how-to-solve-dist-init-process-group-from-hanging-or-deadlocks

related:

cc @pietern @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @agolynski @SciPioneer @H-Huang @mrzzd @cbalioglu

Metadata

Metadata

Assignees

No one assigned

    Labels

    oncall: distributedAdd this issue/PR to distributed oncall triage queue

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions