How to solve dist.init_process_group from hanging (or deadlocks) with DGX A100?

## 🐛 Bug

DDP deadlocks on a new dgx A100 machine with 8 gpus

## To Reproduce

Run this self contained code:
```
"""
For code used in distributed training.
"""
from typing import Tuple

import torch
import torch.distributed as dist

import os

from torch import Tensor

import torch.multiprocessing as mp

def set_sharing_strategy(new_strategy=None):
    """
    https://pytorch.org/docs/stable/multiprocessing.html
    https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302
    https://stackoverflow.com/questions/66426199/how-does-one-setup-the-set-sharing-strategy-strategy-for-multiprocessing-in-pyto
    """
    from sys import platform

    if new_strategy is not None:
        mp.set_sharing_strategy(new_strategy=new_strategy)
    else:
        if platform == 'darwin':  # OS X
            # only sharing strategy available at OS X
            mp.set_sharing_strategy('file_system')
        else:
            # ulimit -n 32767 or ulimit -n unlimited (perhaps later do try catch to execute this increase fd limit)
            mp.set_sharing_strategy('file_descriptor')

def use_file_system_sharing_strategy():
    """
    when to many file descriptor error happens

    https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302
    """
    import torch.multiprocessing
    torch.multiprocessing.set_sharing_strategy('file_system')

def find_free_port():
    """ https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number """
    import socket
    from contextlib import closing

    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(('', 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return str(s.getsockname()[1])

def setup_process(rank, world_size, backend='gloo'):
    """
    Initialize the distributed environment (for each process).

    gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
    it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.

    export NCCL_SOCKET_IFNAME=eth0
    export NCCL_IB_DISABLE=1

    https://stackoverflow.com/questions/61075390/about-pytorch-nccl-error-unhandled-system-error-nccl-version-2-4-8

    https://pytorch.org/docs/stable/distributed.html#common-environment-variables
    """
    import torch.distributed as dist
    import os
    import torch

    if rank != -1:  # -1 rank indicates serial code
        print(f'setting up rank={rank} (with world_size={world_size})')
        # MASTER_ADDR = 'localhost'
        MASTER_ADDR = '127.0.0.1'
        MASTER_PORT = find_free_port()
        # set up the master's ip address so this child process can coordinate
        os.environ['MASTER_ADDR'] = MASTER_ADDR
        print(f"{MASTER_ADDR=}")
        os.environ['MASTER_PORT'] = MASTER_PORT
        print(f"{MASTER_PORT}")

        # - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
        if torch.cuda.is_available():
            # unsure if this is really needed
            # os.environ['NCCL_SOCKET_IFNAME'] = 'eth0'
            # os.environ['NCCL_IB_DISABLE'] = '1'
            backend = 'nccl'
        print(f'{backend=}')
        # Initializes the default distributed process group, and this will also initialize the distributed package.
        dist.init_process_group(backend, rank=rank, world_size=world_size)
        # dist.init_process_group(backend, rank=rank, world_size=world_size)
        # dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
        print(f'--> done setting up rank={rank}')

def cleanup(rank):
    """ Destroy a given process group, and deinitialize the distributed package """
    # only destroy the process distributed group if the code is not running serially
    if rank != -1:  # -1 rank indicates serial code
        dist.destroy_process_group()

def get_batch(batch: Tuple[Tensor, Tensor], rank) -> Tuple[Tensor, Tensor]:
    x, y = batch
    if torch.cuda.is_available():
        x, y = x.to(rank), y.to(rank)
    else:
        # I don't think this is needed...
        # x, y = x.share_memory_(), y.share_memory_()
        pass
    return x, y

def test_setup():
    print('test_setup')
    world_size = 4
    mp.spawn(setup_process, args=(world_size,), nprocs=4)
    dist.destroy_process_group()
    print('successful test_setup!')


if __name__ == '__main__':
    test_setup()
```

error msg
```
Traceback (most recent call last):
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
KeyboardInterrupt
Process SpawnProcess-3:
Traceback (most recent call last):
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
    fn(i, *args)
  File "/home/miranda9/ML4Coq/ml4coq-proj/embeddings_zoo/tree_nns/main_brando.py", line 252, in train
    setup_process(rank, world_size=opts.world_size)
  File "/home/miranda9/ML4Coq/ml4coq-proj/embeddings_zoo/distributed.py", line 85, in setup_process
    dist.init_process_group(backend, rank=rank, world_size=world_size)
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 436, in init_process_group
    store, rank, world_size = next(rendezvous_iterator)
  File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 179, in _env_rendezvous_handler
    store = TCPStore(master_addr, master_port, world_size, start_daemon, timeout)
RuntimeError: connect() timed out.

During handling of the above exception, another exception occurred:

```

## Expected behavior

This should set up the processes and then kill them with no issue.

## Environment

Please copy and paste the output from our
[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py)
(or fill out the checklist below manually).

You can get the script and run it with:
```
(metalearning) $ python collect_env.py
Collecting environment information...
PyTorch version: 1.7.1
Is debug build: False
CUDA used to build PyTorch: 11.0
ROCM used to build PyTorch: N/A

OS: NVIDIA DGX Server (x86_64)
GCC version: (GCC) 5.2.0
Clang version: Could not collect
CMake version: Could not collect

Python version: 3.8 (64-bit runtime)
Is CUDA available: True
CUDA runtime version: 11.0.221
GPU models and configuration: 
GPU 0: A100-SXM4-40GB
GPU 1: A100-SXM4-40GB
GPU 2: A100-SXM4-40GB
GPU 3: A100-SXM4-40GB
GPU 4: A100-SXM4-40GB
GPU 5: A100-SXM4-40GB
GPU 6: A100-SXM4-40GB
GPU 7: A100-SXM4-40GB

Nvidia driver version: 450.102.04
cuDNN version: Probably one of the following:
/usr/lib64/libcudnn.so.7.6.3
/usr/lib64/libcudnn.so.8.0.5
/usr/lib64/libcudnn_adv_infer.so.8.0.5
/usr/lib64/libcudnn_adv_train.so.8.0.5
/usr/lib64/libcudnn_cnn_infer.so.8.0.5
/usr/lib64/libcudnn_cnn_train.so.8.0.5
/usr/lib64/libcudnn_ops_infer.so.8.0.5
/usr/lib64/libcudnn_ops_train.so.8.0.5
HIP runtime version: N/A
MIOpen runtime version: N/A

Versions of relevant libraries:
[pip3] numpy==1.19.2
[pip3] torch==1.7.1
[pip3] torchaudio==0.7.0a0+a853dff
[pip3] torchmeta==1.6.1
[pip3] torchvision==0.8.2
[conda] blas                      1.0                         mkl  
[conda] cudatoolkit               11.0.221             h6bb024c_0  
[conda] mkl                       2020.2                      256  
[conda] mkl-service               2.3.0            py38he904b0f_0  
[conda] mkl_fft                   1.3.0            py38h54f3939_0  
[conda] mkl_random                1.1.1            py38h0573a6f_0  
[conda] numpy                     1.19.2           py38h54aff64_0  
[conda] numpy-base                1.19.2           py38hfa32c7d_0  
[conda] pytorch                   1.7.1           py3.8_cuda11.0.221_cudnn8.0.5_0    pytorch
[conda] torchaudio                0.7.2                      py38    pytorch
[conda] torchmeta                 1.6.1                    pypi_0    pypi
[conda] torchvision               0.8.2                py38_cu110    pytorch

```





 - PyTorch Version (e.g., 1.0): pytorch                   1.7.1           py3.8_cuda11.0.221_cudnn8.0.5_0    pytorch
 - OS (e.g., Linux): $ uname -a
Linux dgx 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux

 - How you installed PyTorch (`conda`, `pip`, source): conda, conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch
 - Build command you used (if compiling from source): conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch
 - Python version: Python 3.8.2 (default, Mar 26 2020, 15:53:00)
 - CUDA/cuDNN version: $ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0

 - GPU models and configuration: dgx a100 see output of nvidia-smi bellow A100-SXM4-40GB
 - Any other relevant information:
 
```
$ nvidia-smi
Fri Mar  5 13:12:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04   Driver Version: 450.102.04   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  A100-SXM4-40GB      On   | 00000000:07:00.0 Off |                    0 |
| N/A   26C    P0    51W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  A100-SXM4-40GB      On   | 00000000:0F:00.0 Off |                    0 |
| N/A   25C    P0    52W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   2  A100-SXM4-40GB      On   | 00000000:47:00.0 Off |                    0 |
| N/A   25C    P0    51W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   3  A100-SXM4-40GB      On   | 00000000:4E:00.0 Off |                    0 |
| N/A   25C    P0    51W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   4  A100-SXM4-40GB      On   | 00000000:87:00.0 Off |                    0 |
| N/A   30C    P0    52W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   5  A100-SXM4-40GB      On   | 00000000:90:00.0 Off |                    0 |
| N/A   29C    P0    53W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   6  A100-SXM4-40GB      On   | 00000000:B7:00.0 Off |                    0 |
| N/A   29C    P0    52W / 400W |      3MiB / 40537MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   7  A100-SXM4-40GB      On   | 00000000:BD:00.0 Off |                    0 |
| N/A   45C    P0   211W / 400W |   7500MiB / 40537MiB |     99%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    7   N/A  N/A    147243      C   python                           7497MiB |
+-----------------------------------------------------------------------------+
```

## Additional context

I installed apex as I found on other links successfully but that didn't help.

### Additional Additional context

crossposted with less details: https://stackoverflow.com/questions/66498045/how-to-solve-dist-init-process-group-from-hanging-or-deadlocks

related: 
- https://github.com/pytorch/pytorch/issues/9696
- https://github.com/pytorch/pytorch/issues/15638

cc @pietern @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @agolynski @SciPioneer @H-Huang @mrzzd @cbalioglu

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to solve dist.init_process_group from hanging (or deadlocks) with DGX A100? #53395

🐛 Bug

To Reproduce

Expected behavior

Environment

Additional context

Additional Additional context

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

How to solve dist.init_process_group from hanging (or deadlocks) with DGX A100? #53395

Description

🐛 Bug

To Reproduce

Expected behavior

Environment

Additional context

Additional Additional context

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions