-
Notifications
You must be signed in to change notification settings - Fork 25.4k
Description
🐛 Bug
DDP deadlocks on a new dgx A100 machine with 8 gpus
To Reproduce
Run this self contained code:
"""
For code used in distributed training.
"""
from typing import Tuple
import torch
import torch.distributed as dist
import os
from torch import Tensor
import torch.multiprocessing as mp
def set_sharing_strategy(new_strategy=None):
"""
https://pytorch.org/docs/stable/multiprocessing.html
https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302
https://stackoverflow.com/questions/66426199/how-does-one-setup-the-set-sharing-strategy-strategy-for-multiprocessing-in-pyto
"""
from sys import platform
if new_strategy is not None:
mp.set_sharing_strategy(new_strategy=new_strategy)
else:
if platform == 'darwin': # OS X
# only sharing strategy available at OS X
mp.set_sharing_strategy('file_system')
else:
# ulimit -n 32767 or ulimit -n unlimited (perhaps later do try catch to execute this increase fd limit)
mp.set_sharing_strategy('file_descriptor')
def use_file_system_sharing_strategy():
"""
when to many file descriptor error happens
https://discuss.pytorch.org/t/how-does-one-setp-up-the-set-sharing-strategy-strategy-for-multiprocessing/113302
"""
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')
def find_free_port():
""" https://stackoverflow.com/questions/1365265/on-localhost-how-do-i-pick-a-free-port-number """
import socket
from contextlib import closing
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(('', 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
return str(s.getsockname()[1])
def setup_process(rank, world_size, backend='gloo'):
"""
Initialize the distributed environment (for each process).
gloo: is a collective communications library (https://github.com/facebookincubator/gloo). My understanding is that
it's a library/API for process to communicate/coordinate with each other/master. It's a backend library.
export NCCL_SOCKET_IFNAME=eth0
export NCCL_IB_DISABLE=1
https://stackoverflow.com/questions/61075390/about-pytorch-nccl-error-unhandled-system-error-nccl-version-2-4-8
https://pytorch.org/docs/stable/distributed.html#common-environment-variables
"""
import torch.distributed as dist
import os
import torch
if rank != -1: # -1 rank indicates serial code
print(f'setting up rank={rank} (with world_size={world_size})')
# MASTER_ADDR = 'localhost'
MASTER_ADDR = '127.0.0.1'
MASTER_PORT = find_free_port()
# set up the master's ip address so this child process can coordinate
os.environ['MASTER_ADDR'] = MASTER_ADDR
print(f"{MASTER_ADDR=}")
os.environ['MASTER_PORT'] = MASTER_PORT
print(f"{MASTER_PORT}")
# - use NCCL if you are using gpus: https://pytorch.org/tutorials/intermediate/dist_tuto.html#communication-backends
if torch.cuda.is_available():
# unsure if this is really needed
# os.environ['NCCL_SOCKET_IFNAME'] = 'eth0'
# os.environ['NCCL_IB_DISABLE'] = '1'
backend = 'nccl'
print(f'{backend=}')
# Initializes the default distributed process group, and this will also initialize the distributed package.
dist.init_process_group(backend, rank=rank, world_size=world_size)
# dist.init_process_group(backend, rank=rank, world_size=world_size)
# dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
print(f'--> done setting up rank={rank}')
def cleanup(rank):
""" Destroy a given process group, and deinitialize the distributed package """
# only destroy the process distributed group if the code is not running serially
if rank != -1: # -1 rank indicates serial code
dist.destroy_process_group()
def get_batch(batch: Tuple[Tensor, Tensor], rank) -> Tuple[Tensor, Tensor]:
x, y = batch
if torch.cuda.is_available():
x, y = x.to(rank), y.to(rank)
else:
# I don't think this is needed...
# x, y = x.share_memory_(), y.share_memory_()
pass
return x, y
def test_setup():
print('test_setup')
world_size = 4
mp.spawn(setup_process, args=(world_size,), nprocs=4)
dist.destroy_process_group()
print('successful test_setup!')
if __name__ == '__main__':
test_setup()
error msg
Traceback (most recent call last):
File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
KeyboardInterrupt
Process SpawnProcess-3:
Traceback (most recent call last):
File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/home/miranda9/ML4Coq/ml4coq-proj/embeddings_zoo/tree_nns/main_brando.py", line 252, in train
setup_process(rank, world_size=opts.world_size)
File "/home/miranda9/ML4Coq/ml4coq-proj/embeddings_zoo/distributed.py", line 85, in setup_process
dist.init_process_group(backend, rank=rank, world_size=world_size)
File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 436, in init_process_group
store, rank, world_size = next(rendezvous_iterator)
File "/home/miranda9/miniconda3/envs/metalearning/lib/python3.8/site-packages/torch/distributed/rendezvous.py", line 179, in _env_rendezvous_handler
store = TCPStore(master_addr, master_port, world_size, start_daemon, timeout)
RuntimeError: connect() timed out.
During handling of the above exception, another exception occurred:
Expected behavior
This should set up the processes and then kill them with no issue.
Environment
Please copy and paste the output from our
environment collection script
(or fill out the checklist below manually).
You can get the script and run it with:
(metalearning) $ python collect_env.py
Collecting environment information...
PyTorch version: 1.7.1
Is debug build: False
CUDA used to build PyTorch: 11.0
ROCM used to build PyTorch: N/A
OS: NVIDIA DGX Server (x86_64)
GCC version: (GCC) 5.2.0
Clang version: Could not collect
CMake version: Could not collect
Python version: 3.8 (64-bit runtime)
Is CUDA available: True
CUDA runtime version: 11.0.221
GPU models and configuration:
GPU 0: A100-SXM4-40GB
GPU 1: A100-SXM4-40GB
GPU 2: A100-SXM4-40GB
GPU 3: A100-SXM4-40GB
GPU 4: A100-SXM4-40GB
GPU 5: A100-SXM4-40GB
GPU 6: A100-SXM4-40GB
GPU 7: A100-SXM4-40GB
Nvidia driver version: 450.102.04
cuDNN version: Probably one of the following:
/usr/lib64/libcudnn.so.7.6.3
/usr/lib64/libcudnn.so.8.0.5
/usr/lib64/libcudnn_adv_infer.so.8.0.5
/usr/lib64/libcudnn_adv_train.so.8.0.5
/usr/lib64/libcudnn_cnn_infer.so.8.0.5
/usr/lib64/libcudnn_cnn_train.so.8.0.5
/usr/lib64/libcudnn_ops_infer.so.8.0.5
/usr/lib64/libcudnn_ops_train.so.8.0.5
HIP runtime version: N/A
MIOpen runtime version: N/A
Versions of relevant libraries:
[pip3] numpy==1.19.2
[pip3] torch==1.7.1
[pip3] torchaudio==0.7.0a0+a853dff
[pip3] torchmeta==1.6.1
[pip3] torchvision==0.8.2
[conda] blas 1.0 mkl
[conda] cudatoolkit 11.0.221 h6bb024c_0
[conda] mkl 2020.2 256
[conda] mkl-service 2.3.0 py38he904b0f_0
[conda] mkl_fft 1.3.0 py38h54f3939_0
[conda] mkl_random 1.1.1 py38h0573a6f_0
[conda] numpy 1.19.2 py38h54aff64_0
[conda] numpy-base 1.19.2 py38hfa32c7d_0
[conda] pytorch 1.7.1 py3.8_cuda11.0.221_cudnn8.0.5_0 pytorch
[conda] torchaudio 0.7.2 py38 pytorch
[conda] torchmeta 1.6.1 pypi_0 pypi
[conda] torchvision 0.8.2 py38_cu110 pytorch
-
PyTorch Version (e.g., 1.0): pytorch 1.7.1 py3.8_cuda11.0.221_cudnn8.0.5_0 pytorch
-
OS (e.g., Linux): $ uname -a
Linux dgx 3.10.0-1160.11.1.el7.x86_64 Matrix multiplication operator #1 SMP Fri Dec 18 16:34:56 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux -
How you installed PyTorch (
conda
,pip
, source): conda, conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch -
Build command you used (if compiling from source): conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch
-
Python version: Python 3.8.2 (default, Mar 26 2020, 15:53:00)
-
CUDA/cuDNN version: $ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0 -
GPU models and configuration: dgx a100 see output of nvidia-smi bellow A100-SXM4-40GB
-
Any other relevant information:
$ nvidia-smi
Fri Mar 5 13:12:44 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.102.04 Driver Version: 450.102.04 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 A100-SXM4-40GB On | 00000000:07:00.0 Off | 0 |
| N/A 26C P0 51W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 1 A100-SXM4-40GB On | 00000000:0F:00.0 Off | 0 |
| N/A 25C P0 52W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 2 A100-SXM4-40GB On | 00000000:47:00.0 Off | 0 |
| N/A 25C P0 51W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 3 A100-SXM4-40GB On | 00000000:4E:00.0 Off | 0 |
| N/A 25C P0 51W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 4 A100-SXM4-40GB On | 00000000:87:00.0 Off | 0 |
| N/A 30C P0 52W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 5 A100-SXM4-40GB On | 00000000:90:00.0 Off | 0 |
| N/A 29C P0 53W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 6 A100-SXM4-40GB On | 00000000:B7:00.0 Off | 0 |
| N/A 29C P0 52W / 400W | 3MiB / 40537MiB | 0% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
| 7 A100-SXM4-40GB On | 00000000:BD:00.0 Off | 0 |
| N/A 45C P0 211W / 400W | 7500MiB / 40537MiB | 99% Default |
| | | Disabled |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 7 N/A N/A 147243 C python 7497MiB |
+-----------------------------------------------------------------------------+
Additional context
I installed apex as I found on other links successfully but that didn't help.
Additional Additional context
crossposted with less details: https://stackoverflow.com/questions/66498045/how-to-solve-dist-init-process-group-from-hanging-or-deadlocks
related:
- Possible deadlock in dist.init_process_group #9696
- init_process_group() sometimes hangs (not stable) with pytorch 1.0.0 #15638
cc @pietern @mrshenli @pritamdamania87 @zhaojuanmao @satgera @rohan-varma @gqchen @aazzolini @osalpekar @jiayisuse @agolynski @SciPioneer @H-Huang @mrzzd @cbalioglu