Skip to content

Commit

Permalink
Update on "[NCCL] Add Error log when ProcessGroupNCCL takes down proc…
Browse files Browse the repository at this point in the history
…ess upon

timeout/error"

timeout/error**
timeout/error**

timeout/error

The new NCCL async error handling feature throws an exception from the
workCleanup Thread if one of the NCCL operations encounters an error or times
out. This PR adds an error log to make it more clear to the user why the
training process crashed.

Differential Revision: [D23794801](https://our.internmc.facebook.com/intern/diff/D23794801/)

[ghstack-poisoned]
  • Loading branch information
osalpekar committed Oct 9, 2020
2 parents 384f1d2 + f8b3af2 commit 4217efa
Show file tree
Hide file tree
Showing 120 changed files with 4,254 additions and 1,288 deletions.
26 changes: 18 additions & 8 deletions .circleci/cimodel/data/pytorch_build_data.py
Expand Up @@ -18,7 +18,11 @@
("clang", [
("5", [
("3.6", [
("asan", [XImportant(True)]),
("asan", [
(True, [
("shard_test", [XImportant(True)]),
]),
]),
]),
]),
("7", [
Expand All @@ -45,14 +49,14 @@
]),
("10.2", [
("3.6", [
("important", [X(True)]),
("shard_test", [XImportant(True)]),
("libtorch", [X(True)]),
]),
]),
("11.0", [
("3.8", [
X(True),
("libtorch", [XImportant(True)])
("libtorch", [XImportant(True)]),
]),
]),
]),
Expand Down Expand Up @@ -158,6 +162,7 @@ def child_constructor(self):
"libtorch": LibTorchConfigNode,
"important": ImportantConfigNode,
"build_only": BuildOnlyConfigNode,
"shard_test": ShardTestConfigNode,
"cuda_gcc_override": CudaGccOverrideConfigNode,
"coverage": CoverageConfigNode,
"pure_torch": PureTorchConfigNode,
Expand Down Expand Up @@ -195,7 +200,7 @@ def init2(self, node_name):
self.props["is_asan"] = node_name

def child_constructor(self):
return ImportantConfigNode
return ExperimentalFeatureConfigNode


class ONNXConfigNode(TreeConfigNode):
Expand Down Expand Up @@ -260,17 +265,24 @@ def init2(self, node_name):
def child_constructor(self):
return ExperimentalFeatureConfigNode

class BuildOnlyConfigNode(TreeConfigNode):

class BuildOnlyConfigNode(TreeConfigNode):
def init2(self, node_name):
self.props["build_only"] = node_name

def child_constructor(self):
return ExperimentalFeatureConfigNode


class CoverageConfigNode(TreeConfigNode):
class ShardTestConfigNode(TreeConfigNode):
def init2(self, node_name):
self.props["shard_test"] = node_name

def child_constructor(self):
return ImportantConfigNode


class CoverageConfigNode(TreeConfigNode):
def init2(self, node_name):
self.props["is_coverage"] = node_name

Expand All @@ -290,7 +302,6 @@ def get_children(self):


class XenialCompilerConfigNode(TreeConfigNode):

def modify_label(self, label):
return label or "<unspecified>"

Expand All @@ -304,7 +315,6 @@ def child_constructor(self):


class BionicCompilerConfigNode(TreeConfigNode):

def modify_label(self, label):
return label or "<unspecified>"

Expand Down
6 changes: 4 additions & 2 deletions .circleci/cimodel/data/pytorch_build_definitions.py
Expand Up @@ -288,7 +288,6 @@ def instantiate_configs():
rocm_version = None
if compiler_name == "cuda":
cuda_version = fc.find_prop("compiler_version")
restrict_phases = ["build", "test1", "test2"]

elif compiler_name == "rocm":
rocm_version = fc.find_prop("compiler_version")
Expand All @@ -311,7 +310,6 @@ def instantiate_configs():
parms_list.append("asan")
python_version = fc.find_prop("pyver")
parms_list[0] = fc.find_prop("abbreviated_pyver")
restrict_phases = ["build", "test1", "test2"]

if is_onnx:
parms_list.append("onnx")
Expand All @@ -328,7 +326,11 @@ def instantiate_configs():
parallel_backend = fc.find_prop("parallel_backend") or None
build_only = fc.find_prop("build_only") or False
is_coverage = fc.find_prop("is_coverage") or False
shard_test = fc.find_prop("shard_test") or False
# TODO: fix pure_torch python test packaging issue.
if shard_test:
restrict_phases = ["build"] if restrict_phases is None else restrict_phases
restrict_phases.extend(["test1", "test2"])
if build_only or is_pure_torch:
restrict_phases = ["build"]
if is_coverage and restrict_phases is None:
Expand Down
68 changes: 9 additions & 59 deletions .circleci/config.yml
Expand Up @@ -142,7 +142,7 @@ commands:
name: (Optional) Merge target branch
no_output_timeout: "10m"
command: |
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
if [[ -n "$CIRCLE_PULL_REQUEST" && "$CIRCLE_BRANCH" != "nightly" ]]; then
PR_NUM=$(basename $CIRCLE_PULL_REQUEST)
CIRCLE_PR_BASE_BRANCH=$(curl -s https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$PR_NUM | jq -r '.base.ref')
if [[ "${BUILD_ENVIRONMENT}" == *"xla"* || "${BUILD_ENVIRONMENT}" == *"gcc5"* ]] ; then
Expand Down Expand Up @@ -6668,7 +6668,7 @@ workflows:
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
- pytorch_linux_test:
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test1
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test
requires:
- pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
filters:
Expand All @@ -6677,21 +6677,7 @@ workflows:
- master
- /ci-all\/.*/
- /release\/.*/
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- pytorch_linux_test:
name: pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test2
requires:
- pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_build
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test2"
build_environment: "pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
Expand Down Expand Up @@ -6802,21 +6788,7 @@ workflows:
build_environment: "pytorch-libtorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
- pytorch_linux_test:
name: pytorch_libtorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test1
requires:
- pytorch_libtorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
build_environment: "pytorch-libtorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- pytorch_linux_test:
name: pytorch_libtorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2
name: pytorch_libtorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test
requires:
- pytorch_libtorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
filters:
Expand All @@ -6825,7 +6797,7 @@ workflows:
- master
- /ci-all\/.*/
- /release\/.*/
build_environment: "pytorch-libtorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test2"
build_environment: "pytorch-libtorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
Expand All @@ -6842,7 +6814,7 @@ workflows:
build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
- pytorch_linux_test:
name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test1
name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
requires:
- pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
filters:
Expand All @@ -6851,21 +6823,7 @@ workflows:
- master
- /ci-all\/.*/
- /release\/.*/
build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- pytorch_linux_test:
name: pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test2
requires:
- pytorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test2"
build_environment: "pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
Expand All @@ -6876,18 +6834,10 @@ workflows:
build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
- pytorch_linux_test:
name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test1
requires:
- pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- pytorch_linux_test:
name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test2
name: pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_test
requires:
- pytorch_libtorch_linux_xenial_cuda11_0_cudnn8_py3_gcc7_build
build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test2"
build_environment: "pytorch-libtorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
Expand Down
2 changes: 1 addition & 1 deletion .circleci/verbatim-sources/commands.yml
Expand Up @@ -103,7 +103,7 @@ commands:
name: (Optional) Merge target branch
no_output_timeout: "10m"
command: |
if [ -n "$CIRCLE_PULL_REQUEST" ]; then
if [[ -n "$CIRCLE_PULL_REQUEST" && "$CIRCLE_BRANCH" != "nightly" ]]; then
PR_NUM=$(basename $CIRCLE_PULL_REQUEST)
CIRCLE_PR_BASE_BRANCH=$(curl -s https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$PR_NUM | jq -r '.base.ref')
if [[ "${BUILD_ENVIRONMENT}" == *"xla"* || "${BUILD_ENVIRONMENT}" == *"gcc5"* ]] ; then
Expand Down
6 changes: 2 additions & 4 deletions .clang-tidy
@@ -1,7 +1,6 @@
---
# NOTE there must be no spaces before the '-', so put the comma last.
InheritParentConfig: true
Checks: '
Checks: '-*,
bugprone-*,
-bugprone-forward-declaration-namespace,
-bugprone-macro-parentheses,
Expand All @@ -18,7 +17,6 @@ cppcoreguidelines-*,
-cppcoreguidelines-pro-type-union-access,
-cppcoreguidelines-pro-type-vararg,
-cppcoreguidelines-special-member-functions,
-facebook-hte-RelativeInclude,
hicpp-exception-baseclass,
hicpp-avoid-goto,
modernize-*,
Expand All @@ -29,7 +27,7 @@ modernize-*,
-modernize-use-trailing-return-type,
performance-*,
-performance-noexcept-move-constructor,
'
'
HeaderFilterRegex: 'torch/csrc/.*'
AnalyzeTemporaryDtors: false
CheckOptions:
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/lint.yml
Expand Up @@ -110,10 +110,10 @@ jobs:
# Install dependencies
pip install pyyaml
wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
sudo apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main"
sudo apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main"
sudo apt-get update
sudo apt-get install -y clang-tidy-8
sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-8 1000
sudo apt-get install -y clang-tidy-11
sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 1000
- name: Run clang-tidy
run: |
set -eux
Expand Down
2 changes: 1 addition & 1 deletion .jenkins/pytorch/win-test-helpers/build_pytorch.bat
Expand Up @@ -103,7 +103,7 @@ if "%USE_CUDA%"=="1" (
:: in PATH, and then pass the arguments to it.
:: Currently, randomtemp is placed before sccache (%TMP_DIR_WIN%\bin\nvcc)
:: so we are actually pretending sccache instead of nvcc itself.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.2/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.3/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
set RANDOMTEMP_EXECUTABLE=%TMP_DIR_WIN%\bin\nvcc.exe
set CUDA_NVCC_EXECUTABLE=%TMP_DIR_WIN%\bin\randomtemp.exe
set RANDOMTEMP_BASEDIR=%TMP_DIR_WIN%\bin
Expand Down
Expand Up @@ -12,7 +12,7 @@ call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Minic
if "%REBUILD%"=="" (
call conda install -y -q python=%PYTHON_VERSION% numpy cffi pyyaml boto3
call conda install -y -q -c conda-forge cmake
call conda install -y -q -c rdonnelly libuv
call conda install -y -q -c conda-forge libuv=1.39
)

:: Get installed libuv path
Expand Down
5 changes: 5 additions & 0 deletions aten/src/ATen/core/NamedRegistrations.cpp
Expand Up @@ -317,6 +317,11 @@ TORCH_LIBRARY_IMPL(aten, Named, m) {
m.impl("median.dim_values", CppFunction::makeFallthrough());
m.impl("median.names_dim", CppFunction::makeFallthrough());
m.impl("median.names_dim_values", CppFunction::makeFallthrough());
m.impl("nanmedian", CppFunction::makeFallthrough());
m.impl("nanmedian.dim", CppFunction::makeFallthrough());
m.impl("nanmedian.dim_values", CppFunction::makeFallthrough());
m.impl("nanmedian.names_dim", CppFunction::makeFallthrough());
m.impl("nanmedian.names_dim_values", CppFunction::makeFallthrough());
m.impl("min", CppFunction::makeFallthrough());
m.impl("min.dim", CppFunction::makeFallthrough());
m.impl("min.dim_min", CppFunction::makeFallthrough());
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/core/aten_interned_strings.h
Expand Up @@ -137,7 +137,6 @@ _(aten, _th_baddbmm) \
_(aten, _th_bmm) \
_(aten, _th_get_device) \
_(aten, _th_kthvalue) \
_(aten, _th_median) \
_(aten, _th_mode) \
_(aten, _th_prod) \
_(aten, _th_sigmoid) \
Expand Down Expand Up @@ -463,6 +462,7 @@ _(aten, max_unpool3d_forward) \
_(aten, max_values) \
_(aten, mean) \
_(aten, median) \
_(aten, nanmedian) \
_(aten, meshgrid) \
_(aten, min) \
_(aten, min_values) \
Expand Down Expand Up @@ -902,6 +902,7 @@ _(attr, maxnorm) \
_(attr, maximum) \
_(attr, mean) \
_(attr, median) \
_(attr, nanmedian) \
_(attr, min) \
_(attr, min_indices) \
_(attr, min_val) \
Expand Down
34 changes: 18 additions & 16 deletions aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
Expand Up @@ -77,22 +77,24 @@ namespace at { namespace cuda {
#define HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR cuOccupancyMaxActiveBlocksPerMultiprocessor
#endif

#define AT_FORALL_NVRTC(_) \
_(nvrtcVersion) \
_(nvrtcCreateProgram) \
_(nvrtcDestroyProgram) \
_(nvrtcGetPTXSize) \
_(nvrtcGetPTX) \
_(cuModuleLoadData) \
_(cuModuleGetFunction) \
_(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR)\
_(nvrtcGetErrorString) \
_(nvrtcGetProgramLogSize) \
_(nvrtcGetProgramLog) \
_(cuLaunchKernel) \
_(nvrtcCompileProgram) \
_(cuCtxGetCurrent) \
_(cuModuleUnload) \
#define AT_FORALL_NVRTC(_) \
_(nvrtcVersion) \
_(nvrtcCreateProgram) \
_(nvrtcAddNameExpression) \
_(nvrtcDestroyProgram) \
_(nvrtcGetPTXSize) \
_(nvrtcGetPTX) \
_(cuModuleLoadData) \
_(cuModuleGetFunction) \
_(HIPOCCUPANCYMAXACTIVEBLOCKSPERMULTIPROCESSOR) \
_(nvrtcGetErrorString) \
_(nvrtcGetProgramLogSize) \
_(nvrtcGetProgramLog) \
_(cuLaunchKernel) \
_(nvrtcCompileProgram) \
_(cuCtxGetCurrent) \
_(nvrtcGetLoweredName) \
_(cuModuleUnload) \
_(cuDevicePrimaryCtxGetState)

#endif
Expand Down

0 comments on commit 4217efa

Please sign in to comment.