Skip to content

Commit

Permalink
Update on "[NCCL] Add Error log when ProcessGroupNCCL takes down proc…
Browse files Browse the repository at this point in the history
…ess upon

timeout/error"

timeout/error**

timeout/error

The new NCCL async error handling feature throws an exception from the
workCleanup Thread if one of the NCCL operations encounters an error or times
out. This PR adds an error log to make it more clear to the user why the
training process crashed.

Differential Revision: [D23794801](https://our.internmc.facebook.com/intern/diff/D23794801/)

[ghstack-poisoned]
  • Loading branch information
osalpekar committed Oct 8, 2020
2 parents 060f2d9 + acca11b commit 384f1d2
Show file tree
Hide file tree
Showing 1,678 changed files with 69,863 additions and 25,276 deletions.
6 changes: 3 additions & 3 deletions .circleci/cimodel/data/binary_build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version):
)),
# Skip CUDA-9.2 builds on Windows
windows=(
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]],
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
OrderedDict(
wheel=dimensions.STANDARD_PYTHON_VERSIONS,
conda=dimensions.STANDARD_PYTHON_VERSIONS,
Expand Down Expand Up @@ -142,11 +142,11 @@ def get_children(self):

# XXX disabling conda rocm build since docker images are not there
if self.find_prop("package_format") == 'conda':
gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)

# XXX libtorch rocm build is temporarily disabled
if self.find_prop("package_format") == 'libtorch':
gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)

return [ArchConfigNode(self, v) for v in gpu_versions]

Expand Down
5 changes: 4 additions & 1 deletion .circleci/cimodel/data/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@

ROCM_VERSIONS = [
"3.7",
"3.8",
]

GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS]
ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]

GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS

STANDARD_PYTHON_VERSIONS = [
"3.6",
Expand Down
26 changes: 20 additions & 6 deletions .circleci/cimodel/data/pytorch_build_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import cimodel.lib.conf_tree as conf_tree
import cimodel.lib.miniutils as miniutils
from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
from cimodel.data.simple.util.branch_filters import gen_filter_dict
from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
from cimodel.data.simple.util.docker_constants import gen_docker_image


Expand Down Expand Up @@ -110,6 +110,8 @@ def gen_workflow_params(self, phase):
parameters["resource_class"] = resource_class
if phase == "build" and self.rocm_version is not None:
parameters["resource_class"] = "xlarge"
if hasattr(self, 'filters'):
parameters['filters'] = self.filters
return parameters

def gen_workflow_job(self, phase):
Expand Down Expand Up @@ -139,14 +141,16 @@ def gen_workflow_job(self, phase):

# TODO This is a hack to special case some configs just for the workflow list
class HiddenConf(object):
def __init__(self, name, parent_build=None):
def __init__(self, name, parent_build=None, filters=None):
self.name = name
self.parent_build = parent_build
self.filters = filters

def gen_workflow_job(self, phase):
return {
self.gen_build_name(phase): {
"requires": [self.parent_build.gen_build_name("build")]
"requires": [self.parent_build.gen_build_name("build")],
"filters": self.filters,
}
}

Expand All @@ -166,7 +170,8 @@ def gen_workflow_job(self, phase):
"branch": self.branch,
"requires": [self.parent_build],
"context": "org-member",
"filters": gen_filter_dict(branches_list=["nightly"])
"filters": gen_filter_dict(branches_list=["nightly"],
tags_list=RC_PATTERN)
}
}

Expand Down Expand Up @@ -205,7 +210,9 @@ def gen_docs_configs(xenial_parent_config):
configs.append(
HiddenConf(
"pytorch_python_doc_build",
parent_build=xenial_parent_config
parent_build=xenial_parent_config,
filters=gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN),
)
)
configs.append(
Expand All @@ -219,7 +226,9 @@ def gen_docs_configs(xenial_parent_config):
configs.append(
HiddenConf(
"pytorch_cpp_doc_build",
parent_build=xenial_parent_config
parent_build=xenial_parent_config,
filters=gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN),
)
)
configs.append(
Expand Down Expand Up @@ -279,6 +288,7 @@ def instantiate_configs():
rocm_version = None
if compiler_name == "cuda":
cuda_version = fc.find_prop("compiler_version")
restrict_phases = ["build", "test1", "test2"]

elif compiler_name == "rocm":
rocm_version = fc.find_prop("compiler_version")
Expand Down Expand Up @@ -348,6 +358,8 @@ def instantiate_configs():

# run docs builds on "pytorch-linux-xenial-py3.6-gcc5.4". Docs builds
# should run on a CPU-only build that runs on all PRs.
# XXX should this be updated to a more modern build? Projects are
# beginning to drop python3.6
if (
distro_name == "xenial"
and fc.find_prop("pyver") == "3.6"
Expand All @@ -358,6 +370,8 @@ def instantiate_configs():
and compiler_name == "gcc"
and fc.find_prop("compiler_version") == "5.4"
):
c.filters = gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN)
c.dependent_tests = gen_docs_configs(c)

if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:
Expand Down
30 changes: 18 additions & 12 deletions .circleci/cimodel/data/simple/docker_definitions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import OrderedDict

from cimodel.lib.miniutils import quote
from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN


# TODO: make this generated from a matrix rather than just a static list
Expand All @@ -24,25 +25,30 @@
"pytorch-linux-xenial-py3.8",
"pytorch-linux-xenial-py3.6-clang7",
"pytorch-linux-xenial-py3.6-gcc4.8",
"pytorch-linux-xenial-py3.6-gcc5.4",
"pytorch-linux-xenial-py3.6-gcc5.4", # this one is used in doc builds
"pytorch-linux-xenial-py3.6-gcc7.2",
"pytorch-linux-xenial-py3.6-gcc7",
"pytorch-linux-bionic-rocm3.7-py3.6",
"pytorch-linux-bionic-rocm3.8-py3.6",
]


def get_workflow_jobs():
"""Generates a list of docker image build definitions"""
return [
OrderedDict(
ret = []
for image_name in IMAGE_NAMES:
parameters = OrderedDict({
"name": quote(f"docker-{image_name}"),
"image_name": quote(image_name),
})
if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
# pushing documentation on tags requires CircleCI to also
# build all the dependencies on tags, including this docker image
parameters['filters'] = gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN)
ret.append(OrderedDict(
{
"docker_build_job": OrderedDict(
{
"name": quote(f"docker-{image_name}"),
"image_name": quote(image_name),
}
)
"docker_build_job": parameters
}
)
for image_name in IMAGE_NAMES
]
))
return ret
24 changes: 4 additions & 20 deletions .circleci/cimodel/data/simple/ge_config_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,41 +61,25 @@ def gen_tree(self):
MultiPartVersion([3, 6], "py"),
MultiPartVersion([5, 4], "gcc"),
None,
["ge_config_legacy", "test"],
["jit_legacy", "test"],
["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
GeConfigTestJob(
MultiPartVersion([3, 6], "py"),
MultiPartVersion([5, 4], "gcc"),
None,
["ge_config_profiling", "test"],
["pytorch_linux_xenial_py3_6_gcc5_4_build"]),
GeConfigTestJob(
MultiPartVersion([3, 6], "py"),
MultiPartVersion([5, 4], "gcc"),
None,
["ge_config_simple", "test"],
["jit_simple", "test"],
["pytorch_linux_xenial_py3_6_gcc5_4_build"],
),
GeConfigTestJob(
None,
None,
CudaVersion(10, 2),
["cudnn7", "py3", "ge_config_legacy", "test"],
["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
use_cuda_docker=True,
# TODO Why does the build environment specify cuda10.1, while the
# job name is cuda10_2?
build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_legacy-test"),
GeConfigTestJob(
None,
None,
CudaVersion(10, 2),
["cudnn7", "py3", "ge_config_profiling", "test"],
["cudnn7", "py3", "jit_legacy", "test"],
["pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build"],
use_cuda_docker=True,
# TODO Why does the build environment specify cuda10.1, while the
# job name is cuda10_2?
build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test"),
build_env_override="pytorch-linux-xenial-cuda10.1-cudnn7-jit_legacy-test"),
]


Expand Down
6 changes: 3 additions & 3 deletions .circleci/cimodel/data/simple/ios_definitions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from cimodel.data.simple.util.versions import MultiPartVersion


IOS_VERSION = MultiPartVersion([11, 2, 1])
IOS_VERSION = MultiPartVersion([12, 0, 0])


class ArchVariant:
Expand Down Expand Up @@ -62,8 +62,8 @@ def gen_tree(self):

WORKFLOW_DATA = [
IOSJob(IOS_VERSION, ArchVariant("x86_64"), is_org_member_context=False),
# IOSJob(IOS_VERSION, ArchVariant("arm64")),
# IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
IOSJob(IOS_VERSION, ArchVariant("arm64")),
IOSJob(IOS_VERSION, ArchVariant("arm64", True), extra_props={"op_list": "mobilenetv2.yaml"}),
]


Expand Down
3 changes: 1 addition & 2 deletions .circleci/cimodel/data/windows_build_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,7 @@ def FalsePred(_):
def TruePred(_):
return True

# MKLDNN compilation fails with VC-19.27
_VC2019 = VcSpec(2019, ["14", "26"], hide_version=True)
_VC2019 = VcSpec(2019)

WORKFLOW_DATA = [
# VS2019 CUDA-10.1
Expand Down

0 comments on commit 384f1d2

Please sign in to comment.