Skip to content

Commit

Permalink
Update on "Support device map for distributed autograd while using Te…
Browse files Browse the repository at this point in the history
…nsorPipe."

TensorPipe's `set_device_map` option was applied during the forward
pass. However, if we ran the backward pass for the graph we would not
automatically pick up the reverse device mapping.

As a result, users had to specify both forward and backward device mapping
which is very tedious to do.

In this PR, I've added this functionality such that TensorPipe automatically
picks up the reverse device mapping during the backward pass. This is done by
storing the appropriate device mapping in the "recv" autograd function for
distributed autograd.

#Closes: #44170

Differential Revision: [D23751975](https://our.internmc.facebook.com/intern/diff/D23751975/)

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D23751975/)!

[ghstack-poisoned]
  • Loading branch information
pritamdamania committed Dec 24, 2020
2 parents 59fd8eb + 55b431b commit 6dbfacc
Show file tree
Hide file tree
Showing 3,228 changed files with 233,874 additions and 79,064 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
8 changes: 4 additions & 4 deletions .circleci/cimodel/data/binary_build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ def get_processor_arch_name(gpu_version):
"cu" + gpu_version.strip("cuda") if gpu_version.startswith("cuda") else gpu_version
)


LINUX_PACKAGE_VARIANTS = OrderedDict(
manywheel=[
"3.6m",
"3.7m",
"3.8m",
"3.9m"
],
conda=dimensions.STANDARD_PYTHON_VERSIONS,
libtorch=[
Expand All @@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version):
)),
# Skip CUDA-9.2 builds on Windows
windows=(
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]],
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
OrderedDict(
wheel=dimensions.STANDARD_PYTHON_VERSIONS,
conda=dimensions.STANDARD_PYTHON_VERSIONS,
Expand Down Expand Up @@ -142,11 +142,11 @@ def get_children(self):

# XXX disabling conda rocm build since docker images are not there
if self.find_prop("package_format") == 'conda':
gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)

# XXX libtorch rocm build is temporarily disabled
if self.find_prop("package_format") == 'libtorch':
gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)

return [ArchConfigNode(self, v) for v in gpu_versions]

Expand Down
8 changes: 6 additions & 2 deletions .circleci/cimodel/data/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,17 @@
]

ROCM_VERSIONS = [
"3.7",
"3.10",
"4.0",
]

GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS]
ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]

GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS

STANDARD_PYTHON_VERSIONS = [
"3.6",
"3.7",
"3.8",
"3.9"
]
48 changes: 35 additions & 13 deletions .circleci/cimodel/data/pytorch_build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
("clang", [
("5", [
("3.6", [
("asan", [XImportant(True)]),
("asan", [
(True, [
("shard_test", [XImportant(True)]),
]),
]),
]),
]),
("7", [
Expand All @@ -45,14 +49,22 @@
]),
("10.2", [
("3.6", [
("important", [X(True)]),
("libtorch", [X(True)]),
("shard_test", [XImportant(True)]),
("libtorch", [
(True, [
('build_only', [X(True)]),
]),
]),
]),
]),
("11.0", [
("11.1", [
("3.8", [
X(True),
("libtorch", [XImportant(True)])
("libtorch", [
(True, [
('build_only', [XImportant(True)]),
]),
]),
]),
]),
]),
Expand All @@ -72,12 +84,16 @@
("gcc", [
("9", [
("3.8", [
("coverage", [XImportant(True)]),
("coverage", [
(True, [
("shard_test", [XImportant(True)]),
]),
]),
]),
]),
]),
("rocm", [
("3.7", [
("3.9", [
("3.6", [
('build_only', [XImportant(True)]),
]),
Expand Down Expand Up @@ -158,6 +174,7 @@ def child_constructor(self):
"libtorch": LibTorchConfigNode,
"important": ImportantConfigNode,
"build_only": BuildOnlyConfigNode,
"shard_test": ShardTestConfigNode,
"cuda_gcc_override": CudaGccOverrideConfigNode,
"coverage": CoverageConfigNode,
"pure_torch": PureTorchConfigNode,
Expand Down Expand Up @@ -195,7 +212,7 @@ def init2(self, node_name):
self.props["is_asan"] = node_name

def child_constructor(self):
return ImportantConfigNode
return ExperimentalFeatureConfigNode


class ONNXConfigNode(TreeConfigNode):
Expand Down Expand Up @@ -250,7 +267,7 @@ def init2(self, node_name):
self.props["is_libtorch"] = node_name

def child_constructor(self):
return ImportantConfigNode
return ExperimentalFeatureConfigNode


class CudaGccOverrideConfigNode(TreeConfigNode):
Expand All @@ -260,17 +277,24 @@ def init2(self, node_name):
def child_constructor(self):
return ExperimentalFeatureConfigNode

class BuildOnlyConfigNode(TreeConfigNode):

class BuildOnlyConfigNode(TreeConfigNode):
def init2(self, node_name):
self.props["build_only"] = node_name

def child_constructor(self):
return ExperimentalFeatureConfigNode


class CoverageConfigNode(TreeConfigNode):
class ShardTestConfigNode(TreeConfigNode):
def init2(self, node_name):
self.props["shard_test"] = node_name

def child_constructor(self):
return ImportantConfigNode


class CoverageConfigNode(TreeConfigNode):
def init2(self, node_name):
self.props["is_coverage"] = node_name

Expand All @@ -290,7 +314,6 @@ def get_children(self):


class XenialCompilerConfigNode(TreeConfigNode):

def modify_label(self, label):
return label or "<unspecified>"

Expand All @@ -304,7 +327,6 @@ def child_constructor(self):


class BionicCompilerConfigNode(TreeConfigNode):

def modify_label(self, label):
return label or "<unspecified>"

Expand Down
39 changes: 28 additions & 11 deletions .circleci/cimodel/data/pytorch_build_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import cimodel.lib.conf_tree as conf_tree
import cimodel.lib.miniutils as miniutils
from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
from cimodel.data.simple.util.branch_filters import gen_filter_dict
from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
from cimodel.data.simple.util.docker_constants import gen_docker_image


Expand Down Expand Up @@ -110,6 +110,8 @@ def gen_workflow_params(self, phase):
parameters["resource_class"] = resource_class
if phase == "build" and self.rocm_version is not None:
parameters["resource_class"] = "xlarge"
if hasattr(self, 'filters'):
parameters['filters'] = self.filters
return parameters

def gen_workflow_job(self, phase):
Expand Down Expand Up @@ -139,14 +141,16 @@ def gen_workflow_job(self, phase):

# TODO This is a hack to special case some configs just for the workflow list
class HiddenConf(object):
def __init__(self, name, parent_build=None):
def __init__(self, name, parent_build=None, filters=None):
self.name = name
self.parent_build = parent_build
self.filters = filters

def gen_workflow_job(self, phase):
return {
self.gen_build_name(phase): {
"requires": [self.parent_build.gen_build_name("build")]
"requires": [self.parent_build.gen_build_name("build")],
"filters": self.filters,
}
}

Expand All @@ -166,7 +170,8 @@ def gen_workflow_job(self, phase):
"branch": self.branch,
"requires": [self.parent_build],
"context": "org-member",
"filters": gen_filter_dict(branches_list=["nightly"])
"filters": gen_filter_dict(branches_list=["nightly"],
tags_list=RC_PATTERN)
}
}

Expand Down Expand Up @@ -205,7 +210,9 @@ def gen_docs_configs(xenial_parent_config):
configs.append(
HiddenConf(
"pytorch_python_doc_build",
parent_build=xenial_parent_config
parent_build=xenial_parent_config,
filters=gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN),
)
)
configs.append(
Expand All @@ -219,7 +226,9 @@ def gen_docs_configs(xenial_parent_config):
configs.append(
HiddenConf(
"pytorch_cpp_doc_build",
parent_build=xenial_parent_config
parent_build=xenial_parent_config,
filters=gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN),
)
)
configs.append(
Expand Down Expand Up @@ -263,6 +272,7 @@ def instantiate_configs():
compiler_version = fc.find_prop("compiler_version")
is_xla = fc.find_prop("is_xla") or False
is_asan = fc.find_prop("is_asan") or False
is_coverage = fc.find_prop("is_coverage") or False
is_onnx = fc.find_prop("is_onnx") or False
is_pure_torch = fc.find_prop("is_pure_torch") or False
is_vulkan = fc.find_prop("is_vulkan") or False
Expand Down Expand Up @@ -301,7 +311,10 @@ def instantiate_configs():
parms_list.append("asan")
python_version = fc.find_prop("pyver")
parms_list[0] = fc.find_prop("abbreviated_pyver")
restrict_phases = ["build", "test1", "test2"]

if is_coverage:
parms_list_ignored_for_docker_image.append("coverage")
python_version = fc.find_prop("pyver")

if is_onnx:
parms_list.append("onnx")
Expand All @@ -317,13 +330,13 @@ def instantiate_configs():
is_important = fc.find_prop("is_important") or False
parallel_backend = fc.find_prop("parallel_backend") or None
build_only = fc.find_prop("build_only") or False
is_coverage = fc.find_prop("is_coverage") or False
shard_test = fc.find_prop("shard_test") or False
# TODO: fix pure_torch python test packaging issue.
if shard_test:
restrict_phases = ["build"] if restrict_phases is None else restrict_phases
restrict_phases.extend(["test1", "test2"])
if build_only or is_pure_torch:
restrict_phases = ["build"]
if is_coverage and restrict_phases is None:
restrict_phases = ["build", "coverage_test"]


gpu_resource = None
if cuda_version and cuda_version != "10":
Expand All @@ -348,6 +361,8 @@ def instantiate_configs():

# run docs builds on "pytorch-linux-xenial-py3.6-gcc5.4". Docs builds
# should run on a CPU-only build that runs on all PRs.
# XXX should this be updated to a more modern build? Projects are
# beginning to drop python3.6
if (
distro_name == "xenial"
and fc.find_prop("pyver") == "3.6"
Expand All @@ -358,6 +373,8 @@ def instantiate_configs():
and compiler_name == "gcc"
and fc.find_prop("compiler_version") == "5.4"
):
c.filters = gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN)
c.dependent_tests = gen_docs_configs(c)

if cuda_version == "10.2" and python_version == "3.6" and not is_libtorch:
Expand Down
37 changes: 22 additions & 15 deletions .circleci/cimodel/data/simple/docker_definitions.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,55 @@
from collections import OrderedDict

from cimodel.lib.miniutils import quote
from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN


# TODO: make this generated from a matrix rather than just a static list
IMAGE_NAMES = [
"pytorch-linux-bionic-cuda11.1-cudnn8-py3.6-gcc9",
"pytorch-linux-bionic-cuda11.1-cudnn8-py3.8-gcc9",
"pytorch-linux-bionic-cuda11.0-cudnn8-py3.6-gcc9",
"pytorch-linux-bionic-cuda11.0-cudnn8-py3.8-gcc9",
"pytorch-linux-bionic-cuda10.2-cudnn7-py3.8-gcc9",
"pytorch-linux-bionic-py3.6-clang9",
"pytorch-linux-bionic-cuda10.2-cudnn7-py3.6-clang9",
"pytorch-linux-bionic-py3.8-gcc9",
"pytorch-linux-bionic-rocm3.5.1-py3.6",
"pytorch-linux-xenial-cuda10-cudnn7-py3-gcc7",
"pytorch-linux-xenial-cuda10.1-cudnn7-py3-gcc7",
"pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7",
"pytorch-linux-xenial-cuda11.0-cudnn8-py3-gcc7",
"pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7",
"pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc5.4",
"pytorch-linux-xenial-cuda9.2-cudnn7-py3-gcc7",
"pytorch-linux-xenial-py3-clang5-android-ndk-r19c",
"pytorch-linux-xenial-py3-clang5-asan",
"pytorch-linux-xenial-py3-clang7-onnx",
"pytorch-linux-xenial-py3.8",
"pytorch-linux-xenial-py3.6-clang7",
"pytorch-linux-xenial-py3.6-gcc4.8",
"pytorch-linux-xenial-py3.6-gcc5.4",
"pytorch-linux-xenial-py3.6-gcc5.4", # this one is used in doc builds
"pytorch-linux-xenial-py3.6-gcc7.2",
"pytorch-linux-xenial-py3.6-gcc7",
"pytorch-linux-bionic-rocm3.7-py3.6",
"pytorch-linux-bionic-rocm3.9-py3.6",
"pytorch-linux-bionic-rocm3.10-py3.6",
]


def get_workflow_jobs():
"""Generates a list of docker image build definitions"""
return [
OrderedDict(
ret = []
for image_name in IMAGE_NAMES:
parameters = OrderedDict({
"name": quote(f"docker-{image_name}"),
"image_name": quote(image_name),
})
if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
# pushing documentation on tags requires CircleCI to also
# build all the dependencies on tags, including this docker image
parameters['filters'] = gen_filter_dict(branches_list=r"/.*/",
tags_list=RC_PATTERN)
ret.append(OrderedDict(
{
"docker_build_job": OrderedDict(
{
"name": quote(f"docker-{image_name}"),
"image_name": quote(image_name),
}
)
"docker_build_job": parameters
}
)
for image_name in IMAGE_NAMES
]
))
return ret

0 comments on commit 6dbfacc

Please sign in to comment.