Skip to content

Commit

Permalink
Update on "[NCCL][Test Only] no change"
Browse files Browse the repository at this point in the history
Differential Revision: [D23922690](https://our.internmc.facebook.com/intern/diff/D23922690/)

[ghstack-poisoned]
  • Loading branch information
mingzhe0908 committed Sep 26, 2020
2 parents e24a3ac + 675bccd commit 74424f1
Show file tree
Hide file tree
Showing 88 changed files with 2,714 additions and 1,644 deletions.
6 changes: 3 additions & 3 deletions .circleci/cimodel/data/binary_build_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_processor_arch_name(gpu_version):
)),
# Skip CUDA-9.2 builds on Windows
windows=(
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92', "rocm3.7"]],
[v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS],
OrderedDict(
wheel=dimensions.STANDARD_PYTHON_VERSIONS,
conda=dimensions.STANDARD_PYTHON_VERSIONS,
Expand Down Expand Up @@ -142,11 +142,11 @@ def get_children(self):

# XXX disabling conda rocm build since docker images are not there
if self.find_prop("package_format") == 'conda':
gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)

# XXX libtorch rocm build is temporarily disabled
if self.find_prop("package_format") == 'libtorch':
gpu_versions = filter(lambda x: x != "rocm3.7", gpu_versions)
gpu_versions = filter(lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions)

return [ArchConfigNode(self, v) for v in gpu_versions]

Expand Down
5 changes: 4 additions & 1 deletion .circleci/cimodel/data/dimensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@

ROCM_VERSIONS = [
"3.7",
"3.8",
]

GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ["rocm" + v for v in ROCM_VERSIONS]
ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]

GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS

STANDARD_PYTHON_VERSIONS = [
"3.6",
Expand Down
1 change: 1 addition & 0 deletions .circleci/cimodel/data/simple/docker_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"pytorch-linux-xenial-py3.6-gcc7.2",
"pytorch-linux-xenial-py3.6-gcc7",
"pytorch-linux-bionic-rocm3.7-py3.6",
"pytorch-linux-bionic-rocm3.8-py3.6",
]


Expand Down
159 changes: 159 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2130,6 +2130,39 @@ workflows:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
docker_image: "pytorch/manylinux-rocm:3.7"
- binary_linux_build:
name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
filters:
branches:
only:
- /.*/
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
docker_image: "pytorch/manylinux-rocm:3.8"
- binary_linux_build:
name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
filters:
branches:
only:
- /.*/
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
docker_image: "pytorch/manylinux-rocm:3.8"
- binary_linux_build:
name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
filters:
branches:
only:
- /.*/
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
docker_image: "pytorch/manylinux-rocm:3.8"
- binary_linux_build:
name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build
build_environment: "conda 3.6 cpu devtoolset7"
Expand Down Expand Up @@ -3429,6 +3462,51 @@ workflows:
docker_image: "pytorch/manylinux-rocm:3.7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- binary_linux_test:
name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
filters:
branches:
only:
- /.*/
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
requires:
- binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build
docker_image: "pytorch/manylinux-rocm:3.8"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- binary_linux_test:
name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
filters:
branches:
only:
- /.*/
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
requires:
- binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_build
docker_image: "pytorch/manylinux-rocm:3.8"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- binary_linux_test:
name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
filters:
branches:
only:
- /.*/
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
requires:
- binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_build
docker_image: "pytorch/manylinux-rocm:3.8"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- binary_linux_test:
name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test
build_environment: "conda 3.6 cpu devtoolset7"
Expand Down Expand Up @@ -4932,6 +5010,48 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: manywheel
upload_subfolder: rocm3.7
- binary_upload:
name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload
context: org-member
requires:
- binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test
filters:
branches:
only:
- nightly
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: manywheel
upload_subfolder: rocm3.8
- binary_upload:
name: binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_upload
context: org-member
requires:
- binary_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly_test
filters:
branches:
only:
- nightly
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: manywheel
upload_subfolder: rocm3.8
- binary_upload:
name: binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_upload
context: org-member
requires:
- binary_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly_test
filters:
branches:
only:
- nightly
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: manywheel
upload_subfolder: rocm3.8
- binary_upload:
name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload
context: org-member
Expand Down Expand Up @@ -6320,6 +6440,9 @@ workflows:
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
- docker_build_job:
name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
- pytorch_linux_build:
name: pytorch_linux_xenial_py3_6_gcc5_4_build
requires:
Expand Down Expand Up @@ -7455,6 +7578,42 @@ workflows:
docker_image: "pytorch/manylinux-rocm:3.7"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- smoke_linux_test:
name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly
build_environment: "manywheel 3.6m rocm3.8 devtoolset7"
requires:
- update_s3_htmls
filters:
branches:
only:
- postnightly
docker_image: "pytorch/manylinux-rocm:3.8"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- smoke_linux_test:
name: smoke_linux_manywheel_3_7m_rocm3_8_devtoolset7_nightly
build_environment: "manywheel 3.7m rocm3.8 devtoolset7"
requires:
- update_s3_htmls
filters:
branches:
only:
- postnightly
docker_image: "pytorch/manylinux-rocm:3.8"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- smoke_linux_test:
name: smoke_linux_manywheel_3_8m_rocm3_8_devtoolset7_nightly
build_environment: "manywheel 3.8m rocm3.8 devtoolset7"
requires:
- update_s3_htmls
filters:
branches:
only:
- postnightly
docker_image: "pytorch/manylinux-rocm:3.8"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- smoke_linux_test:
name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly
build_environment: "conda 3.6 cpu devtoolset7"
Expand Down
7 changes: 7 additions & 0 deletions .circleci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,13 @@ case "$image" in
VISION=yes
ROCM_VERSION=3.7
;;
pytorch-linux-bionic-rocm3.8-py3.6)
ANACONDA_PYTHON_VERSION=3.6
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=3.8
;;
*)
# Catch-all for builds that are not hardcoded.
PROTOBUF=yes
Expand Down
2 changes: 1 addition & 1 deletion .circleci/docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ esac

# Install Valgrind separately since the apt-get version is too old.
mkdir valgrind_build && cd valgrind_build
VALGRIND_VERSION=3.15.0
VALGRIND_VERSION=3.16.1
if ! wget http://valgrind.org/downloads/valgrind-${VALGRIND_VERSION}.tar.bz2
then
wget https://sourceware.org/ftp/valgrind/valgrind-${VALGRIND_VERSION}.tar.bz2
Expand Down
23 changes: 23 additions & 0 deletions aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,4 +230,27 @@ Allocator* getCPUAllocator() {
return getTHDefaultAllocator();
}

// override_allow_tf32_flag = true
// means the allow_tf32 flags are overrided and tf32 is force disabled
// override_allow_tf32_flag = false
// means the original allow_tf32 flags are followed
thread_local bool override_allow_tf32_flag = false;

NoTF32Guard::NoTF32Guard() {
if (!override_allow_tf32_flag) {
changed = true;
override_allow_tf32_flag = true;
}
}

NoTF32Guard::~NoTF32Guard() {
if (changed) {
override_allow_tf32_flag = false;
}
}

bool NoTF32Guard::should_disable_tf32() {
return override_allow_tf32_flag;
}

} // namespace at
16 changes: 16 additions & 0 deletions aten/src/ATen/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,4 +327,20 @@ static inline void manual_seed(uint64_t seed) {
}
}

// When the global flag `allow_tf32` is set to true, cuBLAS handles are
// automatically configured to use math mode CUBLAS_TF32_TENSOR_OP_MATH.
// For some operators, such as addmv, TF32 offers no performance improvement
// but causes precision loss. To help this case, this class implements
// a RAII guard that can be used to quickly disable TF32 within its scope.
//
// Usage:
// NoTF32Guard disable_tf32;
struct TORCH_API NoTF32Guard {
NoTF32Guard();
~NoTF32Guard();
static bool should_disable_tf32();
private:
bool changed = false;
};

} // namespace at
1 change: 1 addition & 0 deletions aten/src/ATen/core/boxing/KernelFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ void fallthrough_kernel(OperatorKernel*, const OperatorHandle&, Stack*) {
void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, Stack*) {
TORCH_INTERNAL_ASSERT(0,
op.operator_name(), " has kernels registered to both Math and a backend mapped to AutogradOther. "
"This makes the backend kernel unreachable (see Note [Ambiguity in AutogradOther kernel]). "
"If it's intended to override Math kernel behavior, please open an issue to request a dedicated "
"Autograd dispatch key for the backend.");
}
Expand Down
13 changes: 8 additions & 5 deletions aten/src/ATen/core/dispatch/OperatorEntry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,9 @@ const KernelFunction& OperatorEntry::computeDispatchTableEntry(const c10::Dispat
}

bool OperatorEntry::hasKernelForDispatchKeySet(DispatchKeySet ks) const {
for (auto k : ks) {
if (kernels_.find(k) != kernels_.end()) {
return true;
}
TORCH_INTERNAL_ASSERT(kernels_.find(DispatchKey::Undefined) == kernels_.end());
for (auto& kv : kernels_) {
if (ks.has(kv.first)) return true;
}
return false;
}
Expand Down Expand Up @@ -196,6 +195,9 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
// In the past we directly call into backends(filled with catchAll) after BackendSelect.
// Now that we first call Autograd backend keys after BackendSelect, we should fill those
// with catchAll as well.
// The implementation of (2.1) & (2.3) relies on the invariant that for a given backend,
// `computeDispatchTableEntryWithDebug()` will be called for that backend's autograd key after the
// backend key. See Note [Refresh Runtime Autograd entries in dispatchTable_]
// (3) Use fallthrough kernel that are registered as fallback.
// (4) Use catchAll kernel if available
// Alias Key Precedence:
Expand Down Expand Up @@ -272,7 +274,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp
for (auto k : c10::getRuntimeDispatchKeySet(dispatch_key)) {
updateDispatchTableEntry_(dispatcher, k);
}
// Registering to backend key might affect computed entry at its Autograd backend key due to 2.2.
// Note [Refresh Runtime Autograd entries in dispatchTable_]
// Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3).
DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key);
updateDispatchTableEntry_(dispatcher, autograd_key);
}
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/core/interned_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ namespace c10 {
_(prim, Store) \
_(prim, AutogradZero) \
_(prim, AutogradAnyNonZero) \
_(prim, AutogradAllNonZero) \
_(prim, AutogradAllZero) \
_(prim, Starred) \
_(prim, TupleConstruct) \
_(prim, TupleUnpack) \
Expand Down

0 comments on commit 74424f1

Please sign in to comment.