Update on "Fix Pipe + DDP for unused parameters, static graph"

Pipe + DDP has a few issues: 1) with static graph, does not synchronize gradients on first backward pass (i.e. delay allreduce is not run). does not work since #55248 2) when find_unused_parameters=True, also does not result in gradient synchronization. does not work since #57081 The reason for both cases is that calling `DDPSink.apply(output_tensor)` does not call the custom `backward` of `DDPSink` when the `output_tensor` is actually an `OwnerRRef`, which is the case when running DDP in `Pipe`. This is because we do `backward` on the `rref.local_value()` which does not have this autograd recording. To fix, we unwrap the RRef and reconstruct it as needed, similar to the fix in #49908. to test: All tests in pipe_with_ddp_test pass. The reason these tests did not catch the errors earlier is because all ranks received the same model inputs. So if gradient synchronization did not occur, then grads would still be the same because the model is the same on all ranks (guaranteed by ddp). Fixed the tests to use different inputs across ranks. Differential Revision: [D29167283](https://our.internmc.facebook.com/intern/diff/D29167283/) [ghstack-poisoned]
pytorch · Jun 17, 2021 · 1d9a066 · 1d9a066
2 parents 5db4162 + 86635e0
commit 1d9a066
Show file tree

Hide file tree

Showing 70 changed files with 1,052 additions and 595 deletions.
diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
@@ -147,14 +147,8 @@ def render(self):
 WORKFLOW_DATA = [
     # VS2019 CUDA-10.1
     WindowsJob(None, _VC2019, CudaVersion(10, 1), master_only=True),
-    WindowsJob(1, _VC2019, CudaVersion(10, 1), master_only=True),
-    WindowsJob(2, _VC2019, CudaVersion(10, 1), master_only=True),
     # VS2019 CUDA-10.1 force on cpu
     WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
-    # VS2019 CUDA-11.1
-    WindowsJob(None, _VC2019, CudaVersion(11, 1)),
-    WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only=True),
-    WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only=True),
 
     # TODO: This test is disabled due to https://github.com/pytorch/pytorch/issues/59724
     # WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, master_and_nightly=True),

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -7637,44 +7637,6 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          executor: windows-with-nvidia-gpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: pytorch_windows_vs2019_py38_cuda10.1_test1
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda10.1_build
-          test_name: pytorch-windows-test1
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          executor: windows-with-nvidia-gpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: pytorch_windows_vs2019_py38_cuda10.1_test2
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda10.1_build
-          test_name: pytorch-windows-test2
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
           cuda_version: "10.1"
@@ -7693,53 +7655,6 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
-      - pytorch_windows_build:
-          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11.1"
-          name: pytorch_windows_vs2019_py38_cuda11.1_build
-          python_version: "3.8"
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11.1"
-          executor: windows-with-nvidia-gpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: pytorch_windows_vs2019_py38_cuda11.1_test1
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda11.1_build
-          test_name: pytorch-windows-test1
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11.1"
-          executor: windows-with-nvidia-gpu
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
-                - /release\/.*/
-          name: pytorch_windows_vs2019_py38_cuda11.1_test2
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda11.1_build
-          test_name: pytorch-windows-test2
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
       - update_s3_htmls:
           context: org-member
           filters:
@@ -9278,32 +9193,6 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          executor: windows-with-nvidia-gpu
-          name: pytorch_windows_vs2019_py38_cuda10.1_test1
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda10.1_build
-          test_name: pytorch-windows-test1
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10.1"
-          executor: windows-with-nvidia-gpu
-          name: pytorch_windows_vs2019_py38_cuda10.1_test2
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda10.1_build
-          test_name: pytorch-windows-test2
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
           cuda_version: "10.1"
@@ -9316,41 +9205,6 @@ workflows:
           vc_product: BuildTools
           vc_version: ""
           vc_year: "2019"
-      - pytorch_windows_build:
-          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11.1"
-          name: pytorch_windows_vs2019_py38_cuda11.1_build
-          python_version: "3.8"
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11.1"
-          executor: windows-with-nvidia-gpu
-          name: pytorch_windows_vs2019_py38_cuda11.1_test1
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda11.1_build
-          test_name: pytorch-windows-test1
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
-      - pytorch_windows_test:
-          build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11.1"
-          executor: windows-with-nvidia-gpu
-          name: pytorch_windows_vs2019_py38_cuda11.1_test2
-          python_version: "3.8"
-          requires:
-            - pytorch_windows_vs2019_py38_cuda11.1_build
-          test_name: pytorch-windows-test2
-          use_cuda: "1"
-          vc_product: BuildTools
-          vc_version: ""
-          vc_year: "2019"
     when: << pipeline.parameters.run_master_build >>
   slow_gradcheck_build:
     jobs:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -270,7 +270,7 @@ jobs:
     runs-on: ubuntu-18.04
     container:
       # ubuntu18.04-cuda10.2-py3.6-tidy11
-      image: ghcr.io/pytorch/cilint-clang-tidy:e2cfc57ce4fa3a257a4b78fdfdc2b065c167b9c5
+      image: ghcr.io/pytorch/cilint-clang-tidy:52a8ad78d49fc9f40241fee7988db48c920499df
     steps:
       - name: Checkout PyTorch
         uses: actions/checkout@v2
@@ -312,12 +312,14 @@ jobs:
           fi
       - name: Run clang-tidy
         env:
-          BASE_SHA: ${{ github.event.pull_request.base.sha }}
           HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
         run: |
           cd "${GITHUB_WORKSPACE}"
           set -eux
 
+          wget -O pr.diff "https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/$PR_NUMBER.diff"
+
           # Run Clang-Tidy
           # The negative filters below are to exclude files that include onnx_pb.h or
           # caffe2_pb.h, otherwise we'd have to build protos as part of this CI job.
@@ -326,27 +328,28 @@ jobs:
           # /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
           # deploy/interpreter files are excluded due to using macros and other techniquies
           # that are not easily converted to accepted c++
-          python3 tools/clang_tidy.py                               \
-            --verbose                                              \
-            --paths torch/csrc/                                    \
-            --diff "$BASE_SHA"                                   \
-            -g"-torch/csrc/jit/passes/onnx/helper.cpp"             \
-            -g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp"\
-            -g"-torch/csrc/jit/serialization/onnx.cpp"             \
-            -g"-torch/csrc/jit/serialization/export.cpp"           \
-            -g"-torch/csrc/jit/serialization/import.cpp"           \
-            -g"-torch/csrc/jit/serialization/import_legacy.cpp"    \
-            -g"-torch/csrc/onnx/init.cpp"                          \
-            -g"-torch/csrc/cuda/nccl.*"                            \
-            -g"-torch/csrc/cuda/python_nccl.cpp"                   \
-            -g"-torch/csrc/autograd/FunctionsManual.cpp"           \
-            -g"-torch/csrc/generic/*.cpp"                          \
-            -g"-torch/csrc/jit/codegen/cuda/runtime/*"             \
-            -g"-torch/csrc/deploy/interpreter/interpreter.cpp"     \
-            -g"-torch/csrc/deploy/interpreter/interpreter.h"  \
-            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h"  \
-            -g"-torch/csrc/deploy/interpreter/test_main.cpp"  \
-            "$@" > "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
+          python3 tools/clang_tidy.py \
+            --verbose \
+            --paths torch/csrc/ \
+            --diff-file pr.diff \
+            -g"-torch/csrc/jit/passes/onnx/helper.cpp" \
+            -g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp" \
+            -g"-torch/csrc/jit/serialization/onnx.cpp" \
+            -g"-torch/csrc/jit/serialization/export.cpp" \
+            -g"-torch/csrc/jit/serialization/import.cpp" \
+            -g"-torch/csrc/jit/serialization/import_legacy.cpp" \
+            -g"-torch/csrc/onnx/init.cpp" \
+            -g"-torch/csrc/cuda/nccl.*" \
+            -g"-torch/csrc/cuda/python_nccl.cpp" \
+            -g"-torch/csrc/autograd/FunctionsManual.cpp" \
+            -g"-torch/csrc/generic/*.cpp" \
+            -g"-torch/csrc/jit/codegen/cuda/runtime/*" \
+            -g"-torch/csrc/deploy/interpreter/interpreter.cpp" \
+            -g"-torch/csrc/deploy/interpreter/interpreter.h" \
+            -g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \
+            -g"-torch/csrc/deploy/interpreter/test_main.cpp" \
+            "$@" >"${GITHUB_WORKSPACE}"/clang-tidy-output.txt
+
 
           cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
 

diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
@@ -53,7 +53,8 @@ test_python_all() {
 
   # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
   # CIRCLE_PULL_REQUEST comes from CircleCI
-  # GITHUB_HEAD_REF comes from Github Actions
+  # NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+  #       see https://github.com/pytorch/pytorch/issues/60111
   IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
   if [ -n "$IN_PULL_REQUEST" ]; then
     DETERMINE_FROM=$(mktemp)

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -124,7 +124,8 @@ fi
 
 # Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
 # CIRCLE_PULL_REQUEST comes from CircleCI
-# GITHUB_HEAD_REF comes from Github Actions
+# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+#       see https://github.com/pytorch/pytorch/issues/60111
 IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
 if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
   DETERMINE_FROM=$(mktemp)

diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh
@@ -43,6 +43,8 @@ fi
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers
 
 # Try to pull value from CIRCLE_PULL_REQUEST
+# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
+#       see https://github.com/pytorch/pytorch/issues/60111
 IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
 if [ -n "$IN_PULL_REQUEST" ]; then
   DETERMINE_FROM="${TMP_DIR}/determine_from"

diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
@@ -368,7 +368,6 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(pow), "pow.Tensor_Tensor", Tensor (const Tensor &, const Tensor &), fp32)
   KERNEL(ADD_NS(pow), "pow.Scalar", Tensor (const Scalar&, const Tensor &), fp32)
   KERNEL(ADD_NS(softplus), "softplus", Tensor (const Tensor &, const Scalar&, const Scalar&), fp32)
-  KERNEL(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32)
   KERNEL(ADD_NS(layer_norm), "layer_norm", Tensor (const Tensor &, IntArrayRef, const c10::optional<Tensor>&, const c10::optional<Tensor>&, double, bool), fp32)
   // The macro doesn't like this one (I think it chokes on commas inside <>) so write it manually
   m.impl(TORCH_SELECTIVE_NAME("aten::native_layer_norm"),

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_base.h b/aten/src/ATen/cpu/vec/vec256/vec256_base.h
@@ -213,14 +213,14 @@ struct Vectorized {
     }
     return vec;
   }
-  Vectorized<T> map(T (*f)(T)) const {
+  Vectorized<T> map(T (*const f)(T)) const {
     Vectorized<T> ret;
     for (int64_t i = 0; i != size(); i++) {
       ret[i] = f(values[i]);
     }
     return ret;
   }
-  Vectorized<T> map(T (*f)(const T &)) const {
+  Vectorized<T> map(T (*const f)(const T &)) const {
     Vectorized<T> ret;
     for (int64_t i = 0; i != size(); i++) {
       ret[i] = f(values[i]);

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@@ -197,19 +197,19 @@ template <> class Vectorized<BFloat16> {
     }
     return b;
   }
-  Vectorized<BFloat16> map(const __m256 (*vop)(__m256)) const {
+  Vectorized<BFloat16> map(const __m256 (*const vop)(__m256)) const {
     __m256 lo, hi;
     cvtbf16_fp32(values, lo, hi);
-    auto o1 = vop(lo);
-    auto o2 = vop(hi);
+    const auto o1 = vop(lo);
+    const auto o2 = vop(hi);
     return cvtfp32_bf16(o1, o2);
   }
   Vectorized<BFloat16> abs() const {
     __m256 lo, hi;
     cvtbf16_fp32(values, lo, hi);
-    auto mask = _mm256_set1_ps(-0.f);
-    auto o1 = _mm256_andnot_ps(mask, lo);
-    auto o2 = _mm256_andnot_ps(mask, hi);
+    const auto mask = _mm256_set1_ps(-0.f);
+    const auto o1 = _mm256_andnot_ps(mask, lo);
+    const auto o2 = _mm256_andnot_ps(mask, hi);
     return cvtfp32_bf16(o1, o2);
   }
   Vectorized<BFloat16> angle() const {
@@ -328,17 +328,17 @@ template <> class Vectorized<BFloat16> {
   Vectorized<BFloat16> i0e() const {
     __m256 lo, hi;
     cvtbf16_fp32(values, lo, hi);
-    auto sz = size();
+    constexpr auto sz = size();
     __at_align32__ float tmp1[sz / 2], tmp2[sz / 2];
     _mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
     _mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);
 
-    for (decltype(sz) i = 0; i < sz / 2; i++) {
+    for (auto i = decltype(sz){0}; i < sz / 2; i++) {
       tmp1[i] = calc_i0e(tmp1[i]);
       tmp2[i] = calc_i0e(tmp2[i]);
     }
-    auto o1 = _mm256_loadu_ps(tmp1);
-    auto o2 = _mm256_loadu_ps(tmp2);
+    const auto o1 = _mm256_loadu_ps(tmp1);
+    const auto o2 = _mm256_loadu_ps(tmp2);
     return cvtfp32_bf16(o1, o2);
   }
   Vectorized<BFloat16> igamma(const Vectorized<BFloat16> &x) const {

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
@@ -105,7 +105,7 @@ template <> class Vectorized<c10::complex<double>> {
   }
   const c10::complex<double>& operator[](int idx) const  = delete;
   c10::complex<double>& operator[](int idx) = delete;
-  Vectorized<c10::complex<double>> map(c10::complex<double> (*f)(const c10::complex<double> &)) const {
+  Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
     __at_align32__ c10::complex<double> tmp[size()];
     store(tmp);
     for (int i = 0; i < size(); i++) {

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
@@ -141,7 +141,7 @@ template <> class Vectorized<c10::complex<float>> {
   }
   const c10::complex<float>& operator[](int idx) const  = delete;
   c10::complex<float>& operator[](int idx) = delete;
-  Vectorized<c10::complex<float>> map(c10::complex<float> (*f)(const c10::complex<float> &)) const {
+  Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
     __at_align32__ c10::complex<float> tmp[size()];
     store(tmp);
     for (int i = 0; i < size(); i++) {

diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h
@@ -99,7 +99,7 @@ template <> class Vectorized<double> {
   Vectorized<double> isnan() const {
     return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
   }
-  Vectorized<double> map(double (*f)(double)) const {
+  Vectorized<double> map(double (*const f)(double)) const {
     __at_align32__ double tmp[size()];
     store(tmp);
     for (int64_t i = 0; i < size(); i++) {