Skip to content

Commit

Permalink
Update on "Fix Pipe + DDP for unused parameters, static graph"
Browse files Browse the repository at this point in the history
Pipe + DDP has a few issues:

1) with static graph, does not synchronize gradients on first backward pass (i.e. delay allreduce is not run). does not work since #55248
2) when find_unused_parameters=True, also does not result in gradient synchronization. does not work since #57081


The reason for both cases is that calling `DDPSink.apply(output_tensor)` does not call the custom `backward` of `DDPSink` when the `output_tensor` is actually an `OwnerRRef`, which is the case when running DDP in `Pipe`. This is because we do `backward` on the `rref.local_value()` which does not have this autograd recording.

To fix, we unwrap the RRef and reconstruct it as needed, similar to the fix in #49908.

to test:
All tests in pipe_with_ddp_test pass.
The reason these tests did not catch the errors earlier is because all ranks received the same model inputs. So if gradient synchronization did not occur, then grads would still be the same because the model is the same on all ranks (guaranteed by ddp). Fixed the tests to use different inputs across ranks.

Differential Revision: [D29167283](https://our.internmc.facebook.com/intern/diff/D29167283/)

[ghstack-poisoned]
  • Loading branch information
rohan-varma committed Jun 17, 2021
2 parents 5db4162 + 86635e0 commit 1d9a066
Show file tree
Hide file tree
Showing 70 changed files with 1,052 additions and 595 deletions.
6 changes: 0 additions & 6 deletions .circleci/cimodel/data/windows_build_definitions.py
Expand Up @@ -147,14 +147,8 @@ def render(self):
WORKFLOW_DATA = [
# VS2019 CUDA-10.1
WindowsJob(None, _VC2019, CudaVersion(10, 1), master_only=True),
WindowsJob(1, _VC2019, CudaVersion(10, 1), master_only=True),
WindowsJob(2, _VC2019, CudaVersion(10, 1), master_only=True),
# VS2019 CUDA-10.1 force on cpu
WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
# VS2019 CUDA-11.1
WindowsJob(None, _VC2019, CudaVersion(11, 1)),
WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only=True),
WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only=True),

# TODO: This test is disabled due to https://github.com/pytorch/pytorch/issues/59724
# WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, master_and_nightly=True),
Expand Down
146 changes: 0 additions & 146 deletions .circleci/config.yml
Expand Up @@ -7637,44 +7637,6 @@ workflows:
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10.1"
executor: windows-with-nvidia-gpu
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
name: pytorch_windows_vs2019_py38_cuda10.1_test1
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda10.1_build
test_name: pytorch-windows-test1
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10.1"
executor: windows-with-nvidia-gpu
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
name: pytorch_windows_vs2019_py38_cuda10.1_test2
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda10.1_build
test_name: pytorch-windows-test2
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10.1"
Expand All @@ -7693,53 +7655,6 @@ workflows:
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
name: pytorch_windows_vs2019_py38_cuda11.1_build
python_version: "3.8"
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
executor: windows-with-nvidia-gpu
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
name: pytorch_windows_vs2019_py38_cuda11.1_test1
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda11.1_build
test_name: pytorch-windows-test1
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
executor: windows-with-nvidia-gpu
filters:
branches:
only:
- master
- /ci-all\/.*/
- /release\/.*/
name: pytorch_windows_vs2019_py38_cuda11.1_test2
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda11.1_build
test_name: pytorch-windows-test2
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- update_s3_htmls:
context: org-member
filters:
Expand Down Expand Up @@ -9278,32 +9193,6 @@ workflows:
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10.1"
executor: windows-with-nvidia-gpu
name: pytorch_windows_vs2019_py38_cuda10.1_test1
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda10.1_build
test_name: pytorch-windows-test1
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10.1"
executor: windows-with-nvidia-gpu
name: pytorch_windows_vs2019_py38_cuda10.1_test2
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda10.1_build
test_name: pytorch-windows-test2
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10.1"
Expand All @@ -9316,41 +9205,6 @@ workflows:
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
name: pytorch_windows_vs2019_py38_cuda11.1_build
python_version: "3.8"
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
executor: windows-with-nvidia-gpu
name: pytorch_windows_vs2019_py38_cuda11.1_test1
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda11.1_build
test_name: pytorch-windows-test1
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
executor: windows-with-nvidia-gpu
name: pytorch_windows_vs2019_py38_cuda11.1_test2
python_version: "3.8"
requires:
- pytorch_windows_vs2019_py38_cuda11.1_build
test_name: pytorch-windows-test2
use_cuda: "1"
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
when: << pipeline.parameters.run_master_build >>
slow_gradcheck_build:
jobs:
Expand Down
49 changes: 26 additions & 23 deletions .github/workflows/lint.yml
Expand Up @@ -270,7 +270,7 @@ jobs:
runs-on: ubuntu-18.04
container:
# ubuntu18.04-cuda10.2-py3.6-tidy11
image: ghcr.io/pytorch/cilint-clang-tidy:e2cfc57ce4fa3a257a4b78fdfdc2b065c167b9c5
image: ghcr.io/pytorch/cilint-clang-tidy:52a8ad78d49fc9f40241fee7988db48c920499df
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
Expand Down Expand Up @@ -312,12 +312,14 @@ jobs:
fi
- name: Run clang-tidy
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PR_NUMBER: ${{ github.event.pull_request.number }}
run: |
cd "${GITHUB_WORKSPACE}"
set -eux
wget -O pr.diff "https://patch-diff.githubusercontent.com/raw/pytorch/pytorch/pull/$PR_NUMBER.diff"
# Run Clang-Tidy
# The negative filters below are to exclude files that include onnx_pb.h or
# caffe2_pb.h, otherwise we'd have to build protos as part of this CI job.
Expand All @@ -326,27 +328,28 @@ jobs:
# /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
# deploy/interpreter files are excluded due to using macros and other techniquies
# that are not easily converted to accepted c++
python3 tools/clang_tidy.py \
--verbose \
--paths torch/csrc/ \
--diff "$BASE_SHA" \
-g"-torch/csrc/jit/passes/onnx/helper.cpp" \
-g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp"\
-g"-torch/csrc/jit/serialization/onnx.cpp" \
-g"-torch/csrc/jit/serialization/export.cpp" \
-g"-torch/csrc/jit/serialization/import.cpp" \
-g"-torch/csrc/jit/serialization/import_legacy.cpp" \
-g"-torch/csrc/onnx/init.cpp" \
-g"-torch/csrc/cuda/nccl.*" \
-g"-torch/csrc/cuda/python_nccl.cpp" \
-g"-torch/csrc/autograd/FunctionsManual.cpp" \
-g"-torch/csrc/generic/*.cpp" \
-g"-torch/csrc/jit/codegen/cuda/runtime/*" \
-g"-torch/csrc/deploy/interpreter/interpreter.cpp" \
-g"-torch/csrc/deploy/interpreter/interpreter.h" \
-g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \
-g"-torch/csrc/deploy/interpreter/test_main.cpp" \
"$@" > "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
python3 tools/clang_tidy.py \
--verbose \
--paths torch/csrc/ \
--diff-file pr.diff \
-g"-torch/csrc/jit/passes/onnx/helper.cpp" \
-g"-torch/csrc/jit/passes/onnx/shape_type_inference.cpp" \
-g"-torch/csrc/jit/serialization/onnx.cpp" \
-g"-torch/csrc/jit/serialization/export.cpp" \
-g"-torch/csrc/jit/serialization/import.cpp" \
-g"-torch/csrc/jit/serialization/import_legacy.cpp" \
-g"-torch/csrc/onnx/init.cpp" \
-g"-torch/csrc/cuda/nccl.*" \
-g"-torch/csrc/cuda/python_nccl.cpp" \
-g"-torch/csrc/autograd/FunctionsManual.cpp" \
-g"-torch/csrc/generic/*.cpp" \
-g"-torch/csrc/jit/codegen/cuda/runtime/*" \
-g"-torch/csrc/deploy/interpreter/interpreter.cpp" \
-g"-torch/csrc/deploy/interpreter/interpreter.h" \
-g"-torch/csrc/deploy/interpreter/interpreter_impl.h" \
-g"-torch/csrc/deploy/interpreter/test_main.cpp" \
"$@" >"${GITHUB_WORKSPACE}"/clang-tidy-output.txt
cat "${GITHUB_WORKSPACE}"/clang-tidy-output.txt
Expand Down
3 changes: 2 additions & 1 deletion .jenkins/pytorch/macos-test.sh
Expand Up @@ -53,7 +53,8 @@ test_python_all() {

# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
# see https://github.com/pytorch/pytorch/issues/60111
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-${GITHUB_HEAD_REF:-}}
if [ -n "$IN_PULL_REQUEST" ]; then
DETERMINE_FROM=$(mktemp)
Expand Down
3 changes: 2 additions & 1 deletion .jenkins/pytorch/test.sh
Expand Up @@ -124,7 +124,8 @@ fi

# Try to pull value from CIRCLE_PULL_REQUEST first then GITHUB_HEAD_REF second
# CIRCLE_PULL_REQUEST comes from CircleCI
# GITHUB_HEAD_REF comes from Github Actions
# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
# see https://github.com/pytorch/pytorch/issues/60111
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
if [ -n "$IN_PULL_REQUEST" ] && [[ "$BUILD_ENVIRONMENT" != *coverage* ]]; then
DETERMINE_FROM=$(mktemp)
Expand Down
2 changes: 2 additions & 0 deletions .jenkins/pytorch/win-test.sh
Expand Up @@ -43,6 +43,8 @@ fi
export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

# Try to pull value from CIRCLE_PULL_REQUEST
# NOTE: file_diff_from_base is currently bugged for GHA due to an issue finding a merge base for ghstack PRs
# see https://github.com/pytorch/pytorch/issues/60111
IN_PULL_REQUEST=${CIRCLE_PULL_REQUEST:-}
if [ -n "$IN_PULL_REQUEST" ]; then
DETERMINE_FROM="${TMP_DIR}/determine_from"
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/autocast_mode.cpp
Expand Up @@ -368,7 +368,6 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
KERNEL(ADD_NS(pow), "pow.Tensor_Tensor", Tensor (const Tensor &, const Tensor &), fp32)
KERNEL(ADD_NS(pow), "pow.Scalar", Tensor (const Scalar&, const Tensor &), fp32)
KERNEL(ADD_NS(softplus), "softplus", Tensor (const Tensor &, const Scalar&, const Scalar&), fp32)
KERNEL(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32)
KERNEL(ADD_NS(layer_norm), "layer_norm", Tensor (const Tensor &, IntArrayRef, const c10::optional<Tensor>&, const c10::optional<Tensor>&, double, bool), fp32)
// The macro doesn't like this one (I think it chokes on commas inside <>) so write it manually
m.impl(TORCH_SELECTIVE_NAME("aten::native_layer_norm"),
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/cpu/vec/vec256/vec256_base.h
Expand Up @@ -213,14 +213,14 @@ struct Vectorized {
}
return vec;
}
Vectorized<T> map(T (*f)(T)) const {
Vectorized<T> map(T (*const f)(T)) const {
Vectorized<T> ret;
for (int64_t i = 0; i != size(); i++) {
ret[i] = f(values[i]);
}
return ret;
}
Vectorized<T> map(T (*f)(const T &)) const {
Vectorized<T> map(T (*const f)(const T &)) const {
Vectorized<T> ret;
for (int64_t i = 0; i != size(); i++) {
ret[i] = f(values[i]);
Expand Down
20 changes: 10 additions & 10 deletions aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
Expand Up @@ -197,19 +197,19 @@ template <> class Vectorized<BFloat16> {
}
return b;
}
Vectorized<BFloat16> map(const __m256 (*vop)(__m256)) const {
Vectorized<BFloat16> map(const __m256 (*const vop)(__m256)) const {
__m256 lo, hi;
cvtbf16_fp32(values, lo, hi);
auto o1 = vop(lo);
auto o2 = vop(hi);
const auto o1 = vop(lo);
const auto o2 = vop(hi);
return cvtfp32_bf16(o1, o2);
}
Vectorized<BFloat16> abs() const {
__m256 lo, hi;
cvtbf16_fp32(values, lo, hi);
auto mask = _mm256_set1_ps(-0.f);
auto o1 = _mm256_andnot_ps(mask, lo);
auto o2 = _mm256_andnot_ps(mask, hi);
const auto mask = _mm256_set1_ps(-0.f);
const auto o1 = _mm256_andnot_ps(mask, lo);
const auto o2 = _mm256_andnot_ps(mask, hi);
return cvtfp32_bf16(o1, o2);
}
Vectorized<BFloat16> angle() const {
Expand Down Expand Up @@ -328,17 +328,17 @@ template <> class Vectorized<BFloat16> {
Vectorized<BFloat16> i0e() const {
__m256 lo, hi;
cvtbf16_fp32(values, lo, hi);
auto sz = size();
constexpr auto sz = size();
__at_align32__ float tmp1[sz / 2], tmp2[sz / 2];
_mm256_storeu_ps(reinterpret_cast<float*>(tmp1), lo);
_mm256_storeu_ps(reinterpret_cast<float*>(tmp2), hi);

for (decltype(sz) i = 0; i < sz / 2; i++) {
for (auto i = decltype(sz){0}; i < sz / 2; i++) {
tmp1[i] = calc_i0e(tmp1[i]);
tmp2[i] = calc_i0e(tmp2[i]);
}
auto o1 = _mm256_loadu_ps(tmp1);
auto o2 = _mm256_loadu_ps(tmp2);
const auto o1 = _mm256_loadu_ps(tmp1);
const auto o2 = _mm256_loadu_ps(tmp2);
return cvtfp32_bf16(o1, o2);
}
Vectorized<BFloat16> igamma(const Vectorized<BFloat16> &x) const {
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h
Expand Up @@ -105,7 +105,7 @@ template <> class Vectorized<c10::complex<double>> {
}
const c10::complex<double>& operator[](int idx) const = delete;
c10::complex<double>& operator[](int idx) = delete;
Vectorized<c10::complex<double>> map(c10::complex<double> (*f)(const c10::complex<double> &)) const {
Vectorized<c10::complex<double>> map(c10::complex<double> (*const f)(const c10::complex<double> &)) const {
__at_align32__ c10::complex<double> tmp[size()];
store(tmp);
for (int i = 0; i < size(); i++) {
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h
Expand Up @@ -141,7 +141,7 @@ template <> class Vectorized<c10::complex<float>> {
}
const c10::complex<float>& operator[](int idx) const = delete;
c10::complex<float>& operator[](int idx) = delete;
Vectorized<c10::complex<float>> map(c10::complex<float> (*f)(const c10::complex<float> &)) const {
Vectorized<c10::complex<float>> map(c10::complex<float> (*const f)(const c10::complex<float> &)) const {
__at_align32__ c10::complex<float> tmp[size()];
store(tmp);
for (int i = 0; i < size(); i++) {
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cpu/vec/vec256/vec256_double.h
Expand Up @@ -99,7 +99,7 @@ template <> class Vectorized<double> {
Vectorized<double> isnan() const {
return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q);
}
Vectorized<double> map(double (*f)(double)) const {
Vectorized<double> map(double (*const f)(double)) const {
__at_align32__ double tmp[size()];
store(tmp);
for (int64_t i = 0; i < size(); i++) {
Expand Down

0 comments on commit 1d9a066

Please sign in to comment.