Skip to content

Commit

Permalink
Update on "Enhance new_group doc to mention using NCCL concurrently."
Browse files Browse the repository at this point in the history
Using NCCL communicators concurrently is not safe and this is
documented in NCCL docs.

However, this is not documented in PyTorch and we should add documentation for
ProcessGroupNCCL so that users are aware of this limitation.

Differential Revision: [D25351778](https://our.internmc.facebook.com/intern/diff/D25351778/)

[ghstack-poisoned]
  • Loading branch information
pritamdamania committed Dec 9, 2020
2 parents 50bdd72 + c29f516 commit 411ba6d
Show file tree
Hide file tree
Showing 308 changed files with 9,565 additions and 5,284 deletions.
2 changes: 1 addition & 1 deletion .circleci/cimodel/data/dimensions.py
Expand Up @@ -8,8 +8,8 @@
]

ROCM_VERSIONS = [
"3.8",
"3.9",
"3.10",
]

ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
Expand Down
2 changes: 1 addition & 1 deletion .circleci/cimodel/data/simple/docker_definitions.py
Expand Up @@ -29,8 +29,8 @@
"pytorch-linux-xenial-py3.6-gcc5.4", # this one is used in doc builds
"pytorch-linux-xenial-py3.6-gcc7.2",
"pytorch-linux-xenial-py3.6-gcc7",
"pytorch-linux-bionic-rocm3.8-py3.6",
"pytorch-linux-bionic-rocm3.9-py3.6",
"pytorch-linux-bionic-rocm3.10-py3.6",
]


Expand Down
214 changes: 107 additions & 107 deletions .circleci/config.yml

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions .circleci/docker/build.sh
Expand Up @@ -274,19 +274,19 @@ case "$image" in
VISION=yes
KATEX=yes
;;
pytorch-linux-bionic-rocm3.8-py3.6)
pytorch-linux-bionic-rocm3.9-py3.6)
ANACONDA_PYTHON_VERSION=3.6
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=3.8
ROCM_VERSION=3.9
;;
pytorch-linux-bionic-rocm3.9-py3.6)
pytorch-linux-bionic-rocm3.10-py3.6)
ANACONDA_PYTHON_VERSION=3.6
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=3.9
ROCM_VERSION=3.10
;;
*)
# Catch-all for builds that are not hardcoded.
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/lint.yml
Expand Up @@ -175,6 +175,7 @@ jobs:
-g"-torch/csrc/cuda/python_nccl.cpp" \
-g"-torch/csrc/autograd/FunctionsManual.cpp" \
-g"-torch/csrc/generic/*.cpp" \
-g"-torch/csrc/jit/codegen/cuda/runtime/*" \
"$@" > ${GITHUB_WORKSPACE}/clang-tidy-output.txt
cat ${GITHUB_WORKSPACE}/clang-tidy-output.txt
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -76,6 +76,7 @@ torch/lib/*.exe*
torch/lib/*.dylib*
torch/lib/*.h
torch/lib/*.lib
torch/lib/*.pdb
torch/lib/*.so*
torch/lib/protobuf*.pc
torch/lib/build
Expand Down Expand Up @@ -192,6 +193,7 @@ build_ios
/build_*
.build_debug/*
.build_release/*
.build_profile/*
distribute/*
*.testbin
*.bin
Expand Down
2 changes: 2 additions & 0 deletions .jenkins/pytorch/codegen-test.sh
Expand Up @@ -38,6 +38,8 @@ mkdir -p "$OUT"/pyi/torch/_C
mkdir -p "$OUT"/pyi/torch/nn
python -m tools.pyi.gen_pyi \
--declarations-path "$OUT"/torch/share/ATen/Declarations.yaml \
--native-functions-path aten/src/ATen/native/native_functions.yaml \
--deprecated-functions-path tools/autograd/deprecated.yaml \
--out "$OUT"/pyi

# autograd codegen (called by torch codegen but can run independently)
Expand Down
1 change: 1 addition & 0 deletions .jenkins/pytorch/multigpu-test.sh
Expand Up @@ -17,6 +17,7 @@ fi

python tools/download_mnist.py --quiet -d test/cpp/api/mnist
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
time python test/run_test.py --verbose -i distributed/test_jit_c10d
time python test/run_test.py --verbose -i distributed/test_distributed_fork
time python test/run_test.py --verbose -i distributed/test_c10d
time python test/run_test.py --verbose -i distributed/test_c10d_spawn
Expand Down
Expand Up @@ -7,4 +7,4 @@ if "%REBUILD%"=="" (
7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
)
set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB
set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
1 change: 1 addition & 0 deletions BUILD.bazel
Expand Up @@ -458,6 +458,7 @@ filegroup(
name = "aten_srcs_cu",
srcs = [
"aten/src/ATen/cuda/detail/IndexUtils.cu.cc",
"aten/src/ATen/cuda/detail/CUDAGraphsUtils.cu.cc",
"aten/src/ATen/native/cuda/Activation.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu.cc",
"aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu.cc",
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -176,7 +176,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_ex
On Linux
```bash
# Add LAPACK support for the GPU if needed
conda install -c pytorch magma-cuda102 # or [ magma-cuda101 | magma-cuda100 | magma-cuda92 ] depending on your cuda version
conda install -c pytorch magma-cuda110 # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
```

On MacOS
Expand Down
20 changes: 10 additions & 10 deletions android/test_app/app/build.gradle
Expand Up @@ -60,20 +60,20 @@ android {
//}
flavorDimensions "model", "build", "activity"
productFlavors {
mbq {
mnet {
dimension "model"
applicationIdSuffix ".mbq"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2q.pt\"")
addManifestPlaceholders([APP_NAME: "MBQ"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbq\"")
applicationIdSuffix ".mnet"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet.pt\"")
addManifestPlaceholders([APP_NAME: "MNET"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet\"")
}
mbvulkan {
mnetVulkan {
dimension "model"
applicationIdSuffix ".mbvulkan"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mobilenet2-vulkan.pt\"")
applicationIdSuffix ".mnet_vulkan"
buildConfigField("String", "MODULE_ASSET_NAME", "\"mnet_vulkan.pt\"")
buildConfigField("boolean", "USE_VULKAN_DEVICE", 'true')
addManifestPlaceholders([APP_NAME: "MBQ"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mbvulkan\"")
addManifestPlaceholders([APP_NAME: "MNET_VULKAN"])
buildConfigField("String", "LOGCAT_TAG", "\"pytorch-mnet-vulkan\"")
}
resnet18 {
dimension "model"
Expand Down
121 changes: 121 additions & 0 deletions aten/src/ATen/CUDAGeneratorImpl.h
Expand Up @@ -2,10 +2,122 @@

#include <c10/core/GeneratorImpl.h>
#include <ATen/core/Generator.h>
#include <ATen/Tensor.h>
#include <ATen/Context.h>
#include <limits>

// TODO: this file should be in ATen/cuda, not top level

namespace at {
/**
* Note [CUDA Graph-safe RNG states]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*
* Strategy:
* ~~~~~~~~~
* A CUDA graph containing multiple RNG ops behaves like a
* single giant kernel from the perspective of ops external
* to the graph. During graph capture, logic below records
* the total of all offset increments that occur in the graphed
* region, and records the final total as the offset for the
* entire graph.
*
* When the graph reruns, the logic that reruns it
* increments this device's CUDA generator's offset
* by that total.
*
* Meanwhile, within the graph, at capture time, instead of
* populating PhiloxCudaStates with the uint64_t offset pulled
* directly from the global state, PhiloxCudaState instead
* holds a pointer to one-element stream-local int64_t device tensor
* holding an initial offset value, and a uint64_t holding an
* intra-graph offset. (The intra-graph offset starts from zero
* when capture begins.) In each consumer kernel,
* at::cuda::philox::unpack computes the offset to use for this kernel
* as intra-graph offset + *initial offset.
*
* When the graph reruns, the logic that reruns it first
* fill_s the initial offset tensor with this device's
* CUDA generator's current offset.
*
* The control flow above ensures graphed execution is bitwise
* identical to eager execution as long as RNG ops are enqueued
* from a single thread, even if RNG ops and graphs containing
* RNG ops are enqueued and run simultaneously on multiple streams.
*
* Usage:
* ~~~~~~
* PhiloxCudaState in this file, and unpack() in
* cuda/CUDAGraphsUtils.cuh allow non-divergent use of
* CUDAGeneratorImpl whether graph capture is underway or not.
*
* Each PhiloxCudaState instance should be used for one and only one
* consumer kernel.
*
* Example (see e.g. native/cuda/Dropout.cu):
*
* #include <ATen/cuda/CUDAGeneratorImpl.h>
* #include <ATen/cuda/CUDAGraphsUtils.cuh>
*
* __global__ void kernel(..., PhiloxCudaState philox_args) {
* auto seeds = at::cuda::philox::unpack(philox_args);
* IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
* curandStatePhilox4_32_10_t state;
* curand_init(std::get<0>(seeds), // seed
* idx, // per-thread subsequence
* std::get<1>(seeds), // offset in subsequence
* &state);
* ...
* }
*
* host_caller(...) {
* PhiloxCudaState rng_engine_inputs;
* {
* // See Note [Acquire lock when using random generators]
* std::lock_guard<std::mutex> lock(gen->mutex_);
*
* // gen could be HostState or DevState here! No divergent code needed!
* rng_engine_inputs = gen->philox_cuda_state(offset_increment);
* }
* kernel<<<...>>>(..., rng_engine_inputs);
* }
*
*/


// Stores state values. Passed as a kernel argument. See "Usage:" above.
struct PhiloxCudaState {
PhiloxCudaState() = default;
PhiloxCudaState(const PhiloxCudaState&) = default;
// Called if graph capture is not underway
PhiloxCudaState(uint64_t seed,
uint64_t offset) {
seed_ = seed;
offset_.val = offset;
}
// Called if graph capture is underway
PhiloxCudaState(uint64_t seed,
int64_t* offset_extragraph,
uint32_t offset_intragraph) {
seed_ = seed;
offset_.ptr = offset_extragraph;
offset_intragraph_ = offset_intragraph;
captured_ = true;
}

// Public members, directly accessible by at::cuda::philox::unpack.
// If we made them private with getters/setters, the getters/setters
// would have to be __device__, and we can't declare __device__ in ATen.
union Payload {
uint64_t val;
int64_t* ptr;
};

uint64_t seed_;
Payload offset_;
uint32_t offset_intragraph_;
bool captured_ = false;
};

struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
// Constructors
Expand All @@ -19,13 +131,22 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
uint64_t seed() override;
void set_philox_offset_per_thread(uint64_t offset);
uint64_t philox_offset_per_thread();
void graph_prologue(int64_t* offset_extragraph);
uint64_t graph_epilogue();
PhiloxCudaState philox_cuda_state(uint64_t increment);

// Temporarily accommodates call sites that use philox_engine_inputs.
// Allows incremental refactor of call sites to use philox_cuda_state.
std::pair<uint64_t, uint64_t> philox_engine_inputs(uint64_t increment);

static DeviceType device_type();

private:
CUDAGeneratorImpl* clone_impl() const override;
uint64_t seed_ = default_rng_seed_val;
uint64_t philox_offset_per_thread_ = 0;
int64_t* offset_extragraph_;
uint32_t offset_intragraph_ = 0;
};

namespace cuda {
Expand Down
39 changes: 37 additions & 2 deletions aten/src/ATen/OpaqueTensorImpl.h
Expand Up @@ -86,14 +86,34 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
key_set(), dtype(), device(), opaque_handle_, sizes_);
copy_tensor_metadata(
/*src_impl=*/this,
/*dest_impl=*/impl.get(),
/*src_opaque_impl=*/this,
/*dest_opaque_impl=*/impl.get(),
/*version_counter=*/version_counter,
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
impl->refresh_numel();
return impl;
}

/**
* Return a TensorImpl that is a shallow-copy of this TensorImpl.
*
* For usage of `version_counter` and `allow_tensor_metadata_change`,
* see NOTE [ TensorImpl Shallow-Copying ].
*/
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
c10::VariableVersion&& version_counter,
bool allow_tensor_metadata_change) const override {
auto impl = c10::make_intrusive<OpaqueTensorImpl<OpaqueHandle>>(
key_set(), dtype(), device(), opaque_handle_, sizes_);
copy_tensor_metadata(
/*src_opaque_impl=*/this,
/*dest_opaque_impl=*/impl.get(),
/*version_counter=*/std::move(version_counter),
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
impl->refresh_numel();
return impl;
}

/**
* Shallow-copies data from another TensorImpl into this TensorImpl.
*
Expand Down Expand Up @@ -143,6 +163,21 @@ struct CAFFE2_API OpaqueTensorImpl : public TensorImpl {
dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_;
}

static void copy_tensor_metadata(
const OpaqueTensorImpl<OpaqueHandle>* src_opaque_impl,
OpaqueTensorImpl<OpaqueHandle>* dest_opaque_impl,
c10::VariableVersion&& version_counter,
bool allow_tensor_metadata_change) {
TensorImpl::copy_tensor_metadata(
src_opaque_impl,
dest_opaque_impl,
std::move(version_counter),
allow_tensor_metadata_change);

// OpaqueTensorImpl-specific fields.
dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_;
}

private:
OpaqueHandle opaque_handle_;
};
Expand Down
9 changes: 0 additions & 9 deletions aten/src/ATen/Parallel.h
@@ -1,18 +1,9 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/Config.h>
#include <ATen/core/ivalue.h>
#include <c10/macros/Macros.h>

namespace at {
namespace internal {
// This parameter is heuristically chosen to determine the minimum number of
// work that warrants parallelism. For example, when summing an array, it is
// deemed inefficient to parallelise over arrays shorter than 32768. Further,
// no parallel algorithm (such as parallel_reduce) should split work into
// smaller than GRAIN_SIZE chunks.
constexpr int64_t GRAIN_SIZE = 32768;
} // namespace internal

inline int64_t divup(int64_t x, int64_t y) {
return (x + y - 1) / y;
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/ParallelOpenMP.h
@@ -1,5 +1,4 @@
#pragma once
#include <ATen/ATen.h>

#include <cstddef>
#include <exception>
Expand Down
19 changes: 19 additions & 0 deletions aten/src/ATen/SparseTensorImpl.h
Expand Up @@ -200,6 +200,25 @@ struct CAFFE2_API SparseTensorImpl : public TensorImpl {
return impl;
}

/**
* Return a TensorImpl that is a shallow-copy of this TensorImpl.
*
* For usage of `version_counter` and `allow_tensor_metadata_change`,
* see NOTE [ TensorImpl Shallow-Copying ].
*/
c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
c10::VariableVersion&& version_counter,
bool allow_tensor_metadata_change) const override {
auto impl = c10::make_intrusive<SparseTensorImpl>(key_set(), dtype());
copy_tensor_metadata(
/*src_impl=*/this,
/*dest_impl=*/impl.get(),
/*version_counter=*/std::move(version_counter),
/*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
impl->refresh_numel();
return impl;
}

/**
* Shallow-copies data from another TensorImpl into this TensorImpl.
*
Expand Down

0 comments on commit 411ba6d

Please sign in to comment.