Skip to content

Commit

Permalink
Update on "Address clang-tidy warning for push_back in ProcessGroupNCCL"
Browse files Browse the repository at this point in the history
Noticed that in the internal diff for
#49069 there was a clang-tidy warning to
use emplace instead of push_back. In this case, there should be a small perf win because we directly construct the object inside the container, instead of constructing a temporary and moving it into the container. 

Differential Revision: [D25800134](https://our.internmc.facebook.com/intern/diff/D25800134/)

[ghstack-poisoned]
  • Loading branch information
rohan-varma committed Jan 7, 2021
2 parents afca239 + 321b988 commit 01feb0c
Show file tree
Hide file tree
Showing 172 changed files with 1,967 additions and 3,579 deletions.
4 changes: 4 additions & 0 deletions .circleci/config.yml
Expand Up @@ -11,6 +11,9 @@ parameters:
run_binary_tests:
type: boolean
default: false
run_build:
type: boolean
default: true

docker_config_defaults: &docker_config_defaults
user: jenkins
Expand Down Expand Up @@ -9762,6 +9765,7 @@ workflows:
only:
- postnightly
executor: windows-with-nvidia-gpu
when: << pipeline.parameters.run_build >>
ecr_gc:
triggers:
- schedule:
Expand Down
5 changes: 4 additions & 1 deletion .circleci/generate_config_yml.py
Expand Up @@ -112,7 +112,10 @@ def gen_build_workflows_tree():
"when": r"<< pipeline.parameters.run_binary_tests >>",
"jobs": [f() for f in binary_build_functions],
},
"build": {"jobs": [f() for f in build_workflows_functions]},
"build": {
"when": r"<< pipeline.parameters.run_build >>",
"jobs": [f() for f in build_workflows_functions]
},
}
}

Expand Down
3 changes: 3 additions & 0 deletions .circleci/verbatim-sources/header-section.yml
Expand Up @@ -11,6 +11,9 @@ parameters:
run_binary_tests:
type: boolean
default: false
run_build:
type: boolean
default: true

docker_config_defaults: &docker_config_defaults
user: jenkins
Expand Down
2 changes: 2 additions & 0 deletions .github/pytorch-circleci-labels.yml
Expand Up @@ -9,3 +9,5 @@ labels_to_circle_params:
- release/.*
tags:
- v[0-9]+(\.[0-9]+)*-rc[0-9]+
set_to_false:
- run_build
7 changes: 0 additions & 7 deletions .jenkins/pytorch/codegen-test.sh
Expand Up @@ -48,13 +48,6 @@ python -m tools.autograd.gen_autograd \
"$OUT"/autograd \
tools/autograd

# unboxing_wrappers codegen (called by torch codegen but can run independently)
mkdir -p "$OUT"/unboxing_wrappers
python -m tools.jit.gen_unboxing_wrappers \
"$OUT"/torch/share/ATen/Declarations.yaml \
"$OUT"/unboxing_wrappers \
tools/jit/templates

# annotated_fn_args codegen (called by torch codegen but can run independently)
mkdir -p "$OUT"/annotated_fn_args
python -m tools.autograd.gen_annotated_fn_args \
Expand Down
5 changes: 0 additions & 5 deletions .jenkins/pytorch/macos-test.sh
Expand Up @@ -9,11 +9,6 @@ pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil
# TODO move this to docker
pip install unittest-xml-reporting pytest

# faulthandler become built-in since 3.3
if [[ ! $(python -c "import sys; print(int(sys.version_info >= (3, 3)))") == "1" ]]; then
pip install -q faulthandler
fi

if [ -z "${IN_CI}" ]; then
rm -rf ${WORKSPACE_DIR}/miniconda3/lib/python3.6/site-packages/torch*
fi
Expand Down
2 changes: 0 additions & 2 deletions .jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
Expand Up @@ -41,8 +41,6 @@ popd
:: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage
if %errorlevel% neq 0 ( exit /b %errorlevel% )
:: No need to install faulthandler since we only test Python >= 3.6 on Windows
:: faulthandler is builtin since Python 3.3

set DISTUTILS_USE_SDK=1

Expand Down
3 changes: 0 additions & 3 deletions BUILD.bazel
Expand Up @@ -193,9 +193,6 @@ libtorch_cpp_generated_sources = [
"torch/csrc/autograd/generated/Functions.h",
"torch/csrc/autograd/generated/Functions.cpp",
"torch/csrc/autograd/generated/variable_factories.h",
"torch/csrc/jit/generated/generated_unboxing_wrappers_0.cpp",
"torch/csrc/jit/generated/generated_unboxing_wrappers_1.cpp",
"torch/csrc/jit/generated/generated_unboxing_wrappers_2.cpp",
]

libtorch_python_generated_sources = [
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Expand Up @@ -173,6 +173,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
cmake_dependent_option(
USE_NCCL "Use NCCL" ON
"USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
cmake_dependent_option(USE_RCCL "Use RCCL" ON
USE_NCCL OFF)
cmake_dependent_option(
USE_STATIC_NCCL "Use static NCCL" OFF
"USE_NCCL" OFF)
Expand Down
6 changes: 6 additions & 0 deletions android/test_app/app/src/main/AndroidManifest.xml
Expand Up @@ -18,4 +18,10 @@
</application>

<uses-permission android:name="android.permission.CAMERA" />

<!--
Permissions required by the Snapdragon Profiler to collect GPU metrics.
-->
<uses-permission android:name="android.permission.INTERNET" />
<uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
</manifest>
2 changes: 1 addition & 1 deletion aten/conda/meta.yaml
Expand Up @@ -24,7 +24,7 @@ requirements:
- mkl # [not osx]

about:
home: https://github.com/zdevito/ATen
home: https://github.com/pytorch/pytorch
license: BSD
summary: A TENsor library for C++14

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/BatchingRegistrations.cpp
Expand Up @@ -1015,7 +1015,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
m.impl("_add_batch_dim", native::_add_batch_dim);
m.impl("_remove_batch_dim", native::_remove_batch_dim);

m.impl_UNBOXED("sum.dim_IntList", sum_batching_rule);
m.impl("sum.dim_IntList", sum_batching_rule);
m.impl("is_complex", native::is_complex);
m.impl("conj", native::conj);

Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/CMakeLists.txt
Expand Up @@ -72,7 +72,7 @@ file(GLOB metal_h "metal/*.h")
file(GLOB metal_cpp "metal/*.cpp")
file(GLOB_RECURSE native_metal_h "native/metal/*.h")
file(GLOB metal_test_srcs "native/metal/mpscnn/tests/*.mm")
file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm", "native/metal/*.cpp")
file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm" "native/metal/*.cpp")
EXCLUDE(native_metal_srcs "${native_metal_srcs}" ${metal_test_srcs})
file(GLOB metal_prepack_h "native/metal/MetalPrepackOpContext.h")
file(GLOB metal_prepack_cpp "native/metal/MetalPrepackOpRegister.cpp")
Expand Down
160 changes: 160 additions & 0 deletions aten/src/ATen/CPUGeneratorImpl.cpp
@@ -1,11 +1,49 @@
#include <ATen/CPUGeneratorImpl.h>
#include <ATen/Utils.h>
#include <ATen/core/MT19937RNGEngine.h>
#include <c10/util/C++17.h>
#include <algorithm>

namespace at {

namespace detail {

/**
* CPUGeneratorImplStateLegacy is a POD class needed for memcpys
* in torch.get_rng_state() and torch.set_rng_state().
* It is a legacy class and even though it is replaced with
* at::CPUGeneratorImpl, we need this class and some of its fields
* to support backward compatibility on loading checkpoints.
*/
struct CPUGeneratorImplStateLegacy {
/* The initial seed. */
uint64_t the_initial_seed;
int left; /* = 1; */
int seeded; /* = 0; */
uint64_t next;
uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector */

/********************************/

/* For normal distribution */
double normal_x;
double normal_y;
double normal_rho;
int normal_is_valid; /* = 0; */
};

/**
* CPUGeneratorImplState is a POD class containing
* new data introduced in at::CPUGeneratorImpl and the legacy state. It is used
* as a helper for torch.get_rng_state() and torch.set_rng_state()
* functions.
*/
struct CPUGeneratorImplState {
CPUGeneratorImplStateLegacy legacy_pod;
float next_float_normal_sample;
bool is_next_float_normal_sample_valid;
};

/**
* PyTorch maintains a collection of default generators that get
* initialized once. The purpose of these default generators is to
Expand Down Expand Up @@ -75,6 +113,128 @@ uint64_t CPUGeneratorImpl::seed() {
return random;
}

/**
* Sets the internal state of CPUGeneratorImpl. The new internal state
* must be a strided CPU byte tensor and of the same size as either
* CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
* CPUGeneratorImplState (for new state).
*
* FIXME: Remove support of the legacy state in the future?
*/
void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
using detail::CPUGeneratorImplState;
using detail::CPUGeneratorImplStateLegacy;

static_assert(std::is_pod<CPUGeneratorImplStateLegacy>::value, "CPUGeneratorImplStateLegacy is not a PODType");
static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");

static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
static const size_t size_current = sizeof(CPUGeneratorImplState);
static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");

detail::check_rng_state(new_state);

at::mt19937 engine;
auto float_normal_sample = c10::optional<float>();
auto double_normal_sample = c10::optional<double>();

// Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
CPUGeneratorImplStateLegacy* legacy_pod;
auto new_state_size = new_state.numel();
if (new_state_size == size_legacy) {
legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
// Note that in CPUGeneratorImplStateLegacy, we didn't have float version
// of normal sample and hence we leave the c10::optional<float> as is

// Update next_double_normal_sample.
// Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
// and a rho value (normal_rho). These three values were redundant and in the new
// DistributionsHelper.h, we store the actual extra normal sample, rather than three
// intermediate values.
if (legacy_pod->normal_is_valid) {
auto r = legacy_pod->normal_rho;
auto theta = 2.0 * M_PI * legacy_pod->normal_x;
// we return the sin version of the normal sample when in caching mode
double_normal_sample = c10::optional<double>(r * ::sin(theta));
}
} else if (new_state_size == size_current) {
auto rng_state = (CPUGeneratorImplState*)new_state.data();
legacy_pod = &rng_state->legacy_pod;
// update next_float_normal_sample
if (rng_state->is_next_float_normal_sample_valid) {
float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
}

// Update next_double_normal_sample.
// Note that in getRNGState, we now return the actual normal sample in normal_y
// and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
// are squashed to 0.0.
if (legacy_pod->normal_is_valid) {
double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
}
} else {
AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
" or a CPUGeneratorImplState of size ", size_current,
" but found the input RNG state size to be ", new_state_size);
}

// construct engine_
// Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
// redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
// doing a std::copy.
at::mt19937_data_pod rng_data;
std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin());
rng_data.seed_ = legacy_pod->the_initial_seed;
rng_data.left_ = legacy_pod->left;
rng_data.seeded_ = legacy_pod->seeded;
rng_data.next_ = static_cast<uint32_t>(legacy_pod->next);
engine.set_data(rng_data);
TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
this->engine_ = engine;
this->next_float_normal_sample_ = float_normal_sample;
this->next_double_normal_sample_ = double_normal_sample;
}

/**
* Gets the current internal state of CPUGeneratorImpl. The internal
* state is returned as a CPU byte tensor.
*/
c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
using detail::CPUGeneratorImplState;

static const size_t size = sizeof(CPUGeneratorImplState);
static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");

auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
auto rng_state = state_tensor.data_ptr();

// accumulate generator data to be copied into byte tensor
auto accum_state = std::make_unique<CPUGeneratorImplState>();
auto rng_data = this->engine_.data();
accum_state->legacy_pod.the_initial_seed = rng_data.seed_;
accum_state->legacy_pod.left = rng_data.left_;
accum_state->legacy_pod.seeded = rng_data.seeded_;
accum_state->legacy_pod.next = rng_data.next_;
std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state));
accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy
accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy
accum_state->legacy_pod.normal_is_valid = false;
accum_state->legacy_pod.normal_y = 0.0;
accum_state->next_float_normal_sample = 0.0f;
accum_state->is_next_float_normal_sample_valid = false;
if (this->next_double_normal_sample_) {
accum_state->legacy_pod.normal_is_valid = true;
accum_state->legacy_pod.normal_y = *(this->next_double_normal_sample_);
}
if (this->next_float_normal_sample_) {
accum_state->is_next_float_normal_sample_valid = true;
accum_state->next_float_normal_sample = *(this->next_float_normal_sample_);
}

memcpy(rng_state, accum_state.get(), size);
return state_tensor.getIntrusivePtr();
}

/**
* Gets the DeviceType of CPUGeneratorImpl.
* Used for type checking during run time.
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/CPUGeneratorImpl.h
Expand Up @@ -17,6 +17,8 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
void set_current_seed(uint64_t seed) override;
uint64_t current_seed() const override;
uint64_t seed() override;
void set_state(const c10::TensorImpl& new_state) override;
c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
static DeviceType device_type();
uint32_t random();
uint64_t random64();
Expand Down
4 changes: 3 additions & 1 deletion aten/src/ATen/CUDAGeneratorImpl.h
Expand Up @@ -129,8 +129,10 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
void set_current_seed(uint64_t seed) override;
uint64_t current_seed() const override;
uint64_t seed() override;
void set_state(const c10::TensorImpl& new_state) override;
c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
void set_philox_offset_per_thread(uint64_t offset);
uint64_t philox_offset_per_thread();
uint64_t philox_offset_per_thread() const;
void capture_prologue(int64_t* offset_extragraph);
uint64_t capture_epilogue();
PhiloxCudaState philox_cuda_state(uint64_t increment);
Expand Down
4 changes: 4 additions & 0 deletions aten/src/ATen/Dispatch.h
Expand Up @@ -10,6 +10,9 @@
#include <c10/util/complex.h>
#include <c10/util/string_view.h>

#ifdef XPLAT_MOBILE_BUILD
#include <ATen/selected_mobile_ops.h>
#else
namespace at {
/**
* The method should_include_kernel_dtype() returns true/false
Expand All @@ -25,6 +28,7 @@ inline constexpr bool should_include_kernel_dtype(
return true;
}
}
#endif

/**
* In the Facebook internal build (using BUCK), this macro is enabled by
Expand Down

0 comments on commit 01feb0c

Please sign in to comment.