diff --git a/.circleci/config.yml b/.circleci/config.yml index 0716e516518b..d19c08b2b0b6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,6 +11,9 @@ parameters: run_binary_tests: type: boolean default: false + run_build: + type: boolean + default: true docker_config_defaults: &docker_config_defaults user: jenkins @@ -9762,6 +9765,7 @@ workflows: only: - postnightly executor: windows-with-nvidia-gpu + when: << pipeline.parameters.run_build >> ecr_gc: triggers: - schedule: diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py index f1af924bd3e2..a836d2e510a6 100755 --- a/.circleci/generate_config_yml.py +++ b/.circleci/generate_config_yml.py @@ -112,7 +112,10 @@ def gen_build_workflows_tree(): "when": r"<< pipeline.parameters.run_binary_tests >>", "jobs": [f() for f in binary_build_functions], }, - "build": {"jobs": [f() for f in build_workflows_functions]}, + "build": { + "when": r"<< pipeline.parameters.run_build >>", + "jobs": [f() for f in build_workflows_functions] + }, } } diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml index 26205a0cccba..43d4c94ee5ed 100644 --- a/.circleci/verbatim-sources/header-section.yml +++ b/.circleci/verbatim-sources/header-section.yml @@ -11,6 +11,9 @@ parameters: run_binary_tests: type: boolean default: false + run_build: + type: boolean + default: true docker_config_defaults: &docker_config_defaults user: jenkins diff --git a/.github/pytorch-circleci-labels.yml b/.github/pytorch-circleci-labels.yml index ccdf2e876af1..3a9eeca0abcc 100644 --- a/.github/pytorch-circleci-labels.yml +++ b/.github/pytorch-circleci-labels.yml @@ -9,3 +9,5 @@ labels_to_circle_params: - release/.* tags: - v[0-9]+(\.[0-9]+)*-rc[0-9]+ + set_to_false: + - run_build diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md index ea6c6dd40f68..9fd68ecf7f15 100644 --- a/.jenkins/pytorch/README.md +++ b/.jenkins/pytorch/README.md @@ -10,9 +10,9 @@ it is very easy to run these tests yourself: ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``, where ``$BUILD_ENVIRONMENT`` is one of the build environments enumerated in - [pytorch-dockerfiles](https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh) + [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/build.sh). The dockerfile used by jenkins can be found under the `.circle` [directory](https://github.com/pytorch/pytorch/blob/master/.circleci/docker) -2. Run ``docker -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and +2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and run one of the scripts in this directory. The Docker images are designed so that any "reasonable" build commands @@ -38,5 +38,5 @@ mechanisms we use: build scripts. - We reroute well known paths like `/usr/bin/gcc` to alternate - implementations with `update-alternatives, instead of setting + implementations with `update-alternatives`, instead of setting `CC` and `CXX` in our implementations. diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh index 17e7e9fa3445..47d13f2908d0 100755 --- a/.jenkins/pytorch/codegen-test.sh +++ b/.jenkins/pytorch/codegen-test.sh @@ -48,13 +48,6 @@ python -m tools.autograd.gen_autograd \ "$OUT"/autograd \ tools/autograd -# unboxing_wrappers codegen (called by torch codegen but can run independently) -mkdir -p "$OUT"/unboxing_wrappers -python -m tools.jit.gen_unboxing_wrappers \ - "$OUT"/torch/share/ATen/Declarations.yaml \ - "$OUT"/unboxing_wrappers \ - tools/jit/templates - # annotated_fn_args codegen (called by torch codegen but can run independently) mkdir -p "$OUT"/annotated_fn_args python -m tools.autograd.gen_annotated_fn_args \ diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh index 0c34ddcc6179..24ec02c76df5 100755 --- a/.jenkins/pytorch/macos-test.sh +++ b/.jenkins/pytorch/macos-test.sh @@ -9,11 +9,6 @@ pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil # TODO move this to docker pip install unittest-xml-reporting pytest -# faulthandler become built-in since 3.3 -if [[ ! $(python -c "import sys; print(int(sys.version_info >= (3, 3)))") == "1" ]]; then - pip install -q faulthandler -fi - if [ -z "${IN_CI}" ]; then rm -rf ${WORKSPACE_DIR}/miniconda3/lib/python3.6/site-packages/torch* fi diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat index a052a1b67d59..ed6482890993 100644 --- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat +++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat @@ -41,8 +41,6 @@ popd :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage if %errorlevel% neq 0 ( exit /b %errorlevel% ) -:: No need to install faulthandler since we only test Python >= 3.6 on Windows -:: faulthandler is builtin since Python 3.3 set DISTUTILS_USE_SDK=1 diff --git a/BUILD.bazel b/BUILD.bazel index b3faea487965..2b4636d850c9 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -193,9 +193,6 @@ libtorch_cpp_generated_sources = [ "torch/csrc/autograd/generated/Functions.h", "torch/csrc/autograd/generated/Functions.cpp", "torch/csrc/autograd/generated/variable_factories.h", - "torch/csrc/jit/generated/generated_unboxing_wrappers_0.cpp", - "torch/csrc/jit/generated/generated_unboxing_wrappers_1.cpp", - "torch/csrc/jit/generated/generated_unboxing_wrappers_2.cpp", ] libtorch_python_generated_sources = [ diff --git a/CMakeLists.txt b/CMakeLists.txt index ba862b5a4d5f..3df73f8a3041 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,6 +173,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF) cmake_dependent_option( USE_NCCL "Use NCCL" ON "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF) +cmake_dependent_option(USE_RCCL "Use RCCL" ON + USE_NCCL OFF) cmake_dependent_option( USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF) @@ -316,7 +318,7 @@ set(OP_DEPENDENCY "" CACHE STRING # symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk # https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu if(LINUX) - set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed") + set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed ${CMAKE_SHARED_LINKER_FLAGS}") endif() if(MSVC) diff --git a/android/test_app/app/src/main/AndroidManifest.xml b/android/test_app/app/src/main/AndroidManifest.xml index a83bf223bdaf..abdd9a8d986a 100644 --- a/android/test_app/app/src/main/AndroidManifest.xml +++ b/android/test_app/app/src/main/AndroidManifest.xml @@ -18,4 +18,10 @@ + + + + diff --git a/aten/conda/meta.yaml b/aten/conda/meta.yaml index d8096fc73a0f..a502690a5447 100644 --- a/aten/conda/meta.yaml +++ b/aten/conda/meta.yaml @@ -24,7 +24,7 @@ requirements: - mkl # [not osx] about: - home: https://github.com/zdevito/ATen + home: https://github.com/pytorch/pytorch license: BSD summary: A TENsor library for C++14 diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index 9bdec2dce77e..2cd7cac4e71b 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -1015,7 +1015,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) { m.impl("_add_batch_dim", native::_add_batch_dim); m.impl("_remove_batch_dim", native::_remove_batch_dim); - m.impl_UNBOXED("sum.dim_IntList", sum_batching_rule); + m.impl("sum.dim_IntList", sum_batching_rule); m.impl("is_complex", native::is_complex); m.impl("conj", native::conj); diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index fd3c95f2573b..6fedef185b21 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -72,7 +72,7 @@ file(GLOB metal_h "metal/*.h") file(GLOB metal_cpp "metal/*.cpp") file(GLOB_RECURSE native_metal_h "native/metal/*.h") file(GLOB metal_test_srcs "native/metal/mpscnn/tests/*.mm") -file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm", "native/metal/*.cpp") +file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm" "native/metal/*.cpp") EXCLUDE(native_metal_srcs "${native_metal_srcs}" ${metal_test_srcs}) file(GLOB metal_prepack_h "native/metal/MetalPrepackOpContext.h") file(GLOB metal_prepack_cpp "native/metal/MetalPrepackOpRegister.cpp") diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp index bfa4a2a8f72f..ff4a2f1c61e2 100644 --- a/aten/src/ATen/CPUGeneratorImpl.cpp +++ b/aten/src/ATen/CPUGeneratorImpl.cpp @@ -1,4 +1,6 @@ #include +#include +#include #include #include @@ -6,6 +8,42 @@ namespace at { namespace detail { +/** + * CPUGeneratorImplStateLegacy is a POD class needed for memcpys + * in torch.get_rng_state() and torch.set_rng_state(). + * It is a legacy class and even though it is replaced with + * at::CPUGeneratorImpl, we need this class and some of its fields + * to support backward compatibility on loading checkpoints. + */ +struct CPUGeneratorImplStateLegacy { + /* The initial seed. */ + uint64_t the_initial_seed; + int left; /* = 1; */ + int seeded; /* = 0; */ + uint64_t next; + uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector */ + + /********************************/ + + /* For normal distribution */ + double normal_x; + double normal_y; + double normal_rho; + int normal_is_valid; /* = 0; */ +}; + +/** + * CPUGeneratorImplState is a POD class containing + * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used + * as a helper for torch.get_rng_state() and torch.set_rng_state() + * functions. + */ +struct CPUGeneratorImplState { + CPUGeneratorImplStateLegacy legacy_pod; + float next_float_normal_sample; + bool is_next_float_normal_sample_valid; +}; + /** * PyTorch maintains a collection of default generators that get * initialized once. The purpose of these default generators is to @@ -75,6 +113,128 @@ uint64_t CPUGeneratorImpl::seed() { return random; } +/** + * Sets the internal state of CPUGeneratorImpl. The new internal state + * must be a strided CPU byte tensor and of the same size as either + * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or + * CPUGeneratorImplState (for new state). + * + * FIXME: Remove support of the legacy state in the future? + */ +void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { + using detail::CPUGeneratorImplState; + using detail::CPUGeneratorImplStateLegacy; + + static_assert(std::is_pod::value, "CPUGeneratorImplStateLegacy is not a PODType"); + static_assert(std::is_pod::value, "CPUGeneratorImplState is not a PODType"); + + static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy); + static const size_t size_current = sizeof(CPUGeneratorImplState); + static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size"); + + detail::check_rng_state(new_state); + + at::mt19937 engine; + auto float_normal_sample = c10::optional(); + auto double_normal_sample = c10::optional(); + + // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. + CPUGeneratorImplStateLegacy* legacy_pod; + auto new_state_size = new_state.numel(); + if (new_state_size == size_legacy) { + legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data(); + // Note that in CPUGeneratorImplStateLegacy, we didn't have float version + // of normal sample and hence we leave the c10::optional as is + + // Update next_double_normal_sample. + // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y) + // and a rho value (normal_rho). These three values were redundant and in the new + // DistributionsHelper.h, we store the actual extra normal sample, rather than three + // intermediate values. + if (legacy_pod->normal_is_valid) { + auto r = legacy_pod->normal_rho; + auto theta = 2.0 * M_PI * legacy_pod->normal_x; + // we return the sin version of the normal sample when in caching mode + double_normal_sample = c10::optional(r * ::sin(theta)); + } + } else if (new_state_size == size_current) { + auto rng_state = (CPUGeneratorImplState*)new_state.data(); + legacy_pod = &rng_state->legacy_pod; + // update next_float_normal_sample + if (rng_state->is_next_float_normal_sample_valid) { + float_normal_sample = c10::optional(rng_state->next_float_normal_sample); + } + + // Update next_double_normal_sample. + // Note that in getRNGState, we now return the actual normal sample in normal_y + // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho + // are squashed to 0.0. + if (legacy_pod->normal_is_valid) { + double_normal_sample = c10::optional(legacy_pod->normal_y); + } + } else { + AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy, + " or a CPUGeneratorImplState of size ", size_current, + " but found the input RNG state size to be ", new_state_size); + } + + // construct engine_ + // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our + // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are + // doing a std::copy. + at::mt19937_data_pod rng_data; + std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin()); + rng_data.seed_ = legacy_pod->the_initial_seed; + rng_data.left_ = legacy_pod->left; + rng_data.seeded_ = legacy_pod->seeded; + rng_data.next_ = static_cast(legacy_pod->next); + engine.set_data(rng_data); + TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state"); + this->engine_ = engine; + this->next_float_normal_sample_ = float_normal_sample; + this->next_double_normal_sample_ = double_normal_sample; +} + +/** + * Gets the current internal state of CPUGeneratorImpl. The internal + * state is returned as a CPU byte tensor. + */ +c10::intrusive_ptr CPUGeneratorImpl::get_state() const { + using detail::CPUGeneratorImplState; + + static const size_t size = sizeof(CPUGeneratorImplState); + static_assert(std::is_pod::value, "CPUGeneratorImplState is not a PODType"); + + auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_tensor.data_ptr(); + + // accumulate generator data to be copied into byte tensor + auto accum_state = std::make_unique(); + auto rng_data = this->engine_.data(); + accum_state->legacy_pod.the_initial_seed = rng_data.seed_; + accum_state->legacy_pod.left = rng_data.left_; + accum_state->legacy_pod.seeded = rng_data.seeded_; + accum_state->legacy_pod.next = rng_data.next_; + std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state)); + accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy + accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy + accum_state->legacy_pod.normal_is_valid = false; + accum_state->legacy_pod.normal_y = 0.0; + accum_state->next_float_normal_sample = 0.0f; + accum_state->is_next_float_normal_sample_valid = false; + if (this->next_double_normal_sample_) { + accum_state->legacy_pod.normal_is_valid = true; + accum_state->legacy_pod.normal_y = *(this->next_double_normal_sample_); + } + if (this->next_float_normal_sample_) { + accum_state->is_next_float_normal_sample_valid = true; + accum_state->next_float_normal_sample = *(this->next_float_normal_sample_); + } + + memcpy(rng_state, accum_state.get(), size); + return state_tensor.getIntrusivePtr(); +} + /** * Gets the DeviceType of CPUGeneratorImpl. * Used for type checking during run time. diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h index eceb338966fd..f8b43a04c73c 100644 --- a/aten/src/ATen/CPUGeneratorImpl.h +++ b/aten/src/ATen/CPUGeneratorImpl.h @@ -17,6 +17,8 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override; uint64_t current_seed() const override; uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; static DeviceType device_type(); uint32_t random(); uint64_t random64(); diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h index 9a9febd01f8e..1179a049aa08 100644 --- a/aten/src/ATen/CUDAGeneratorImpl.h +++ b/aten/src/ATen/CUDAGeneratorImpl.h @@ -129,8 +129,10 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override; uint64_t current_seed() const override; uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; void set_philox_offset_per_thread(uint64_t offset); - uint64_t philox_offset_per_thread(); + uint64_t philox_offset_per_thread() const; void capture_prologue(int64_t* offset_extragraph); uint64_t capture_epilogue(); PhiloxCudaState philox_cuda_state(uint64_t increment); diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h index 41252609953f..341e20cab1f3 100644 --- a/aten/src/ATen/Dispatch.h +++ b/aten/src/ATen/Dispatch.h @@ -10,6 +10,9 @@ #include #include +#ifdef XPLAT_MOBILE_BUILD +#include +#else namespace at { /** * The method should_include_kernel_dtype() returns true/false @@ -25,6 +28,7 @@ inline constexpr bool should_include_kernel_dtype( return true; } } +#endif /** * In the Facebook internal build (using BUCK), this macro is enabled by @@ -93,26 +97,6 @@ inline constexpr bool should_include_kernel_dtype( return __VA_ARGS__(); \ } -// This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and -// should be removed once the bfloat16 bringup is complete on other platforms. -// This is supposed to be used as a wrapper around the lambda function passed to -// the dispatch macro and will conditionally dispatch ops with bfloat16 type -// only on ROCm. -#if !defined(__HIP_PLATFORM_HCC__) -#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) \ - if (std::is_same::value) { \ - AT_ERROR( \ - #NAME, \ - " not implemented for '", \ - toString(at::ScalarType::BFloat16), \ - "'"); \ - } else { \ - return __VA_ARGS__(); \ - } -#else -#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) return __VA_ARGS__() -#endif - namespace detail { inline at::ScalarType scalar_type(at::ScalarType s) { diff --git a/aten/src/ATen/VmapTransforms.h b/aten/src/ATen/VmapTransforms.h index 5063beeb08b0..8fa085245459 100644 --- a/aten/src/ATen/VmapTransforms.h +++ b/aten/src/ATen/VmapTransforms.h @@ -96,8 +96,17 @@ struct VmapPhysicalToLogicalMap; // The levels bitset specifies which vmap levels correspond to the batch // dimensions at the front of the tensor. In particular, the number of set bits // corresponds to the number of batch dimensions on `tensor` and the rightmost -// bit of `levels` specifies the minimum number of nested vmaps we are in at +// bit of `levels` specifies the maximum number of nested vmaps we are in at // this point in time. +// For example, given: +// physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3}) +// +// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less +// than or equal to 3. +// bitset: 010100 +// ^ +// | +// levels: 012345 struct TORCH_API VmapPhysicalView { VmapPhysicalView(Tensor&& tensor, std::bitset levels) : levels_(levels), tensor_(tensor) { diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index dfb8e3ac0f32..9a2f34257c57 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -239,13 +239,9 @@ Therefore, for the moment, this is all copy pasted in from VariableTypeEverythin m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ &WrapFunction::type::call); -#define KERNEL_UNBOXED_ONLY(FUNC, REGISTER_NAME, SIGNATURE, POLICY) \ - m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ - &WrapFunction::type::call); - // Less-common but still useful case: redispatching to a function with a new signature (e.g. appending a dtype) -#define KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \ - m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ +#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \ + m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \ &WrapFunction::type::call); /***************************************** @@ -367,20 +363,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { KERNEL(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional&, const c10::optional&, int64_t), fp32) KERNEL(ADD_NS(dist), "dist", Tensor (const Tensor &, const Tensor &, Scalar), fp32) KERNEL(ADD_NS(pdist), "pdist", Tensor (const Tensor &, double), fp32) - KERNEL_UNBOXED_ONLY(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional), fp32) + KERNEL(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional), fp32) KERNEL(ADD_NS(renorm), "renorm", Tensor (const Tensor &, Scalar, int64_t, Scalar), fp32) // fp32_set_opt_dtype KERNEL(ADD_NS(prod), "prod", Tensor (const Tensor &, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(prod), "prod.dim_int", Tensor (const Tensor &, int64_t, bool, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(softmax), "softmax.int", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(log_softmax), "log_softmax.int", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(cumprod), "cumprod", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(cumsum), "cumsum", Tensor (const Tensor &, int64_t, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional), fp32_set_opt_dtype) // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even // when autocasting. // KERNEL(ADD_NS(norm), "norm.ScalarOpt_dtype", Tensor (const Tensor &, c10::optional, ScalarType), fp32_set_opt_dtype) @@ -388,20 +384,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) { // KERNEL(ADD_NS(norm), "norm.names_ScalarOpt_dim_dtype", Tensor (const Tensor &, c10::optional, DimnameList, bool, ScalarType), fp32_set_opt_dtype) KERNEL(ADD_NS(sum), "sum", Tensor (const Tensor &, c10::optional), fp32_set_opt_dtype) KERNEL(ADD_NS(sum), "sum.dim_IntList", Tensor (const Tensor &, IntArrayRef, bool, c10::optional), fp32_set_opt_dtype) - KERNEL_UNBOXED_ONLY(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional), fp32_set_opt_dtype) + KERNEL(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional), fp32_set_opt_dtype) // fp32_append_dtype // The fp32_append_dtype wrapper overrides implicit promotion behavior. // norm does not implicitly promote, but be aware when adding new ops to this policy. - KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional, ScalarType), fp32_append_dtype) - KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional, IntArrayRef, bool), Tensor (const Tensor &, c10::optional, IntArrayRef, bool, ScalarType), fp32_append_dtype) - KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional, DimnameList, bool), Tensor (const Tensor &, c10::optional, DimnameList, bool, ScalarType), fp32_append_dtype) + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional, ScalarType), fp32_append_dtype) + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional, IntArrayRef, bool), Tensor (const Tensor &, c10::optional, IntArrayRef, bool, ScalarType), fp32_append_dtype) + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional, DimnameList, bool), Tensor (const Tensor &, c10::optional, DimnameList, bool, ScalarType), fp32_append_dtype) // promote KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote) KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote) KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote) KERNEL(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const c10::optional&), promote) KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote) - KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote) + KERNEL(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote) KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote) KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional), promote) KERNEL(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote) diff --git a/aten/src/ATen/core/Generator.cpp b/aten/src/ATen/core/Generator.cpp new file mode 100644 index 000000000000..800f8c7c88ec --- /dev/null +++ b/aten/src/ATen/core/Generator.cpp @@ -0,0 +1,16 @@ +#include +#include +#include + +namespace at { + +void Generator::set_state(const at::Tensor& new_state) { + TORCH_CHECK(new_state.defined(), "Undefined tensor is not allowed"); + this->impl_->set_state(*new_state.unsafeGetTensorImpl()); +} + +at::Tensor Generator::get_state() const { + return at::Tensor::wrap_tensor_impl(this->impl_->get_state()); +} + +} // namespace at diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index de3f6e46f8f2..b5bbb2fe3c74 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -56,6 +56,8 @@ namespace at { +class Tensor; + struct TORCH_API Generator { Generator() {} @@ -96,6 +98,12 @@ struct TORCH_API Generator { uint64_t seed() { return impl_->seed(); } + // Implementation not inlined to prevent cycle reference between + // `ATen/core/Generator.h` and `ATen/core/Tensor.h` + void set_state(const at::Tensor& new_state); + + at::Tensor get_state() const; + std::mutex& mutex() { return impl_->mutex_; } @@ -130,4 +138,24 @@ Generator make_generator(Args&&... args) { return Generator(c10::make_intrusive(std::forward(args)...)); } +namespace detail { + +/** + * Helper function for checking the validity of new random generator + * state. Right now following conditions are checked: + * + * - The new state tensor must be a torch.ByteTensor + * - Data of the new state tensor must be contiguous + */ +static inline void check_rng_state(const c10::TensorImpl& new_state) { + TORCH_CHECK_TYPE( + new_state.layout() == kStrided && new_state.device().type() == kCPU && new_state.dtype() == kByte, + "RNG state must be a torch.ByteTensor" + ); + + TORCH_CHECK(new_state.is_contiguous(), "RNG state must be contiguous"); +} + +} // namespace detail + } // namespace at diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index f84352ebee1f..58c35557018c 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -57,25 +57,4 @@ bool KernelFunction::_equalsBoxedAndUnboxed(const KernelFunction& other) const { unboxed_kernel_func_ == other.unboxed_kernel_func_; } -void KernelFunction::checkBoxedKernel(const OperatorHandle& opHandle) const { - if (C10_UNLIKELY(boxed_kernel_func_ == nullptr)) { - if (unboxed_kernel_func_ == nullptr) { - TORCH_INTERNAL_ASSERT( - false, - "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction.", - " opname: ", - opHandle.operator_name(), - " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`."); - } else { - // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this case should be impossible. - TORCH_INTERNAL_ASSERT( - false, - "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call().", - " opname: ", - opHandle.operator_name(), - " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`."); - } - } -} - } // namespace c10 diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index 6817907b12b1..ddbbd912777a 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -123,26 +123,6 @@ class TORCH_API KernelFunction final { template static KernelFunction makeFromUnboxedFunctor(std::unique_ptr kernelFunctor); - /** - * Create a KernelFunction from an unboxed functor and prevent creation of an - * unboxing-wrapper. This means that you cannot call this KernelFunction - * using KernelFunction::callBoxed() - * - * This is necessary because our unboxing wrappers don't work for all types - * yet, so if you want to use one of these types as function arguments, - * you need to use makeFromUnboxedOnlyFunctor. - * - * Example: - * - * > class MyFunctor final { - * > public: - * > Tensor operator()(Tensor a, Tensor b) {...} - * > }; - * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::make_unique()); - */ - template - static KernelFunction makeFromUnboxedOnlyFunctor(std::unique_ptr kernelFunctor); - /** * Create a KernelFunction from an unboxed function. * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction @@ -158,23 +138,6 @@ class TORCH_API KernelFunction final { template static KernelFunction makeFromUnboxedFunction(FuncPtr); - /** - * Create a KernelFunction from an unboxed function and prevent creation of an - * unboxing-wrapper. This means that you cannot call this KernelFunction - * using KernelFunction::callBoxed() - * - * This is necessary because our unboxing wrappers don't work for all types - * yet, so if you want to use one of these types as function arguments, - * you need to use makeFromUnboxedOnlyFunctor. - * - * Example: - * - * > Tensor unboxed_func(Tensor a, Tensor b) {...} - * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(); - */ - template - static KernelFunction makeFromUnboxedOnlyFunction(FuncPtr); - /** * Create a KernelFunction from an unboxed function. * KernelFunction::makeFromUnboxedFunction is usually a better choice than @@ -189,9 +152,6 @@ class TORCH_API KernelFunction final { template static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func); - template - static KernelFunction makeFromUnboxedOnlyRuntimeFunction(FuncType* func); - static KernelFunction makeFallthrough(); static KernelFunction makeAmbiguousAutogradOther(); static KernelFunction makeNamedNotSupported(); @@ -213,12 +173,6 @@ class TORCH_API KernelFunction final { // For testing internal invariants only bool _equalsBoxedAndUnboxed(const KernelFunction&) const; - // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic. This can be done once https://github.com/pytorch/pytorch/issues/32366 is fixed. - void setManuallyBoxedKernel_(InternalBoxedKernelFunction* func); - private: explicit KernelFunction(std::unique_ptr functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func); @@ -226,8 +180,6 @@ class TORCH_API KernelFunction final { template static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, Stack* stack); - void checkBoxedKernel(const OperatorHandle& opHandle) const; - OperatorKernel* getFunctor_() const; std::shared_ptr functor_; diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index 82a65fa27ffb..b248e54a6f94 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -23,8 +23,7 @@ inline void KernelFunction::make_boxed_function(OperatorKernel*, const OperatorH } inline bool KernelFunction::isValid() const { - // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this should only check boxed_kernel_func_. - return boxed_kernel_func_ != nullptr || unboxed_kernel_func_ != nullptr; + return boxed_kernel_func_ != nullptr; } inline bool KernelFunction::isFallthrough() const { @@ -32,7 +31,10 @@ inline bool KernelFunction::isFallthrough() const { } inline void KernelFunction::callBoxed(const OperatorHandle& opHandle, Stack* stack) const { - checkBoxedKernel(opHandle); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + boxed_kernel_func_ != nullptr, + "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction." + ); (*boxed_kernel_func_)(functor_.get(), opHandle, stack); } @@ -111,21 +113,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr -inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr kernelFunctor) { - // TODO We want to get rid of kernels that have only an unboxed function pointer. - // All kernels should have a boxed pointer. - - static_assert(guts::is_functor::value, "Tried to call KernelFunction::makeFromUnboxedFunctor but the argument is not a functor."); - static_assert(std::is_base_of::value, "Tried to call KernelFunction::makeFromUnboxedFunctor, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it."); - - return KernelFunction( - std::move(kernelFunctor), - nullptr, // Don't create a boxed kernel for this - reinterpret_cast(&impl::wrap_kernel_functor_unboxed::call) - ); -} - template inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) { static_assert(is_compile_time_function_pointer::value, "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN."); @@ -144,26 +131,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) #endif } -template -inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunction(FuncPtr func_ptr) { - // TODO We want to get rid of kernels that have only an unboxed function pointer. - // All kernels should have a boxed pointer. - static_assert(is_compile_time_function_pointer::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with an invalid parameter. It must be a function pointer created with TORCH_FN."); - static_assert(!std::is_same::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); - static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr"); - -#if !defined(C10_MOBILE) - return makeFromUnboxedOnlyFunctor::type> ( - guts::make_unique_base::type>() - ); -#else - // On mobile, we rather want to optimize for binary size than for performance, - // so let's not inline the kernel into the wrapper but use makeFromUnboxedOnlyRuntimeFunction - // instead. - return makeFromUnboxedOnlyRuntimeFunction(func_ptr.func_ptr()); -#endif -} - template inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* func) { static_assert(guts::is_function_type::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type."); @@ -175,17 +142,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* f ); } -template -inline KernelFunction KernelFunction::makeFromUnboxedOnlyRuntimeFunction(FuncType* func) { - static_assert(guts::is_function_type::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type."); - static_assert(!std::is_same::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead."); - TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr"); - - return makeFromUnboxedOnlyFunctor>>( - guts::make_unique_base>>(func) - ); -} - template inline std::enable_if_t>::value, KernelFunction> KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) { static_assert(guts::is_functor>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type."); @@ -212,14 +168,4 @@ inline std::enable_if_t>::value, ); } -inline void KernelFunction::setManuallyBoxedKernel_(InternalBoxedKernelFunction* func) { - if (boxed_kernel_func_ == &fallthrough_kernel) { - // special case no-op - return; - } - TORCH_INTERNAL_ASSERT(boxed_kernel_func_ == nullptr, "Tried to set a manually boxed kernel for a kernel that already has a boxed kernel set."); - TORCH_INTERNAL_ASSERT(unboxed_kernel_func_ != nullptr, "Tried to set a manually boxed kernel for an invalid KernelFunction."); - boxed_kernel_func_ = func; -} - } diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp index 8ba50db14a2b..e17efab10ba5 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp @@ -544,26 +544,6 @@ TEST(KernelFunctionTest, givenUnboxedFunctor_withoutReturn_whenCallingUnboxed_th kernels::expectUnboxedCallingWithoutReturnWorks(func); } -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectUnboxedCallingWithReturnWorks(func); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr(std::make_unique())); - kernels::expectUnboxedCallingWithoutReturnWorks(func); -} - TEST(KernelFunctionTest, givenUnboxedFunction_withReturn_whenCallingBoxed_thenWorks) { KernelFunction func = KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernels::unboxed_function_with_return)); kernels::expectBoxedCallingWithReturnWorks(func); @@ -584,26 +564,6 @@ TEST(KernelFunctionTest, givenUnboxedFunction_withoutReturn_whenCallingUnboxed_t kernels::expectUnboxedCallingWithoutReturnWorks(func); } -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return)); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingBoxed_thenFails) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return)); - kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()"); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return)); - kernels::expectUnboxedCallingWithReturnWorks(func); -} - -TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return)); - kernels::expectUnboxedCallingWithoutReturnWorks(func); -} - TEST(KernelFunctionTest, givenUnboxedRuntimeFunction_withReturn_whenCallingBoxed_thenWorks) { KernelFunction func = KernelFunction::makeFromUnboxedRuntimeFunction(&kernels::unboxed_function_with_return); kernels::expectBoxedCallingWithReturnWorks(func); diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 5e3e91afbb45..270cffaf6d1f 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -295,12 +295,6 @@ void Dispatcher::checkInvariants() const { } } -void Dispatcher::setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func) { - std::lock_guard lock(mutex_); - op.operatorIterator_->op.setManuallyBoxedKernel_(*this, func); - // NB: Do not need to set manually boxed kernel for backend fallbacks -} - std::vector Dispatcher::findDanglingImpls() const { return operatorLookupTable_.read([&] (const ska::flat_hash_map& operatorLookupTable) -> std::vector { std::vector opsWithDanglingImpls; diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 60f9f9bd0579..d83653f75363 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -182,12 +182,6 @@ class TORCH_API Dispatcher final { */ RegistrationHandleRAII registerLibrary(std::string ns, std::string debug); - // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete setBoxedKernelFor_ once all operators work with the templated boxing logic - void setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func); - // ------------------------------------------------------------------------ // // Listeners on registrations @@ -310,7 +304,9 @@ class TORCH_API OperatorHandle { // smuggle in a kernel that is typed incorrectly). For everything // in core library this won't happen, because all the static registrations // will be done by the time a typed() handle is acquired. +#if !defined C10_MOBILE operatorIterator_->op.assertSignatureIsCorrect(); +#endif return TypedOperatorHandle(operatorIterator_); } diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index f0d7bc6968ed..7c3698beeb06 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -21,7 +21,6 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name) , schema_() , dispatchTable_() , dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized()) -, manuallyBoxedKernel_() , kernels_() , cpp_signature_() , is_observed_(ObservedOperators::isObserved(name_)) @@ -122,10 +121,6 @@ std::list::iterator OperatorEntry::registerKernel( ); } - if (manuallyBoxedKernel_.has_value()) { - kernel.setManuallyBoxedKernel_(*manuallyBoxedKernel_); - } - k.emplace_front(std::move(kernel), std::move(inferred_function_schema), std::move(debug)); std::list::iterator inserted = k.begin(); // update the dispatch table, i.e. re-establish the invariant @@ -331,19 +326,6 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher) } } -void OperatorEntry::setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func) { - TORCH_INTERNAL_ASSERT(!manuallyBoxedKernel_); - manuallyBoxedKernel_ = func; - - for (auto& kv : kernels_) { - for (auto& k : kv.second) { - k.kernel.setManuallyBoxedKernel_(func); - } - } - // Refresh entries in dispatchTable_ - updateDispatchTableFull_(dispatcher); -} - void OperatorEntry::checkInvariants() const { if (schema_) { TORCH_INTERNAL_ASSERT(schema_->schema.operator_name() == name_, dumpState()); diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index 5098fd0d8c28..44b8fac5661e 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -148,12 +148,6 @@ class TORCH_API OperatorEntry final { const DispatchKeyExtractor& dispatchKeyExtractor() const { return dispatchKeyExtractor_; } - // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic - void setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func); - // Asserts that the given FuncType is correct for calling this operator in an unboxed way. template void assertSignatureIsCorrect() { @@ -189,12 +183,6 @@ class TORCH_API OperatorEntry final { std::array(DispatchKey::NumDispatchKeys)> dispatchTable_; DispatchKeyExtractor dispatchKeyExtractor_; - // This manuallyBoxedKernel_ member is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed - // unboxing wrapper for aten operators. We still need those for some operators because not all work - // with the templated unboxing logic yet. - // TODO Delete manuallyBoxedKernel_ once all operators work with the templated boxing logic - c10::optional manuallyBoxedKernel_; - // kernels_ stores all registered kernels for the corresponding dispatch key // and catchAllKernels_ stores the catch-all kernels. // If an operator library gets loaded that overwrites an already existing kernel, diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 320fa6294638..1223577c59c6 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -265,7 +265,7 @@ bool IValue::ptrEqual(const IValue& lhs, const IValue& rhs) { TORCH_INTERNAL_ASSERT(lhs.is_intrusive_ptr); TORCH_INTERNAL_ASSERT(rhs.is_intrusive_ptr); return lhs.tag == rhs.tag && - lhs.payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + lhs.payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } IValue IValue::equals(const IValue& rhs) const { @@ -325,17 +325,17 @@ size_t IValue::hash(const IValue& v) { case Tag::None: return 0; case Tag::Bool: - return c10::get_hash(v.payload.as_bool); + return c10::get_hash(v.payload.u.as_bool); case Tag::Double: - return c10::get_hash(v.payload.as_double); + return c10::get_hash(v.payload.u.as_double); case Tag::Tensor: // Tensor __hash__ is equivalent to `id()`, so take the pointer value of // the tensor to emulate it - return c10::get_hash(v.payload.as_int); + return c10::get_hash(v.payload.as_tensor.unsafeGetTensorImpl()); case Tag::Storage: - return c10::get_hash(v.payload.as_int); + return c10::get_hash(v.payload.u.as_int); case Tag::Int: - return c10::get_hash(v.payload.as_int); + return c10::get_hash(v.payload.u.as_int); case Tag::String: return c10::get_hash(v.toStringRef()); case Tag::Tuple: diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 4a7e15c4008b..ca68a8df46e1 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -131,10 +131,15 @@ struct Capsule { // they are marked `@private`, which hides them on the doxygen documentation for // this page. -/// IValue (Interpreter Value) is a tagged union over the types supported by the -/// TorchScript interpreter. IValues contain their values as an -/// `IValue::Payload`, which holds primitive types (`int64_t`, `bool`, `double`, -/// `Device`), as values and all other types as a `c10::intrusive_ptr`. +/// IValue (Interpreter Value) is a tagged union over the types +/// supported by the TorchScript interpreter. IValues contain their +/// values as an `IValue::Payload`, which holds primitive types +/// (`int64_t`, `bool`, `double`, `Device`) and `Tensor` as values, +/// and all other types as a `c10::intrusive_ptr`. In order to +/// optimize performance of the destructor and related operations by +/// making the `Tensor` and `c10::intrusive_ptr` paths generate the +/// same code, we represent a null `c10::intrusive_ptr` as +/// `UndefinedTensorImpl::singleton()`, *not* `nullptr`. /// /// IValues are used as inputs to and outputs from the TorchScript interpreter. /// To retrieve the value contained within an IValue, use the `.toX()` methods, @@ -160,27 +165,35 @@ struct Capsule { struct TORCH_API IValue final { IValue(const IValue& rhs) : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr); + if (is_intrusive_ptr && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { + c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr); } } - IValue(IValue&& rhs) noexcept : IValue() { - swap(rhs); + + IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { + moveFrom(std::move(rhs)); } + /// @private [doxygen private] ~IValue() { - if (is_intrusive_ptr) { - c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr); - } + destroy(); } - IValue& operator=(IValue&& rhs) & noexcept { - IValue(std::move(rhs)).swap(*this); // this also sets rhs to None + + C10_ALWAYS_INLINE IValue& operator=(IValue&& rhs) & noexcept { + if (&rhs == this) { + return *this; + } + + destroy(); + moveFrom(std::move(rhs)); return *this; } + IValue& operator=(IValue const& rhs) & { IValue(rhs).swap(*this); return *this; } + void dump() const; /** @@ -260,6 +273,13 @@ struct TORCH_API IValue final { return false; } + // Tensors should be compared based on internal storage + if (this->isTensor()) { + const auto& thisTensor = this->toTensor(); + const auto& rhsTensor = rhs.toTensor(); + return thisTensor.is_alias_of(rhsTensor); + } + if (!this->is_intrusive_ptr) { // Primitive types don't alias anything return false; @@ -267,29 +287,49 @@ struct TORCH_API IValue final { AT_ASSERT(rhs.is_intrusive_ptr); - // Tensors should be compared based on internal storage - if (this->isTensor()) { - const auto thisTensor = this->toTensor(); - const auto rhsTensor = rhs.toTensor(); - return thisTensor.is_alias_of(rhsTensor); - } - // Other types can be compared by their ptr value - return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } /// @private [doxygen private] size_t use_count() const noexcept { + if (isTensor()) { + return payload.as_tensor.use_count(); + } + if (!is_intrusive_ptr) { return 1; } - return c10::raw::intrusive_ptr::use_count(payload.as_intrusive_ptr); + if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) { + return 0; + } + return c10::raw::intrusive_ptr::use_count(payload.u.as_intrusive_ptr); } /// @private [doxygen private] void swap(IValue& rhs) noexcept { - std::swap(payload, rhs.payload); + if (isTensor() && rhs.isTensor()) { + std::swap(payload.as_tensor, rhs.payload.as_tensor); + } else if (isTensor()) { + at::Tensor t = std::move(payload.as_tensor); + // As far as I can tell, omitting the usual explicit destructor call + // is not UB in and of itself, and it's a slight perf win. The + // destructor is a no-op, because the moved-from Tensor is + // effectively an intrusive_ptr in the null state, so we don't need + // the behavior for correctness reasons either. Leaving this + // explanatory comment, including commented-out destructor call, to + // make this abundantly clear. + // + // payload.as_tensor.~Tensor(); + payload.u = rhs.payload.u; + new (&rhs.payload.as_tensor) at::Tensor(std::move(t)); + } else if (rhs.isTensor()) { + rhs.swap(*this); + return; + } else { + std::swap(payload.u, rhs.payload.u); + } std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr); std::swap(tag, rhs.tag); } @@ -298,21 +338,17 @@ struct TORCH_API IValue final { // While some of these accessors could be generated through templates, // we prefer to write them manually for clarity - IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) { - // Note: the undefined tensor is not refcounted, so while it - // is tagged as a tensor, is_intrusive_ptr is set to false. - // This is not an optional optimization: our incref call - // *will not* do the right thing when called on an - // undefined tensor. - payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl(); + IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(false) { + new (&payload.as_tensor) at::Tensor(std::move(t)); } bool isTensor() const { return Tag::Tensor == tag; } at::Tensor toTensor() &&; - at::Tensor toTensor() const&; + at::Tensor& toTensor() &; + const at::Tensor& toTensor() const&; at::TensorImpl* unsafeToTensorImpl() const { - return static_cast(payload.as_intrusive_ptr); + return payload.as_tensor.unsafeGetTensorImpl(); } IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast(s)) { @@ -321,7 +357,7 @@ struct TORCH_API IValue final { // This is not an optional optimization: our incref call // *will not* do the right thing when called on an // undefined tensor. - payload.as_intrusive_ptr = s.unsafeReleaseStorageImpl(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(s.unsafeReleaseStorageImpl()); } bool isStorage() const { return Tag::Storage == tag; @@ -341,7 +377,7 @@ struct TORCH_API IValue final { : tag(Tag::Blob), is_intrusive_ptr(true) { // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract // and store it as a Tensor instead. - payload.as_intrusive_ptr = blob.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release()); } /// @private [doxygen private] @@ -397,14 +433,14 @@ struct TORCH_API IValue final { // Double IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) { - payload.as_double = d; + payload.u.as_double = d; } bool isDouble() const { return Tag::Double == tag; } double toDouble() const { AT_ASSERT(isDouble()); - return payload.as_double; + return payload.u.as_double; } // Future @@ -433,7 +469,7 @@ struct TORCH_API IValue final { // Int IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) { - payload.as_int = i; + payload.u.as_int = i; } // allow you to pass literals (3, 4) without ambiguity @@ -445,7 +481,7 @@ struct TORCH_API IValue final { int64_t toInt() const { AT_ASSERT(isInt()); - return payload.as_int; + return payload.u.as_int; } // Bool @@ -454,9 +490,9 @@ struct TORCH_API IValue final { // Initializing entire payload stops valgrind's from reporting // "jump or move depends on uninitialised value" in IValue copy constructor // See https://github.com/pytorch/pytorch/issues/37117 - payload.as_int = b; + payload.u.as_int = b; #else - payload.as_bool = b; + payload.u.as_bool = b; #endif } bool isBool() const { @@ -464,7 +500,7 @@ struct TORCH_API IValue final { } bool toBool() const { AT_ASSERT(isBool()); - return payload.as_bool; + return payload.u.as_bool; } // IntList @@ -580,7 +616,7 @@ struct TORCH_API IValue final { c10::intrusive_ptr toEnumHolder() const&; // None - IValue() : payload{0}, tag(Tag::None), is_intrusive_ptr(false) {} + IValue() : tag(Tag::None), is_intrusive_ptr(false) {} bool isNone() const { return Tag::None == tag; } @@ -616,21 +652,21 @@ struct TORCH_API IValue final { // Device IValue(c10::Device d) : tag(Tag::Device), is_intrusive_ptr(false) { - payload.as_device.type = d.type(); - payload.as_device.index = d.index(); + payload.u.as_device.type = d.type(); + payload.u.as_device.index = d.index(); } bool isDevice() const { return Tag::Device == tag; } c10::Device toDevice() const { AT_ASSERT(isDevice()); - return c10::Device(payload.as_device.type, payload.as_device.index); + return c10::Device(payload.u.as_device.type, payload.u.as_device.index); } //Stream IValue(c10::Stream stream) : tag(Tag::Stream), is_intrusive_ptr(false) { - payload.as_int = stream.pack(); + payload.u.as_int = stream.pack(); } c10::Stream toStream() &&; c10::Stream toStream() const &; @@ -659,7 +695,7 @@ struct TORCH_API IValue final { // QScheme IValue(at::QScheme qscheme) : tag(Tag::Int), is_intrusive_ptr(false) { - payload.as_int = static_cast(qscheme); + payload.u.as_int = static_cast(qscheme); } at::QScheme toQScheme() const { @@ -680,7 +716,7 @@ struct TORCH_API IValue final { // This is not an optional optimization: our incref call // *will not* do the right thing when called on an // undefined generator. - payload.as_intrusive_ptr = g.unsafeReleaseGeneratorImpl(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl()); } bool isGenerator() const { return Tag::Generator == tag; @@ -749,14 +785,19 @@ struct TORCH_API IValue final { const IValue& v); bool isPtrType() const { - return is_intrusive_ptr; + return (isTensor() && payload.as_tensor.defined()) || is_intrusive_ptr; } /// @private [doxygen private] const void* internalToPointer() const { TORCH_INTERNAL_ASSERT( isPtrType(), "Can only call internalToPointer() for pointer types"); - return payload.as_intrusive_ptr; + if (isTensor()) { + return payload.as_tensor.unsafeGetTensorImpl(); + } else { + return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton() + ? payload.u.as_intrusive_ptr : nullptr; + } } TypePtr type() const; @@ -770,7 +811,7 @@ struct TORCH_API IValue final { } // If it is not a Tensor, then two mutable IValues alias each other only // if they are the same pointer. - return val.payload.as_int; + return val.payload.u.as_int; } }; @@ -800,6 +841,10 @@ struct TORCH_API IValue final { IValue deepcopy(HashAliasedIValueMap& memo) const; private: + static c10::intrusive_ptr_target* null_to_undefined_tensor(c10::intrusive_ptr_target* p) { + return p ? p : static_cast(c10::UndefinedTensorImpl::singleton()); + } + static bool ptrEqual(const IValue& lhs, const IValue& rhs); // NOTE: IValue tags are intentionally private. In the future we may encode // this value different (e.g. using NaN boxing), and this would make it more @@ -822,24 +867,77 @@ struct TORCH_API IValue final { class NullType = c10::detail::intrusive_target_default_null_type> c10::intrusive_ptr toIntrusivePtr() const; - void clearToNone() { - payload.as_int = 0; + void destroy() { + // We carefully construct this call to both 1) avoid UB by using + // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable + // the compiler to generate the same code for each case. It is + // surprisingly difficult to get this right. + if (isTensor() || is_intrusive_ptr) { + c10::intrusive_ptr_target* p = isTensor() ? payload.as_tensor.unsafeGetTensorImpl() : payload.u.as_intrusive_ptr; + c10::intrusive_ptr::reclaim(p); + // No need to make this destructor call! + // payload.as_tensor.~Tensor(); + } + } + + C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept { + if (rhs.isTensor()) { + new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor)); + // As far as I can tell, omitting the usual explicit destructor call + // is not UB in and of itself, and it's a slight perf win. The + // destructor is a no-op, because the moved-from Tensor is + // effectively an intrusive_ptr in the null state, so we don't need + // the behavior for correctness reasons either. Leaving this + // explanatory comment, including commented-out destructor call, to + // make this abundantly clear. + // + // rhs.payload.as_tensor.~Tensor(); + } else { + payload.u = rhs.payload.u; + } + tag = rhs.tag; + is_intrusive_ptr = rhs.is_intrusive_ptr; + rhs.clearToNone(); + } + + void clearToNone() noexcept { + payload.u.as_int = 0; tag = Tag::None; is_intrusive_ptr = false; } union Payload { - int64_t as_int; - double as_double; - bool as_bool; - c10::intrusive_ptr_target* as_intrusive_ptr; - struct { - DeviceType type; - DeviceIndex index; - } as_device; + // We use a nested union here so that we can make the copy easy + // and efficient in the non-tensor (i.e., trivially copyable) + // case. Specifically, we do not have to do a switch-on-tag to + // figure out which union member to assign; we can just use + // TriviallyCopyablePayload::operator=. + union TriviallyCopyablePayload { + TriviallyCopyablePayload() : as_int(0) {} + int64_t as_int; + double as_double; + bool as_bool; + // Invariant: never nullptr; null state is represented as + // c10::UndefinedTensorImpl::singleton() for consistency of + // representation with Tensor. + c10::intrusive_ptr_target* as_intrusive_ptr; + struct { + DeviceType type; + DeviceIndex index; + } as_device; + } u; + at::Tensor as_tensor; + Payload() : u() {} + ~Payload() {} }; - IValue(Payload p, Tag t, bool i) : payload(p), tag(t), is_intrusive_ptr(i) {} + IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) { + if (isTensor()) { + new (&payload.as_tensor) at::Tensor(p.as_tensor); + } else { + payload.u = p.u; + } + } Payload payload; Tag tag; @@ -848,29 +946,36 @@ struct TORCH_API IValue final { }; struct TORCH_API WeakIValue final { - WeakIValue() : payload{0}, tag(IValue::Tag::None), is_intrusive_ptr(false) {} + WeakIValue() : tag(IValue::Tag::None), is_intrusive_ptr(false) {} WeakIValue(const WeakIValue& rhs) : payload(rhs.payload), tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { - if (is_intrusive_ptr) { + if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr); } } WeakIValue(const IValue& rhs) - : payload(rhs.payload), - tag(rhs.tag), + : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) { + if (rhs.isTensor()) { + payload.as_intrusive_ptr = rhs.unsafeToTensorImpl(); + is_intrusive_ptr = true; + } else { + payload = rhs.payload.u; + } if (is_intrusive_ptr) { - c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr); + if (payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { + c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr); + } } } WeakIValue(WeakIValue&& rhs) noexcept : WeakIValue() { swap(rhs); } ~WeakIValue() { - if (is_intrusive_ptr) { + if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) { c10::raw::weak_intrusive_ptr::decref(payload.as_intrusive_ptr); } } @@ -895,17 +1000,33 @@ struct TORCH_API WeakIValue final { IValue lock() const { if (!is_intrusive_ptr) { - return IValue(payload, tag, false); + IValue::Payload newPayload; + newPayload.u = payload; + return IValue(newPayload, tag, false); } - auto temp = c10::weak_intrusive_ptr::reclaim( - payload.as_intrusive_ptr); - IValue::Payload pl; - pl.as_intrusive_ptr = temp.lock().release(); - temp.release(); - if (!pl.as_intrusive_ptr) { - return IValue(); + if (IValue::Tag::Tensor == tag) { + auto temp = c10::weak_intrusive_ptr::reclaim( + static_cast(payload.as_intrusive_ptr)); + c10::intrusive_ptr ip(temp.lock()); + temp.release(); + if (!ip) { + return IValue(); + } else { + return IValue(at::Tensor(std::move(ip))); + } } else { - return IValue(pl, tag, true); + auto temp = c10::weak_intrusive_ptr::reclaim( + payload.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton() + ? nullptr + : payload.as_intrusive_ptr); + IValue::Payload pl; + pl.u.as_intrusive_ptr = temp.lock().release(); + temp.release(); + if (!pl.u.as_intrusive_ptr) { + return IValue(); + } else { + return IValue(pl, tag, true); + } } } @@ -913,7 +1034,7 @@ struct TORCH_API WeakIValue final { if (!is_intrusive_ptr) { return 1; } - auto temp = c10::weak_intrusive_ptr::reclaim( + auto temp = c10::weak_intrusive_ptr::reclaim( payload.as_intrusive_ptr); size_t result = temp.use_count(); temp.release(); @@ -924,7 +1045,7 @@ struct TORCH_API WeakIValue final { if (!is_intrusive_ptr) { return 1; } - auto temp = c10::weak_intrusive_ptr::reclaim( + auto temp = c10::weak_intrusive_ptr::reclaim( payload.as_intrusive_ptr); size_t result = temp.weak_use_count(); temp.release(); @@ -935,7 +1056,8 @@ struct TORCH_API WeakIValue final { } private: - IValue::Payload payload; + using Payload = IValue::Payload::TriviallyCopyablePayload; + Payload payload; IValue::Tag tag; bool is_intrusive_ptr; }; diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 89c8e669c138..b96f4b834989 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -48,14 +48,18 @@ struct tagged_capsule { template c10::intrusive_ptr IValue::moveToIntrusivePtr() { auto t = c10::intrusive_ptr::reclaim( - static_cast(payload.as_intrusive_ptr)); + payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton() + ? NullType::singleton() + : static_cast(payload.u.as_intrusive_ptr)); clearToNone(); return t; } template c10::intrusive_ptr IValue::toIntrusivePtr() const { auto r = c10::intrusive_ptr::reclaim( - static_cast(payload.as_intrusive_ptr)); + payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton() + ? NullType::singleton() + : static_cast(payload.u.as_intrusive_ptr)); auto p = r; r.release(); return p; @@ -131,12 +135,26 @@ inline c10::intrusive_ptr IValue::toEnumHolder() const& { } inline at::Tensor IValue::toTensor() && { AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); - return at::Tensor( - moveToIntrusivePtr()); + auto result = std::move(payload.as_tensor); + // As far as I can tell, omitting the usual explicit destructor call + // is not UB in and of itself, and it's a slight perf win. The + // destructor is a no-op, because the moved-from Tensor is + // effectively an intrusive_ptr in the null state, so we don't need + // the behavior for correctness reasons either. Leaving this + // explanatory comment, including commented-out destructor call, to + // make this abundantly clear. + // + // payload.as_tensor.~Tensor(); + clearToNone(); + return result; } -inline at::Tensor IValue::toTensor() const& { +inline at::Tensor& IValue::toTensor() & { AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); - return at::Tensor(toIntrusivePtr()); + return payload.as_tensor; +} +inline const at::Tensor& IValue::toTensor() const& { + AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind()); + return payload.as_tensor; } inline c10::Storage IValue::toStorage() && { AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind()); @@ -148,10 +166,10 @@ inline c10::Storage IValue::toStorage() const& { return c10::Storage(toIntrusivePtr()); } inline c10::Stream IValue::toStream() && { - return c10::Stream::unpack(payload.as_int); + return c10::Stream::unpack(payload.u.as_int); } inline c10::Stream IValue::toStream() const& { - return c10::Stream::unpack(payload.as_int); + return c10::Stream::unpack(payload.u.as_int); } inline c10::intrusive_ptr IValue::toBlob() && { AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind()); @@ -713,7 +731,8 @@ using _guarded_unsigned_long = std::conditional_t< inline const ivalue::Object& IValue::toObjectRef() const { AT_ASSERT(isObject(), "Expected Object but got ", tagKind()); - return *static_cast(payload.as_intrusive_ptr); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "Attempted to create null reference"); + return *static_cast(payload.u.as_intrusive_ptr); } // note: when adding a DEFINE_TO case here you should also add a @@ -729,6 +748,7 @@ inline const ivalue::Object& IValue::toObjectRef() const { inline type IValue::to() const& { \ return this->method_name(); \ } + DEFINE_TO(at::Tensor, toTensor) DEFINE_TO(at::Storage, toStorage) DEFINE_TO(c10::Stream, toStream) @@ -980,8 +1000,11 @@ inline c10::List IValue::toIntList() const& { } inline std::vector IValue::toIntVector() const { AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toIntVector on null intrusive_ptr IValue"); return createVectorFromList( - static_cast(payload.as_intrusive_ptr)); + static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toDoubleList() && { AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind()); @@ -993,8 +1016,11 @@ inline c10::List IValue::toDoubleList() const& { } inline std::vector IValue::toDoubleVector() const { AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toDoubleVector on null intrusive_ptr IValue"); return createVectorFromList( - static_cast(payload.as_intrusive_ptr)); + static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toBoolList() && { AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind()); @@ -1014,8 +1040,11 @@ inline c10::List IValue::toTensorList() const& { } inline std::vector IValue::toTensorVector() const { AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toTensorVector on null intrusive_ptr IValue"); return createVectorFromList( - static_cast(payload.as_intrusive_ptr)); + static_cast(payload.u.as_intrusive_ptr)); } inline c10::List IValue::toList() && { AT_ASSERT(isList(), "Expected GenericList but got ", tagKind()); @@ -1027,7 +1056,10 @@ inline c10::List IValue::toList() const& { } inline c10::ArrayRef IValue::toListRef() const { AT_ASSERT(isList(), "Expected GenericList but got ", tagKind()); - return static_cast(payload.as_intrusive_ptr) + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toListRef on null intrusive_ptr IValue"); + return static_cast(payload.u.as_intrusive_ptr) ->list; } inline c10::Dict IValue::toGenericDict() && { @@ -1049,7 +1081,7 @@ inline c10::intrusive_ptr IValue::toTuple() const& { inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Tuple), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } template < typename... Args, @@ -1065,14 +1097,14 @@ inline IValue::IValue(const std::tuple& t) inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::String), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(std::string v) : IValue(ivalue::ConstantString::create(std::move(v))) {} inline IValue::IValue(c10::impl::GenericList v) : tag(Tag::GenericList), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.impl_.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release()); } template > @@ -1104,7 +1136,7 @@ inline IValue::IValue(std::array v) : IValue(c10::List()) { inline IValue::IValue(c10::impl::GenericDict v) : tag(Tag::GenericDict), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.impl_.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release()); } template inline IValue::IValue(c10::Dict v) @@ -1131,17 +1163,17 @@ inline IValue::IValue(c10::nullopt_t) : IValue() {} inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Object), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::PyObject), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Enum), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue IValue::make_capsule( @@ -1149,7 +1181,7 @@ inline IValue IValue::make_capsule( IValue iv; iv.tag = Tag::Capsule; iv.is_intrusive_ptr = true; - iv.payload.as_intrusive_ptr = blob.release(); + iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release()); return iv; } @@ -1170,30 +1202,33 @@ IValue::IValue(c10::intrusive_ptr custom_class) { auto ivalue_obj = c10::ivalue::Object::create( c10::StrongTypePtr(nullptr, classType), /*num_slots=*/1); ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class))); - payload.as_intrusive_ptr = ivalue_obj.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release()); tag = Tag::Object; is_intrusive_ptr = true; } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Future), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::RRef), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline IValue::IValue(c10::intrusive_ptr v) : tag(Tag::Quantizer), is_intrusive_ptr(true) { - payload.as_intrusive_ptr = v.release(); + payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release()); } inline const std::string& IValue::toStringRef() const { AT_ASSERT(isString(), "Expected String but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toStringRef on null intrusive_ptr IValue"); return static_cast( - payload.as_intrusive_ptr) + payload.u.as_intrusive_ptr) ->string(); } inline c10::optional> IValue:: @@ -1202,8 +1237,11 @@ inline c10::optional> IValue:: return c10::nullopt; } AT_ASSERT(isString(), "Expected optional but got ", tagKind()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), + "called toOptionalStringRef on null intrusive_ptr IValue"); return std::reference_wrapper( - static_cast(payload.as_intrusive_ptr) + static_cast(payload.u.as_intrusive_ptr) ->string()); } @@ -1241,15 +1279,13 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const { // for bool type, do equality check return this->toBool() == rhs.toBool(); } else if (this->isTensor() && rhs.isTensor()) { - // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr - // is false for undefined tensor - return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + return this->payload.as_tensor.is_same(rhs.payload.as_tensor); } else if (this->isTensor() && rhs.isNone()) { // special case: undefined tensor and None are the same identity - return !this->is_intrusive_ptr; + return !this->payload.as_tensor.defined(); } else if (this->isNone() && rhs.isTensor()) { // special case: undefined tensor and None are the same identity - return !rhs.is_intrusive_ptr; + return !rhs.payload.as_tensor.defined(); } else if (this->isInt() && rhs.isInt()) { return this->toInt() == rhs.toInt(); } else if (this->isDouble() && rhs.isDouble()) { @@ -1260,7 +1296,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const { // for objects holding in IValue, do shallow compare on pointer address to // testify the identity return this->is_intrusive_ptr && rhs.is_intrusive_ptr && - this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr; + this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr; } } diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index a3ae813616e0..7d3890f582b8 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -2370,19 +2370,19 @@ struct TORCH_API AnyClassType : public Type { inline bool IValue::isDoubleList() const { // note: avoids calling type() to avoid extra referencing counting for the returned type. - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == FloatType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == FloatType::Kind; } inline bool IValue::isTensorList() const { - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == TensorType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == TensorType::Kind; } inline bool IValue::isIntList() const { - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == IntType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == IntType::Kind; } inline bool IValue::isBoolList() const { - return isList() && static_cast(payload.as_intrusive_ptr)->elementType->kind() == BoolType::Kind; + return isList() && static_cast(payload.u.as_intrusive_ptr)->elementType->kind() == BoolType::Kind; } template<> diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index 37da9ad7ef8d..e5a6d48340cf 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -152,6 +152,20 @@ struct TORCH_API Type : std::enable_shared_from_this { return nullptr; } template + T* castRaw() { + if (T::Kind == kind()) { + return static_cast(this); + } + return nullptr; + } + template + const T* castRaw() const { + if (T::Kind == kind()) { + return static_cast(this); + } + return nullptr; + } + template std::shared_ptr expect() { auto r = cast(); AT_ASSERT(r); @@ -163,6 +177,18 @@ struct TORCH_API Type : std::enable_shared_from_this { AT_ASSERT(r); return r; } + template + T& expectRef() { + auto* r = castRaw(); + AT_ASSERT(r); + return *r; + } + template + const T& expectRef() const { + auto* r = castRaw(); + AT_ASSERT(r); + return *r; + } virtual ~Type() = default; virtual bool hasFreeVariables() const { return false; diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 6259578fdac8..56afe8ca7fb5 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -1909,7 +1909,7 @@ TEST(NewOperatorRegistrationTest, CppFunction) { m.def("fn3", [](const Tensor& x) { return x; }); // These require explicit schema m.def("fn4(Tensor x) -> Tensor", CppFunction::makeFallthrough()); - m.def("fn5(Tensor x) -> Tensor", CppFunction::makeUnboxedOnly(dummy_fn)); + m.def("fn5(Tensor x) -> Tensor", CppFunction::makeFromUnboxedFunction(dummy_fn)); m.def("fn6(Tensor x) -> Tensor", CppFunction::makeFromBoxedFunction<&backend_fallback_kernel>()); } diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index 8a5e4f48e0c0..f0572bb6d809 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -130,6 +130,67 @@ uint64_t CUDAGeneratorImpl::seed() { return random; } +/** + * Gets the current internal state of CUDAGeneratorImpl. The internal + * state is returned as a CPU byte tensor. + */ +c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { + // The RNG state comprises the seed, and an offset used for Philox. + // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120. + // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); + // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here + // because this is just host side code and we don't want to worry about linking with cuda + static const size_t states_size = 200 * sizeof(4120); + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = states_size + seed_size + offset_size; + + auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); + auto rng_state = state_tensor.data_ptr(); + // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1 + // gen_states in THCGenerator struct was an array of curandStateMtgp32s. + memset(rng_state, -1, states_size); + auto current_seed = this->current_seed(); + auto offset = static_cast(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic + memcpy(rng_state + states_size, ¤t_seed, seed_size); + memcpy(rng_state + states_size + seed_size, &offset, offset_size); + + return state_tensor.getIntrusivePtr(); +} + +/** + * Sets the internal state of CUDAGeneratorImpl. The new internal state + * must be a strided CPU byte tensor and have appropriate size. See + * comments of CUDAGeneratorImpl::state for information about the layout + * and size of the internal state. + */ +void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { + static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason + static const size_t seed_size = sizeof(uint64_t); + static const size_t offset_size = sizeof(int64_t); + static const size_t total_size = states_size + seed_size + offset_size; + + detail::check_rng_state(new_state); + + bool no_philox_seed = false; + auto new_state_size = new_state.numel(); + if (new_state_size == total_size - offset_size) { + no_philox_seed = true; + } else { + TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size"); + } + + uint64_t input_seed; + auto new_rng_state = new_state.data(); + memcpy(&input_seed, new_rng_state + states_size, seed_size); + this->set_current_seed(input_seed); + int64_t philox_offset = 0; + if (!no_philox_seed) { + memcpy(&philox_offset, new_rng_state + states_size + seed_size, offset_size); + } + this->set_philox_offset_per_thread(static_cast(philox_offset)); +} + /** * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10 * @@ -143,7 +204,7 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { /** * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. */ -uint64_t CUDAGeneratorImpl::philox_offset_per_thread() { +uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread"); return philox_offset_per_thread_; } diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index f38860e8ef13..b75ef8219b1c 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -369,6 +369,11 @@ int CUDAHooks::getNumGPUs() const { return at::cuda::device_count(); } +void CUDAHooks::deviceSynchronize(int64_t device_index) const { + at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index)); + c10::cuda::device_synchronize(); +} + // Sigh, the registry doesn't support namespaces :( using at::CUDAHooksRegistry; using at::RegistererCUDAHooksRegistry; diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index dff8913b153f..abef2e7ff835 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -38,6 +38,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { int64_t cuFFTGetPlanCacheSize(int64_t device_index) const override; void cuFFTClearPlanCache(int64_t device_index) const override; int getNumGPUs() const override; + void deviceSynchronize(int64_t device_index) const override; }; }}} // at::cuda::detail diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index af4eb6fd0739..afe88761d88f 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -181,6 +181,10 @@ struct TORCH_API CUDAHooksInterface { virtual int getNumGPUs() const { return 0; } + + virtual void deviceSynchronize(int64_t device_index) const { + TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP); + } }; // NB: dummy argument to suppress "ISO C++11 requires at least one argument diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index ef0c2e2509c1..413ea32acdef 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -118,7 +118,7 @@ DEFINE_DISPATCH(bernoulli_tensor_stub); DEFINE_DISPATCH(bernoulli_scalar_stub); DEFINE_DISPATCH(cauchy_stub); DEFINE_DISPATCH(exponential_stub); -DEFINE_DISPATCH(multinomial_stub); +DEFINE_DISPATCH(multinomial_with_replacement_stub); DEFINE_DISPATCH(geometric_stub); DEFINE_DISPATCH(log_normal_stub); DEFINE_DISPATCH(uniform_stub); @@ -497,8 +497,10 @@ Tensor& multinomial_out( // Reference: // https://github.com/pytorch/pytorch/issues/11931#issuecomment-625882503 // Half is not supported on CPU. - if (!with_replacement && - !(self.device().is_cpu() && self.scalar_type() == ScalarType::Half)) { + TORCH_CHECK( + !(self.device().is_cpu() && self.scalar_type() == ScalarType::Half), + "multinomial is not implemented for half on CPU"); + if (!with_replacement) { // Sanity checks on `self`. auto is_valid = ((self.max() < INFINITY) & (self.min() >= 0)).item(); TORCH_CHECK( @@ -537,13 +539,8 @@ Tensor& multinomial_out( return result; } - multinomial_stub( - result.device().type(), - result, - self, - n_sample, - with_replacement, - gen); + multinomial_with_replacement_stub( + result.device().type(), result, self, n_sample, gen); return result; } diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 071460b090cd..8b5d65a8a60f 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -72,7 +72,7 @@ pool2d_shape_check( TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4), "non-empty 3D or 4D input tensor expected but got ndim: ", ndim); TORCH_CHECK(kW/2 >= padW && kH/2 >= padH, - "pad should be smaller than half of kernel size, but got ", + "pad should be smaller than or equal to half of kernel size, but got ", "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH); TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1, @@ -172,7 +172,7 @@ pool3d_shape_check( } TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, - "pad should be smaller than half of kernel size, but got " + "pad should be smaller than or equal to half of kernel size, but got " "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH); TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1, diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index c8eb3cc99a01..289d1128d2f9 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -102,9 +102,12 @@ Tensor resize_fft_input(Tensor x, IntArrayRef dims, IntArrayRef sizes) { } // Complex to real FFT -Tensor fft_c2r(Tensor input, c10::optional n_opt, +Tensor fft_c2r(c10::string_view function_name, + Tensor out, Tensor input, c10::optional n_opt, int64_t unwrapped_dim, c10::optional norm_str, bool forward) { + TORCH_CHECK(!out.defined() || out.is_floating_point(), function_name, + " expects a floating point output tensor, but got ", out.scalar_type()); input = promote_tensor_fft(input, /*require_complex=*/true); const auto input_dim = input.dim(); const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim); @@ -118,14 +121,22 @@ Tensor fft_c2r(Tensor input, c10::optional n_opt, // FIXME: _fft does not support complex_output=false with inverse=false input = at::conj(input); } - return at::_fft_c2r(input, dim, static_cast(norm), n); + if (out.defined()) { + return at::_fft_c2r_out(out, input, dim, static_cast(norm), n); + } else { + return at::_fft_c2r(input, dim, static_cast(norm), n); + } } // Real to complex FFT -Tensor fft_r2c(Tensor input, c10::optional n_opt, +Tensor fft_r2c(c10::string_view function_name, + Tensor out, Tensor input, c10::optional n_opt, int64_t unwrapped_dim, c10::optional norm_str, bool forward, bool onesided) { - TORCH_CHECK(!input.is_complex(), "Expected a real input tensor to FFT"); + TORCH_CHECK(!input.is_complex(), function_name, + " expects a real input tensor, but got ", input.scalar_type()); + TORCH_CHECK(!out.defined() || out.is_complex(), function_name, + " expects a complex output tensor, but got ", out.scalar_type()); input = promote_tensor_fft(input); const auto input_dim = input.dim(); const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim); @@ -136,19 +147,29 @@ Tensor fft_r2c(Tensor input, c10::optional n_opt, } const auto norm = norm_from_string(norm_str, forward); - auto out = at::_fft_r2c(input, dim, static_cast(norm), onesided); + + Tensor ret; + if (out.defined() && forward) { + ret = at::_fft_r2c_out(out, input, dim, static_cast(norm), onesided); + } else { + ret = at::_fft_r2c(input, dim, static_cast(norm), onesided); + } + if (!forward) { // FIXME: _fft_r2c doesn't support native r2c IFFT - out = at::conj(out); + return out.defined() ? at::conj_out(out, ret) : at::conj(ret); + } else { + return ret; } - return out; } // Complex to complex FFT -Tensor fft_c2c(Tensor input, c10::optional n_opt, +Tensor fft_c2c(c10::string_view function_name, + Tensor out, Tensor input, c10::optional n_opt, int64_t unwrapped_dim, c10::optional norm_str, bool forward) { - TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); + TORCH_CHECK(input.is_complex(), function_name, + " expects a complex input tensor, but got ", input.scalar_type()); const auto input_dim = input.dim(); const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim); const auto n = n_opt.value_or(input.sizes()[dim]); @@ -157,7 +178,13 @@ Tensor fft_c2c(Tensor input, c10::optional n_opt, input = resize_fft_input(input, dim, n); } const auto norm = norm_from_string(norm_str, forward); - return at::_fft_c2c(input, dim, static_cast(norm), forward); + if (out.defined()) { + TORCH_CHECK(out.is_complex(), function_name, + " expects a complex output tensor, but got ", out.scalar_type()); + return at::_fft_c2c_out(out, input, dim, static_cast(norm), forward); + } else { + return at::_fft_c2c(input, dim, static_cast(norm), forward); + } } // Dimensions to transform, and the signal shape in those dimensions @@ -230,12 +257,18 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args( // Complex to complex n-dimensional fft Tensor fftn_c2c( - const Tensor& input, IntArrayRef shape, IntArrayRef dim, - c10::optional norm_str, bool forward) { - TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT"); + c10::string_view function_name, + Tensor out, const Tensor& input, IntArrayRef shape, + IntArrayRef dim, c10::optional norm_str, bool forward) { + TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got", input.scalar_type()); Tensor x = resize_fft_input(input, dim, shape); const auto norm = norm_from_string(norm_str, forward); - return at::_fft_c2c(x, dim, static_cast(norm), forward); + if (out.defined()) { + TORCH_CHECK(out.is_complex(), function_name, " expects a complex output tensor, but got ", out.scalar_type()); + return at::_fft_c2c_out(out, x, dim, static_cast(norm), forward); + } else { + return at::_fft_c2c(x, dim, static_cast(norm), forward); + } } } // namespace (anonymous) @@ -244,35 +277,79 @@ Tensor fftn_c2c( Tensor fft_fft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { return self.is_complex() ? - fft_c2c(self, n, dim, norm, /*forward=*/true) : - fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false); + fft_c2c("fft", {}, self, n, dim, norm, /*forward=*/true) : + fft_r2c("fft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/false); +} + +Tensor& fft_fft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + if (self.is_complex()) { + fft_c2c("fft", out, self, n, dim, norm, /*forward=*/true); + } else { + fft_r2c("fft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/false); + } + return out; } Tensor fft_ifft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { return self.is_complex() ? - fft_c2c(self, n, dim, norm, /*forward=*/false) : - fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false); + fft_c2c("ifft", {}, self, n, dim, norm, /*forward=*/false) : + fft_r2c("ifft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/false); +} + +Tensor& fft_ifft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + if (self.is_complex()) { + fft_c2c("ifft", out, self, n, dim, norm, /*forward=*/false); + } else { + fft_r2c("ifft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/false); + } + return out; } Tensor fft_rfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/true); + return fft_r2c("rfft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/true); +} + +Tensor& fft_rfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_r2c("rfft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/true); + return out; } Tensor fft_irfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_c2r(self, n, dim, norm, /*forward=*/false); + return fft_c2r("irfft", {}, self, n, dim, norm, /*forward=*/false); +} + +Tensor& fft_irfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_c2r("irfft", out, self, n, dim, norm, /*forward=*/false); + return out; } Tensor fft_hfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_c2r(self, n, dim, norm, /*forward=*/true); + return fft_c2r("hfft", {}, self, n, dim, norm, /*forward=*/true); +} + +Tensor& fft_hfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_c2r("hfft", out, self, n, dim, norm, /*forward=*/true); + return out; } Tensor fft_ihfft(const Tensor& self, c10::optional n, int64_t dim, c10::optional norm) { - return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true); + return fft_r2c("ihfft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/true); +} + +Tensor& fft_ihfft_out(Tensor& out, const Tensor& self, c10::optional n, + int64_t dim, c10::optional norm) { + fft_r2c("ihfft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/true); + return out; } Tensor fft_fftn(const Tensor& self, c10::optional s, @@ -281,7 +358,18 @@ Tensor fft_fftn(const Tensor& self, c10::optional s, auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); // TODO: For real input, perform rfftn then mirror with conjugate symmetry Tensor input = promote_tensor_fft(self, /*require_complex=*/true); - return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true); + return fftn_c2c("fftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/true); +} + +Tensor& fft_fftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + // TODO: For real input, perform rfftn then mirror with conjugate symmetry + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + fftn_c2c("fftn", out, input, desc.shape, desc.dim, norm, /*forward=*/true); + return out; } Tensor fft_ifftn(const Tensor& self, c10::optional s, @@ -289,24 +377,55 @@ Tensor fft_ifftn(const Tensor& self, c10::optional s, c10::optional norm) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); Tensor input = promote_tensor_fft(self, /*require_complex=*/true); - return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false); + return fftn_c2c("ifftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/false); } -Tensor fft_rfftn(const Tensor& self, c10::optional s, - c10::optional dim, - c10::optional norm_str) { +Tensor& fft_ifftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm) { + auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); + Tensor input = promote_tensor_fft(self, /*require_complex=*/true); + fftn_c2c("ifftn", out, input, desc.shape, desc.dim, norm, /*forward=*/false); + return out; +} + +static Tensor fft_rfftn_impl(Tensor out, const Tensor& self, + c10::optional s, + c10::optional dim, + const c10::optional& norm_str) { TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type()); auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis"); Tensor input = promote_tensor_fft(self, /*require_complex=*/false); Tensor x = resize_fft_input(input, desc.dim, desc.shape); const auto norm = norm_from_string(norm_str, /*forward=*/true); - return at::_fft_r2c(x, desc.dim, static_cast(norm), /*onesided=*/true); + if (out.defined()) { + TORCH_CHECK(out.is_complex(), "rfftn expects a complex-valued output tensor, but got ", out.scalar_type()); + return at::_fft_r2c_out(out, x, desc.dim, static_cast(norm), /*onesided=*/true); + } else { + return at::_fft_r2c(x, desc.dim, static_cast(norm), /*onesided=*/true); + } } -Tensor fft_irfftn(const Tensor& self, c10::optional s, +Tensor fft_rfftn(const Tensor& self, c10::optional s, c10::optional dim, c10::optional norm_str) { + return fft_rfftn_impl({}, self, s, dim, norm_str); +} + +Tensor& fft_rfftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm_str) { + fft_rfftn_impl(out, self, s, dim, norm_str); + return out; +} + +static Tensor fft_irfftn_impl(Tensor out, const Tensor& self, + c10::optional s, + c10::optional dim, + const c10::optional& norm_str) { auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim); TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis"); @@ -323,7 +442,27 @@ Tensor fft_irfftn(const Tensor& self, c10::optional s, Tensor input = promote_tensor_fft(self, /*require_complex=*/true); Tensor x = resize_fft_input(input, desc.dim, desc.shape); const auto norm = norm_from_string(norm_str, /*forward=*/false); - return at::_fft_c2r(x, desc.dim, static_cast(norm), last_dim_size); + if (out.defined()) { + TORCH_CHECK(out.is_floating_point(), "irfftn expects a floating point output tensor, but got ", out.scalar_type()); + return at::_fft_c2r_out(out, x, desc.dim, static_cast(norm), last_dim_size); + } else { + return at::_fft_c2r(x, desc.dim, static_cast(norm), last_dim_size); + } +} + +Tensor fft_irfftn(const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm_str) { + return fft_irfftn_impl({}, self, s, dim, norm_str); +} + +Tensor& fft_irfftn_out(Tensor& out, const Tensor& self, + c10::optional s, + c10::optional dim, + c10::optional norm_str) { + fft_irfftn_impl(out, self, s, dim, norm_str); + return out; } Tensor fft_fft2(const Tensor& self, c10::optional s, @@ -331,41 +470,69 @@ Tensor fft_fft2(const Tensor& self, c10::optional s, return native::fft_fftn(self, s, dim, std::move(norm)); } +Tensor& fft_fft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_fftn_out(out, self, s, dim, std::move(norm)); +} + Tensor fft_ifft2(const Tensor& self, c10::optional s, IntArrayRef dim, c10::optional norm) { return native::fft_ifftn(self, s, dim, std::move(norm)); } +Tensor& fft_ifft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_ifftn_out(out, self, s, dim, std::move(norm)); +} + Tensor fft_rfft2(const Tensor& self, c10::optional s, IntArrayRef dim, c10::optional norm) { return native::fft_rfftn(self, s, dim, std::move(norm)); } +Tensor& fft_rfft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_rfftn_out(out, self, s, dim, std::move(norm)); +} + Tensor fft_irfft2(const Tensor& self, c10::optional s, IntArrayRef dim, c10::optional norm) { return native::fft_irfftn(self, s, dim, std::move(norm)); } -Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) { - ScalarType dtype = typeMetaToScalarType(options.dtype()); +Tensor& fft_irfft2_out(Tensor& out, const Tensor& self, c10::optional s, + IntArrayRef dim, c10::optional norm) { + return native::fft_irfftn_out(out, self, s, dim, std::move(norm)); +} + +Tensor& fft_fftfreq_out(Tensor& out, int64_t n, double d) { + ScalarType dtype = out.scalar_type(); TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), "fftfreq requires a floating point or complex dtype"); // TODO: arange doesn't have complex support - Tensor result = native::arange(n, options); - auto right_slice = result.slice(0, (n + 1) / 2, 0); + at::arange_out(out, n); + auto right_slice = out.slice(0, (n + 1) / 2, 0); at::arange_out(right_slice, -(n/2), 0, 1); - result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) - return result; + return out.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) } -Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) { - ScalarType dtype = typeMetaToScalarType(options.dtype()); +Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) { + auto out = at::empty({n}, options); + return native::fft_fftfreq_out(out, n, d); +} + +Tensor& fft_rfftfreq_out(Tensor& out, int64_t n, double d) { + ScalarType dtype = out.scalar_type(); TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype), "rfftfreq requires a floating point or complex dtype"); // TODO: arange doesn't have complex support - Tensor result = native::arange(n/2 + 1, options); - result.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) - return result; + native::arange_out(out, n/2 + 1); + return out.mul_(1.0 / (n * d)); // Slightly faster than div_(n*d) +} + +Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) { + auto out = at::empty({n/2 + 1}, options); + return native::fft_rfftfreq_out(out, n, d); } // If an array dim is specified, wraps them according to self.dim(). @@ -469,18 +636,20 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional hop const bool return_complex = return_complexOpt.value_or( self.is_complex() || (window.defined() && window.is_complex())); if (!return_complex) { - TORCH_CHECK(return_complexOpt.has_value(), - "stft requires the return_complex parameter be given for real inputs." - "You should pass return_complex=True to opt-in to complex dtype returns " - "(which will be required in a future pytorch release). " + if (!return_complexOpt.has_value()) { + TORCH_WARN_ONCE( + "stft will soon require the return_complex parameter be given for real inputs, " + "and will further require that return_complex=True in a future PyTorch release." ); + } - TORCH_WARN_ONCE( - "stft with return_complex=False is deprecated. In a future pytorch " - "release, stft will return complex tensors for all inputs, and " - "return_complex=False will raise an error.\n" - "Note: you can still call torch.view_as_real on the complex output to " - "recover the old return format."); + + // TORCH_WARN_ONCE( + // "stft with return_complex=False is deprecated. In a future pytorch " + // "release, stft will return complex tensors for all inputs, and " + // "return_complex=False will raise an error.\n" + // "Note: you can still call torch.view_as_real on the complex output to " + // "recover the old return format."); } if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) { diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index b27a995962b4..5435f5042ce0 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -38,6 +38,8 @@ Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type()); TORCH_CHECK(!(self.is_complex() && equal_nan), "isclose with equal_nan=True is not supported for complex inputs."); + TORCH_CHECK(!(self.is_quantized() || other.is_quantized()), + "isclose is not supported for quantized inputs."); // Checks that rtol and atol are non-negative // Note: consistent with Python's isclose but divergent from NumPy's, which diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index f8ba5527e5a9..d1fadd58d38d 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -97,23 +98,25 @@ static inline void check_cat_shape_except_dim(const Tensor & first, const Tensor if (dim == dimension) { continue; } - int64_t first_dim_size = first.size(dim); - int64_t second_dim_size = second.size(dim); + int64_t first_dim_size = first.sizes()[dim]; + int64_t second_dim_size = second.sizes()[dim]; TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ", dimension, ". Got ", first_dim_size, " and ", second_dim_size, " in dimension ", dim, " (The offending index is ", index, ")"); } } +static bool should_skip(const Tensor& t) { + return t.numel() == 0 && t.dim() == 1; +} + Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) { // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors // to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific // size (i.e. other empty sizes are not skipped). - // FIXME: warn if this is the case - bool allSkipped = true; + bool allContiguous = true; - Tensor notSkippedTensor; // Inputs cannot alias the output tensor for (int64_t i = 0; i < tensors.size(); i++) { @@ -125,19 +128,23 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) { } at::assert_no_internal_overlap(result); - auto should_skip = [](const Tensor& t) { return t.numel() == 0 && t.dim() == 1; }; - for (auto const &tensor : tensors) { - if (should_skip(tensor)) { - continue; + const Tensor* pnotSkippedTensor = [](TensorList tensors) -> const Tensor* { + for (auto const &tensor : tensors) { + if (should_skip(tensor)) { + continue; + } + // we've found a non-empty tensor + return &tensor; } - // we've found a non-empty tensor - allSkipped = false; - notSkippedTensor = tensor; - break; - } - if (allSkipped) { + return nullptr; + }(tensors); + + if (!pnotSkippedTensor) { + // FIXME: warn if this is the case -- see comment about skipped + // tensors at top of function. return result; } + const Tensor& notSkippedTensor = *pnotSkippedTensor; TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors"); TORCH_CHECK(dim <= notSkippedTensor.dim(), "dimension ", dim, "out of range"); @@ -160,7 +167,7 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) { continue; } check_cat_shape_except_dim(notSkippedTensor, tensor, dim, i); - cat_dim_size += tensor.size(dim); + cat_dim_size += tensor.sizes()[dim]; if (!tensor.is_contiguous(first_tensor_mem_format)) { allContiguous = false; @@ -195,8 +202,8 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) { if (reuse_iterator && result.is_contiguous(first_tensor_mem_format) && no_type_promotion) { - auto source_slice = notSkippedTensor; - auto slice_dim_size = source_slice.size(dim); + const auto& source_slice = notSkippedTensor; + auto slice_dim_size = source_slice.sizes()[dim]; auto result_slice = result.narrow(dim, 0, slice_dim_size); auto result_slice_data = result_slice.data_ptr(); auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type()); @@ -225,7 +232,7 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) { if (should_skip(tensor)) { continue; } - auto slice_dim_size = tensor.size(dim); + auto slice_dim_size = tensor.sizes()[dim]; auto result_slice = result.narrow(dim, offset, slice_dim_size); auto iter = TensorIteratorConfig() @@ -1467,15 +1474,25 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) { return std::make_tuple(sizes, strides); } -std::tuple, std::vector > +namespace { +// Named type instead of a pair/tuple so that we can be sure to +// construct the vectors in place and get NRVO. +struct InferUnsqueezeGeometryResult { + c10::SmallVector sizes; + c10::SmallVector strides; + InferUnsqueezeGeometryResult(IntArrayRef tensor_sizes, IntArrayRef tensor_strides) + : sizes(tensor_sizes.begin(), tensor_sizes.end()) + , strides(tensor_strides.begin(), tensor_strides.end()) {} +}; +} +InferUnsqueezeGeometryResult inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { - auto sizes = tensor.sizes().vec(); - auto strides = tensor.strides().vec(); - int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim]; - sizes.insert(sizes.begin() + dim, 1); - strides.insert(strides.begin() + dim, new_stride); + InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides()); + int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim]; + result.sizes.insert(result.sizes.begin() + dim, 1); + result.strides.insert(result.strides.begin() + dim, new_stride); - return std::make_tuple(sizes, strides); + return result; } Tensor squeeze_qtensor(const Tensor& self) { @@ -1624,7 +1641,7 @@ Tensor unsqueeze_qtensor(const Tensor& self, int64_t dim) { axis, quantizer->scalar_type()); } - return make_qtensor(self, std::get<0>(g), std::get<1>(g), quantizer); + return make_qtensor(self, g.sizes, g.strides, quantizer); } Tensor unsqueeze(const Tensor& self, int64_t dim) { @@ -1636,7 +1653,7 @@ Tensor unsqueeze(const Tensor& self, int64_t dim) { return unsqueeze_qtensor(self, dim); } else { auto g = inferUnsqueezeGeometry(self, dim); - return self.as_strided(std::get<0>(g), std::get<1>(g)); + return self.as_strided(g.sizes, g.strides); } } @@ -1644,7 +1661,7 @@ Tensor & unsqueeze_(Tensor& self, int64_t dim) { dim = maybe_wrap_dim(dim, self.dim() + 1); auto g = inferUnsqueezeGeometry(self, dim); - return self.as_strided_(std::get<0>(g), std::get<1>(g)); + return self.as_strided_(g.sizes, g.strides); } Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) { diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index fdee519c4bd0..5c6ab40b0ad4 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -73,7 +73,7 @@ Tensor flip_cpu(const Tensor& self, IntArrayRef dims) { ); }); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16, in_tensor.scalar_type(), "flip_cpu", [&] { flip_cpu_kernel( diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index f732cb9a0141..d92864e6fb2a 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -77,7 +77,9 @@ DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional), random_full DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional), random_stub); DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t), polygamma_stub); DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar a, Scalar b), clamp_stub); -DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, int64_t, bool, c10::optional), multinomial_stub); +DECLARE_DISPATCH( + void (*)(Tensor&, const Tensor&, int64_t, c10::optional), + multinomial_with_replacement_stub); DECLARE_DISPATCH( void (*)( TensorIterator&, diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp index 299850407da3..f86adb8e6318 100644 --- a/aten/src/ATen/native/cpu/CatKernel.cpp +++ b/aten/src/ATen/native/cpu/CatKernel.cpp @@ -15,18 +15,20 @@ struct InputMeta { InputMeta(const Tensor& t, int64_t dim, int64_t inner) : data_ptr(t.data_ptr()) - , inner_size(t.size(dim) * inner) {} + , inner_size(t.sizes()[dim] * inner) {} }; template void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { - int64_t outer = result.numel() / (result.size(dim) * result.stride(dim)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl"); + int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]); scalar_t* result_data = result.data_ptr(); int64_t ninputs = tensors.size(); std::vector inputs; inputs.reserve(ninputs); for (auto const &tensor : tensors) { - inputs.emplace_back(tensor, dim, result.stride(dim)); + inputs.emplace_back(tensor, dim, result.strides()[dim]); } using Vec = vec256::Vec256; diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index 1f4a52084962..62f1d7b879ac 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -11,8 +11,12 @@ namespace at { namespace native { namespace { -template -void multinomial_apply(Tensor& result, const Tensor& self, const int64_t n_sample, const bool with_replacement, c10::optional generator) { +template +void multinomial_with_replacement_apply( + Tensor& result, + const Tensor& self, + const int64_t n_sample, + c10::optional generator) { auto gen = get_generator_or_default(generator, detail::getDefaultCPUGenerator()); // See Note [Acquire lock when using random generators] std::lock_guard lock(gen->mutex_); @@ -61,8 +65,6 @@ void multinomial_apply(Tensor& result, const Tensor& self, const int64_t n_sampl } TORCH_CHECK(sum > 0, "invalid multinomial distribution (sum of probabilities <= 0)"); - TORCH_CHECK(with_replacement || (n_categories - n_zeros >= n_sample), - "invalid multinomial distribution (with replacement=False, not enough non-negative category to sample)"); /* normalize cumulative probability distribution so that last val is 1 i.e. doesn't assume original self row sums to one */ @@ -100,45 +102,23 @@ void multinomial_apply(Tensor& result, const Tensor& self, const int64_t n_sampl /* store in result tensor (will be incremented for lua compat by wrapper) */ result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] = sample_idx; - - /* Once a sample is drawn, it cannot be drawn again. ie sample without replacement */ - if (!with_replacement && j < n_sample - 1) { - /* update cumulative distribution so that sample cannot be drawn again */ - scalar_t diff; - scalar_t new_val = 0; - scalar_t sum; - - if (sample_idx != 0) { - new_val = cum_dist_ptr[(sample_idx - 1) * cum_dist_stride_0]; - } - /* marginal cumulative mass (i.e. original probability) of sample */ - diff = cum_dist_ptr[sample_idx * cum_dist_stride_0] - new_val; - /* new sum of marginals is not one anymore... */ - sum = 1.0 - diff; - for (int64_t k = 0; k < n_categories; k++) { - new_val = cum_dist_ptr[k * cum_dist_stride_0]; - if (k >= sample_idx) { - /* remove sampled probability mass from later cumulative probabilities */ - new_val -= diff; - } - /* make total marginals sum to one */ - new_val /= sum; - cum_dist_ptr[k * cum_dist_stride_0] = new_val; - } - } } } } -static void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n_sample, const bool with_replacement, c10::optional gen) { +static void multinomial_with_replacement_kernel_impl( + Tensor& result, + const Tensor& self, + const int64_t n_sample, + c10::optional gen) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "multinomial", [&] { - multinomial_apply(result, self, n_sample, with_replacement, gen); + multinomial_with_replacement_apply(result, self, n_sample, gen); }); } - } -REGISTER_DISPATCH(multinomial_stub, &multinomial_kernel_impl); - +REGISTER_DISPATCH( + multinomial_with_replacement_stub, + &multinomial_with_replacement_kernel_impl); } } diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 67adbaabbb84..c3e456d97056 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -57,6 +57,12 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, accscalar_t pinv = accscalar_t(1)/p; + // Helps align the total number of times curand_uniform4 is called by each thread for the same totalElements + // in the vec=2 and vec=4 cases. + bool gridxvec_loop_state = 0; + + float4 rand; + // Note: Vectorized loads means we'll stride each thread by an additional VEC factor, as we'll load VEC elements at a time for (IndexType linearIndex = idx * VEC; linearIndex < totalElements; @@ -69,12 +75,21 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything // Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4) // sets of rand. - float4 rand = curand_uniform4(&state); + if ((VEC == 4) || (gridxvec_loop_state == 0)) { + rand = curand_uniform4(&state); + } else { + // sets up the last two values we generated last iteration to be used this iteration. + rand.x = rand.z; + rand.y = rand.w; + gridxvec_loop_state ^= 1; + } rand.x = rand.x < p; rand.y = rand.y < p; - rand.z = rand.z < p; - rand.w = rand.w < p; + if (VEC == 4) { + rand.z = rand.z < p; + rand.w = rand.w < p; + } // Note: We explicitly check for is_contiguous() before launching the vectorized kernel // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other) diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu index 3d59617903b4..cc74848b632a 100644 --- a/aten/src/ATen/native/cuda/MultinomialKernel.cu +++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu @@ -300,7 +300,11 @@ sampleMultinomialOnce(int64_t* dest, } } -void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n_sample, const bool with_replacement, c10::optional generator) { +void multinomial_with_replacement_kernel_impl( + Tensor& result, + const Tensor& self, + const int64_t n_sample, + c10::optional generator) { auto gen = get_generator_or_default(generator, cuda::detail::getDefaultCUDAGenerator()); int inputSize = self.dim(); @@ -371,7 +375,6 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n PhiloxCudaState rng_engine_inputs; - if (with_replacement) { // Binary search is warp divergent (so effectively we're running // with just a single thread), but for better utilization, // we need each block to have at least 4 warps. @@ -402,7 +405,6 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n prefixSum.data_ptr(), normDist.data_ptr()); C10_CUDA_KERNEL_LAUNCH_CHECK(); - } } }); @@ -412,6 +414,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n } } -REGISTER_DISPATCH(multinomial_stub, &multinomial_kernel_impl); - +REGISTER_DISPATCH( + multinomial_with_replacement_stub, + &multinomial_with_replacement_kernel_impl); }} diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu index db3e853a9321..e5e91cea4ccc 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cu +++ b/aten/src/ATen/native/cuda/SpectralOps.cu @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -439,10 +440,10 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, // Calculates the normalization constant and applies it in-place to self // sizes is the sizes of a twosided tensor and dims are all transformed dims -void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { +double _fft_normalization_scale(int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { auto norm = static_cast(normalization); if (norm == fft_norm_mode::none) { - return; + return 1.0; } int64_t signal_numel = 1; @@ -451,7 +452,17 @@ void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArra } const double scale_denom = (norm == fft_norm_mode::by_root_n) ? std::sqrt(signal_numel) : static_cast(signal_numel); - self.div_(scale_denom); + return 1.0 / scale_denom; +} + +const Tensor& _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { + auto scale = _fft_normalization_scale(normalization, sizes, dims); + return (scale == 1.0) ? self : self.mul_(scale); +} + +Tensor& _fft_apply_normalization_out(Tensor& out, const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) { + auto scale = _fft_normalization_scale(normalization, sizes, dims); + return at::mul_out(out, self, c10::scalar_to_tensor(scale)); } } // namespace (anonymous) @@ -522,6 +533,23 @@ Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization return output; } +Tensor& _fft_r2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim, + int64_t normalization, bool onesided) { + auto result = _fft_r2c_cufft(self, dim, static_cast(fft_norm_mode::none), /*onesided=*/true); + if (onesided) { + return _fft_apply_normalization_out(out, result, normalization, self.sizes(), dim); + } + + resize_output(out, self.sizes()); + + auto last_dim = dim.back(); + auto last_dim_halfsize = result.sizes()[last_dim]; + auto out_slice = out.slice(last_dim, 0, last_dim_halfsize); + _fft_apply_normalization_out(out_slice, result, normalization, self.sizes(), dim); + at::native::_fft_fill_with_conjugate_symmetry_(out, dim); + return out; +} + // n-dimensional complex to real IFFT Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) { TORCH_CHECK(self.is_complex()); @@ -544,8 +572,13 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization // TODO: could transform up to 2 other dims in the same cuFFT operation auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false); - _fft_apply_normalization(output, normalization, out_sizes, dim); - return output; + return _fft_apply_normalization(output, normalization, out_sizes, dim); +} + +Tensor& _fft_c2r_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim, + int64_t normalization, int64_t lastdim) { + auto result = _fft_c2r_cufft(self, dim, static_cast(fft_norm_mode::none), lastdim); + return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim); } // n-dimensional complex to complex FFT/IFFT @@ -586,8 +619,13 @@ Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization } } - _fft_apply_normalization(output, normalization, out_sizes, dim); - return output; + return _fft_apply_normalization(output, normalization, out_sizes, dim); +} + +Tensor& _fft_c2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim, + int64_t normalization, bool forward) { + auto result = _fft_c2c_cufft(self, dim, static_cast(fft_norm_mode::none), forward); + return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim); } diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index a435c7060f45..9dfa4e8759cf 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -87,7 +87,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) { // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work if (flip_dims_size == 1 && in_tensor.is_contiguous() && (flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, in_tensor.scalar_type(), "flip_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] { auto in_tensor_info = cuda::detail::getTensorInfo(in_tensor); auto out_tensor_info = cuda::detail::getTensorInfo(out_tensor); int flip_dim = in_tensor_info.collapseDims(flip_dims[0]); @@ -123,7 +123,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) { } } - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, in_tensor.scalar_type(), "flip_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] { flip_cuda_kernel<<>>( in_tensor.data_ptr(), out_tensor.data_ptr(), N, flip_dims_t.cuda().data_ptr(), diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index 834c000fdb05..8ac7abca1824 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -168,43 +168,43 @@ __global__ void upsample_trilinear3d_backward_out_frame( true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1, h1, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p), i_numel, static_cast(t0lambda * h0lambda * w1lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1), + idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1), i_numel, static_cast(t0lambda * h1lambda * w0lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p), i_numel, static_cast(t0lambda * h1lambda * w1lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1), i_numel, static_cast(t1lambda * h0lambda * w0lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p), i_numel, static_cast(t1lambda * h0lambda * w1lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1), i_numel, static_cast(t1lambda * h1lambda * w0lambda * d2val), true); fastAtomicAdd( idata, - idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1 + w1p), + idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p), i_numel, static_cast(t1lambda * h1lambda * w1lambda * d2val), true); diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index 8fca9ad9ecdf..d5a39e45941b 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -21,6 +22,21 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, AT_ERROR("fft: ATen not compiled with MKL support"); } +Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool onesided) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + +Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + int64_t last_dim_size) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + +Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool forward) { + AT_ERROR("fft: ATen not compiled with MKL support"); +} + }} #else // AT_MKL_ENABLED @@ -381,6 +397,13 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false); } +Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + int64_t last_dim_size) { + auto result = _fft_c2r_mkl(self, dim, normalization, last_dim_size); + resize_output(out, result.sizes()); + return out.copy_(result); +} + // n-dimensional real to complex FFT Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { TORCH_CHECK(self.is_floating_point()); @@ -402,6 +425,24 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, return out; } +Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool onesided) { + auto result = _fft_r2c_mkl(self, dim, normalization, /*onesided=*/true); + if (onesided) { + resize_output(out, result.sizes()); + return out.copy_(result); + } + + resize_output(out, self.sizes()); + + auto last_dim = dim.back(); + auto last_dim_halfsize = result.sizes()[last_dim]; + auto out_slice = out.slice(last_dim, 0, last_dim_halfsize); + out_slice.copy_(result); + at::native::_fft_fill_with_conjugate_symmetry_(out, dim); + return out; +} + // n-dimensional complex to complex FFT/IFFT Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { TORCH_CHECK(self.is_complex()); @@ -410,6 +451,13 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward); } +Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization, + bool forward) { + auto result = _fft_c2c_mkl(self, dim, normalization, forward); + resize_output(out, result.sizes()); + return out.copy_(result); +} + }} // namespace at::native #endif diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index c9a6b675529f..e8e3efa307f8 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -7,42 +7,34 @@ # DEPRECATED. DO NOT USE - func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # DEPRECATED. DO NOT USE - func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full variants: function # Computes the gradient of current tensor w.r.t. graph leaves. @@ -59,18 +51,15 @@ # where Variables *are* Tensors (as opposed to them containing tensors, which # is what the previous interpretation was.) - func: set_data(Tensor(a!) self, Tensor new_data) -> () - use_c10_dispatcher: full manual_kernel_registration: True variants: method - func: data(Tensor self) -> Tensor - use_c10_dispatcher: full manual_kernel_registration: True variants: method # True if this `Variable` is a leaf and thus does not have a `grad_fn`. - func: is_leaf(Tensor self) -> bool - use_c10_dispatcher: full manual_kernel_registration: True variants: method @@ -85,23 +74,19 @@ # assert y2.output_nr == 2 # - func: output_nr(Tensor self) -> int - use_c10_dispatcher: full manual_kernel_registration: True variants: method - func: _version(Tensor self) -> int - use_c10_dispatcher: full manual_kernel_registration: True variants: method - func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!) - use_c10_dispatcher: full manual_kernel_registration: True variants: method # Enables .grad attribute for non-leaf Tensors. - func: retain_grad(Tensor(a!) self) -> () - use_c10_dispatcher: full manual_kernel_registration: True variants: method @@ -120,47 +105,36 @@ variants: function - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: align_as(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method - func: align_tensors(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool - use_c10_dispatcher: full dispatch: CUDA: _use_cudnn_ctc_loss - func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: _cudnn_ctc_loss - func: _use_cudnn_rnn_flatten_weight() -> bool - use_c10_dispatcher: full - func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: _cudnn_rnn_flatten_weight @@ -180,71 +154,52 @@ CUDA: _cudnn_init_dropout_state - func: _debug_has_internal_overlap(Tensor self) -> int - use_c10_dispatcher: full variants: function - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CUDA: fused_dropout_cuda - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CUDA: masked_scale_cuda - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) - use_c10_dispatcher: full - func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!) - use_c10_dispatcher: full - func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) - use_c10_dispatcher: full - func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor - use_c10_dispatcher: full - func: _shape_as_tensor(Tensor self) -> Tensor - use_c10_dispatcher: full - func: dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: feature_dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: alpha_dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor - use_c10_dispatcher: full - func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!) - use_c10_dispatcher: full - func: abs(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: abs - func: abs_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: abs_ @@ -281,18 +236,15 @@ # Absolute, alias for abs - func: absolute(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: absolute_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: angle(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: angle @@ -303,19 +255,16 @@ CPU, CUDA: angle_out - func: view_as_real(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: view_as_real - func: view_as_complex(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: view_as_complex - func: sgn(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sgn @@ -332,15 +281,12 @@ CPU, CUDA: sgn_out - func: real(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function - func: imag(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function - func: conj(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -349,19 +295,16 @@ CPU, CUDA: conj_out - func: _conj(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: _conj - func: acos(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acos - func: acos_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acos_ @@ -373,28 +316,22 @@ # arccos, alias of acos - func: arccos(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arccos_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor - use_c10_dispatcher: full - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor - use_c10_dispatcher: full # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full structured_delegate: add.out variants: function, method dispatch: @@ -403,7 +340,6 @@ MkldnnCPU: mkldnn_add - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method structured_delegate: add.out dispatch: @@ -422,13 +358,11 @@ MkldnnCPU: mkldnn_add_out - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: add_relu - func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: CPU: add_relu_ @@ -441,25 +375,21 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: add - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: add_ - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: addmv - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: addmv_ @@ -470,20 +400,17 @@ CPU, CUDA: addmv_out - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU: addmv_impl_cpu CUDA: addmv_impl_cuda - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: addr Math: math_addr - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: addr_ @@ -495,17 +422,14 @@ Math: math_addr_out - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: affine_grid_generator - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor - use_c10_dispatcher: full variants: function - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: all @@ -516,18 +440,15 @@ CPU, CUDA: all_out - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool - use_c10_dispatcher: full variants: function, method - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: any @@ -538,7 +459,6 @@ CPU, CUDA: any_out - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -568,10 +488,8 @@ # preserve tracing. Get rid of this when arange can directly take tensors for bounds # (so that it can be traced directly). - func: _dim_arange(Tensor like, int dim) -> Tensor - use_c10_dispatcher: full - func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: argmax @@ -582,7 +500,6 @@ CPU, CUDA: argmax_out - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: argmin @@ -593,13 +510,11 @@ CPU, CUDA: argmin_out - func: acosh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acosh - func: acosh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: acosh_ @@ -611,24 +526,20 @@ # arccosh, alias for acosh - func: arccosh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arccosh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: asinh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: asinh - func: asinh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: asinh_ @@ -640,24 +551,20 @@ # arcsinh, alias for asinh - func: arcsinh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arcsinh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: atanh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atanh - func: atanh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atanh_ @@ -669,18 +576,15 @@ # arctanh, alias for atanh - func: arctanh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arctanh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: as_strided_tensorimpl @@ -695,14 +599,12 @@ DefaultBackend: as_strided_ - func: asin(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: asin SparseCPU, SparseCUDA: asin_sparse - func: asin_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: asin_ @@ -716,24 +618,20 @@ # arcsin, alias of asin - func: arcsin(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arcsin_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: atan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atan - func: atan_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: atan_ @@ -745,55 +643,44 @@ # arctan, alias of atan - func: arctan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: arctan_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: atleast_1d(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - func: atleast_2d(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function - func: atleast_3d(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function - func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: baddbmm_cpu CUDA: baddbmm_cuda - func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: baddbmm__cpu CUDA: baddbmm__cuda - func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: function - func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -825,7 +712,6 @@ # Sample bernoulli with values in `self` as probability. - func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: bernoulli @@ -837,13 +723,11 @@ CPU, CUDA: bernoulli_out - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: bernoulli_ - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: bernoulli_ @@ -852,7 +736,6 @@ # There is no default valid on `p` here because it would introduce ambiguity # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration. - func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor @@ -908,11 +791,9 @@ CUDA: _bincount_cuda - func: bitwise_not(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -921,13 +802,11 @@ CPU, CUDA: bitwise_not_out - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: copysign - func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: copysign_ @@ -938,23 +817,19 @@ CPU, CUDA: copysign_out - func: copysign.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: copysign - func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: copysign_ - func: logical_not(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_not_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -963,11 +838,9 @@ CPU, CUDA: logical_not_out - func: logical_xor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -976,11 +849,9 @@ CPU, CUDA: logical_xor_out - func: logical_and(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -989,11 +860,9 @@ CPU, CUDA: logical_and_out - func: logical_or(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -1008,7 +877,6 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: bmm(Tensor self, Tensor mat2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: bmm_cpu @@ -1017,7 +885,6 @@ SparseCUDA: bmm_sparse_cuda - func: _bmm(Tensor self, Tensor mat2, *, bool deterministic=False) -> Tensor - use_c10_dispatcher: full variants: function dispatch: SparseCUDA: _bmm_sparse_cuda @@ -1038,7 +905,6 @@ SparseCUDA: _bmm_out_sparse_cuda - func: broadcast_tensors(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full device_guard: False - func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) @@ -1048,7 +914,6 @@ Math: broadcast_to - func: cat(Tensor[] tensors, int dim=0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: cat @@ -1058,23 +923,19 @@ DefaultBackend: cat_out - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor - use_c10_dispatcher: full - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: block_diag(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full variants: function - func: ceil(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: ceil - func: ceil_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: ceil_ @@ -1085,25 +946,20 @@ CPU, CUDA: ceil_out - func: chain_matmul(Tensor[] matrices) -> Tensor - use_c10_dispatcher: full variants: function - func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[] - use_c10_dispatcher: full variants: function, method device_guard: False - func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method device_guard: False - func: tensor_split.sections(Tensor(a) self, int sections, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method - func: tensor_split.indices(Tensor(a) self, int[] indices, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method - func: tensor_split.tensor_indices_or_sections(Tensor(a) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[] @@ -1111,14 +967,12 @@ variants: function, method - func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: clamp QuantizedCPU: clamp_quantized_cpu - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_ @@ -1129,13 +983,11 @@ CPU, CUDA: clamp_out - func: clamp_max(Tensor self, Scalar max) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_max - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_max_ @@ -1146,13 +998,11 @@ CPU, CUDA: clamp_max_out - func: clamp_min(Tensor self, Scalar min) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_min - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: clamp_min_ @@ -1164,7 +1014,6 @@ # clip is an alias for clamp - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!) @@ -1175,11 +1024,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: cudnn_is_acceptable(Tensor self) -> bool - use_c10_dispatcher: full device_guard: False - func: complex(Tensor real, Tensor imag) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: complex @@ -1190,7 +1037,6 @@ CPU, CUDA: complex_out - func: polar(Tensor abs, Tensor angle) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: polar @@ -1201,13 +1047,11 @@ CPU, CUDA: polar_out - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: constant_pad_nd - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a) - use_c10_dispatcher: full variants: method manual_cpp_binding: True @@ -1220,7 +1064,6 @@ DefaultBackend: convolution_overrideable - func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full dispatch: DefaultBackend: convolution_backward_overrideable @@ -1246,12 +1089,10 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: conv_tbc - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full # NB: we inherit the goofy argument order from PyTorch torch.nn.functional - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor @@ -1264,24 +1105,20 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: copy_ - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor - use_c10_dispatcher: full dispatch: {} - func: cos(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cos - func: cos_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cos_ @@ -1292,13 +1129,11 @@ CPU, CUDA: cos_out - func: cosh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cosh - func: cosh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cosh_ @@ -1309,28 +1144,23 @@ CPU, CUDA: cosh_out - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: count_nonzero - func: count_nonzero(Tensor self, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: count_nonzero - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid - use_c10_dispatcher: full dispatch: CUDA: cudnn_affine_grid_generator_forward # TODO: Why do I have to call this grad?! - func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta - use_c10_dispatcher: full dispatch: CUDA: cudnn_affine_grid_generator_backward @@ -1351,27 +1181,22 @@ CUDA: cudnn_convolution_deprecated - func: cudnn_convolution.deprecated2(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_deprecated2 - func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution - func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward_input - func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward - func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_backward_weight @@ -1381,45 +1206,37 @@ CUDA: cudnn_convolution_transpose_deprecated - func: cudnn_convolution_transpose.deprecated2(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_deprecated2 - func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose # NB: output_padding not strictly needed here, but it's helpful for the float # backwards - func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward - func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward_input - func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: cudnn_convolution_transpose_backward_weight # NB: input is special cased in a way I don't quite understand - func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output - use_c10_dispatcher: full dispatch: CUDA: cudnn_grid_sampler_forward - func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid) - use_c10_dispatcher: full dispatch: CUDA: cudnn_grid_sampler_backward - func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cummax @@ -1430,7 +1247,6 @@ DefaultBackend: cummax_out - func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -1444,7 +1260,6 @@ CUDA: cummax_helper_cuda - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cummin @@ -1455,7 +1270,6 @@ DefaultBackend: cummin_out - func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -1469,18 +1283,15 @@ CUDA: cummin_helper_cuda - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cumprod - func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: cumprod_ @@ -1491,29 +1302,24 @@ DefaultBackend: cumprod_out - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: cumprod_backward(Tensor grad, Tensor input, int dim) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: cumsum - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: cumsum_ @@ -1524,137 +1330,111 @@ DefaultBackend: cumsum_out - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor - use_c10_dispatcher: full # convenience function that converts to intlists for you - func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor - use_c10_dispatcher: full - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: ctc_loss_cpu CUDA: ctc_loss_gpu - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: ctc_loss_backward_cpu CUDA: ctc_loss_backward_gpu - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: diagflat(Tensor self, int offset=0) -> Tensor - use_c10_dispatcher: full variants: function, method - func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: diagonal - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: div.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: div SparseCPU, SparseCUDA: div_sparse - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: div_ SparseCPU, SparseCUDA: div_sparse_ - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU, CUDA: div_out SparseCPU, SparseCUDA: div_out_sparse_zerodim # For C++ only, until we have conversion from C++ numbers to Tensor - func: div.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: div - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: div_ # divide, alias for div - func: divide.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: divide.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method # true_divide, an alias for div - func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: dot(Tensor self, Tensor tensor) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: dot @@ -1666,7 +1446,6 @@ DefaultBackend: dot_out - func: vdot(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: vdot @@ -1678,30 +1457,24 @@ DefaultBackend: vdot_out - func: einsum(str equation, Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: embedding - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor - use_c10_dispatcher: full - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor - use_c10_dispatcher: full dispatch: CPU: embedding_dense_backward_cpu CUDA: embedding_dense_backward_cuda - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU: embedding_renorm_cpu_ CUDA: embedding_renorm_cuda_ - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor - use_c10_dispatcher: full # NOTE [ embedding_bag Native Functions ] # The `_embedding_bag.*` variants assume that input tensors except for `weight`, @@ -1720,11 +1493,9 @@ CUDA: _embedding_bag_forward_only_cuda - func: rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor) - use_c10_dispatcher: full # row_stack is the alias of vstack - func: row_stack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full dispatch: Math: row_stack @@ -1755,20 +1526,17 @@ CUDA: _embedding_bag_dense_backward_cuda - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _embedding_bag_per_sample_weights_backward_cpu CUDA: _embedding_bag_per_sample_weights_backward_cuda - func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures device_guard: False - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: empty_cpu CUDA: empty_cuda @@ -1776,7 +1544,6 @@ SparseCPU, SparseCUDA: empty_sparse - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full variants: method - func: new_empty_strided(Tensor self, int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -1808,7 +1575,6 @@ QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized - func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -1818,7 +1584,6 @@ Meta: resize_meta_ - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor - use_c10_dispatcher: full variants: function dispatch: QuantizedCPU, QuantizedCUDA: empty_quantized @@ -1832,19 +1597,16 @@ device_guard: False - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: empty_strided_cpu CUDA: empty_strided_cuda - func: erf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erf - func: erf_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erf_ @@ -1855,13 +1617,11 @@ CPU, CUDA: erf_out - func: erfc(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erfc - func: erfc_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: erfc_ @@ -1872,13 +1632,11 @@ CPU, CUDA: erfc_out - func: exp(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp - func: exp_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp_ @@ -1889,13 +1647,11 @@ CPU, CUDA: exp_out - func: exp2(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp2 - func: exp2_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: exp2_ @@ -1906,13 +1662,11 @@ CPU, CUDA: exp2_out - func: expm1(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: expm1 - func: expm1_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: expm1_ @@ -1923,14 +1677,12 @@ CPU, CUDA: expm1_out - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a) - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_guard: False dispatch: DefaultBackend: expand - func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a) - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. device_guard: False @@ -1953,49 +1705,39 @@ CUDA: eye_out_cuda - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: fill_ - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: fill_ - func: floor(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: floor - func: floor_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: floor_ @@ -2006,14 +1748,12 @@ CPU, CUDA: floor_out - func: floor_divide(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: floor_divide SparseCPU, SparseCUDA: floor_divide_sparse - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: floor_divide_ @@ -2026,21 +1766,17 @@ SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: frac(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: frac - func: frac_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: frac_ @@ -2074,11 +1810,9 @@ CPU, CUDA: gcd_out - func: gcd(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -2087,11 +1821,9 @@ CPU, CUDA: lcm_out - func: lcm(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method # NOTE [ grid_sampler Native Functions ] @@ -2110,37 +1842,30 @@ # Nor does it take in `align_corners` because it only supports the mode # `align_corners = True`. - func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full - func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full dispatch: CPU: grid_sampler_2d_cpu CUDA: grid_sampler_2d_cuda - func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: grid_sampler_2d_backward_cpu CUDA: grid_sampler_2d_backward_cuda # See NOTE [ grid_sample CPU fallback ] - func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _grid_sampler_2d_cpu_fallback - func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor - use_c10_dispatcher: full dispatch: CPU: grid_sampler_3d_cpu CUDA: grid_sampler_3d_cuda - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: grid_sampler_3d_backward_cpu CUDA: grid_sampler_3d_backward_cuda @@ -2173,7 +1898,6 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -2191,42 +1915,55 @@ # Real to complex forward FFT - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _fft_r2c_mkl CUDA: _fft_r2c_cufft +- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + variants: function + dispatch: + CPU: _fft_r2c_mkl_out + CUDA: _fft_r2c_cufft_out + # Complex to real inverse FFT - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _fft_c2r_mkl CUDA: _fft_c2r_cufft +- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + variants: function + dispatch: + CPU: _fft_c2r_mkl_out + CUDA: _fft_c2r_cufft_out + # Standard complex to complex FFT (forward or backward) - func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _fft_c2c_mkl CUDA: _fft_c2c_cufft +- func: _fft_c2c.out(Tensor self, int[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + variants: function + dispatch: + CPU: _fft_c2c_mkl_out + CUDA: _fft_c2c_cufft_out + - func: _cufft_get_plan_cache_size(int device_index) -> int - use_c10_dispatcher: full - func: _cufft_get_plan_cache_max_size(int device_index) -> int - use_c10_dispatcher: full - func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> () - use_c10_dispatcher: full - func: _cufft_clear_plan_cache(int device_index) -> () - use_c10_dispatcher: full - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: index @@ -2237,25 +1974,20 @@ # - Tensor Tensor::index(std::initializer_list indices) - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: index_copy_ - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: index_put_ @@ -2266,11 +1998,9 @@ # - Tensor & Tensor::index_put_(std::initializer_list indices, Scalar v) - func: index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: _index_put_impl_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False, bool unsafe=False) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _index_put_impl_ @@ -2280,7 +2010,6 @@ variants: function - func: inverse(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: inverse @@ -2291,18 +2020,15 @@ DefaultBackend: inverse_out - func: _inverse_helper(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _inverse_helper_cpu CUDA: _inverse_helper_cuda - func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: isnan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: @@ -2310,52 +2036,42 @@ SparseCPU, SparseCUDA: isnan_sparse - func: is_distributed(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_floating_point(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_complex(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: isreal(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: is_nonzero(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_same_size(Tensor self, Tensor other) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: is_signed(Tensor self) -> bool - use_c10_dispatcher: full variants: function, method device_guard: False - func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: kl_div - func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: kl_div_backward_cpu CUDA: kl_div_backward_cuda - func: kron(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: Math: kron @@ -2366,7 +2082,6 @@ Math: kron_out - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: kthvalue @@ -2378,7 +2093,6 @@ CUDA: kthvalue_out_cuda - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -2401,13 +2115,11 @@ CUDA: layer_norm_backward_cuda - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: nan_to_num - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: nan_to_num_ @@ -2428,35 +2140,25 @@ MkldnnCPU: mkldnn_linear - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int) - use_c10_dispatcher: full - func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor - use_c10_dispatcher: full - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor - use_c10_dispatcher: full - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor - use_c10_dispatcher: full - func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -2472,13 +2174,11 @@ CUDA: linspace_cuda_out - func: log(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log - func: log_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log_ @@ -2489,13 +2189,11 @@ CPU, CUDA: log_out - func: log10(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log10 - func: log10_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log10_ @@ -2506,14 +2204,12 @@ CPU, CUDA: log10_out - func: log1p(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: log1p SparseCPU, SparseCUDA: log1p_sparse - func: log1p_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: log1p_ @@ -2526,13 +2222,11 @@ SparseCPU, SparseCUDA: log1p_out_sparse - func: log2(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log2 - func: log2_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: log2_ @@ -2548,7 +2242,6 @@ CPU, CUDA: logaddexp_out - func: logaddexp(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: logaddexp @@ -2559,7 +2252,6 @@ CPU, CUDA: logaddexp2_out - func: logaddexp2(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: logaddexp2 @@ -2615,7 +2307,6 @@ CPU, CUDA: xlogy_out - func: logdet(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: logdet @@ -2631,27 +2322,22 @@ # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: CPU: log_softmax_cpu CUDA: log_softmax_cuda - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: CPU: log_softmax_backward_cpu CUDA: log_softmax_backward_cuda - func: _logcumsumexp(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _logcumsumexp_cpu CUDA: _logcumsumexp_cuda @@ -2663,7 +2349,6 @@ CUDA: _logcumsumexp_out_cuda - func: logcumsumexp(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: logcumsumexp @@ -2674,14 +2359,12 @@ DefaultBackend: logcumsumexp_out - func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: logsumexp @@ -2692,55 +2375,44 @@ DefaultBackend: logsumexp_out - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: matmul(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor - use_c10_dispatcher: full - func: matrix_rank(Tensor self, bool symmetric=False) -> Tensor - use_c10_dispatcher: full - func: matrix_power(Tensor self, int n) -> Tensor - use_c10_dispatcher: full variants: function, method - func: matrix_exp(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: matrix_exp - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor - use_c10_dispatcher: full - func: _aminmax(Tensor self) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _aminmax_all - func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _aminmax - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _compute_linear_combination @@ -2750,7 +2422,6 @@ CPU, CUDA: _compute_linear_combination_out - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: max @@ -2761,19 +2432,16 @@ CPU, CUDA: max_out - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, int[] sizes, bool keepdim) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: amax @@ -2785,48 +2453,38 @@ # Return: (Tensor output, Tensor indices) - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_max_pool2d - func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_max_pool3d - func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: QuantizedCPU: quantized_max_pool1d - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full dispatch: QuantizedCPU: quantized_max_pool2d - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor - use_c10_dispatcher: full # The CPU and GPU dispatch variants are named weirdly here because otherwise there # are namespacing issues in C++ - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mean_cpu_gpu QuantizedCPU: mean_quantized_cpu - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mean_cpu_gpu @@ -2839,21 +2497,18 @@ QuantizedCPU: mean_out_quantized_cpu - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: median(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: median_cpu CUDA: median_cuda - func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: median @@ -2865,21 +2520,18 @@ CUDA: median_out_cuda - func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nanmedian(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: nanmedian_cpu CUDA: nanmedian_cuda - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: nanmedian @@ -2891,14 +2543,12 @@ CUDA: nanmedian_out_cuda - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: min @@ -2909,14 +2559,12 @@ CPU, CUDA: min_out - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: amin @@ -2932,13 +2580,10 @@ DefaultBackend: mkldnn_convolution - func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor - use_c10_dispatcher: full - func: mkldnn_convolution_backward_weights(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: DefaultBackend: mkldnn_convolution_backward @@ -2958,22 +2603,18 @@ CUDA: miopen_convolution - func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward_input - func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward - func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward_bias - func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_backward_weight @@ -2985,17 +2626,14 @@ # NB: output_padding not strictly needed here, but it's helpful for the float # backwards - func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose_backward - func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose_backward_input - func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_convolution_transpose_backward_weight @@ -3005,17 +2643,14 @@ CUDA: miopen_depthwise_convolution - func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution_backward_input - func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution_backward - func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor - use_c10_dispatcher: full dispatch: CUDA: miopen_depthwise_convolution_backward_weight @@ -3030,7 +2665,6 @@ CUDA: miopen_rnn_backward - func: mm(Tensor self, Tensor mat2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: mm_cpu @@ -3045,7 +2679,6 @@ SparseCPU, SparseCUDA: _sparse_mm_out - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor - use_c10_dispatcher: full - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full @@ -3059,7 +2692,6 @@ SparseCUDA: sparse_matrix_mask_helper_cuda - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mode @@ -3070,14 +2702,12 @@ DefaultBackend: mode_out - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: function, method - func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: mul.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mul @@ -3085,7 +2715,6 @@ MkldnnCPU: mkldnn_mul - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: mul_ @@ -3102,39 +2731,32 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: mul.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: mul - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: mul_ # multiply, alias for mul - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: multiply.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: mv(Tensor self, Tensor vec) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: mv @@ -3146,31 +2768,26 @@ DefaultBackend: mv_out - func: mvlgamma(Tensor self, int p) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: mvlgamma - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: mvlgamma_ - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: narrow_copy_dense SparseCPU, SparseCUDA: narrow_copy_sparse - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False @@ -3187,7 +2804,6 @@ CUDA: batch_norm_cuda_out - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: batch_norm_stats_cuda @@ -3235,10 +2851,8 @@ CUDA: batch_norm_update_stats_cuda - func: is_vulkan_available() -> bool - use_c10_dispatcher: full - func: _nnpack_available() -> bool - use_c10_dispatcher: full - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -3247,15 +2861,12 @@ DefaultBackend: _nnpack_spatial_convolution - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: _nnpack_spatial_convolution_backward_input(Tensor input, Tensor grad_output, Tensor weight, int[2] padding) -> Tensor - use_c10_dispatcher: full variants: function - func: _nnpack_spatial_convolution_backward_weight(Tensor input, int[] weightsize, Tensor grad_output, int[2] padding) -> Tensor - use_c10_dispatcher: full variants: function - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -3272,64 +2883,50 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor - use_c10_dispatcher: full - func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor - use_c10_dispatcher: full - func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _euclidean_dist - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _cdist_forward - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _cdist_backward - func: pdist(Tensor self, float p=2) -> Tensor - use_c10_dispatcher: full - func: _pdist_forward(Tensor self, float p=2) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _pdist_forward - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: _pdist_backward - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor - use_c10_dispatcher: full variants: function - func: permute(Tensor(a) self, int[] dims) -> Tensor(a) - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: DefaultBackend: permute - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method # moveaxis, alias for movedim - func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a) - use_c10_dispatcher: full variants: function, method # Only exposed from C++ -- in Python, @@ -3340,45 +2937,36 @@ # behavior on Windows, for reasons I don't understand # (maybe related to capital letter collation somehow...) - func: numpy_T(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor - use_c10_dispatcher: full - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor use_c10_dispatcher: full - func: channel_shuffle(Tensor self, int groups) -> Tensor - use_c10_dispatcher: full dispatch: CPU: channel_shuffle QuantizedCPU: channel_shuffle_quantized_cpu - func: is_pinned(Tensor self) -> bool - use_c10_dispatcher: full variants: method - func: pin_memory(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor - use_c10_dispatcher: full variants: function, method - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor - use_c10_dispatcher: full variants: function - func: rad2deg(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rad2deg - func: rad2deg_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rad2deg_ @@ -3389,13 +2977,11 @@ DefaultBackend: rad2deg_out - func: deg2rad(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: deg2rad - func: deg2rad_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: deg2rad_ @@ -3512,17 +3098,14 @@ CUDA: range_cuda_out - func: ravel(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method - func: reciprocal(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: reciprocal - func: reciprocal_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: reciprocal_ @@ -3533,13 +3116,11 @@ CPU, CUDA: reciprocal_out - func: neg(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: neg - func: neg_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: neg_ @@ -3553,61 +3134,50 @@ # Alias for neg - func: negative(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: negative_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: repeat(Tensor self, int[] repeats) -> Tensor - use_c10_dispatcher: full variants: method # This is method-only to match the previous tensor API. In the future we could make this a function too. dispatch: DefaultBackend: repeat - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: repeat_interleave_cpu CUDA: repeat_interleave_cuda - func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: reshape(Tensor(a) self, int[] shape) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor - use_c10_dispatcher: full device_guard: False dispatch: MkldnnCPU: mkldnn_reshape - func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False - func: round(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: round - func: round_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: round_ @@ -3619,13 +3189,10 @@ CUDA: round_out - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor - use_c10_dispatcher: full - func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full - func: relu(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: relu @@ -3633,7 +3200,6 @@ QuantizedCPU: relu_quantized_cpu - func: relu_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: relu_ @@ -3641,59 +3207,50 @@ QuantizedCPU: relu_quantized_cpu_ - func: prelu(Tensor self, Tensor weight) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: prelu_cpu CUDA: prelu_cuda - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function, method dispatch: CPU: prelu_backward_cpu CUDA: prelu_backward_cuda - func: gelu(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: gelu_cpu CUDA: gelu_cuda - func: gelu_backward(Tensor grad, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: gelu_backward_cpu CUDA: gelu_backward_cuda - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor - use_c10_dispatcher: full variants: function python_module: nn device_guard: False - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: hardshrink - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: hardshrink_backward - func: rsqrt(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rsqrt - func: rsqrt_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rsqrt_ @@ -3704,46 +3261,37 @@ CPU, CUDA: rsqrt_out - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: select.int(Tensor(a) self, int dim, int index) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: select - func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: selu(Tensor self) -> Tensor - use_c10_dispatcher: full - func: selu_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: celu - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) - use_c10_dispatcher: full dispatch: DefaultBackend: celu_ - func: silu(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: silu - func: silu_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: silu_ @@ -3755,14 +3303,12 @@ CPU, CUDA: silu_out - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: silu_backward Math: math_silu_backward - func: sigmoid(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sigmoid @@ -3770,7 +3316,6 @@ MkldnnCPU: mkldnn_sigmoid - func: sigmoid_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sigmoid_ @@ -3782,13 +3327,11 @@ CPU, CUDA: sigmoid_out - func: logit(Tensor self, float? eps=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: logit - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: logit_ @@ -3799,13 +3342,11 @@ CPU, CUDA: logit_out - func: sin(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sin - func: sin_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sin_ @@ -3833,13 +3374,11 @@ CPU, CUDA: sinc_out - func: sinh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sinh - func: sinh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sinh_ @@ -3861,7 +3400,6 @@ # changing metadata of the detached tensor and expecting the original tensor to also # be updated. - func: detach(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: detach @@ -3870,134 +3408,112 @@ # only be called on non-view `Variable`s. You can use `is_view()` to check # this. If this `Variable` is a view, throws an `std::runtime_error()`. - func: detach_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: detach_ - func: size.int(Tensor self, int dim) -> int - use_c10_dispatcher: full variants: function device_guard: False manual_cpp_binding: True - func: size.Dimname(Tensor self, Dimname dim) -> int - use_c10_dispatcher: full variants: function, method device_guard: False - func: slice.Tensor(Tensor(a) self, int dim=0, int start=0, int end=9223372036854775807, int step=1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: slice - func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: slogdet - func: smm(Tensor self, Tensor mat2) -> Tensor - use_c10_dispatcher: full variants: function, method # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models. - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: CPU: softmax_cpu CUDA: softmax_cuda MkldnnCPU: mkldnn_softmax - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: CPU: softmax_backward_cpu CUDA: softmax_backward_cuda - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: unsafe_split - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: split - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: unsafe_split_with_sizes - func: split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: split_with_sizes - func: squeeze(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: squeeze - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: squeeze - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: squeeze_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: squeeze_ - func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: squeeze_ - func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False - func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) @@ -4009,7 +3525,6 @@ SparseCUDA: _sspaddmm_out_cuda - func: stack(Tensor[] tensors, int dim=0) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: stack @@ -4019,19 +3534,16 @@ DefaultBackend: stack_out - func: hstack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: vstack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: dstack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -4049,30 +3561,25 @@ variants: function, method - func: stride.int(Tensor self, int dim) -> int - use_c10_dispatcher: full variants: function device_guard: False manual_cpp_binding: True - func: stride.Dimname(Tensor self, Dimname dim) -> int - use_c10_dispatcher: full variants: function, method device_guard: False - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sum - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sum - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) @@ -4084,13 +3591,11 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: nansum - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: nansum @@ -4101,18 +3606,15 @@ CPU, CUDA: nansum_out - func: sum_to_size(Tensor self, int[] size) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: sqrt(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sqrt - func: sqrt_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sqrt_ @@ -4123,39 +3625,32 @@ CPU, CUDA: sqrt_out - func: square(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: square_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: std(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: std - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: std - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: std_mean - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: std_mean - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4164,20 +3659,17 @@ CPU, CUDA: std_out - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: prod - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: prod @@ -4188,34 +3680,29 @@ CPU, CUDA: prod_out - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function, method - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: t(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full device_guard: False variants: function, method dispatch: DefaultBackend: t - func: t_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full device_guard: False variants: method dispatch: DefaultBackend: t_ - func: tan(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: tan - func: tan_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: tan_ @@ -4226,14 +3713,12 @@ CPU, CUDA: tan_out - func: tanh(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: tanh QuantizedCPU: tanh_quantized_cpu - func: tanh_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: tanh_ @@ -4244,7 +3729,6 @@ CPU, CUDA: tanh_out - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor - use_c10_dispatcher: full variants: function - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!) @@ -4255,7 +3739,6 @@ # TODO: namespace threshold in 'nn' - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: threshold @@ -4263,7 +3746,6 @@ QuantizedCPU: threshold_quantized_cpu - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: CPU: threshold_ @@ -4276,69 +3758,57 @@ CUDA: threshold_out_cuda - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: threshold_backward CUDA: threshold_backward_cuda - func: tile(Tensor self, int[] dims) -> Tensor - use_c10_dispatcher: full variants: function, method - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: transpose - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor - use_c10_dispatcher: full device_guard: False dispatch: MkldnnCPU: mkldnn_transpose - func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: transpose_ - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) - use_c10_dispatcher: full device_guard: False dispatch: MkldnnCPU: mkldnn_transpose_ - func: one_hot(Tensor self, int num_classes=-1) -> Tensor - use_c10_dispatcher: full python_module: nn variants: function - func: flip(Tensor self, int[] dims) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, QuantizedCPU: flip_cpu CUDA: flip_cuda - func: fliplr(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: flipud(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: roll_cpu @@ -4347,33 +3817,26 @@ # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: rot90 - func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor - use_c10_dispatcher: full - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor - use_c10_dispatcher: full - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _trilinear - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor - use_c10_dispatcher: full - func: trunc(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: trunc - func: trunc_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: trunc_ @@ -4385,47 +3848,39 @@ # Alias for trunc - func: fix(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: fix_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method - func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: type_as(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method - func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool - use_c10_dispatcher: full variants: function - func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _unique_cpu CUDA: _unique_cuda - func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: unique_dim_cpu CUDA: unique_dim_cuda - func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: unique_consecutive_cpu CUDA: unique_consecutive_cuda - func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: unique_dim_consecutive_cpu @@ -4436,42 +3891,35 @@ # Please don't rely on these two operators, they will be removed soon - func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _unique2_cpu CUDA: _unique2_cuda - func: _unsafe_view(Tensor self, int[] size) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _unsafe_view - func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False dispatch: DefaultBackend: unsqueeze - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: DefaultBackend: unsqueeze_ - func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor - use_c10_dispatcher: full - func: var(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: var - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: var @@ -4482,30 +3930,25 @@ CPU, CUDA: var_out - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: var_mean - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: var_mean - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False @@ -4513,55 +3956,44 @@ # this allows us to implicitly calculate the broadcast derivative, while only dealing with the # _s_where derivative. - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function - func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: function - func: where(Tensor condition) -> Tensor[] - use_c10_dispatcher: full variants: function - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _s_where - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor - use_c10_dispatcher: full variants: function # VariableType::_weight_norm does not want to be given a gap in the autograd graph, # so we don't define "dispatch" variants for it. - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor - use_c10_dispatcher: full variants: function - func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CUDA: weight_norm_cuda - func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CUDA: weight_norm_cuda_backward - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -4578,40 +4010,34 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _standard_gamma_grad_cpu CUDA: _standard_gamma_grad_cuda - func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _s_gamma_cpu CUDA: _s_gamma_cuda - func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _dirichlet_grad_cpu CUDA: _dirichlet_grad_cuda - func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _s_dirichlet_cpu CUDA: _s_dirichlet_cuda - func: poisson(Tensor self, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _s_poisson_cpu CUDA: _s_poisson_cuda - func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _s_binomial_cpu CUDA: _s_binomial_cuda @@ -4620,96 +4046,77 @@ # complicated - func: native_norm(Tensor self, Scalar p=2) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: norm_sparse - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: norm_sparse # TODO: reduce signatures down to one when optional args is available - func: _sparse_sum(Tensor self) -> Tensor - use_c10_dispatcher: full - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _sparse_sum - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: _sparse_sum_backward_cpu SparseCUDA: _sparse_sum_backward_cuda - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: softmax_sparse_cpu SparseCUDA: softmax_sparse_cuda - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: softmax_backward_sparse_cpu SparseCUDA: softmax_backward_sparse_cuda - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: function - func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_sparse_cpu SparseCUDA: log_softmax_sparse_cuda - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: log_softmax_backward_sparse_cpu SparseCUDA: log_softmax_backward_sparse_cuda - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: norm @@ -4725,11 +4132,9 @@ CPU, CUDA: norm_out - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function, method - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function, method - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!) @@ -4739,11 +4144,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: frobenius_norm(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4751,7 +4154,6 @@ variants: function - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4759,7 +4161,6 @@ variants: function - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: function - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) @@ -4767,7 +4168,6 @@ variants: function - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: clone @@ -4776,13 +4176,11 @@ QuantizedCPU, QuantizedCUDA: quantized_clone - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: resize_as_ - func: zero_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: zero_ @@ -4796,14 +4194,12 @@ SparseCPU, SparseCUDA: sub_out_sparse - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: sub SparseCPU, SparseCUDA: sub_sparse - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: sub_ @@ -4811,13 +4207,11 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sub - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: sub_ @@ -4827,24 +4221,19 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method # For C++ only, until we have conversion from C++ numbers to Tensor - func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method - func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: rsub @@ -4855,7 +4244,6 @@ CPU, CUDA: heaviside_out - func: heaviside(Tensor self, Tensor values) -> Tensor - use_c10_dispatcher: full variants: function, method - func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!) @@ -4864,7 +4252,6 @@ # For C++ only, until we have conversion from C++ numbers to Tensor - func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function dispatch: DefaultBackend: rsub @@ -4872,7 +4259,6 @@ # Functionally the same as addmm, but we give it a different derivative formula # that doesn't propagate gradients to non-present entries on sparse. - func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full dispatch: DefaultBackend: _sparse_addmm @@ -4885,7 +4271,6 @@ SparseCUDA: addmm_out_sparse_dense_cuda - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU: addmm_cpu @@ -4894,7 +4279,6 @@ SparseCUDA: addmm_sparse_dense_cuda - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: addmm_cpu_ @@ -5028,49 +4412,40 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> () - use_c10_dispatcher: full - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: new_with_dims_sparse - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU, SparseCUDA: new_with_dims_and_tensor_sparse - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_resize_ - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_resize_and_clear_ - func: sparse_mask(Tensor self, Tensor mask) -> Tensor - use_c10_dispatcher: full variants: method dispatch: SparseCPU: sparse_mask_cpu SparseCUDA: sparse_mask_cuda - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_to_dense MkldnnCPU: mkldnn_to_dense - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor - use_c10_dispatcher: full - func: sparse_dim(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_dim_sparse @@ -5078,14 +4453,12 @@ # legacy method - func: _dimI(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: sparse_dim_sparse device_guard: False - func: dense_dim(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: dense_dim_sparse @@ -5093,42 +4466,36 @@ # legacy method - func: _dimV(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: dense_dim_sparse device_guard: False - func: _nnz(Tensor self) -> int - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _nnz_sparse device_guard: False - func: coalesce(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method dispatch: SparseCPU: coalesce_sparse_cpu SparseCUDA: coalesce_sparse_cuda - func: is_coalesced(Tensor self) -> bool - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: is_coalesced_sparse device_guard: False - func: _indices(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _indices_sparse device_guard: False - func: _values(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _values_sparse @@ -5138,21 +4505,18 @@ # a bit unsafe. Similar to _indices and _values, this is useful for implementing # custom sparse operations in Python/C++ extension. - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: _coalesced_sparse_ device_guard: False - func: indices(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: indices_sparse device_guard: False - func: values(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method dispatch: SparseCPU, SparseCUDA: values_sparse @@ -5165,196 +4529,161 @@ SparseCUDA: hspmm_out_sparse_cuda - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor - use_c10_dispatcher: full dispatch: SparseCPU: hspmm_sparse_cpu SparseCUDA: hspmm_sparse_cuda - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!) - use_c10_dispatcher: full variants: function dispatch: SparseCPU, SparseCUDA: copy_sparse_ - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: unbind - func: unbind.Dimname(Tensor(a) self, Dimname dim) -> Tensor(a)[] - use_c10_dispatcher: full variants: function, method - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: dense_to_sparse - func: to_sparse(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: dense_to_sparse - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full variants: method dispatch: CPU: dense_to_mkldnn - func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1) -> Tensor - use_c10_dispatcher: full variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv2d_weight - func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor - use_c10_dispatcher: full variants: function python_module: nn dispatch: MkldnnCPU: mkldnn_reorder_conv3d_weight - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor - use_c10_dispatcher: full - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: quantize_per_tensor - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_tensor_list_cpu - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: quantize_per_channel_cpu - func: dequantize.self(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: dequantize_quant - func: dequantize.tensors(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: QuantizedCPU: dequantize_tensors_quantized_cpu - func: q_scale(Tensor self) -> float - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_scale_quant - func: q_zero_point(Tensor self) -> int - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_zero_point_quant - func: q_per_channel_scales(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_per_channel_scales - func: q_per_channel_zero_points(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points - func: q_per_channel_axis(Tensor self) -> int - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU, QuantizedCUDA: q_per_channel_axis - func: int_repr(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: QuantizedCPU: int_repr_quantized_cpu QuantizedCUDA: int_repr_quantized_cuda - func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor - use_c10_dispatcher: full dispatch: CPU: make_per_tensor_quantized_tensor_cpu CUDA: make_per_tensor_quantized_tensor_cuda - func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor - use_c10_dispatcher: full dispatch: CPU: make_per_channel_quantized_tensor_cpu - func: qscheme(Tensor self) -> QScheme - use_c10_dispatcher: full variants: method dispatch: QuantizedCPU, QuantizedCUDA: qscheme_quant - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: fake_quantize_per_tensor_affine - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _fake_quantize_learnable_per_tensor_affine - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: fake_quantize_per_channel_affine - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: _fake_quantize_learnable_per_channel_affine - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int) - use_c10_dispatcher: full variants: function - func: _saturate_weight_to_fp16(Tensor weight) -> Tensor - use_c10_dispatcher: full variants: function - func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function # to(Device) must not exist because all constructors of Device also works for @@ -5366,61 +4695,47 @@ device_guard: False - func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor - use_c10_dispatcher: full variants: method device_guard: False - func: meshgrid(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - func: cartesian_prod(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full variants: function - func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor - use_c10_dispatcher: full variants: function - func: item(Tensor self) -> Scalar - use_c10_dispatcher: full variants: method - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType - use_c10_dispatcher: full variants: function - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType - use_c10_dispatcher: full variants: function - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType - use_c10_dispatcher: full variants: function - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType - use_c10_dispatcher: full - func: can_cast(ScalarType from, ScalarType to) -> bool - use_c10_dispatcher: full variants: function - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType - use_c10_dispatcher: full variants: function # NB: Does NOT check precondition that numel == 1 - func: _local_scalar_dense(Tensor self) -> Scalar - use_c10_dispatcher: full dispatch: CPU: _local_scalar_dense_cpu CUDA: _local_scalar_dense_cuda @@ -5446,7 +4761,6 @@ CUDA: _thnn_fused_gru_cell_cuda - func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor) - use_c10_dispatcher: full dispatch: CUDA: _thnn_fused_gru_cell_backward_cuda @@ -5455,28 +4769,20 @@ # RNN cells and layers - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -5494,55 +4800,46 @@ # Quantized RNN layers # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) -# use_c10_dispatcher: full + # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor) -# use_c10_dispatcher: full + # Quantized GRU layers # - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) -# use_c10_dispatcher: full +# # - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) -# use_c10_dispatcher: full +# # Quantized RNN cells - func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor) - use_c10_dispatcher: full - func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor - use_c10_dispatcher: full - func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor - use_c10_dispatcher: full - func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor - use_c10_dispatcher: full # PackedSequence utilities - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: DefaultBackend: _pack_padded_sequence - func: _pack_padded_sequence_backward(Tensor grad, int[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor - use_c10_dispatcher: full - func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor) - use_c10_dispatcher: full # wrappers for legacy TH methods - func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: CPU, CUDA: set_ - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -5551,61 +4848,51 @@ QuantizedCPU, QuantizedCUDA: set_storage_quantized_ - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False dispatch: CPU, CUDA: set_tensor_ - func: set_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: set_cpu_ CUDA: set_cuda_ - func: is_set_to(Tensor self, Tensor tensor) -> bool - use_c10_dispatcher: full variants: method device_guard: False dispatch: CPU, CUDA: is_set_to - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: masked_fill__cpu CUDA: masked_fill__cuda - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: masked_scatter__cpu CUDA: masked_scatter__cuda - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: view(Tensor(a) self, int[] size) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -5613,126 +4900,101 @@ MkldnnCPU: mkldnn_view - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_put_ CUDA: legacy::cuda::_th_put_ - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: index_add_cpu_ CUDA: index_add_cuda_ - func: index_add(Tensor self, int dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_index_fill_ CUDA: legacy::cuda::_th_index_fill_ - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: index_fill_ - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_ - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_fill_ - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_reduce_ - func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_scalar_reduce_ - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: scatter_add_ - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor - use_c10_dispatcher: full variants: function, method - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: eq_ - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: eq_ @@ -5750,35 +5012,27 @@ CPU, CUDA: bitwise_and_out - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __and__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -5794,35 +5048,27 @@ CPU, CUDA: bitwise_or_out - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __or__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -5838,181 +5084,149 @@ CPU, CUDA: bitwise_xor_out - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __lshift__ - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __lshift__ - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __ilshift__ - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __ilshift__ - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __rshift__ - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: __rshift__ - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __irshift__ - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: __irshift__ - func: lgamma_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: _lgamma__cpu CUDA: _lgamma__cuda - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: atan2_ - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: tril_cpu_ CUDA: tril_cuda_ - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: triu_cpu_ CUDA: triu_cuda_ - func: digamma_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: digamma_ - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: legacy::cpu::_th_renorm_ CUDA: legacy::cuda::_th_renorm_ - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: pow_ - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: pow_ - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: lerp_cpu_scalar_ CUDA: lerp_cuda_scalar_ - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU: lerp_cpu_tensor_ CUDA: lerp_cuda_tensor_ - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: fmod_ - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: fmod_ - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: remainder_ - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: remainder_ - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: addbmm_ @@ -6023,61 +5237,51 @@ CPU, CUDA: addbmm_out - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: addbmm - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: addcdiv_ - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: random_ - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: random_ - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: random_ - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: uniform_ - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: cauchy_ - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: log_normal_ - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: exponential_ - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: geometric_ @@ -6091,13 +5295,11 @@ CUDA: diag_cuda_out - func: diag(Tensor self, int diagonal=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: diag - func: diag_backward(Tensor grad, int[] input_sizes, int diagonal) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6107,7 +5309,6 @@ CPU, CUDA: cross_out - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: cross @@ -6119,7 +5320,6 @@ CUDA: triu_cuda_out - func: triu(Tensor self, int diagonal=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: triu @@ -6131,32 +5331,27 @@ CUDA: tril_cuda_out - func: tril(Tensor self, int diagonal=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: tril - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: tril_indices_cpu CUDA: tril_indices_cuda - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU: triu_indices_cpu CUDA: triu_indices_cuda - func: trace(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: trace_cpu CUDA: trace_cuda - func: trace_backward(Tensor grad, int[] sizes) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6167,7 +5362,6 @@ QuantizedCPU: ne_out_quantized_cpu - func: ne.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ne @@ -6180,20 +5374,17 @@ QuantizedCPU: ne_out_quantized_cpu - func: ne.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ne QuantizedCPU: ne_quantized_cpu - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ne_ - func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ne_ @@ -6203,22 +5394,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6228,7 +5415,6 @@ QuantizedCPU: eq_out_quantized_cpu - func: eq.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: eq @@ -6241,7 +5427,6 @@ QuantizedCPU: eq_out_quantized_cpu - func: eq.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: eq @@ -6254,7 +5439,6 @@ QuantizedCPU: ge_out_quantized_cpu - func: ge.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ge @@ -6267,20 +5451,17 @@ QuantizedCPU: ge_out_quantized_cpu - func: ge.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: ge QuantizedCPU: ge_quantized_cpu - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ge_ - func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: ge_ @@ -6290,22 +5471,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6315,7 +5492,6 @@ QuantizedCPU: le_out_quantized_cpu - func: le.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: le @@ -6328,20 +5504,17 @@ QuantizedCPU: le_out_quantized_cpu - func: le.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: le QuantizedCPU: le_quantized_cpu - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: le_ - func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: le_ @@ -6351,22 +5524,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6376,7 +5545,6 @@ QuantizedCPU: gt_out_quantized_cpu - func: gt.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: gt @@ -6389,20 +5557,17 @@ QuantizedCPU: gt_out_quantized_cpu - func: gt.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: gt QuantizedCPU: gt_quantized_cpu - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: gt_ - func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: gt_ @@ -6412,22 +5577,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: greater.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) @@ -6437,7 +5598,6 @@ QuantizedCPU: lt_out_quantized_cpu - func: lt.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: lt @@ -6450,20 +5610,17 @@ QuantizedCPU: lt_out_quantized_cpu - func: lt.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: lt QuantizedCPU: lt_quantized_cpu - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: lt_ - func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: lt_ @@ -6473,22 +5630,18 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: less.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!) @@ -6498,14 +5651,12 @@ CUDA: take_out_cuda - func: take(Tensor self, Tensor index) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: take_cpu CUDA: take_cuda - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6516,7 +5667,6 @@ CUDA: index_select_out_cuda - func: index_select(Tensor self, int dim, Tensor index) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: index_select_cpu_ @@ -6528,11 +5678,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor - use_c10_dispatcher: full variants: method, function - func: index_select_backward(Tensor grad, int[] self_sizes, int dim, Tensor index) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6543,14 +5691,12 @@ CUDA: masked_select_out_cuda - func: masked_select(Tensor self, Tensor mask) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: masked_select_cpu CUDA: masked_select_cuda - func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6561,14 +5707,12 @@ CUDA: nonzero_out_cuda - func: nonzero(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_nonzero CUDA: nonzero_cuda - func: nonzero_numpy(Tensor self) -> Tensor[] - use_c10_dispatcher: full variants: method, function - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!) @@ -6578,13 +5722,11 @@ CUDA: gather_out_cpu_cuda - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: gather - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor - use_c10_dispatcher: full variants: function device_guard: False @@ -6592,11 +5734,9 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor - use_c10_dispatcher: full - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures @@ -6604,13 +5744,11 @@ CPU, CUDA: addcmul_out - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: addcmul - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: addcmul_ @@ -6621,7 +5759,6 @@ CPU, CUDA: addcdiv_out - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: addcdiv @@ -6633,7 +5770,6 @@ CUDA: legacy::cuda::_th_gels_out - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR) - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_gels @@ -6645,13 +5781,11 @@ DefaultBackend: triangular_solve_out - func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: triangular_solve - func: _triangular_solve_helper(Tensor self, Tensor A, bool upper, bool transpose, bool unitriangular) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _triangular_solve_helper_cpu @@ -6663,13 +5797,11 @@ DefaultBackend: symeig_out - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: symeig - func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _symeig_helper_cpu @@ -6681,7 +5813,6 @@ DefaultBackend: eig_out - func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: eig @@ -6692,13 +5823,11 @@ Math: svd_out - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V) - use_c10_dispatcher: full variants: method, function dispatch: Math: svd - func: _svd_helper(Tensor self, bool some, bool compute_uv) -> (Tensor U, Tensor S, Tensor V) - use_c10_dispatcher: full variants: function dispatch: CPU: _svd_helper_cpu @@ -6706,23 +5835,19 @@ # swapaxes, alias for transpose - func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False # swapdims, alias for transpose - func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a) - use_c10_dispatcher: full variants: function, method device_guard: False - func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!) - use_c10_dispatcher: full variants: method device_guard: False @@ -6732,13 +5857,11 @@ DefaultBackend: cholesky_out - func: cholesky(Tensor self, bool upper=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: cholesky - func: _cholesky_helper(Tensor self, bool upper) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _cholesky_helper_cpu @@ -6750,20 +5873,17 @@ DefaultBackend: cholesky_solve_out - func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: cholesky_solve - func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _cholesky_solve_helper_cpu CUDA: _cholesky_solve_helper_cuda - func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: solve @@ -6774,7 +5894,6 @@ DefaultBackend: solve_out - func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _solve_helper_cpu @@ -6787,7 +5906,6 @@ CUDA: legacy::cuda::_th_potri_out - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_potri @@ -6799,7 +5917,6 @@ Math: qr_out - func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R) - use_c10_dispatcher: full variants: method, function dispatch: Math: qr @@ -6811,7 +5928,6 @@ CUDA: legacy::cuda::_th_geqrf_out - func: geqrf(Tensor self) -> (Tensor a, Tensor tau) - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_geqrf @@ -6823,7 +5939,6 @@ CPU: legacy::cpu::_th_orgqr_out - func: orgqr(Tensor self, Tensor input2) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_orgqr @@ -6834,13 +5949,11 @@ CPU: legacy::cpu::_th_ormqr_out - func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_ormqr - func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor, Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _lu_with_info_cpu @@ -6852,13 +5965,11 @@ DefaultBackend: lu_solve_out - func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: lu_solve - func: _lu_solve_helper(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: _lu_solve_helper_cpu @@ -6871,20 +5982,17 @@ CPU, CUDA: multinomial_out - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: multinomial - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: legacy::cpu::_th_multinomial_alias_setup CUDA: legacy::cuda::_th_multinomial_alias_setup - func: _multinomial_alias_draw(Tensor J, Tensor q, int num_samples, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU: legacy::cpu::_th_multinomial_alias_draw @@ -6897,7 +6005,6 @@ CUDA: _lgamma_out_cuda - func: lgamma(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: lgamma @@ -6908,7 +6015,6 @@ CPU, CUDA: digamma_out - func: digamma(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: digamma @@ -6919,19 +6025,16 @@ CPU, CUDA: polygamma_out - func: polygamma(int n, Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: polygamma - func: erfinv(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: erfinv - func: erfinv_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: erfinv_ @@ -6942,13 +6045,11 @@ CPU, CUDA: erfinv_out - func: i0(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: i0 - func: i0_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: i0_ @@ -6959,13 +6060,11 @@ CPU, CUDA: i0_out - func: sign(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: sign - func: sign_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: DefaultBackend: sign_ @@ -6976,7 +6075,6 @@ CPU, CUDA: sign_out - func: signbit(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -6986,7 +6084,6 @@ CUDA: signbit_out - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: dist @@ -6997,7 +6094,6 @@ CPU, CUDA: atan2_out - func: atan2(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: atan2 @@ -7015,14 +6111,12 @@ CUDA: lerp_cuda_tensor_out - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: lerp_cpu_scalar CUDA: lerp_cuda_scalar - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: lerp_cpu_tensor @@ -7035,7 +6129,6 @@ CUDA: _histc_out_cuda - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_histc @@ -7047,7 +6140,6 @@ CPU, CUDA: fmod_out - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: fmod @@ -7058,7 +6150,6 @@ CPU, CUDA: fmod_out - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: fmod @@ -7069,7 +6160,6 @@ CPU, CUDA: hypot_out - func: hypot(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: hypot @@ -7086,7 +6176,6 @@ CPU, CUDA: igamma_out - func: igamma(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: igamma @@ -7103,13 +6192,11 @@ CPU, CUDA: igammac_out - func: igammac(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: igammac - func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: igammac_ @@ -7120,7 +6207,6 @@ CPU, CUDA: nextafter_out - func: nextafter(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: nextafter @@ -7137,7 +6223,6 @@ CPU, CUDA: remainder_out - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: remainder @@ -7148,27 +6233,23 @@ CPU, CUDA: remainder_out - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: remainder - func: min(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: min QuantizedCPU: min_quantized_cpu - func: max(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: max QuantizedCPU: max_quantized_cpu - func: maximum(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: maximum @@ -7181,14 +6262,12 @@ # binary max, alias of maximum # NOTE: max is not an alias for maximum, since there is also unary max - func: max.other(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: minimum(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: minimum @@ -7204,35 +6283,30 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: min.other(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: method, function - func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -7242,7 +6316,6 @@ CUDA: legacy::cuda::_th_sort_out - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: method, function dispatch: CPU: sort_cpu @@ -7253,7 +6326,6 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: method, function - func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -7262,17 +6334,14 @@ Math: msort_out - func: msort(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: Math: msort - func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor - use_c10_dispatcher: full variants: method, function - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) @@ -7282,20 +6351,17 @@ CUDA: legacy::cuda::_th_topk_out - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices) - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: topk QuantizedCPU: topk_quantized_cpu - func: all(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: all - func: any(Tensor self) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: any @@ -7308,14 +6374,12 @@ CUDA: legacy::cuda::_th_renorm_out - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU: legacy::cpu::_th_renorm CUDA: legacy::cuda::_th_renorm - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a) - use_c10_dispatcher: full variants: method device_guard: False dispatch: @@ -7323,13 +6387,11 @@ QuantizedCPU, QuantizedCUDA: unfold - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CPU, CUDA: unfold_backward - func: equal(Tensor self, Tensor other) -> bool - use_c10_dispatcher: full variants: method, function dispatch: CPU: cpu_equal @@ -7342,7 +6404,6 @@ CPU, CUDA: pow_out - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor - use_c10_dispatcher: full variants: method, function dispatch: CPU, CUDA: pow @@ -7353,7 +6414,6 @@ CPU, CUDA: pow_out - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: pow @@ -7364,7 +6424,6 @@ SparseCPU, SparseCUDA: pow_out_sparse_scalar - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: CPU, CUDA: pow @@ -7376,7 +6435,6 @@ Math: float_power_out - func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: Math: float_power @@ -7387,7 +6445,6 @@ Math: float_power_out - func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor - use_c10_dispatcher: full dispatch: Math: float_power @@ -7397,25 +6454,21 @@ Math: float_power_out - func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: Math: float_power - func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: Math: float_power_ - func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: Math: float_power_ - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full variants: method dispatch: CPU, CUDA: normal_ @@ -7426,7 +6479,6 @@ CPU, CUDA: normal_out - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: normal @@ -7436,7 +6488,6 @@ CPU, CUDA: normal_out - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: normal @@ -7446,7 +6497,6 @@ CPU, CUDA: normal_out - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor - use_c10_dispatcher: full dispatch: CPU, CUDA: normal @@ -7457,19 +6507,16 @@ use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: alias(Tensor(a) self) -> Tensor(a) - use_c10_dispatcher: full variants: method, function dispatch: DefaultBackend: alias - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!) - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_index_copy_ CUDA: legacy::cuda::_th_index_copy_ - func: _cumsum(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _cumsum_cpu CUDA: _cumsum_cuda @@ -7481,7 +6528,6 @@ CUDA: _cumsum_out_cuda - func: _cumprod(Tensor self, int dim) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _cumprod_cpu CUDA: _cumprod_cuda @@ -7493,29 +6539,24 @@ CUDA: _cumprod_out_cuda - func: _var(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_var - func: _std(Tensor self, bool unbiased=True) -> Tensor - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_std - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () - use_c10_dispatcher: full variants: function dispatch: CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor - use_c10_dispatcher: full variants: function dispatch: CUDA: _amp_update_scale_cuda - func: _cat(Tensor[] tensors, int dim=0) -> Tensor - use_c10_dispatcher: full dispatch: CPU: _cat_cpu CUDA: cat_cuda @@ -7529,644 +6570,552 @@ QuantizedCPU: cat_out_quantized_cpu - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalar_kernel_slow CUDA: foreach_tensor_add_scalar_kernel_cuda - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalar_kernel_slow_ CUDA: foreach_tensor_add_scalar_kernel_cuda_ - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalar_kernel_slow CUDA: foreach_tensor_sub_scalar_kernel_cuda - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalar_kernel_slow_ CUDA: foreach_tensor_sub_scalar_kernel_cuda_ - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalar_kernel_slow CUDA: foreach_tensor_mul_scalar_kernel_cuda - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalar_kernel_slow_ CUDA: foreach_tensor_mul_scalar_kernel_cuda_ - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow CUDA: foreach_tensor_div_scalar_kernel_cuda - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalar_kernel_slow_ CUDA: foreach_tensor_div_scalar_kernel_cuda_ - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow CUDA: foreach_tensor_add_list_kernel_cuda - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_list_kernel_slow_ CUDA: foreach_tensor_add_list_kernel_cuda_ - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow CUDA: foreach_tensor_sub_list_kernel_cuda - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_list_kernel_slow_ CUDA: foreach_tensor_sub_list_kernel_cuda_ - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow CUDA: foreach_tensor_mul_list_kernel_cuda - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_list_kernel_slow_ CUDA: foreach_tensor_mul_list_kernel_cuda_ - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow CUDA: foreach_tensor_div_list_kernel_cuda - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_list_kernel_slow_ CUDA: foreach_tensor_div_list_kernel_cuda_ - func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalarlist_kernel_slow CUDA: foreach_tensor_add_scalarlist_kernel_cuda - func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_add_scalarlist_kernel_slow_ CUDA: foreach_tensor_add_scalarlist_kernel_cuda_ - func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalarlist_kernel_slow CUDA: foreach_tensor_sub_scalarlist_kernel_cuda - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sub_scalarlist_kernel_slow_ CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_ - func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalarlist_kernel_slow CUDA: foreach_tensor_div_scalarlist_kernel_cuda - func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_div_scalarlist_kernel_slow_ CUDA: foreach_tensor_div_scalarlist_kernel_cuda_ - func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalarlist_kernel_slow CUDA: foreach_tensor_mul_scalarlist_kernel_cuda - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_mul_scalarlist_kernel_slow_ CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_ - func: _foreach_exp(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_exp_slow CUDA: foreach_tensor_exp_cuda - func: _foreach_zero_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_zero_slow_ CUDA: foreach_tensor_zero_cuda_ - func: _foreach_exp_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_exp_slow_ CUDA: foreach_tensor_exp_cuda_ - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sqrt_slow CUDA: foreach_tensor_sqrt_cuda - func: _foreach_sqrt_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sqrt_slow_ CUDA: foreach_tensor_sqrt_cuda_ - func: _foreach_abs(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_abs_slow CUDA: foreach_tensor_abs_cuda - func: _foreach_abs_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_abs_slow_ CUDA: foreach_tensor_abs_cuda_ - func: _foreach_acos(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_acos_slow CUDA: foreach_tensor_acos_cuda - func: _foreach_acos_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_acos_slow_ CUDA: foreach_tensor_acos_cuda_ - func: _foreach_asin(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_asin_slow CUDA: foreach_tensor_asin_cuda - func: _foreach_asin_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_asin_slow_ CUDA: foreach_tensor_asin_cuda_ - func: _foreach_atan(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_atan_slow CUDA: foreach_tensor_atan_cuda - func: _foreach_atan_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_atan_slow_ CUDA: foreach_tensor_atan_cuda_ - func: _foreach_ceil(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_ceil_slow CUDA: foreach_tensor_ceil_cuda - func: _foreach_ceil_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_ceil_slow_ CUDA: foreach_tensor_ceil_cuda_ - func: _foreach_cos(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cos_slow CUDA: foreach_tensor_cos_cuda - func: _foreach_cos_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cos_slow_ CUDA: foreach_tensor_cos_cuda_ - func: _foreach_cosh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cosh_slow CUDA: foreach_tensor_cosh_cuda - func: _foreach_cosh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_cosh_slow_ CUDA: foreach_tensor_cosh_cuda_ - func: _foreach_erf(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erf_slow CUDA: foreach_tensor_erf_cuda - func: _foreach_erf_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erf_slow_ CUDA: foreach_tensor_erf_cuda_ - func: _foreach_erfc(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erfc_slow CUDA: foreach_tensor_erfc_cuda - func: _foreach_erfc_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_erfc_slow_ CUDA: foreach_tensor_erfc_cuda_ - func: _foreach_expm1(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_expm1_slow CUDA: foreach_tensor_expm1_cuda - func: _foreach_expm1_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_expm1_slow_ CUDA: foreach_tensor_expm1_cuda_ - func: _foreach_floor(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_floor_slow CUDA: foreach_tensor_floor_cuda - func: _foreach_floor_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_floor_slow_ CUDA: foreach_tensor_floor_cuda_ - func: _foreach_log(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log_slow CUDA: foreach_tensor_log_cuda - func: _foreach_log_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log_slow_ CUDA: foreach_tensor_log_cuda_ - func: _foreach_log10(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log10_slow CUDA: foreach_tensor_log10_cuda - func: _foreach_log10_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log10_slow_ CUDA: foreach_tensor_log10_cuda_ - func: _foreach_log1p(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log1p_slow CUDA: foreach_tensor_log1p_cuda - func: _foreach_log1p_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log1p_slow_ CUDA: foreach_tensor_log1p_cuda_ - func: _foreach_log2(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log2_slow CUDA: foreach_tensor_log2_cuda - func: _foreach_log2_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_log2_slow_ CUDA: foreach_tensor_log2_cuda_ - func: _foreach_neg(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_neg_slow CUDA: foreach_tensor_neg_cuda - func: _foreach_neg_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_neg_slow_ CUDA: foreach_tensor_neg_cuda_ - func: _foreach_tan(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tan_slow CUDA: foreach_tensor_tan_cuda - func: _foreach_tan_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tan_slow_ CUDA: foreach_tensor_tan_cuda_ - func: _foreach_tanh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tanh_slow CUDA: foreach_tensor_tanh_cuda - func: _foreach_tanh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_tanh_slow_ CUDA: foreach_tensor_tanh_cuda_ - func: _foreach_sin(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sin_slow CUDA: foreach_tensor_sin_cuda - func: _foreach_sin_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sin_slow_ CUDA: foreach_tensor_sin_cuda_ - func: _foreach_sinh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sinh_slow CUDA: foreach_tensor_sinh_cuda - func: _foreach_sinh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sinh_slow_ CUDA: foreach_tensor_sinh_cuda_ - func: _foreach_round(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_round_slow CUDA: foreach_tensor_round_cuda - func: _foreach_round_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_round_slow_ CUDA: foreach_tensor_round_cuda_ - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_lgamma_slow CUDA: foreach_tensor_lgamma_cuda - func: _foreach_lgamma_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_lgamma_slow_ CUDA: foreach_tensor_lgamma_cuda_ - func: _foreach_frac(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_frac_slow CUDA: foreach_tensor_frac_cuda - func: _foreach_frac_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_frac_slow_ CUDA: foreach_tensor_frac_cuda_ - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_reciprocal_slow CUDA: foreach_tensor_reciprocal_cuda - func: _foreach_reciprocal_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_reciprocal_slow_ CUDA: foreach_tensor_reciprocal_cuda_ - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sigmoid_slow CUDA: foreach_tensor_sigmoid_cuda - func: _foreach_sigmoid_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_sigmoid_slow_ CUDA: foreach_tensor_sigmoid_cuda_ - func: _foreach_trunc(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_trunc_slow CUDA: foreach_tensor_trunc_cuda - func: _foreach_trunc_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_trunc_slow_ CUDA: foreach_tensor_trunc_cuda_ - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalar_slow_ CUDA: foreach_tensor_addcdiv_scalar_cuda_ - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalar_slow_ CUDA: foreach_tensor_addcmul_scalar_cuda_ - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalarlist_slow_ CUDA: foreach_tensor_addcdiv_scalarlist_cuda_ - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> () - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalarlist_slow_ CUDA: foreach_tensor_addcmul_scalarlist_cuda_ - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalar_slow CUDA: foreach_tensor_addcdiv_scalar_cuda - func: _foreach_addcmul.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalar_slow CUDA: foreach_tensor_addcmul_scalar_cuda - func: _foreach_addcdiv.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcdiv_scalarlist_slow CUDA: foreach_tensor_addcdiv_scalarlist_cuda - func: _foreach_addcmul.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_addcmul_scalarlist_slow CUDA: foreach_tensor_addcmul_scalarlist_cuda - func: _foreach_maximum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_maximum_slow CUDA: foreach_tensor_maximum_cuda - func: _foreach_minimum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[] - use_c10_dispatcher: full variants: function dispatch: CPU: foreach_tensor_minimum_slow CUDA: foreach_tensor_minimum_cuda - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor) - use_c10_dispatcher: full dispatch: CPU: legacy::cpu::_th_mode CUDA: legacy::cuda::_th_mode @@ -8178,7 +7127,6 @@ CUDA: legacy::cuda::_th_mode_out - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda @@ -8190,13 +7138,11 @@ CUDA: bucketize_out_cuda - func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: bucketize_cpu CUDA: bucketize_cuda - func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda @@ -8208,7 +7154,6 @@ CUDA: searchsorted_out_cuda - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False) -> Tensor - use_c10_dispatcher: full dispatch: CPU: searchsorted_cpu CUDA: searchsorted_cuda @@ -8222,7 +7167,6 @@ CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: mse_loss @@ -8234,7 +7178,6 @@ CPU, CUDA: mse_loss_backward_out - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: mse_loss_backward @@ -8246,7 +7189,6 @@ DefaultBackend: l1_loss_out - func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: l1_loss @@ -8258,7 +7200,6 @@ CPU, CUDA: l1_loss_backward_out - func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: l1_loss_backward @@ -8296,7 +7237,6 @@ python_module: nn - func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn - func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!)) @@ -8307,7 +7247,6 @@ CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward_out - func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target) - use_c10_dispatcher: full python_module: nn dispatch: CPU: multilabel_margin_loss_forward_cpu @@ -8321,7 +7260,6 @@ CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: multilabel_margin_loss_backward_cpu @@ -8407,7 +7345,6 @@ CUDA: smooth_l1_loss_out - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: smooth_l1_loss @@ -8420,7 +7357,6 @@ CUDA: smooth_l1_loss_backward_out - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: smooth_l1_loss_backward @@ -8432,7 +7368,6 @@ DefaultBackend: soft_margin_loss_out - func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: soft_margin_loss @@ -8444,7 +7379,6 @@ DefaultBackend: soft_margin_loss_backward_out - func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: soft_margin_loss_backward @@ -8456,7 +7390,6 @@ CPU, CUDA: elu_out - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: elu @@ -8468,13 +7401,11 @@ CPU, CUDA: elu_backward_out - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: elu_backward - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: elu_ @@ -8487,7 +7418,6 @@ CUDA: legacy::cuda::_thnn_glu_forward_out - func: glu(Tensor self, int dim=-1) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: glu @@ -8501,7 +7431,6 @@ CUDA: legacy::cuda::_thnn_glu_backward_out - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: glu_backward @@ -8514,20 +7443,17 @@ CPU, CUDA: hardsigmoid_out - func: hardsigmoid(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardsigmoid QuantizedCPU: hardsigmoid_quantized_cpu - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardsigmoid_ - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardsigmoid_backward @@ -8540,7 +7466,6 @@ QuantizedCPU: hardtanh_out_quantized_cpu - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardtanh @@ -8553,13 +7478,11 @@ CPU, CUDA: hardtanh_backward_out - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardtanh_backward - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardtanh_ @@ -8572,19 +7495,16 @@ CPU, CUDA: hardswish_out - func: hardswish(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardswish - func: hardswish_(Tensor(a!) self) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardswish_ - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: hardswish_backward @@ -8597,20 +7517,17 @@ QuantizedCPU: leaky_relu_out_quantized_cpu - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: leaky_relu QuantizedCPU: leaky_relu_quantized_cpu - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: leaky_relu_backward - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: leaky_relu_ @@ -8621,7 +7538,6 @@ python_module: nn - func: log_sigmoid(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn - func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!)) @@ -8632,7 +7548,6 @@ CUDA: legacy::cuda::_thnn_log_sigmoid_forward_out - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer) - use_c10_dispatcher: full python_module: nn dispatch: CPU: log_sigmoid_forward_cpu @@ -8646,7 +7561,6 @@ CUDA: legacy::cuda::_thnn_log_sigmoid_backward_out - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: log_sigmoid_backward_cpu @@ -8660,20 +7574,17 @@ CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_out - func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: rrelu_with_noise_cpu CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: rrelu_with_noise_backward - func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) - use_c10_dispatcher: full python_module: nn dispatch: CPU: rrelu_with_noise_cpu_ @@ -8686,7 +7597,6 @@ CPU, CUDA: softplus_out - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softplus @@ -8698,7 +7608,6 @@ CPU, CUDA: softplus_backward_out - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softplus_backward @@ -8710,7 +7619,6 @@ CPU, CUDA: softshrink_out - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softshrink @@ -8722,7 +7630,6 @@ CPU, CUDA: softshrink_backward_out - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: softshrink_backward @@ -8735,23 +7642,19 @@ MkldnnCPU: mkldnn_adaptive_avg_pool2d_out - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor - use_c10_dispatcher: full dispatch: MkldnnCPU: mkldnn_adaptive_avg_pool2d - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor - use_c10_dispatcher: full dispatch: CPU: adaptive_avg_pool2d_cpu CUDA: adaptive_avg_pool2d_cuda QuantizedCPU: adaptive_avg_pool2d_quantized_cpu - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_avg_pool2d_backward_cpu @@ -8766,7 +7669,6 @@ QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_avg_pool3d_cpu @@ -8781,7 +7683,6 @@ CUDA: adaptive_avg_pool3d_backward_out_cuda - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_avg_pool3d_backward_cpu @@ -8797,7 +7698,6 @@ # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool2d_cpu @@ -8811,7 +7711,6 @@ CUDA: adaptive_max_pool2d_backward_out_cuda - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool2d_backward_cpu @@ -8827,7 +7726,6 @@ # Return: (Tensor output, Tensor indices) - func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool3d_cpu @@ -8841,7 +7739,6 @@ CUDA: adaptive_max_pool3d_backward_out_cuda - func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: adaptive_max_pool3d_backward_cpu @@ -8856,7 +7753,6 @@ MkldnnCPU: mkldnn_avg_pool2d_out - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool2d_cpu @@ -8872,7 +7768,6 @@ CUDA: avg_pool2d_backward_out_cuda - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool2d_backward_cpu @@ -8887,7 +7782,6 @@ MkldnnCPU: mkldnn_avg_pool3d_out - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool3d_cpu @@ -8903,7 +7797,6 @@ CUDA: avg_pool3d_backward_out_cuda - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: avg_pool3d_backward_cpu @@ -8919,7 +7812,6 @@ # Return: (Tensor output, Tensor indices) - func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool2d_cpu @@ -8933,7 +7825,6 @@ CUDA: fractional_max_pool2d_backward_out_cuda - func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool2d_backward_cpu @@ -8949,7 +7840,6 @@ # Return: (Tensor output, Tensor indices) - func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool3d_cpu @@ -8963,7 +7853,6 @@ CUDA: fractional_max_pool3d_backward_out_cuda - func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: fractional_max_pool3d_backward_cpu @@ -8979,7 +7868,6 @@ # Return: (Tensor output, Tensor indices) - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool2d_with_indices_cpu @@ -8993,7 +7881,6 @@ CUDA: max_pool2d_with_indices_backward_out_cuda - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool2d_with_indices_backward_cpu @@ -9009,7 +7896,6 @@ # Return: (Tensor output, Tensor indices) - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor) - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool3d_with_indices_cpu @@ -9023,7 +7909,6 @@ CUDA: max_pool3d_with_indices_backward_out_cuda - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_pool3d_with_indices_backward_cpu @@ -9037,7 +7922,6 @@ CUDA: max_unpooling2d_forward_out_cuda - func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling2d_forward_cpu @@ -9051,7 +7935,6 @@ CUDA: max_unpooling2d_backward_out_cuda - func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling2d_backward_cpu @@ -9065,7 +7948,6 @@ CUDA: max_unpooling3d_forward_out_cuda - func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling3d_forward_cpu @@ -9079,7 +7961,6 @@ CUDA: max_unpooling3d_backward_out_cuda - func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: max_unpooling3d_backward_cpu @@ -9093,7 +7974,6 @@ CUDA: reflection_pad1d_out_cuda - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, QuantizedCPU: reflection_pad1d_cpu @@ -9107,7 +7987,6 @@ CUDA: reflection_pad1d_backward_out_cuda - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: reflection_pad1d_backward_cpu @@ -9121,7 +8000,6 @@ CUDA: reflection_pad2d_out_cuda - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, QuantizedCPU: reflection_pad2d_cpu @@ -9135,7 +8013,6 @@ CUDA: reflection_pad2d_backward_out_cuda - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: reflection_pad2d_backward_cpu @@ -9149,7 +8026,6 @@ CUDA: replication_pad1d_out_cuda - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad1d_cpu @@ -9163,7 +8039,6 @@ CUDA: replication_pad1d_backward_out_cuda - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad1d_backward_cpu @@ -9177,7 +8052,6 @@ CUDA: replication_pad2d_out_cuda - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad2d_cpu @@ -9191,7 +8065,6 @@ CUDA: replication_pad2d_backward_out_cuda - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad2d_backward_cpu @@ -9205,7 +8078,6 @@ CUDA: replication_pad3d_out_cuda - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad3d_cpu @@ -9219,28 +8091,24 @@ CUDA: replication_pad3d_backward_out_cuda - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: replication_pad3d_backward_cpu CUDA: replication_pad3d_backward_cuda - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_cpu CUDA: upsample_linear1d_cuda - func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_backward_cpu CUDA: upsample_linear1d_backward_cuda - func: upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_cpu @@ -9248,54 +8116,46 @@ QuantizedCPU: upsample_bilinear2d_quantized_cpu - func: upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_backward_cpu CUDA: upsample_bilinear2d_backward_cuda - func: upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_cpu CUDA: upsample_trilinear3d_cuda - func: upsample_trilinear3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_backward_cpu CUDA: upsample_trilinear3d_backward_cuda - func: upsample_bicubic2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_cpu CUDA: upsample_bicubic2d_cuda - func: upsample_bicubic2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_backward_cpu CUDA: upsample_bicubic2d_backward_cuda - func: upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: upsample_nearest1d - func: upsample_nearest1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: DefaultBackend: upsample_nearest1d_backward - func: upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_cpu @@ -9303,14 +8163,12 @@ QuantizedCPU: upsample_nearest2d_quantized_cpu - func: upsample_nearest2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_backward_cpu CUDA: upsample_nearest2d_backward_cuda - func: upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_cpu @@ -9318,7 +8176,6 @@ QuantizedCPU: upsample_nearest3d_quantized_cpu - func: upsample_nearest3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_backward_cpu @@ -9333,7 +8190,6 @@ CUDA: upsample_linear1d_out_cuda - func: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_cpu @@ -9347,7 +8203,6 @@ CUDA: upsample_linear1d_backward_out_cuda - func: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_linear1d_backward_cpu @@ -9361,7 +8216,6 @@ CUDA: upsample_bilinear2d_out_cuda - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_cpu @@ -9376,7 +8230,6 @@ CUDA: upsample_bilinear2d_backward_out_cuda - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bilinear2d_backward_cpu @@ -9390,7 +8243,6 @@ CUDA: upsample_bicubic2d_out_cuda - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_cpu @@ -9404,7 +8256,6 @@ CUDA: upsample_bicubic2d_backward_out_cuda - func: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_bicubic2d_backward_cpu @@ -9418,7 +8269,6 @@ CUDA: upsample_trilinear3d_out_cuda - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_cpu @@ -9432,7 +8282,6 @@ CUDA: upsample_trilinear3d_backward_out_cuda - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_trilinear3d_backward_cpu @@ -9447,7 +8296,6 @@ CUDA: upsample_nearest1d_out_cuda - func: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn structured_delegate: upsample_nearest1d.out @@ -9460,7 +8308,6 @@ CUDA: upsample_nearest1d_backward_out_cuda - func: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor - use_c10_dispatcher: full python_module: nn structured_delegate: upsample_nearest1d_backward.grad_input @@ -9472,7 +8319,6 @@ CUDA: upsample_nearest2d_out_cuda - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_cpu @@ -9487,7 +8333,6 @@ CUDA: upsample_nearest2d_backward_out_cuda - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest2d_backward_cpu @@ -9501,7 +8346,6 @@ CUDA: upsample_nearest3d_out_cuda - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_cpu @@ -9516,7 +8360,6 @@ CUDA: upsample_nearest3d_backward_out_cuda - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: upsample_nearest3d_backward_cpu @@ -9529,7 +8372,6 @@ CPU, CUDA: sigmoid_backward_out - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: sigmoid_backward @@ -9541,7 +8383,6 @@ CPU, CUDA: logit_backward_out - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: logit_backward @@ -9553,7 +8394,6 @@ CPU, CUDA: tanh_backward_out - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU, CUDA: tanh_backward @@ -9598,7 +8438,6 @@ CUDA: slow_conv_transpose2d_backward_out_cuda - func: slow_conv_transpose2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_transpose2d_backward_cpu @@ -9626,7 +8465,6 @@ CUDA: slow_conv_transpose3d_backward_out_cuda - func: slow_conv_transpose3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_transpose3d_backward_cpu @@ -9662,7 +8500,6 @@ CUDA: slow_conv2d_backward_out_cuda - func: thnn_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv2d_backward_cpu @@ -9695,7 +8532,6 @@ CUDA: thnn_conv_depthwise2d_backward_out - func: thnn_conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight) - use_c10_dispatcher: full python_module: nn dispatch: CUDA: thnn_conv_depthwise2d_backward @@ -9727,7 +8563,6 @@ CPU: slow_conv3d_backward_out_cpu - func: slow_conv3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv3d_backward_cpu @@ -9740,7 +8575,6 @@ CUDA: slow_conv_dilated2d_cuda - func: slow_conv_dilated2d_backward(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_dilated2d_backward_cpu @@ -9754,7 +8588,6 @@ CUDA: slow_conv_dilated3d_cuda - func: slow_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias) - use_c10_dispatcher: full python_module: nn dispatch: CPU: slow_conv_dilated3d_backward_cpu @@ -9768,7 +8601,6 @@ CUDA: col2im_out_cuda - func: col2im(Tensor self, int[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: col2im_cpu @@ -9782,14 +8614,12 @@ CUDA: col2im_backward_out_cuda - func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: col2im_backward_cpu CUDA: col2im_backward_cuda - func: column_stack(Tensor[] tensors) -> Tensor - use_c10_dispatcher: full dispatch: Math: column_stack @@ -9806,7 +8636,6 @@ CUDA: im2col_out_cuda - func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: im2col_cpu @@ -9820,30 +8649,25 @@ CUDA: im2col_backward_out_cuda - func: im2col_backward(Tensor grad_output, int[2] input_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: im2col_backward_cpu CUDA: im2col_backward_cuda - func: isfinite(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method device_guard: False - func: isinf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method device_guard: False - func: record_stream(Tensor(a!) self, Stream s) -> () - use_c10_dispatcher: full variants: method dispatch: CUDA: record_stream_cuda - func: isposinf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -9852,7 +8676,6 @@ CPU, CUDA: isposinf_out - func: isneginf(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) @@ -9865,12 +8688,10 @@ # of the vmap frontend API (see torch/_vmap_internals.py). They are not # user-facing, hence the leading underscore. Please don't use them them anywhere else. - func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor - use_c10_dispatcher: full variants: function # See NOTE [_add_batch_dim and _remove_batch_dim] - func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor - use_c10_dispatcher: full variants: function ## Functions related to the fast Fourier transform and the torch.fft namespace @@ -9885,72 +8706,128 @@ # torch.fft.fft # NOTE: NOT an alias for torch.fft, which has different semantics - func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor - use_c10_dispatcher: full + python_module: fft + variants: function + +- func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function @@ -9959,18 +8836,26 @@ python_module: fft variants: function +- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor use_c10_dispatcher: hacky_wrapper_for_legacy_signatures python_module: fft variants: function +- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!) + use_c10_dispatcher: hacky_wrapper_for_legacy_signatures + python_module: fft + variants: function + - func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function - func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor - use_c10_dispatcher: full python_module: fft variants: function @@ -9984,7 +8869,6 @@ # See linalg_det as an example. - func: linalg_cholesky(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -9999,25 +8883,21 @@ # torch.linalg.det, alias for torch.det - func: linalg_det(Tensor self) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function - func: det(Tensor self) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: det - func: _syevd_helper(Tensor self, bool compute_eigenvectors, str uplo) -> (Tensor, Tensor) - use_c10_dispatcher: full variants: function dispatch: CPU: _syevd_helper_cpu CUDA: _syevd_helper_cuda - func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors) - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10030,7 +8910,6 @@ DefaultBackend: linalg_eigh_out - func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10043,7 +8922,6 @@ DefaultBackend: linalg_eigvalsh_out - func: inner(Tensor self, Tensor other) -> Tensor - use_c10_dispatcher: full variants: function, method - func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) @@ -10051,14 +8929,12 @@ # torch.outer, alias for torch.ger - func: outer(Tensor self, Tensor vec2) -> Tensor - use_c10_dispatcher: full variants: function, method - func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!) use_c10_dispatcher: hacky_wrapper_for_legacy_signatures - func: ger(Tensor self, Tensor vec2) -> Tensor - use_c10_dispatcher: full variants: function, method dispatch: DefaultBackend: ger @@ -10069,12 +8945,10 @@ DefaultBackend: ger_out - func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function - func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function @@ -10098,7 +8972,6 @@ variants: function - func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10112,7 +8985,6 @@ Math: linalg_cond_out - func: linalg_cond.p_str(Tensor self, str p) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10160,7 +9032,6 @@ Math: linalg_tensorinv_out - func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10195,7 +9066,6 @@ CUDA: _linalg_qr_helper_cuda - func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor - use_c10_dispatcher: full python_module: linalg variants: function dispatch: @@ -10211,32 +9081,27 @@ ## Functions that are only for testing # It is undocumented and should not be used outside of tests. - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor - use_c10_dispatcher: full # Note: this function is only for testing. - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: _test_optional_intlist # Note: this function is only for testing. - func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: _test_optional_intlist # Note: this function is only for testing. - func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor - use_c10_dispatcher: full python_module: nn dispatch: CPU: _test_optional_floatlist # Note: this function is only for testing. - func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor - use_c10_dispatcher: full python_module: nn # Note: this function is only for testing. diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 9bb679beb3d0..6c3298b72e75 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -650,7 +650,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen dstBuffer.add_(srcBuffer, value); } } else { - AT_DISPATCH_ALL_TYPES( + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, commonDtype, "add_dense_sparse", [&] { add_dense_sparse_worker_cpu(resultBuffer, value, sparse, indices, valuesBuffer); }); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index c8366f71618e..fce3446816e7 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -338,8 +338,8 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT if (sparse.dense_dim() == 0) { TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions"); - AT_DISPATCH_ALL_TYPES_AND2( - at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] { apply::sparseElementwiseKernelScalar, uint64_t, scalar_t> <<>>( TensorCAddOp(value.to()), diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp index 4dba9de7d5b0..88c519c09ea3 100644 --- a/aten/src/ATen/native/vulkan/VulkanAten.cpp +++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp @@ -548,7 +548,7 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape)); m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select)); m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose)); - m.impl_UNBOXED("transpose_", at::native::vulkan::aten::transpose_); + m.impl("transpose_", at::native::vulkan::aten::transpose_); m.impl("view", TORCH_FN(at::native::vulkan::aten::view)); m.impl("unsqueeze", TORCH_FN(at::native::vulkan::aten::unsqueeze)); m.impl("empty.memory_format", at::native::vulkan::aten::empty); @@ -569,11 +569,11 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("_cat", TORCH_FN(at::native::vulkan::aten::cat)); m.impl("mul.Scalar", TORCH_FN(at::native::vulkan::aten::mul_scalar)); m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar)); - m.impl_UNBOXED( + m.impl( "convolution_overrideable", at::native::vulkan::aten::convolution); m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_); m.impl("relu_", at::native::vulkan::aten::relu_); - m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_); + m.impl("add_.Tensor", at::native::vulkan::aten::add_); } #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl index bb2508aefe65..547eec7fafef 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl @@ -17,7 +17,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 padding; ivec2 dilate; vec2 clamp; - int stacks_per_tower; + ivec3 src_kernel; } uBlock; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -28,9 +28,6 @@ void main() { /* Dynamically Uniform */ const ivec3 size = imageSize(uOutput); const ivec3 isize = textureSize(uInput, 0); - const int tower = pos.z/(uBlock.stacks_per_tower); - const int tower_offset = pos.z % uBlock.stacks_per_tower; - const ivec4 block = tower_offset * uBlock.kernel.z + ivec4(0, 1, 2, 3); if (all(lessThan(pos, size))) { const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding; @@ -42,16 +39,15 @@ void main() { vec4 sum = uBias.data[pos.z]; for (int z = 0; z < uBlock.kernel.z; z+=4) { - const ivec4 kz = block + z; - - for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) { - for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) { - const vec4 In = texelFetch(uInput, ivec3(x, y, z/4), 0); - - sum = fma(In.xxxx, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.x), 0), sum); - sum = fma(In.yyyy, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.y), 0), sum); - sum = fma(In.zzzz, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.z), 0), sum); - sum = fma(In.wwww, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.w), 0), sum); + const int z4 = z/4; + for (int y = start.y, ky = kstart.y + pos.z * uBlock.src_kernel.y; y < end.y; y += uBlock.dilate.y, ++ky) { + for (int x = start.x, kx = 4*kstart.x + z4*uBlock.src_kernel.z; x < end.x; x += uBlock.dilate.x, kx+=4) { + const vec4 In = texelFetch(uInput, ivec3(x, y, z4), 0); + + sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0 + kx, ky, 0), 0), sum); + sum = fma(In.yyyy, texelFetch(uKernel, ivec3(1 + kx, ky, 0), 0), sum); + sum = fma(In.zzzz, texelFetch(uKernel, ivec3(2 + kx, ky, 0), 0), sum); + sum = fma(In.wwww, texelFetch(uKernel, ivec3(3 + kx, ky, 0), 0), sum); } } } diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl index 0f49515718b2..f8f929461ce7 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl @@ -17,6 +17,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 padding; ivec2 dilate; vec2 clamp; + ivec2 src_kernel; } uBlock; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -38,10 +39,10 @@ void main() { vec4 sum = uBias.data[pos.z]; for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) { - for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) { + for (int x = start.x, kx = kstart.x + ky*uBlock.src_kernel.x; x < end.x; x += uBlock.dilate.x, ++kx) { sum = fma( texelFetch(uInput, ivec3(x, y, pos.z), 0), - texelFetch(uKernel, ivec3(kx, ky, pos.z), 0), + texelFetch(uKernel, ivec3(kx, pos.z, 0), 0), sum); } } diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl index 1355b2c09b05..b28f0550132f 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl @@ -16,7 +16,6 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 stride; ivec2 padding; vec2 clamp; - int stacks_per_tower; } uBlock; layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; @@ -27,9 +26,6 @@ void main() { /* Dynamically Uniform */ const ivec3 size = imageSize(uOutput); const ivec3 isize = textureSize(uInput, 0); - const int tower = pos.z/(uBlock.stacks_per_tower); - const int tower_offset = pos.z % uBlock.stacks_per_tower; - const ivec4 block = tower_offset * uBlock.kernel.x + ivec4(0, 1, 2, 3); if (all(lessThan(pos, size))) { const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding; @@ -38,12 +34,11 @@ void main() { for (int z = 0; z < uBlock.kernel.x; z+=4) { const vec4 In = texelFetch(uInput, ivec3(ipos.x, ipos.y, z/4), 0); - const ivec4 kz = block + z; - sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0, tower, kz.x), 0), sum); - sum = fma(In.yyyy, texelFetch(uKernel, ivec3(0, tower, kz.y), 0), sum); - sum = fma(In.zzzz, texelFetch(uKernel, ivec3(0, tower, kz.z), 0), sum); - sum = fma(In.wwww, texelFetch(uKernel, ivec3(0, tower, kz.w), 0), sum); + sum = fma(In.xxxx, texelFetch(uKernel, ivec3(z+0, pos.z, 0), 0), sum); + sum = fma(In.yyyy, texelFetch(uKernel, ivec3(z+1, pos.z, 0), 0), sum); + sum = fma(In.zzzz, texelFetch(uKernel, ivec3(z+2, pos.z, 0), 0), sum); + sum = fma(In.wwww, texelFetch(uKernel, ivec3(z+3, pos.z, 0), 0), sum); } imageStore( diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index 369a47fee93a..9f25d89bca9b 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -167,10 +167,10 @@ Tensor& relu_(Tensor& self) { TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("clamp", TORCH_FN(clamp)); m.impl("clamp_", TORCH_FN(clamp_)); - m.impl_UNBOXED("hardtanh", hardtanh); - m.impl_UNBOXED("hardtanh_", hardtanh_); - m.impl_UNBOXED("relu", relu); - m.impl_UNBOXED("relu_", relu_); + m.impl("hardtanh", hardtanh); + m.impl("hardtanh_", hardtanh_); + m.impl("relu", relu); + m.impl("relu_", relu_); } #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h index 6f7080f71a80..b0bbeeaf34f1 100644 --- a/aten/src/ATen/native/vulkan/ops/Common.h +++ b/aten/src/ATen/native/vulkan/ops/Common.h @@ -36,7 +36,7 @@ struct Layout final { }; struct Experimentation { - static constexpr bool kUseConv2dOldApi = true; + static constexpr bool kUseConv2dOldApi = false; }; struct ConvPrepackLimits final { diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index 5af2c14b80cb..d88545e3a25a 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -25,7 +25,7 @@ inline bool is_pointwise(const IntArrayRef filter) { (1 == filter[Layout::Filter::width]); } -vTensor pack_weights( +vTensor pack_weights_dw( api::Resource::Pool& pool, const Tensor& weight_arg, const int64_t groups) { @@ -39,161 +39,201 @@ vTensor pack_weights( const IntArrayRef src_filter = weight.sizes(); const float* const src_weight_ptr = weight.data_ptr(); - // - // Depthwise - // + const int64_t src_kw_sz = src_filter[Layout::Filter::width]; + const int64_t src_kh_sz = src_filter[Layout::Filter::height]; + const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4)); + vTensor v_weight{ + api::context(), + &pool, + { + 4, + num_stacks, + src_kw_sz * src_kh_sz, + }, + weight.options(), + }; - if (is_depthwise(src_filter, groups)) { - vTensor v_weight{ - api::context(), - &pool, - src_filter, - weight.options(), - }; + using Future = vTensor::Future; + Future v_weight_future = v_weight.host(); + Future::Payload v_weight_payload = v_weight_future.wait(); - using Future = vTensor::Future; - Future v_weight_future = v_weight.host(); - Future::Payload v_weight_payload = v_weight_future.wait(); + /* Source */ + const int64_t src_kernel_sz = src_kw_sz * src_kh_sz; + const int64_t src_block_sz = + src_kernel_sz * src_filter[Layout::Filter::input]; - memcpy( - v_weight_payload.get(), - src_weight_ptr, - std::min(weight.nbytes(), v_weight.nbytes())); + /* Destination */ + const int64_t dst_kw_sz = src_kw_sz * src_kh_sz; + const int64_t dst_kh_sz = num_stacks; + const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz; - return v_weight; - } + float* const dst_weight_ptr = v_weight_payload.get(); + memset(dst_weight_ptr, 0, v_weight.nbytes()); - // - // General - // + for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) { + /* Source */ + const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz; - if (Experimentation::kUseConv2dOldApi) { - const uint32_t OC = src_filter[Layout::Filter::output]; - const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u); - const uint32_t C = src_filter[Layout::Filter::input]; - const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u); - const uint32_t KH = src_filter[Layout::Filter::height]; - const uint32_t KW = src_filter[Layout::Filter::width]; - - vTensor v_weight{ - api::context(), - &pool, - { - 1, - 4 * KH * KW, - OC_4, - 4 * C_4 - }, - weight.options(), - }; + /* Destination */ + const int64_t dst_oh = src_oc / 4; + const int64_t dst_c = src_oc % 4; - using Future = vTensor::Future; - Future v_weight_future = v_weight.host(); - Future::Payload v_weight_payload = v_weight_future.wait(); + float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz; - float* const dst_weight_ptr = v_weight_payload.get(); - memset(dst_weight_ptr, 0, v_weight.nbytes()); + for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) { + memcpy( + dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz, + src_weight_oc_ptr + src_ih * src_kw_sz, + sizeof(float) * src_kw_sz); + } + } - const float* src = src_weight_ptr; - float* const dst = dst_weight_ptr; + return v_weight; +} +vTensor pack_weights_old( + api::Resource::Pool& pool, + const Tensor& weight_arg, + const int64_t groups) { + if (weight_arg.is_vulkan()) { + return convert(weight_arg); + } + + const Tensor weight = weight_arg.contiguous(); + const IntArrayRef src_filter = weight.sizes(); + const float* const src_weight_ptr = weight.data_ptr(); + + const uint32_t OC = src_filter[Layout::Filter::output]; + const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u); + const uint32_t C = src_filter[Layout::Filter::input]; + const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u); + const uint32_t KH = src_filter[Layout::Filter::height]; + const uint32_t KW = src_filter[Layout::Filter::width]; + + vTensor v_weight{ + api::context(), + &pool, { - uint32_t ridx = 0; - const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16; - for (uint32_t oc = 0; oc < OC; ++oc) { - int oc_4 = oc / 4; - int oc_4_i = oc % 4; - float* dst_oc = dst + oc_4 * oc_4SizeNumel; - for (uint32_t ic = 0; ic < C; ++ic) { - int ic_4 = ic / 4; - int ic_4_i = ic % 4; - float* dst_ic = dst_oc + ic_4 * KW * KH * 16; - for (uint32_t ky = 0; ky < KH; ++ky) { - float* dst_ky = dst_ic + ky * KW * 16; - for (uint32_t kx = 0; kx < KW; ++kx) { - float* dst_kx = dst_ky + kx * 16; - dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++]; - } + 1, + 4 * KH * KW, + OC_4, + 4 * C_4 + }, + weight.options(), + }; + + using Future = vTensor::Future; + Future v_weight_future = v_weight.host(); + Future::Payload v_weight_payload = v_weight_future.wait(); + + float* const dst_weight_ptr = v_weight_payload.get(); + memset(dst_weight_ptr, 0, v_weight.nbytes()); + + const float* src = src_weight_ptr; + float* const dst = dst_weight_ptr; + + { + uint32_t ridx = 0; + const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16; + for (uint32_t oc = 0; oc < OC; ++oc) { + int oc_4 = oc / 4; + int oc_4_i = oc % 4; + float* dst_oc = dst + oc_4 * oc_4SizeNumel; + for (uint32_t ic = 0; ic < C; ++ic) { + int ic_4 = ic / 4; + int ic_4_i = ic % 4; + float* dst_ic = dst_oc + ic_4 * KW * KH * 16; + for (uint32_t ky = 0; ky < KH; ++ky) { + float* dst_ky = dst_ic + ky * KW * 16; + for (uint32_t kx = 0; kx < KW; ++kx) { + float* dst_kx = dst_ky + kx * 16; + dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++]; } } } + } - // shader KO4C4HW_to_image - struct Image3D { - float* data_; - uint32_t dim0_, dim1_, dim2_; - - Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) { - dim0_ = dim0; - dim1_ = dim1; - dim2_ = dim2; - data_ = new float[dim0 * dim1 * dim2 * 4]; - memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float)); - } + // shader KO4C4HW_to_image + struct Image3D { + float* data_; + uint32_t dim0_, dim1_, dim2_; + + Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) { + dim0_ = dim0; + dim1_ = dim1; + dim2_ = dim2; + data_ = new float[dim0 * dim1 * dim2 * 4]; + memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float)); + } - inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { - return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_; - } + inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_; + } - void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) { - data_[idx(i0, i1, i2, i3)] = value; - } + void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) { + data_[idx(i0, i1, i2, i3)] = value; + } - float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { - return data_[idx(i0, i1, i2, i3)]; - } - } image{4 * C_4, OC_4, KH * KW}; - - for (uint32_t sx = 0; sx < C_4; ++sx) { - for (uint32_t sy = 0; sy < OC_4; ++sy) { - for (uint32_t sz = 0; sz < (KH * KW); ++sz) { - for (uint32_t vi = 0; vi < 4; ++vi) { - int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz; - image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]); - image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]); - image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]); - image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]); - } + float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + return data_[idx(i0, i1, i2, i3)]; + } + } image{4 * C_4, OC_4, KH * KW}; + + for (uint32_t sx = 0; sx < C_4; ++sx) { + for (uint32_t sy = 0; sy < OC_4; ++sy) { + for (uint32_t sz = 0; sz < (KH * KW); ++sz) { + for (uint32_t vi = 0; vi < 4; ++vi) { + int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz; + image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]); + image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]); + image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]); + image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]); } } } + } - // inverse function of nchw_to_image - const uint32_t W = 4 * C_4; - const uint32_t H = OC_4; - const uint32_t D = KH * KW; - for (uint32_t sx = 0; sx < W; ++sx) { - for (uint32_t sy = 0; sy < H; ++sy) { - for (uint32_t sz = 0; sz < D; ++sz) { - for (uint32_t szvi = 0; szvi < 4; ++szvi) { - dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi); - } + // inverse function of nchw_to_image + const uint32_t W = 4 * C_4; + const uint32_t H = OC_4; + const uint32_t D = KH * KW; + for (uint32_t sx = 0; sx < W; ++sx) { + for (uint32_t sy = 0; sy < H; ++sy) { + for (uint32_t sz = 0; sz < D; ++sz) { + for (uint32_t szvi = 0; szvi < 4; ++szvi) { + dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi); } } } } + } - return v_weight; + return v_weight; +} + +vTensor pack_weights_2d( + api::Resource::Pool& pool, + const Tensor& weight_arg, + const int64_t groups) { + if (weight_arg.is_vulkan()) { + return convert(weight_arg); } + const Tensor weight = weight_arg.contiguous(); + const IntArrayRef src_filter = weight.sizes(); + const float* const src_weight_ptr = weight.data_ptr(); + + const int64_t src_kw_sz = src_filter[Layout::Filter::width]; + const int64_t src_kh_sz = src_filter[Layout::Filter::height]; const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4)); - const int64_t stack_depth = - 4 * api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4)); - const int64_t max_stacks_per_tower = - ConvPrepackLimits::maxStackDepth / stack_depth; - const int64_t num_towers = div_up(num_stacks, max_stacks_per_tower); - int64_t stacks_per_tower = num_stacks; - if (num_towers > 1) { - stacks_per_tower = div_up(num_stacks, num_towers); - } + const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4)); vTensor v_weight{ api::context(), &pool, { - stacks_per_tower, - stack_depth, - src_filter[Layout::Filter::height] * num_towers, - src_filter[Layout::Filter::width], + 4, + src_kh_sz * num_stacks, + src_kw_sz * stack_depth, }, weight.options(), }; @@ -203,53 +243,59 @@ vTensor pack_weights( Future::Payload v_weight_payload = v_weight_future.wait(); /* Source */ - const int64_t src_kw_sz = src_filter[Layout::Filter::width]; - const int64_t src_kh_sz = src_filter[Layout::Filter::height]; const int64_t src_kernel_sz = src_kw_sz * src_kh_sz; const int64_t src_block_sz = src_kernel_sz * src_filter[Layout::Filter::input]; /* Destination */ - const IntArrayRef dst_filter = v_weight.sizes(); - const int64_t dst_kw_sz = src_filter[Layout::Filter::width]; - const int64_t dst_kh_sz = src_filter[Layout::Filter::height] * num_towers; + const int64_t dst_kw_sz = src_kw_sz * stack_depth; + const int64_t dst_kh_sz = src_kh_sz * num_stacks; const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz; - const int64_t dst_block_sz = - dst_kernel_sz * dst_filter[Layout::Filter::input]; - - TORCH_INTERNAL_ASSERT(src_kernel_sz*num_towers == dst_kernel_sz, "Internal error!"); float* const dst_weight_ptr = v_weight_payload.get(); memset(dst_weight_ptr, 0, v_weight.nbytes()); for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) { - const int64_t i_tower = src_oc / (stacks_per_tower * 4); /* Source */ - const float* const src_weight_oc_ptr = - src_weight_ptr + src_oc * src_block_sz; + const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz; /* Destination */ - const int64_t local_oc = src_oc % (stacks_per_tower * 4); - const int64_t dst_oc = local_oc / 4; - const int64_t dst_oc_offset = local_oc % 4; + const int64_t dst_oh = src_oc / 4; + const int64_t dst_c = src_oc % 4; - float* const dst_weight_oc_ptr = dst_weight_ptr + dst_oc * dst_block_sz + - dst_oc_offset * dst_kernel_sz; + float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz; for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) { - const int64_t dst_ic = 4 * src_ic; - - memcpy( - dst_weight_oc_ptr + dst_ic * dst_kernel_sz + - (i_tower * src_kernel_sz), - src_weight_oc_ptr + src_ic * src_kernel_sz, - sizeof(float) * src_kernel_sz); + const int64_t dst_ic4 = src_ic/4; + for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) { + for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) { + memcpy( + dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz + + dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4, + src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw, + sizeof(float)); + } + } } } return v_weight; } +vTensor pack_weights( + api::Resource::Pool& pool, + const Tensor& weight_arg, + const int64_t groups) { + if (is_depthwise(weight_arg.sizes(), groups)) { + return pack_weights_dw(pool, weight_arg, groups); + } + + if (Experimentation::kUseConv2dOldApi) { + return pack_weights_old(pool, weight_arg, groups); + } + return pack_weights_2d(pool, weight_arg, groups); +} + vTensor pack_biases( api::Resource::Pool& pool, const c10::optional& bias, @@ -394,6 +440,7 @@ void conv2d_depthwise( const vTensor& v_weight, const vTensor& v_bias, const IntArrayRef filter, + const IntArrayRef src_filter, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, @@ -406,6 +453,7 @@ void conv2d_depthwise( int32_t padding_x, padding_y; int32_t dilate_x, dilate_y; float clamp_x, clamp_y; + int32_t src_filter_w, src_filter_h; } block { safe_downcast(filter[Layout::Filter::width]), safe_downcast(filter[Layout::Filter::height]), @@ -417,6 +465,8 @@ void conv2d_depthwise( safe_downcast(dilation[Layout::Parameter::height]), output_min, output_max, + safe_downcast(src_filter[Layout::Filter::width]), + safe_downcast(src_filter[Layout::Filter::height]), }; context->dispatch( @@ -473,14 +523,12 @@ void conv2d_pointwise( const float output_min, const float output_max) { if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { - const int64_t stacks_per_tower = v_weight.sizes()[0]; const struct { int32_t kernel_ic, kernel_oc; int32_t stride_x, stride_y; int32_t padding_x, padding_y; float clamp_x, clamp_y; - int32_t stacks_per_tower; } block { safe_downcast(filter[Layout::Filter::input]), safe_downcast(filter[Layout::Filter::output]), @@ -490,7 +538,6 @@ void conv2d_pointwise( safe_downcast(padding[Layout::Parameter::height]), output_min, output_max, - safe_downcast(stacks_per_tower), }; context->dispatch( @@ -542,20 +589,20 @@ void conv2d( const vTensor& v_weight, const vTensor& v_bias, const IntArrayRef filter, + const IntArrayRef src_filter, const IntArrayRef stride, const IntArrayRef padding, const IntArrayRef dilation, const float output_min, const float output_max) { if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { - const int64_t stacks_per_tower = v_weight.sizes()[0]; const struct { int32_t kernel_x, kernel_y, kernel_ic, kernel_oc; int32_t stride_x, stride_y; int32_t padding_x, padding_y; int32_t dilate_x, dilate_y; float clamp_x, clamp_y; - int32_t stacks_per_tower; + int32_t src_filter_w, src_filter_h, src_filter_w4; } block { safe_downcast(filter[Layout::Filter::width]), safe_downcast(filter[Layout::Filter::height]), @@ -569,7 +616,9 @@ void conv2d( safe_downcast(dilation[Layout::Parameter::height]), output_min, output_max, - safe_downcast(stacks_per_tower), + safe_downcast(src_filter[Layout::Filter::width]), + safe_downcast(src_filter[Layout::Filter::height]), + safe_downcast(src_filter[Layout::Filter::width]*4), }; context->dispatch( @@ -639,7 +688,7 @@ Tensor convolution( #ifdef USE_VULKAN_API TORCH_LIBRARY_IMPL(aten, Vulkan, m) { - m.impl_UNBOXED("convolution_overrideable", convolution); + m.impl("convolution_overrideable", convolution); } #endif /* USE_VULKAN_API */ @@ -859,6 +908,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const { packed_.v_weight, packed_.v_bias, packed_.filter, + unpacked_.filter, packed_.stride, packed_.padding, packed_.dilation, @@ -904,6 +954,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const { packed_.v_weight, packed_.v_bias, packed_.filter, + unpacked_.filter, packed_.stride, packed_.padding, packed_.dilation, diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp index 6e48ba120c31..14deb30b9888 100644 --- a/aten/src/ATen/native/vulkan/ops/Factory.cpp +++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp @@ -45,7 +45,7 @@ Tensor empty_strided( #ifdef USE_VULKAN_API TORCH_LIBRARY_IMPL(aten, Vulkan, m) { - m.impl_UNBOXED("empty.memory_format", at::native::vulkan::ops::empty_memory_format); + m.impl("empty.memory_format", at::native::vulkan::ops::empty_memory_format); m.impl("empty_strided", TORCH_FN(at::native::vulkan::ops::empty_strided)); } diff --git a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp index e8442a64d0ad..da13fb9574d5 100644 --- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp +++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp @@ -73,21 +73,21 @@ TORCH_LIBRARY(xnnpack, m) { } TORCH_LIBRARY(prepacked, m) { - m.def("linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext"); - m.def("linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y"); - m.def("conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext"); - m.def("conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext"); - m.def("conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y"); - m.def("conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y"); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y")); + m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y")); } TORCH_LIBRARY_IMPL(prepacked, CPU, m) { - m.impl("linear_clamp_prepack", TORCH_FN(createLinearClampPrePackOpContext)); - m.impl("linear_clamp_run", TORCH_FN(internal::linear::linear_clamp_run)); - m.impl("conv2d_clamp_prepack", TORCH_FN(createConv2dClampPrePackOpContext)); - m.impl("conv2d_transpose_clamp_prepack", TORCH_FN(createConv2dTransposeClampPrePackOpContext)); - m.impl("conv2d_clamp_run", TORCH_FN(internal::convolution2d::conv2d_clamp_run)); - m.impl("conv2d_transpose_clamp_run", TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_prepack"), TORCH_FN(createLinearClampPrePackOpContext)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_run"), TORCH_FN(internal::linear::linear_clamp_run)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_prepack"), TORCH_FN(createConv2dClampPrePackOpContext)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_prepack"), TORCH_FN(createConv2dTransposeClampPrePackOpContext)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_clamp_run)); + m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run)); } } // namespace xnnpack diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp index e923f6d73bd0..ed4359c6883e 100644 --- a/aten/src/ATen/templates/RegisterDispatchKey.cpp +++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp @@ -37,10 +37,13 @@ namespace at { -namespace { - ${dispatch_definitions} +// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid +// ambiguity with conflicting identifiers that may have been defined in +// at namespace already. +namespace { + TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) { ${dispatch_registrations} } diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h index 1c0a04a318d0..0dfef701c51b 100644 --- a/aten/src/ATen/templates/TensorBody.h +++ b/aten/src/ATen/templates/TensorBody.h @@ -208,10 +208,6 @@ class TORCH_API Tensor { Tensor& operator=(const Tensor&) &&; Tensor& operator=(Tensor&&) &&; - #ifdef _MSC_VER - #pragma warning( pop ) - #endif - bool is_same(const Tensor& other) const noexcept { return impl_ == other.impl_; } @@ -761,6 +757,12 @@ class TORCH_API Tensor { c10::intrusive_ptr impl_; }; +// For "multiple ... operators specified" warnings, closing brace of class +// declaration must be included between pragma push & pop +#ifdef _MSC_VER +#pragma warning( pop ) +#endif + int64_t get_device(Tensor self); template diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp index 6d596095d7a0..805ed40557b6 100644 --- a/aten/src/ATen/test/cpu_rng_test.cpp +++ b/aten/src/ATen/test/cpu_rng_test.cpp @@ -28,6 +28,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); } uint64_t current_seed() const override { throw std::runtime_error("not implemented"); } uint64_t seed() override { throw std::runtime_error("not implemented"); } + void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); } + c10::intrusive_ptr get_state() const override { throw std::runtime_error("not implemented"); } TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); } static DeviceType device_type() { return DeviceType::CPU; } diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp index 14e75205aa66..a0e2648758ff 100644 --- a/aten/src/ATen/test/ivalue_test.cpp +++ b/aten/src/ATen/test/ivalue_test.cpp @@ -51,6 +51,91 @@ TEST(IValueTest, Basic) { ASSERT_EQ(tv.use_count(), 2); } +static std::array makeSampleIValues() { + return { at::rand({3, 4}), "hello", 42, true, 1.5 }; +} + +static std::array makeMoreSampleIValues() { + return { at::rand({3, 4}), "goodbye", 23, false, 0.5 }; +} + +// IValue::operator== doesn't seem to work on Tensors. +#define EXPECT_IVALUE_EQ(a, b) \ + EXPECT_EQ((a).isTensor(), (b).isTensor()); \ + if ((a).isTensor()) { \ + EXPECT_TRUE(a.toTensor().equal(b.toTensor())); \ + } else { \ + EXPECT_EQ(a, b); \ + } + +TEST(IValueTest, Swap) { + // swap() has the following 3 cases: tensor, intrusive_ptr, or + // neither. Exercise all pairs of the three. + + auto sampleInputs = makeSampleIValues(); + auto sampleTargets = makeMoreSampleIValues(); + for (const auto& input: sampleInputs) { + for (const auto& target: sampleTargets) { + IValue a(input); + IValue b(target); + EXPECT_IVALUE_EQ(a, input); + EXPECT_IVALUE_EQ(b, target); + a.swap(b); + EXPECT_IVALUE_EQ(a, target); + EXPECT_IVALUE_EQ(b, input); + } + } +} + +TEST(IValueTest, CopyConstruct) { + auto sampleInputs = makeSampleIValues(); + for (const IValue& v: sampleInputs) { + IValue copy(v); + EXPECT_IVALUE_EQ(copy, v); + } +} + +TEST(IValueTest, MoveConstruct) { + auto sampleInputs = makeSampleIValues(); + for (const IValue& v: sampleInputs) { + IValue source(v); + IValue target(std::move(source)); + EXPECT_IVALUE_EQ(target, v); + EXPECT_TRUE(source.isNone()); + } +} + +TEST(IValueTest, CopyAssign) { + auto sampleInputs = makeSampleIValues(); + auto sampleTargets = makeMoreSampleIValues(); + + for (const IValue& input: sampleInputs) { + for (const IValue& target: sampleTargets) { + IValue copyTo(target); + IValue copyFrom(input); + copyTo = copyFrom; + EXPECT_IVALUE_EQ(copyTo, input); + EXPECT_IVALUE_EQ(copyFrom, input); + EXPECT_IVALUE_EQ(copyTo, copyFrom); + } + } +} + +TEST(IValueTest, MoveAssign) { + auto sampleInputs = makeSampleIValues(); + auto sampleTargets = makeMoreSampleIValues(); + + for (const IValue& input: sampleInputs) { + for (const IValue& target: sampleTargets) { + IValue moveTo(target); + IValue moveFrom(input); + moveTo = std::move(moveFrom); + EXPECT_IVALUE_EQ(moveTo, input); + EXPECT_TRUE(moveFrom.isNone()); + } + } +} + TEST(IValueTest, Tuple) { std::tuple t = std::make_tuple(123, at::randn({1})); auto iv = IValue(t); @@ -318,5 +403,137 @@ TEST(IValueTest, EnumEquality) { ); } +TEST(IValueTest, isPtrType) { + IValue tensor(at::rand({3, 4})); + IValue undefinedTensor((at::Tensor())); + IValue integer(42); + IValue str("hello"); + + EXPECT_TRUE(tensor.isPtrType()); + EXPECT_FALSE(undefinedTensor.isPtrType()); + EXPECT_FALSE(integer.isPtrType()); + EXPECT_TRUE(str.isPtrType()); +} + +TEST(IValueTest, isAliasOf) { + auto sampleIValues = makeSampleIValues(); + for (auto& iv: sampleIValues) { + for (auto& iv2: sampleIValues) { + if (&iv == &iv2 && iv.isPtrType()) { + EXPECT_TRUE(iv.isAliasOf(iv2)); + } else { + EXPECT_FALSE(iv.isAliasOf(iv2)); + } + } + } +} + +TEST(IValueTest, internalToPointer) { + IValue tensor(at::rand({3, 4})); + IValue str("hello"); + + EXPECT_EQ(tensor.internalToPointer(), tensor.unsafeToTensorImpl()); + EXPECT_NE(str.internalToPointer(), nullptr); + + IValue nullStr((c10::intrusive_ptr())); + ASSERT_TRUE(nullStr.isString()); + EXPECT_EQ(nullStr.internalToPointer(), nullptr); +} + +TEST(IValueTest, IdentityComparisonAndHashing) { + at::Tensor t1 = at::rand({3, 4}); + at::Tensor t2 = at::rand({3, 4}); + IValue tv1(t1), tv2(t2); + IValue tv1b(t1); + + EXPECT_EQ(tv1.hash(), tv1b.hash()); + EXPECT_NE(tv1.hash(), tv2.hash()); + + EXPECT_TRUE(tv1.is(tv1)); + EXPECT_TRUE(tv1.is(tv1b)); + EXPECT_TRUE(tv1b.is(tv1)); + EXPECT_TRUE(tv2.is(tv2)); + + EXPECT_FALSE(tv1.is(tv2)); + EXPECT_FALSE(tv2.is(tv1)); + + IValue none; + IValue undefinedTensor((at::Tensor())); + + EXPECT_TRUE(none.is(undefinedTensor)); + EXPECT_TRUE(undefinedTensor.is(none)); + + // Is this a bug? We should probably have a is b => a.hash() == b.hash() + EXPECT_NE(none.hash(), undefinedTensor.hash()); + + auto sampleIValues = makeSampleIValues(); + auto sampleIValues2 = makeSampleIValues(); + auto moreSampleIValues = makeMoreSampleIValues(); + + ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size()); + for (int ii = 0; ii < sampleIValues.size(); ++ii) { + // Constant strings will have the same pointer value. + if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) { + EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash()); + } else { + EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash()); + } + EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash()); + } +} + +TEST(IValueTest, getSubValues) { + // Scalars have no subvalues. + IValue integer(42), float_(1.5); + + IValue::HashAliasedIValues subvalues; + + integer.getSubValues(subvalues); + EXPECT_TRUE(subvalues.empty()); + + subvalues.clear(); + + float_.getSubValues(subvalues); + EXPECT_TRUE(subvalues.empty()); + + subvalues.clear(); + + at::Tensor t1(at::rand({3, 4})), t2(at::rand({3, 4})); + IValue tv1(t1), tv2(t2); + IValue list(std::vector{t1, t2}); + IValue tuple(ivalue::Tuple::create({tv1, tv2})); + + std::unordered_map m; + m[1] = t1; + m[2] = t2; + + IValue dict(std::move(m)); + + auto objType = ClassType::create(nullopt, {}); + objType->addAttribute("t1", tv1.type()); + objType->addAttribute("t2", tv2.type()); + + auto o = ivalue::Object::create(StrongTypePtr(nullptr, objType), 2); + o->setSlot(0, tv1); + o->setSlot(1, tv2); + + IValue object(o); + tv1.getSubValues(subvalues); + EXPECT_EQ(subvalues.size(), 1); + EXPECT_EQ(subvalues.count(tv1), 1); + + subvalues.clear(); + + for (auto& container: {list, tuple, dict, object}) { + container.getSubValues(subvalues); + EXPECT_EQ(subvalues.size(), 3); + EXPECT_EQ(subvalues.count(container), 1); + EXPECT_EQ(subvalues.count(tv1), 1); + EXPECT_EQ(subvalues.count(tv2), 1); + + subvalues.clear(); + } +} + // TODO(gmagogsfm): Add type conversion test? } // namespace c10 diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt index a3ed10126b93..5661a697da38 100644 --- a/aten/src/TH/CMakeLists.txt +++ b/aten/src/TH/CMakeLists.txt @@ -79,7 +79,6 @@ install(FILES THHalf.h THTensor.hpp THStorageFunctions.hpp - THGenerator.hpp DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH") install(FILES diff --git a/aten/src/TH/THGenerator.hpp b/aten/src/TH/THGenerator.hpp deleted file mode 100644 index 1a40611f8b5b..000000000000 --- a/aten/src/TH/THGenerator.hpp +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include - -/** - * THGeneratorState is a POD class needed for memcpys - * in torch.get_rng_state() and torch.set_rng_state(). - * It is a legacy class and even though it is replaced with - * at::CPUGeneratorImpl, we need this class and some of its fields - * to support backward compatibility on loading checkpoints. - */ -struct THGeneratorState { - /* The initial seed. */ - uint64_t the_initial_seed; - int left; /* = 1; */ - int seeded; /* = 0; */ - uint64_t next; - uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector */ - - /********************************/ - - /* For normal distribution */ - double normal_x; - double normal_y; - double normal_rho; - int normal_is_valid; /* = 0; */ -}; - -/** - * THGeneratorStateNew is a POD class containing - * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used - * as a helper for torch.get_rng_state() and torch.set_rng_state() - * functions. - */ -struct THGeneratorStateNew { - THGeneratorState legacy_pod; - float next_float_normal_sample; - bool is_next_float_normal_sample_valid; -}; diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index 399bcc38e1de..c37b0b9bb7f0 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE) @@ -149,119 +148,4 @@ void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTens } } #endif - -#if defined(TH_REAL_IS_BYTE) -void THTensor_(getRNGState)(at::Generator _generator, THTensor *self) -{ - // See Note [Acquire lock when using random generators] - std::lock_guard lock(_generator.mutex()); - static const size_t size = sizeof(THGeneratorStateNew); - THTensor_(resize1d)(self, size); - THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size"); - THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous"); - static_assert(std::is_pod::value, "THGeneratorStateNew is not a PODType"); - - // cast byte tensor to POD type - THGeneratorStateNew* rng_state = (THGeneratorStateNew*)self->data(); - - // accumulate generator data to be copied into byte tensor - auto accum_state = std::make_unique(); - auto cast_generator = at::check_generator(_generator); - auto rng_data = cast_generator->engine().data(); - accum_state->legacy_pod.the_initial_seed = rng_data.seed_; - accum_state->legacy_pod.left = rng_data.left_; - accum_state->legacy_pod.seeded = rng_data.seeded_; - accum_state->legacy_pod.next = rng_data.next_; - std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state)); - accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy - accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy - accum_state->legacy_pod.normal_is_valid = false; - accum_state->legacy_pod.normal_y = 0.0; - accum_state->next_float_normal_sample = 0.0f; - accum_state->is_next_float_normal_sample_valid = false; - if(cast_generator->next_double_normal_sample()) { - accum_state->legacy_pod.normal_is_valid = true; - accum_state->legacy_pod.normal_y = *(cast_generator->next_double_normal_sample()); - } - if(cast_generator->next_float_normal_sample()) { - accum_state->is_next_float_normal_sample_valid = true; - accum_state->next_float_normal_sample = *(cast_generator->next_float_normal_sample()); - } - - memcpy(rng_state, accum_state.get(), size); -} - -void THTensor_(setRNGState)(at::Generator _generator, THTensor *self) -{ - // See Note [Acquire lock when using random generators] - std::lock_guard lock(_generator.mutex()); - auto cast_generator = at::check_generator(_generator); - THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous"); - static_assert(std::is_pod::value, "THGeneratorState is not a PODType"); - static_assert(std::is_pod::value, "THGeneratorStateNew is not a PODType"); - - static const size_t size_legacy = sizeof(THGeneratorState); - static const size_t size_current = sizeof(THGeneratorStateNew); - static_assert(size_legacy != size_current, "Legacy THGeneratorState and THGeneratorStateNew can't be of the same size"); - - at::mt19937 engine; - auto float_normal_sample = c10::optional(); - auto double_normal_sample = c10::optional(); - - // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. - THGeneratorState* legacy_pod; - if (THTensor_(nElement)(self) == size_legacy) { - legacy_pod = (THGeneratorState*)self->data(); - // Note that in legacy THGeneratorState, we didn't have float version - // of normal sample and hence we leave the c10::optional as is - - // Update next_double_normal_sample. - // Note that legacy THGeneratorState stores two uniform values (normal_x, normal_y) - // and a rho value (normal_rho). These three values were redundant and in the new - // DistributionsHelper.h, we store the actual extra normal sample, rather than three - // intermediate values. - if (legacy_pod->normal_is_valid) { - auto r = legacy_pod->normal_rho; - auto theta = 2.0 * M_PI * legacy_pod->normal_x; - // we return the sin version of the normal sample when in caching mode - double_normal_sample = c10::optional(r * ::sin(theta)); - } - } else if (THTensor_(nElement)(self) == size_current) { - auto rng_state = (THGeneratorStateNew*)self->data(); - legacy_pod = &rng_state->legacy_pod; - // update next_float_normal_sample - if (rng_state->is_next_float_normal_sample_valid) { - float_normal_sample = c10::optional(rng_state->next_float_normal_sample); - } - - // Update next_double_normal_sample. - // Note that in getRNGState, we now return the actual normal sample in normal_y - // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho - // are squashed to 0.0. - if (legacy_pod->normal_is_valid) { - double_normal_sample = c10::optional(legacy_pod->normal_y); - } - } else { - AT_ERROR("Expected either a THGeneratorState of size ", size_legacy, - " or a THGeneratorStateNew of size ", size_current, - " but found the input RNG state size to be ", THTensor_(nElement)(self)); - } - - // construct engine_ - // Note that legacy THGeneratorState stored a state array of 64 bit uints, whereas in our - // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are - // doing a std::copy. - at::mt19937_data_pod rng_data; - std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin()); - rng_data.seed_ = legacy_pod->the_initial_seed; - rng_data.left_ = legacy_pod->left; - rng_data.seeded_ = legacy_pod->seeded; - rng_data.next_ = static_cast(legacy_pod->next); - engine.set_data(rng_data); - THArgCheck(engine.is_valid(), 1, "Invalid mt19937 state"); - cast_generator->set_engine(engine); - cast_generator->set_next_float_normal_sample(float_normal_sample); - cast_generator->set_next_double_normal_sample(double_normal_sample); -} -#endif #endif diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h index ffc52bc69390..ddeb905680cd 100644 --- a/aten/src/TH/generic/THTensorRandom.h +++ b/aten/src/TH/generic/THTensorRandom.h @@ -9,9 +9,4 @@ TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor * TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTensor *J, int n_sample, c10::optional _generator); #endif -#if defined(TH_REAL_IS_BYTE) -TH_API void THTensor_(getRNGState)(at::Generator _generator, THTensor *self); -TH_API void THTensor_(setRNGState)(at::Generator _generator, THTensor *self); -#endif - #endif diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu index aefb427f4e67..8655ea2fb829 100644 --- a/aten/src/THC/THCTensorRandom.cu +++ b/aten/src/THC/THCTensorRandom.cu @@ -12,60 +12,6 @@ #define MAX_NUM_BLOCKS 200 #define BLOCK_SIZE 256 -// NB: ROCm compiler seems to have a bug where __host__ functions must be -// explicitly specified extern "C" otherwise ROCm compiler doesn't respect it. -// See https://github.com/RadeonOpenCompute/hcc/issues/839 -__host__ void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state) -{ - auto gen = at::check_generator(gen_); - std::lock_guard lock(gen->mutex_); - // The RNG state comprises the seed, and an offset used for Philox. - // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120. - // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32); - // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here - // because this is just host side code and we don't want to worry about linking with cuda - static const size_t states_size = 200 * sizeof(4120); - static const size_t seed_size = sizeof(uint64_t); - static const size_t offset_size = sizeof(int64_t); - static const size_t total_size = states_size + seed_size + offset_size; - THByteTensor_resize1d(rng_state, total_size); - THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); - THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); - // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1 - // gen_states in THCGenerator struct was an array of curandStateMtgp32s. - memset(THByteTensor_data(rng_state), -1, states_size); - auto current_seed = gen->current_seed(); - auto offset = static_cast(gen->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic - memcpy(THByteTensor_data(rng_state) + states_size, ¤t_seed, seed_size); - memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &offset, offset_size); -} - -__host__ void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state) -{ - auto gen = at::check_generator(gen_); - std::lock_guard lock(gen->mutex_); - static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason - static const size_t seed_size = sizeof(uint64_t); - static const size_t offset_size = sizeof(int64_t); - static const size_t total_size = states_size + seed_size + offset_size; - bool no_philox_seed = false; - if (THByteTensor_nElement(rng_state) == total_size - offset_size) { - no_philox_seed = true; - } - else { - THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size"); - } - THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous"); - uint64_t input_seed; - memcpy(&input_seed, THByteTensor_data(rng_state) + states_size, seed_size); - gen->set_current_seed(input_seed); - int64_t philox_offset = 0; - if (!no_philox_seed) { - memcpy(&philox_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size); - } - gen->set_philox_offset_per_thread(static_cast(philox_offset)); -} - #include #include diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h index b1d7f1ef1797..696e36f70bec 100644 --- a/aten/src/THC/THCTensorRandom.h +++ b/aten/src/THC/THCTensorRandom.h @@ -9,9 +9,4 @@ #include #include -#include - -TORCH_CUDA_API void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state); -TORCH_CUDA_API void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state); - #endif diff --git a/benchmarks/functional_autograd_benchmark/ppl_models.py b/benchmarks/functional_autograd_benchmark/ppl_models.py index 906ebac5d41b..94ba6698a91d 100644 --- a/benchmarks/functional_autograd_benchmark/ppl_models.py +++ b/benchmarks/functional_autograd_benchmark/ppl_models.py @@ -24,8 +24,9 @@ def forward(beta_value: Tensor) -> Tensor: mu = X.mm(beta_value) # We need to compute the first and second gradient of this score with respect - # to beta_value. - score = dist.Bernoulli(logits=mu).log_prob(Y).sum() + beta_prior.log_prob(beta_value).sum() + # to beta_value. We disable Bernoulli validation because Y is a relaxed value. + score = (dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum() + + beta_prior.log_prob(beta_value).sum()) return score return forward, (beta_value.to(device),) @@ -40,7 +41,7 @@ def get_robust_regression(device: torch.device) -> GetterReturnType: Y = torch.rand(N, 1, device=device) # Predefined nu_alpha and nu_beta, nu_alpha.shape: (1, 1), nu_beta.shape: (1, 1) - nu_alpha = torch.randn(1, 1, device=device) + nu_alpha = torch.rand(1, 1, device=device) nu_beta = torch.rand(1, 1, device=device) nu = dist.Gamma(nu_alpha, nu_beta) diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h index 3af652a1a3b2..84e620e93a72 100644 --- a/c10/core/GeneratorImpl.h +++ b/c10/core/GeneratorImpl.h @@ -13,6 +13,7 @@ #include #include #include +#include /** * Note [Generator] @@ -71,6 +72,8 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target { virtual void set_current_seed(uint64_t seed) = 0; virtual uint64_t current_seed() const = 0; virtual uint64_t seed() = 0; + virtual void set_state(const c10::TensorImpl& new_state) = 0; + virtual c10::intrusive_ptr get_state() const = 0; Device device() const; // See Note [Acquire lock when using random generators] diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp index 358e6ef7e1f7..ff3e454eda8a 100644 --- a/c10/core/impl/LocalDispatchKeySet.cpp +++ b/c10/core/impl/LocalDispatchKeySet.cpp @@ -5,10 +5,6 @@ namespace c10 { namespace impl { -C10_DEFINE_bool(disable_variable_dispatch, false, "This flag forcibly disables the Variable code paths from executing, which currently breaks profiling in the process."); - -namespace { - /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, /// thread_local is not supported. #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY @@ -18,25 +14,15 @@ thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) -static PODLocalDispatchKeySet raw_local_dispatch_key_set; +PODLocalDispatchKeySet raw_local_dispatch_key_set; #endif -} // anonymous namespace - +#ifdef _MSC_VER LocalDispatchKeySet tls_local_dispatch_key_set() { - // Hack until variable performance is fixed - // - // ezyang: I'm pretty unhappy about this implementation, it looks wrong - // to me, as it seems to be performing a mutation on - // raw_local_dispatch_key_set. I can't conveniently test the correct - // version though... - if (FLAGS_disable_variable_dispatch) { - raw_local_dispatch_key_set.set_excluded( - raw_local_dispatch_key_set.excluded() | autograd_dispatch_keyset); - } return raw_local_dispatch_key_set; } +#endif // _MSC_VER void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set) { raw_local_dispatch_key_set = PODLocalDispatchKeySet { diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 5262b1d4d6c0..313dc5ca3508 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -23,8 +23,6 @@ namespace c10 { namespace impl { -C10_DECLARE_bool(disable_variable_dispatch); - // POD version of LocalDispatchKeySet. Declared here just so that // we can put it in the guards. struct C10_API PODLocalDispatchKeySet { @@ -54,7 +52,24 @@ struct C10_API LocalDispatchKeySet { DispatchKeySet excluded_; }; +// thread_local variables cannot be C10_API on Windows. +#ifdef _MSC_VER C10_API LocalDispatchKeySet tls_local_dispatch_key_set(); +#else // _MSC_VER +/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting, +/// thread_local is not supported. +#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY + extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set; +#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY) + extern C10_API PODLocalDispatchKeySet raw_local_dispatch_key_set; +#endif + +inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() { + // Don't let people fiddle with the thread_local directly just + // because they include this header. + return raw_local_dispatch_key_set; +} +#endif // _MSC_VER // Internal, use ThreadLocalStateGuard C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set); diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h index 637db95991f2..790d97ee3994 100644 --- a/c10/util/intrusive_ptr.h +++ b/c10/util/intrusive_ptr.h @@ -206,7 +206,7 @@ class intrusive_ptr final { "NullType must have a constexpr singleton() method"); #endif static_assert( - std::is_same::value, + std::is_base_of::type>::value, "NullType::singleton() must return a element_type* pointer"); TTarget* target_; @@ -509,7 +509,7 @@ class weak_intrusive_ptr final { "NullType must have a constexpr singleton() method"); #endif static_assert( - std::is_same::value, + std::is_base_of::type>::value, "NullType::singleton() must return a element_type* pointer"); TTarget* target_; diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 4fcf86be55e2..9b934e4831e8 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -340,9 +340,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) set(GENERATED_CXX_TORCH "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_0.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_1.cpp" - "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_2.cpp" ) if(NOT INTERN_DISABLE_AUTOGRAD) @@ -434,8 +431,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) "${TOOLS_PATH}/autograd/load_derivatives.py" "${TOOLS_PATH}/autograd/nested_dict.py" "${TOOLS_PATH}/autograd/utils.py" - "${TOOLS_PATH}/jit/gen_unboxing_wrappers.py" - "${TOOLS_PATH}/jit/templates/generated_unboxing_wrappers.cpp" WORKING_DIRECTORY "${TORCH_ROOT}") @@ -479,6 +474,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) # This one needs to be unconditionally added as Functions.cpp is also unconditionally added list(APPEND TORCH_SRCS ${TORCH_SRC_DIR}/csrc/autograd/FunctionsManual.cpp + ${TORCH_SRC_DIR}/csrc/utils/out_types.cpp ) if(NOT INTERN_DISABLE_AUTOGRAD) diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md index 377a1f780271..593079ef1393 100644 --- a/caffe2/contrib/aten/README.md +++ b/caffe2/contrib/aten/README.md @@ -1,6 +1,6 @@ # An ATen operator for Caffe2 -[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch +ATen is a simple tensor library thats exposes the Tensor operations in Torch and PyTorch directly in C++14. This library provides a generated wrapper around the ATen API that makes these functions available in Caffe2 as an operator. It also makes it accessible using the ToffeeIR. @@ -8,8 +8,8 @@ ToffeeIR. ### Example Usage in Caffe2 -First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), -[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). +First identify a function in ATen you want to call in Functions.h, +Tensor.h, or Type.h. We will call the `pow` operator: diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md index 85c275bb5178..c3f615ee37b9 100644 --- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md +++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md @@ -6,7 +6,7 @@ operators that haven't been standardized yet, or custom `torch.autograd.Function are specific to a network. To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library. -[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten) +[ATen](https://github.com/pytorch/pytorch/tree/master/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/master/caffe2/contrib/aten) that can run these tensor functions in a Caffe2 network after importing them through ONNX. This guide explains how to configure Caffe2 and modify your PyTorch program to use @@ -61,8 +61,8 @@ We can add a `symbolic` method to it like so: The function `graph.at` adds a new ATen op the computation graph. You can call any ATen function using this facility. To do so, -first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h), -[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h). +first identify a function in ATen you want to call in Functions.h, +Tensor.h, or Type.h. As an example, we might want to call the `pow` operator: @@ -86,9 +86,9 @@ To call methods of ATen's `Type` objects, you provide an additional string attri that determines the type. For instance, `ones` creates a new constant tensor of all ones: ``` class Type { - ... - virtual Tensor ones(IntArrayRef size) const; - ... + ... + virtual Tensor ones(IntArrayRef size) const; + ... }; ``` diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py index 769f9d59c856..64d3de547bb7 100755 --- a/caffe2/contrib/aten/gen_op.py +++ b/caffe2/contrib/aten/gen_op.py @@ -285,8 +285,7 @@ def emit_assignments(o, env): real_inputs = 0 for i, arg in enumerate(o['arguments']): env['arguments'].append(arg['name']) - # Emulate logic in gen_unboxing_wrappers.py. Pretend the flat argument - # list is a stack where the end is the top. + # Pretend the flat argument list is a stack where the end is the top. view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs if arg['type'] == 'TensorList': # NOTE: do not advance real_inputs here. After this we will diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py index fbca9b8fe64c..5ae066f5e3ca 100644 --- a/caffe2/contrib/gloo/gloo_test.py +++ b/caffe2/contrib/gloo/gloo_test.py @@ -27,7 +27,6 @@ op_engine = 'GLOO' - class TemporaryDirectory: def __enter__(self): self.tmpdir = tempfile.mkdtemp() diff --git a/caffe2/opt/fakefp16_transform.cc b/caffe2/opt/fakefp16_transform.cc index 424056bd2c80..cbd3132dfc08 100644 --- a/caffe2/opt/fakefp16_transform.cc +++ b/caffe2/opt/fakefp16_transform.cc @@ -299,8 +299,8 @@ void fakeFp16Transform(NetDef* net) { FLAGS_fake_fp16_conversion_use_fp16_acc, FLAGS_fake_fp16_conversion_use_nnpi); - auto blacklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist); - auto blacklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); + auto blocklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist); + auto blocklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); // A hack to only do fakefp16 transformation for operators which will be // lowered to ONNXIFI. @@ -320,7 +320,7 @@ void fakeFp16Transform(NetDef* net) { auto* op = net->mutable_op(i); auto net_pos = ArgumentHelper::GetSingleArgument(*op, "net_pos", -1); - if (blacklist_pos.count(net_pos) || blacklist_type.count(op->type())) { + if (blocklist_pos.count(net_pos) || blocklist_type.count(op->type())) { continue; } auto it = kFakeFp16OpConversionMap.find(op->type()); diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc index ee3ce1b27e2c..45ce9a487fbb 100644 --- a/caffe2/opt/glow_net_transform.cc +++ b/caffe2/opt/glow_net_transform.cc @@ -107,7 +107,7 @@ void onnxifi( const std::vector& input_names, const std::vector& output_names, const std::vector& weight_names, - const std::unordered_set& blacklist, + const std::unordered_set& blocklist, const ShapeInfoMap& shape_hints_max_bs, bool use_onnx, size_t max_batch_size, @@ -154,19 +154,19 @@ void onnxifi( // Before applying backlist, make sure the ops in the net all have an net_pos; caffe2::BackendTransformerBase::annotateOpIndex(net); - // Parse the blacklist - auto more_blacklist = ParseNetPositionList(FLAGS_onnxifi_blacklist); - for (const auto& b : blacklist) { - more_blacklist.emplace(b); + // Parse the blocklist + auto more_blocklist = ParseNetPositionList(FLAGS_onnxifi_blacklist); + for (const auto& b : blocklist) { + more_blocklist.emplace(b); } // ONNX mode will change the op order so it doesn't apply here if (!opts.use_onnx) { - auto blacklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); + auto blocklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops); for (const auto& op : net->op()) { - if (blacklisted_ops.count(op.type())) { + if (blocklisted_ops.count(op.type())) { ArgumentHelper helper(op); - more_blacklist.emplace(helper.GetSingleArgument(op, kNetPos, -1)); + more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1)); } } } @@ -179,7 +179,7 @@ void onnxifi( // 1. for specified op, we find its input and outputs. // 2. for each input and output, we create a new copy op and attach it as an // input to the copy. - // 3. we blacklist these new copy operators from onnxification. This forces + // 3. we blocklist these new copy operators from onnxification. This forces // these intermediate tensors to also become outputs of the onnxifi op. // 4. we put the right arguments on the copy ops so TensorObserver can print // out the values. @@ -213,11 +213,11 @@ void onnxifi( AddArgument(kNetPos, pos, ©_op); AddArgument("observe_input_tensors", 1, ©_op); net->add_op()->CopyFrom(copy_op); - more_blacklist.emplace(pos); + more_blocklist.emplace(pos); } OnnxifiTransformer ts(opts); - ts.transform(ws, net, weight_names, more_shape_hints, more_blacklist); + ts.transform(ws, net, weight_names, more_shape_hints, more_blocklist); // Cleanup the input from the workspace for (const auto& i : input_names) { diff --git a/caffe2/opt/glow_net_transform.h b/caffe2/opt/glow_net_transform.h index e8d1c9b9054f..f6cd975a6e91 100644 --- a/caffe2/opt/glow_net_transform.h +++ b/caffe2/opt/glow_net_transform.h @@ -16,7 +16,7 @@ namespace caffe2 { namespace glow { /// Onnxifi transformation on the net and workspace. We also /// needed the input data/shape to populate the shape. In addition, we take a \p -/// blacklist to control and mask what ops we want to consider in onnxifi +/// blocklist to control and mask what ops we want to consider in onnxifi /// process. We can also set whether to use ONNX proto or C2 proto through /// ONNXIFI interface. void onnxifi( @@ -25,7 +25,7 @@ void onnxifi( const std::vector& input_names, const std::vector& output_names, const std::vector& weight_names, - const std::unordered_set& blacklist, + const std::unordered_set& blocklist, const ShapeInfoMap& shape_hints_max_bs, bool use_onnx, size_t max_batch_size = 0, diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc index 8089314c3100..2dd8c8d2d8b4 100644 --- a/caffe2/opt/onnxifi_transformer.cc +++ b/caffe2/opt/onnxifi_transformer.cc @@ -1195,11 +1195,11 @@ void OnnxifiTransformer::applyFilteringRules( blocklistCpuPartition(net, blocklisted_ops); } -void OnnxifiTransformer::getBackendId() { +std::vector OnnxifiTransformer::getBackendId() { idx_ = 0; if (opts_.use_onnx) { - return; + return backend_ids_; } // Try to find a backend that support Caffe2 proto. Note that this is quite // opportunistic as we don't officially support Caffe2 proto. @@ -1214,6 +1214,7 @@ void OnnxifiTransformer::getBackendId() { break; } } + return backend_ids_; } NetDef OnnxifiTransformer::TransformViaC2( diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h index d88eb739750c..d1af1731013d 100644 --- a/caffe2/opt/onnxifi_transformer.h +++ b/caffe2/opt/onnxifi_transformer.h @@ -61,6 +61,17 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase { const ShapeInfoMap& shape_hints, const std::unordered_set& blocklisted_ops) override; + // Query whether an operator is supported by passing C2 protobuf + bool supportOpC2( + const caffe2::OperatorDef& op, + const ShapeInfoMap& shape_hints, + const std::unordered_set& weights, + const std::unordered_set& blocklisted_ops, + onnxBackendID backend_id) const; + + // Determine backend id + std::vector getBackendId(); + private: // Since we create new tensors during the conversion process, we actually need // into inject them into the original workspace @@ -114,14 +125,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase { ShapeInfoMap* shape_hints_max_bs, const std::unordered_map &shape_hints_per_bs); - // Query whether an operator is supported by passing C2 protobuf - bool supportOpC2( - const caffe2::OperatorDef& op, - const ShapeInfoMap& shape_hints, - const std::unordered_set& weights, - const std::unordered_set& blocklisted_ops, - onnxBackendID backend_id) const; - // Query whether an operator is supported by passing ONNX protobuf bool supportOpOnnx( const caffe2::OperatorDef& op, @@ -152,9 +155,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase { const std::unordered_set& weights, std::unordered_set* blocklisted_ops) const; - // Determine backend id - void getBackendId(); - // Extract partition info from the original net void extractPartitionInfo(const NetDef& net); diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py index d6754adc20fd..32b9ec34d1f8 100644 --- a/caffe2/python/_import_c_extension.py +++ b/caffe2/python/_import_c_extension.py @@ -5,16 +5,6 @@ import sys from caffe2.python import extension_loader -# NOTE: we have to import python protobuf here **before** we load cpp extension. -# Otherwise it breaks under certain build conditions if cpp implementation of -# protobuf is used. Presumably there's some registry in protobuf library and -# python side has to initialize the dictionary first, before static -# initialization in python extension does so. Otherwise, duplicated protobuf -# descriptors will be created and it can lead to obscure errors like -# "Parameter to MergeFrom() must be instance of same class: -# expected caffe2.NetDef got caffe2.NetDef." -import caffe2.proto - # We will first try to load the gpu-enabled caffe2. If it fails, we will then # attempt to load the cpu version. The cpu backend is the minimum required, so # if that still fails, we will exit loud. diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py index 1b683be0d51e..b4cb8f2da0b4 100644 --- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py +++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py @@ -5,7 +5,7 @@ import hypothesis.strategies as st import numpy as np -from caffe2.python import core, dyndep, workspace +from caffe2.python import core, workspace def benchmark_sparse_lengths_sum( diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py deleted file mode 100644 index 9d615a308333..000000000000 --- a/caffe2/python/compatibility.py +++ /dev/null @@ -1,8 +0,0 @@ -from six import PY2, PY3 - -if PY2: - import collections - container_abcs = collections -elif PY3: - import collections.abc - container_abcs = collections.abc diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py index 18033661a69e..b4b37811de10 100644 --- a/caffe2/python/convert.py +++ b/caffe2/python/convert.py @@ -5,6 +5,3 @@ -from caffe2.proto import caffe2_pb2, torch_pb2 - -import caffe2.python._import_c_extension as C diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py index a1dc52aad2d9..d9d82bf5e6c4 100644 --- a/caffe2/python/convert_test.py +++ b/caffe2/python/convert_test.py @@ -3,10 +3,8 @@ -from caffe2.python import convert, workspace -from caffe2.proto import caffe2_pb2, torch_pb2 +from caffe2.python import workspace import unittest -import numpy as np class TestOperator(unittest.TestCase): def setUp(self): diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index 3674b7aa4585..293eccca0dd4 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -3,7 +3,6 @@ -from future.utils import bytes_to_native_str from hypothesis import given, settings import hypothesis.strategies as st import unittest diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py index 0c45fb50aed9..ac1c72284fbf 100644 --- a/caffe2/python/dataio_test.py +++ b/caffe2/python/dataio_test.py @@ -6,7 +6,6 @@ from caffe2.python.dataio import ( CompositeReader, CompositeReaderBuilder, - Reader, ReaderBuilder, ReaderWithDelay, ReaderWithLimit, @@ -29,7 +28,6 @@ import shutil import unittest import tempfile -import time def make_source_dataset(ws, size=100, offset=0, name=None): diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py index ae4473ea4864..7c5a0026c113 100644 --- a/caffe2/python/ideep/conv_op_test.py +++ b/caffe2/python/ideep/conv_op_test.py @@ -4,7 +4,6 @@ import unittest -import sys import hypothesis.strategies as st from hypothesis import given, settings import numpy as np diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py index 18ce574b623b..a0a782ab8a03 100644 --- a/caffe2/python/ideep/convfusion_op_test.py +++ b/caffe2/python/ideep/convfusion_op_test.py @@ -5,8 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings -import copy +from hypothesis import given import numpy as np import math from caffe2.proto import caffe2_pb2 diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py index 33b0a52a7421..5b07333758dd 100644 --- a/caffe2/python/ideep/dropout_op_test.py +++ b/caffe2/python/ideep/dropout_op_test.py @@ -7,8 +7,6 @@ from hypothesis import given import hypothesis.strategies as st import numpy as np - -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py index a259e01bab10..39ede0d214fe 100644 --- a/caffe2/python/ideep/order_switch_op_test.py +++ b/caffe2/python/ideep/order_switch_op_test.py @@ -10,7 +10,6 @@ import caffe2.python.ideep_test_util as mu from hypothesis import given, settings -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py index 47114832f85d..1beb24bc8803 100644 --- a/caffe2/python/ideep/shape_op_test.py +++ b/caffe2/python/ideep/shape_op_test.py @@ -7,7 +7,6 @@ import hypothesis.strategies as st from hypothesis import given, settings import numpy as np -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py index 618a0e7fbfc3..97efafa72057 100644 --- a/caffe2/python/ideep/spatial_bn_op_test.py +++ b/caffe2/python/ideep/spatial_bn_op_test.py @@ -7,9 +7,8 @@ import hypothesis.strategies as st import numpy as np import unittest -from caffe2.python import brew, core, workspace +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu -from caffe2.python.model_helper import ModelHelper import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py index aa1c5bc260fa..42feeed00122 100644 --- a/caffe2/python/ideep/test_ideep_net.py +++ b/caffe2/python/ideep/test_ideep_net.py @@ -9,7 +9,6 @@ import numpy as np import argparse import time -import os.path def GetArgumentParser(): diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py index 962d4051718b..2d0f35a7406f 100644 --- a/caffe2/python/ideep/transform_ideep_net.py +++ b/caffe2/python/ideep/transform_ideep_net.py @@ -6,7 +6,6 @@ import argparse import copy import json -import os.path import numpy as np diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py index 8b324ed964ae..f8b784822a07 100644 --- a/caffe2/python/ideep/transpose_op_test.py +++ b/caffe2/python/ideep/transpose_op_test.py @@ -7,7 +7,6 @@ import hypothesis.strategies as st from hypothesis import given, settings import numpy as np -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.ideep_test_util as mu diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py index 7129ed14ba74..0cc643317c93 100644 --- a/caffe2/python/ideep_test_util.py +++ b/caffe2/python/ideep_test_util.py @@ -14,7 +14,6 @@ import hypothesis.strategies as st from caffe2.proto import caffe2_pb2 -from caffe2.python import workspace from caffe2.python import hypothesis_test_util as hu cpu_do = hu.cpu_do diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py index 9d825f3827b9..6a5a3c82dd30 100644 --- a/caffe2/python/layer_model_helper.py +++ b/caffe2/python/layer_model_helper.py @@ -17,7 +17,6 @@ from caffe2.python.optimizer import get_param_device, Optimizer from caffe2.python.regularizer import Regularizer, RegularizationBy from caffe2.python.layers import layers -from caffe2.proto import caffe2_pb2 from future.utils import viewitems, viewvalues import logging diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py index 2b084bea591b..fddb20e6bb14 100644 --- a/caffe2/python/mkl/mkl_LRN_op_test.py +++ b/caffe2/python/mkl/mkl_LRN_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py index ae42902d9102..c192137dc28c 100644 --- a/caffe2/python/mkl/mkl_LRN_speed_test.py +++ b/caffe2/python/mkl/mkl_LRN_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py index f1fe7b062318..74c4f2c6cde9 100644 --- a/caffe2/python/mkl/mkl_conv_op_test.py +++ b/caffe2/python/mkl/mkl_conv_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py index 01786d55c337..180d93f26570 100644 --- a/caffe2/python/mkl/mkl_fc_op_test.py +++ b/caffe2/python/mkl/mkl_fc_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py index 85f5605e9676..243e49c2f8f8 100644 --- a/caffe2/python/mkl/mkl_fc_speed_test.py +++ b/caffe2/python/mkl/mkl_fc_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py index 26a9b7131b0b..f233275786f7 100644 --- a/caffe2/python/mkl/mkl_fill_op_test.py +++ b/caffe2/python/mkl/mkl_fill_op_test.py @@ -5,8 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings -import numpy as np +from hypothesis import given from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import caffe2.python.mkl_test_util as mu diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py index b25e0f915cc7..aa43aed97a09 100644 --- a/caffe2/python/mkl/mkl_pool_speed_test.py +++ b/caffe2/python/mkl/mkl_pool_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py index 2ac9080ce670..86856b130d63 100644 --- a/caffe2/python/mkl/mkl_sbn_op_test.py +++ b/caffe2/python/mkl/mkl_sbn_op_test.py @@ -5,7 +5,7 @@ import unittest import hypothesis.strategies as st -from hypothesis import given, settings +from hypothesis import given import numpy as np from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py index 3b3b71d1c997..05885ceca575 100644 --- a/caffe2/python/mkl/mkl_sbn_speed_test.py +++ b/caffe2/python/mkl/mkl_sbn_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py index 9a7310a484d1..ab2e4428519a 100644 --- a/caffe2/python/mkl/mkl_speed_test.py +++ b/caffe2/python/mkl/mkl_speed_test.py @@ -6,7 +6,7 @@ import numpy as np from caffe2.proto import caffe2_pb2 -from caffe2.python import cnn, core, workspace, test_util +from caffe2.python import core, workspace, test_util @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.") diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py index 3a88a3deeccc..b52501584064 100644 --- a/caffe2/python/mkl/rewrite_graph.py +++ b/caffe2/python/mkl/rewrite_graph.py @@ -6,7 +6,6 @@ import copy from caffe2.proto import caffe2_pb2 from caffe2.python import core -import caffe2.python._import_c_extension as C def rewrite_init_net_simple(net): diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py index 3d9adc696486..bd9d10fcbae1 100644 --- a/caffe2/python/nomnigraph_test.py +++ b/caffe2/python/nomnigraph_test.py @@ -3,7 +3,7 @@ -from caffe2.python import core, workspace, test_util +from caffe2.python import core, test_util from caffe2.proto import caffe2_pb2 import caffe2.python.nomnigraph as ng diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py index 5d445576b32c..193a6f217f93 100644 --- a/caffe2/python/onnx/backend.py +++ b/caffe2/python/onnx/backend.py @@ -5,14 +5,7 @@ To run this, you will need to have Caffe2 installed as well. """ - - - - - -import os import collections -from subprocess import Popen, PIPE import sys import zipfile import itertools @@ -23,16 +16,13 @@ # importing onnx first, which will cause it to go out and pick up the # system protobuf. import onnx.backend - -import caffe2 from caffe2.python import core, workspace, rnn_cell, gru_cell -from caffe2.python.compatibility import container_abcs from caffe2.python.model_helper import ModelHelper from caffe2.proto import caffe2_pb2 import caffe2.python.utils import numpy as np import onnx -from onnx import checker, GraphProto, TensorProto, AttributeProto, ModelProto +from onnx import TensorProto import onnx.numpy_helper import onnx.defs import onnx.optimizer @@ -42,7 +32,6 @@ from caffe2.python.onnx.workspace import Workspace from caffe2.python.onnx.backend_rep import Caffe2Rep -from caffe2.python.onnx.backend_cpp_rep import Caffe2CppRep import caffe2.python._import_c_extension as C @@ -781,7 +770,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version) if isinstance(ops, Caffe2Ops): return ops - if not isinstance(ops, container_abcs.Iterable): + if not isinstance(ops, collections.abc.Iterable): ops = [ops] return Caffe2Ops(ops, [], []) diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py index 126eef8a8470..7e469e514a73 100644 --- a/caffe2/python/onnx/bin/conversion.py +++ b/caffe2/python/onnx/bin/conversion.py @@ -9,8 +9,7 @@ from caffe2.proto import caffe2_pb2 import click -import numpy as np -from onnx import checker, ModelProto +from onnx import ModelProto from caffe2.python.onnx.backend import Caffe2Backend as c2 import caffe2.python.onnx.frontend as c2_onnx diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py index ee3c30949ff7..b5121602aff5 100644 --- a/caffe2/python/onnx/frontend.py +++ b/caffe2/python/onnx/frontend.py @@ -10,22 +10,18 @@ - +import collections import itertools import logging import re from caffe2.python import core as caffe2_core -from caffe2.python.compatibility import container_abcs -from caffe2.proto import caffe2_legacy_pb2 -from enum import Enum -from onnx import (defs, checker, helper, numpy_helper, mapping, - ModelProto, GraphProto, NodeProto, AttributeProto, TensorProto, OperatorSetIdProto) -from onnx.helper import make_tensor, make_tensor_value_info, make_attribute, make_model +from onnx import (checker, helper, numpy_helper, mapping, + GraphProto, NodeProto, TensorProto, OperatorSetIdProto) +from onnx.helper import make_tensor_value_info, make_model import numpy as np from caffe2.python.onnx.helper import c2_native_run_net -from caffe2.python.onnx.error import Unsupported import caffe2.python._import_c_extension as C @@ -156,7 +152,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes): const_tensors = [] if isinstance(nodes, tuple): nodes, const_tensors = nodes - if not isinstance(nodes, container_abcs.Iterable): + if not isinstance(nodes, collections.abc.Iterable): nodes = [nodes] return nodes, const_tensors diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py index 7f8f1a6d346a..6e73a5d5c95d 100644 --- a/caffe2/python/onnx/helper.py +++ b/caffe2/python/onnx/helper.py @@ -9,9 +9,6 @@ from onnx.backend.base import namedtupledict from caffe2.python.onnx.workspace import Workspace -import caffe2.python._import_c_extension as C - -import io import logging import time diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py index a04e7e4554b9..3e67c4948b1f 100644 --- a/caffe2/python/onnx/onnxifi.py +++ b/caffe2/python/onnx/onnxifi.py @@ -11,9 +11,7 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace import caffe2.python._import_c_extension as C -import numpy as np def onnxifi_caffe2_net( diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py index 7eafccaec9e4..4316149d5bf6 100644 --- a/caffe2/python/onnx/test_onnxifi.py +++ b/caffe2/python/onnx/test_onnxifi.py @@ -3,16 +3,14 @@ -import json import numpy as np -import os import time import unittest import onnx import onnx.defs from onnx.backend.base import namedtupledict -from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model +from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace from caffe2.python.models.download import ModelDownloader diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py index d2efcc79823e..aab5a04a169c 100644 --- a/caffe2/python/onnx/tests/c2_ref_test.py +++ b/caffe2/python/onnx/tests/c2_ref_test.py @@ -6,7 +6,6 @@ -import json import os import unittest @@ -17,7 +16,7 @@ from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op -from onnx import defs, mapping +from onnx import mapping import caffe2.python.onnx.frontend as c2_onnx import caffe2.python.onnx.backend as c2 diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py index 5166ec3c5083..e8b718a5a2be 100644 --- a/caffe2/python/onnx/tests/onnx_backend_test.py +++ b/caffe2/python/onnx/tests/onnx_backend_test.py @@ -13,7 +13,7 @@ import caffe2.python.onnx.backend as c2 -from caffe2.python import core, workspace +from caffe2.python import core core.SetEnginePref({}, {}) # This is a pytest magic variable to load extra plugins diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py index d34d4a0e5287..96f954037178 100644 --- a/caffe2/python/onnx/tests/ssa_test.py +++ b/caffe2/python/onnx/tests/ssa_test.py @@ -7,11 +7,10 @@ import copy -import onnx import numpy as np from caffe2.proto import caffe2_pb2 from caffe2.python import core -from onnx import helper, TensorProto +from onnx import TensorProto import caffe2.python.onnx.frontend as c2_onnx from caffe2.python.onnx.helper import c2_native_run_net diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py index d224daf05ba3..bebfc1012957 100644 --- a/caffe2/python/onnx/tests/test_utils.py +++ b/caffe2/python/onnx/tests/test_utils.py @@ -6,7 +6,6 @@ -import os import unittest import numpy as np diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py index 3a1ebcd4ec67..f039ef09f637 100644 --- a/caffe2/python/operator_fp_exceptions_test.py +++ b/caffe2/python/operator_fp_exceptions_test.py @@ -3,7 +3,6 @@ from caffe2.python import core, workspace -from caffe2.proto import caffe2_pb2 from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py index 6cf8170b34f8..88197d16d70b 100644 --- a/caffe2/python/operator_test/blobs_queue_db_test.py +++ b/caffe2/python/operator_test/blobs_queue_db_test.py @@ -3,7 +3,6 @@ -import unittest import numpy as np import caffe2.proto.caffe2_pb2 as caffe2_pb2 diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 05b8212242e4..38fe43899990 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -2,7 +2,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py index bf9af112a5b0..2eb2acf87902 100644 --- a/caffe2/python/operator_test/bucketize_op_test.py +++ b/caffe2/python/operator_test/bucketize_op_test.py @@ -2,10 +2,9 @@ -from caffe2.python import core, dyndep +from caffe2.python import core from hypothesis import given import caffe2.python.hypothesis_test_util as hu -import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py index 1927b4eac78f..ac83681f08bf 100644 --- a/caffe2/python/operator_test/concat_split_op_test.py +++ b/caffe2/python/operator_test/concat_split_op_test.py @@ -3,8 +3,7 @@ -from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace +from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given, settings diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index ae54cd37a91d..e600aa2c9ee9 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -2,7 +2,6 @@ import collections import functools -import os import unittest import caffe2.python._import_c_extension as C diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py index 04bfbbe6f4f6..d979407321a4 100644 --- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py +++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py @@ -3,7 +3,6 @@ -from hypothesis import given import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py index b75e7b7b1a10..4d7b90c431a6 100644 --- a/caffe2/python/operator_test/crf_test.py +++ b/caffe2/python/operator_test/crf_test.py @@ -9,7 +9,6 @@ import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st from hypothesis import given, settings -import unittest class TestCRFOp(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py index d1852e7dd9e8..c88f93503a15 100644 --- a/caffe2/python/operator_test/cross_entropy_ops_test.py +++ b/caffe2/python/operator_test/cross_entropy_ops_test.py @@ -9,7 +9,6 @@ import numpy as np import unittest -import os def sigmoid(x): return 1.0 / (1.0 + np.exp(-x)) diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py index 1dda7166e65a..29440c00a4b3 100644 --- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py +++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from caffe2.python.test_util import caffe2_flaky from collections import defaultdict, Counter from hypothesis import given, settings import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py index db1b826cfe41..ef4433a41a18 100644 --- a/caffe2/python/operator_test/cudnn_recurrent_test.py +++ b/caffe2/python/operator_test/cudnn_recurrent_test.py @@ -4,7 +4,6 @@ from caffe2.python import model_helper, workspace, core, rnn_cell -from caffe2.proto import caffe2_pb2 from future.utils import viewitems import numpy as np diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py index f6ad0e38e73c..67289de5e924 100644 --- a/caffe2/python/operator_test/deform_conv_test.py +++ b/caffe2/python/operator_test/deform_conv_test.py @@ -1,6 +1,5 @@ -import os import unittest import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py index 2d6d6429f833..cdfffce288dd 100644 --- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py +++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py @@ -5,7 +5,7 @@ import numpy as np import caffe2.python.hypothesis_test_util as hu -from caffe2.python import core, dyndep, utils, workspace +from caffe2.python import core, utils from hypothesis import given, settings import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py index e948fdae9673..5b46548e072b 100644 --- a/caffe2/python/operator_test/distance_op_test.py +++ b/caffe2/python/operator_test/distance_op_test.py @@ -6,7 +6,6 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial -from hypothesis import given import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py index ac0dc3dd0975..2bd85625a3d9 100644 --- a/caffe2/python/operator_test/elementwise_linear_op_test.py +++ b/caffe2/python/operator_test/elementwise_linear_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py index 8dbfdc1871e8..31f70086de7b 100644 --- a/caffe2/python/operator_test/elementwise_ops_test.py +++ b/caffe2/python/operator_test/elementwise_ops_test.py @@ -10,7 +10,6 @@ import numpy as np import unittest -import os class TestElementwiseOps(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py index b843bfdc95b9..8150977945a2 100644 --- a/caffe2/python/operator_test/enforce_finite_op_test.py +++ b/caffe2/python/operator_test/enforce_finite_op_test.py @@ -8,7 +8,6 @@ from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu -import hypothesis.strategies as st class TestEnforceFinite(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py index 0d198b1aff14..aba2c1106da3 100644 --- a/caffe2/python/operator_test/expand_op_test.py +++ b/caffe2/python/operator_test/expand_op_test.py @@ -3,7 +3,7 @@ -from caffe2.python import core, workspace +from caffe2.python import core from hypothesis import given, settings import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py index 19fa329c9389..5a20b63166be 100644 --- a/caffe2/python/operator_test/feature_maps_ops_test.py +++ b/caffe2/python/operator_test/feature_maps_ops_test.py @@ -2,7 +2,7 @@ -from caffe2.python import core, workspace, dyndep +from caffe2.python import core, workspace from caffe2.python.test_util import TestCase import numpy as np diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py index f38df09ec9fb..7b7a33dcd90a 100644 --- a/caffe2/python/operator_test/glu_op_test.py +++ b/caffe2/python/operator_test/glu_op_test.py @@ -6,7 +6,7 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial -from hypothesis import assume, given, settings, HealthCheck +from hypothesis import given, settings import hypothesis.strategies as st import numpy as np diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py index 62aba236d5ba..8e864bb42152 100644 --- a/caffe2/python/operator_test/group_conv_test.py +++ b/caffe2/python/operator_test/group_conv_test.py @@ -12,7 +12,6 @@ import caffe2.python.hypothesis_test_util as hu import unittest -import os class TestGroupConvolution(hu.HypothesisTestCase): diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py index 99444f39ac26..1a7db2634989 100644 --- a/caffe2/python/operator_test/gru_test.py +++ b/caffe2/python/operator_test/gru_test.py @@ -16,7 +16,6 @@ import hypothesis.strategies as st import numpy as np import unittest -import os def gru_unit(*args, **kwargs): diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py index 90a8197e7ccf..c0a1e8f49f5a 100644 --- a/caffe2/python/operator_test/hyperbolic_ops_test.py +++ b/caffe2/python/operator_test/hyperbolic_ops_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py index 760228382bc6..42cb1deaf8ae 100644 --- a/caffe2/python/operator_test/im2col_col2im_test.py +++ b/caffe2/python/operator_test/im2col_col2im_test.py @@ -10,9 +10,6 @@ import hypothesis.strategies as st import numpy as np -import unittest -import os - class TestReduceFrontSum(hu.HypothesisTestCase): @given(batch_size=st.integers(1, 3), diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py index fb4f3c935ba8..efce9d7001fe 100644 --- a/caffe2/python/operator_test/instance_norm_test.py +++ b/caffe2/python/operator_test/instance_norm_test.py @@ -11,7 +11,6 @@ import caffe2.python.serialized_test.serialized_test_util as serial import unittest -import os class TestInstanceNorm(serial.SerializedTestCase): diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py index 6ed2db2e88c2..f205d8e650b2 100644 --- a/caffe2/python/operator_test/jsd_ops_test.py +++ b/caffe2/python/operator_test/jsd_ops_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py index 62e94afe9e7d..d402cce4c4f9 100644 --- a/caffe2/python/operator_test/layer_norm_op_test.py +++ b/caffe2/python/operator_test/layer_norm_op_test.py @@ -13,7 +13,6 @@ import hypothesis.strategies as st import numpy as np -import os import torch import unittest diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py index 626ec0542b7d..cda2f7da323e 100644 --- a/caffe2/python/operator_test/lengths_pad_op_test.py +++ b/caffe2/python/operator_test/lengths_pad_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py index fc4e89e2545b..49b0ba7ec22c 100644 --- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py +++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py @@ -3,7 +3,7 @@ import caffe2.python.hypothesis_test_util as hu import hypothesis.strategies as st import numpy as np -from caffe2.python import core, dyndep, workspace +from caffe2.python import core, workspace from hypothesis import given diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py index e0a5f9609588..441fcc747835 100644 --- a/caffe2/python/operator_test/lengths_tile_op_test.py +++ b/caffe2/python/operator_test/lengths_tile_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py index 24cb65ac96f8..f6a07ead3cf9 100644 --- a/caffe2/python/operator_test/loss_ops_test.py +++ b/caffe2/python/operator_test/loss_ops_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py index b8cef19b24df..8b4001a574ac 100644 --- a/caffe2/python/operator_test/matmul_op_test.py +++ b/caffe2/python/operator_test/matmul_op_test.py @@ -9,8 +9,6 @@ from hypothesis import assume, given, settings import hypothesis.strategies as st - -from caffe2.proto import caffe2_pb2 from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py index 5830089f8e9b..ee2c6fc8fbf7 100644 --- a/caffe2/python/operator_test/mean_op_test.py +++ b/caffe2/python/operator_test/mean_op_test.py @@ -6,8 +6,6 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial - -from hypothesis import given import hypothesis.strategies as st import numpy as np import unittest diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py index 3b270df254ce..bee44e360e3f 100644 --- a/caffe2/python/operator_test/moments_op_test.py +++ b/caffe2/python/operator_test/moments_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py index a202581f808c..c32aa99470db 100644 --- a/caffe2/python/operator_test/numpy_tile_op_test.py +++ b/caffe2/python/operator_test/numpy_tile_op_test.py @@ -9,7 +9,7 @@ import hypothesis.strategies as st import unittest -from caffe2.python import core, workspace +from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py index 4cff53b87d6e..5ad9c277239d 100644 --- a/caffe2/python/operator_test/onnx_while_test.py +++ b/caffe2/python/operator_test/onnx_while_test.py @@ -3,7 +3,7 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python import core, workspace +from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial from hypothesis import given, settings diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py index 9a76e6b847a5..eceb1e5ba6a9 100644 --- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py +++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py @@ -4,7 +4,6 @@ from caffe2.python import core -from hypothesis import given import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py index 6d4e6bbdcd08..788c4035dd5f 100644 --- a/caffe2/python/operator_test/pad_test.py +++ b/caffe2/python/operator_test/pad_test.py @@ -5,8 +5,6 @@ from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial - -from hypothesis import given import hypothesis.strategies as st import numpy as np import unittest diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py index d81b0a963185..40c4192e21e9 100644 --- a/caffe2/python/operator_test/percentile_op_test.py +++ b/caffe2/python/operator_test/percentile_op_test.py @@ -3,7 +3,7 @@ -from caffe2.python import core, workspace, dyndep +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import numpy as np diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py index e244f77149e1..a702ab41577f 100644 --- a/caffe2/python/operator_test/rand_quantization_op_test.py +++ b/caffe2/python/operator_test/rand_quantization_op_test.py @@ -6,7 +6,6 @@ import numpy as np import struct import unittest -import os from hypothesis import given, example import hypothesis.strategies as st diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py index 13650e6cad4e..33ada4d6881c 100644 --- a/caffe2/python/operator_test/recurrent_network_test.py +++ b/caffe2/python/operator_test/recurrent_network_test.py @@ -11,9 +11,6 @@ import hypothesis.strategies as st import numpy as np -import os -import unittest - class RecurrentNetworkTest(serial.SerializedTestCase): @given(T=st.integers(1, 4), n=st.integers(1, 5), diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py index 727631befe89..7b79b3b81aed 100644 --- a/caffe2/python/operator_test/reduce_ops_test.py +++ b/caffe2/python/operator_test/reduce_ops_test.py @@ -11,7 +11,6 @@ import hypothesis.strategies as st import numpy as np import itertools as it -import unittest class TestReduceOps(serial.SerializedTestCase): diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py index 7d4287df6609..6a99f2b27d42 100644 --- a/caffe2/python/operator_test/reduction_ops_test.py +++ b/caffe2/python/operator_test/reduction_ops_test.py @@ -3,7 +3,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace from hypothesis import assume, given, settings import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py index c74157a039b0..ea835acead61 100644 --- a/caffe2/python/operator_test/roi_align_rotated_op_test.py +++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py @@ -3,7 +3,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import core, workspace from hypothesis import given import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 4609473f91f0..65c0669abfb0 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -11,7 +11,6 @@ import hypothesis.strategies as st import numpy as np import unittest -import os def _gen_test_add_padding(with_pad_data=True, diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py index 35f7bd2a5e29..21a530346329 100644 --- a/caffe2/python/operator_test/spatial_bn_op_test.py +++ b/caffe2/python/operator_test/spatial_bn_op_test.py @@ -3,7 +3,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import brew, core, utils, workspace import caffe2.python.hip_test_util as hiputl import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py index 5bd6cb1d08f8..51f328c95f5f 100644 --- a/caffe2/python/operator_test/square_root_divide_op_test.py +++ b/caffe2/python/operator_test/square_root_divide_op_test.py @@ -5,7 +5,6 @@ from caffe2.python import core from functools import partial -from hypothesis import given from hypothesis import strategies as st import caffe2.python.hypothesis_test_util as hu diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py index 02276b08c176..beb8a3781832 100644 --- a/caffe2/python/optimizer_test_util.py +++ b/caffe2/python/optimizer_test_util.py @@ -8,7 +8,6 @@ import unittest import numpy as np from caffe2.python import brew, core, workspace, cnn, optimizer -from caffe2.proto import caffe2_pb2 from caffe2.python.modeling.initializers import ( Initializer, PseudoFP16Initializer) diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h index b3926e941194..6513f216a9be 100644 --- a/caffe2/python/pybind_state.h +++ b/caffe2/python/pybind_state.h @@ -232,7 +232,6 @@ class TensorFeeder : public BlobFeederBase { for (int i = 0; i < tensor.numel(); ++i) { char* str; Py_ssize_t strSize; -#if PY_MAJOR_VERSION > 2 if (PyBytes_Check(input[i])) { CAFFE_ENFORCE( PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1, @@ -246,11 +245,6 @@ class TensorFeeder : public BlobFeederBase { } else { CAFFE_THROW("Unsupported python object type passed into ndarray."); } -#else - CAFFE_ENFORCE( - PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1, - "Unsupported python object type passed into ndarray."); -#endif // PY_MAJOR_VERSION > 2 outPtr[i] = std::string(str, strSize); } break; @@ -342,18 +336,12 @@ class PythonOpBase : public Operator { try { builder_call = loads(py::bytes(pickled)).cast(); } catch (const py::error_already_set& e) { -#if PY_MAJOR_VERSION >= 3 LOG(INFO) << "Cannot unpickle python operator: " << e.what(); LOG(INFO) << "Try latin1 encoding for python3 run"; // to use the `_a` literal for arguments using namespace pybind11::literals; builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1") .template cast(); -#else - // for py2, simply re-throw the exception, as there is no encoding - // argument for pickle.loads - throw; -#endif } CAFFE_ENFORCE(builder_call); CAFFE_ENFORCE_EQ(py::len(builder_call), 3); diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py index dee96413dbe5..34fddbc1a66e 100644 --- a/caffe2/python/rnn/lstm_comparison.py +++ b/caffe2/python/rnn/lstm_comparison.py @@ -2,7 +2,6 @@ -from caffe2.proto import caffe2_pb2 from caffe2.python import workspace, core, lstm_benchmark, utils from copy import copy diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py index 9c85d0efd2a5..f6da5e126119 100644 --- a/caffe2/python/rnn_cell.py +++ b/caffe2/python/rnn_cell.py @@ -7,7 +7,6 @@ import functools import inspect -import itertools import logging import numpy as np import random diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py index 9bd69eb32902..bf3c8e9a0d06 100644 --- a/caffe2/python/scope_test.py +++ b/caffe2/python/scope_test.py @@ -4,7 +4,6 @@ from caffe2.python import scope, core, workspace -from caffe2.proto import caffe2_pb2 import unittest import threading diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py index ba10247eaa2e..abf63626a7fa 100644 --- a/caffe2/python/test/executor_test_util.py +++ b/caffe2/python/test/executor_test_util.py @@ -14,7 +14,6 @@ import time import numpy as np -from hypothesis import settings CI_MAX_EXAMPLES = 2 diff --git a/caffe2/python/test/inference_lstm_op_test.py b/caffe2/python/test/inference_lstm_op_test.py index 20caab9ba78b..768827bd8876 100644 --- a/caffe2/python/test/inference_lstm_op_test.py +++ b/caffe2/python/test/inference_lstm_op_test.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 -import inspect import hypothesis.strategies as st import numpy as np import torch -from caffe2.python import core, workspace +from caffe2.python import core from caffe2.python.test_util import TestCase from hypothesis import given, settings from torch import nn diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py index 7790e0f6d8f5..a407f33fe253 100644 --- a/caffe2/python/test/python_protobuf_test.py +++ b/caffe2/python/test/python_protobuf_test.py @@ -5,9 +5,6 @@ # make sure we use cpp implementation of protobuf import os os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp" - -# import cpp extension first -from caffe2.python import core # then import protobuf from caffe2.proto import caffe2_pb2, metanet_pb2 diff --git a/caffe2/python/trt/test_pt_onnx_trt.py b/caffe2/python/trt/test_pt_onnx_trt.py index 96f1ad76f6b7..5e6abb5c4d0b 100644 --- a/caffe2/python/trt/test_pt_onnx_trt.py +++ b/caffe2/python/trt/test_pt_onnx_trt.py @@ -15,17 +15,13 @@ import os import unittest -from typing import List, Any from PIL import Image import numpy as np import torch -from torch.onnx import OperatorExportTypes import torchvision.models as models import pycuda.driver as cuda -# This import causes pycuda to automatically manage CUDA context creation and cleanup. -import pycuda.autoinit import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py index 39d37ca9fa0a..2782cca7c13f 100644 --- a/caffe2/python/trt/test_trt.py +++ b/caffe2/python/trt/test_trt.py @@ -7,7 +7,7 @@ from caffe2.python import core, workspace import onnx import onnx.defs -from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model +from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model from onnx.backend.base import namedtupledict from caffe2.python.models.download import ModelDownloader import caffe2.python.onnx.backend as c2 @@ -16,7 +16,6 @@ from caffe2.python.onnx.tests.test_utils import TestCase import numpy as np import os.path -import json import time import unittest import tarfile diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py index 0936941aac03..1b201007daab 100644 --- a/caffe2/python/trt/transform.py +++ b/caffe2/python/trt/transform.py @@ -12,9 +12,7 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op -from caffe2.python import core, workspace -import caffe2.python.onnx.frontend as c2_front +from caffe2.python import workspace import caffe2.python._import_c_extension as C import numpy as np diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py index 947dd9bf296d..289d107303fa 100644 --- a/caffe2/python/utils.py +++ b/caffe2/python/utils.py @@ -6,12 +6,12 @@ from caffe2.proto import caffe2_pb2 -from caffe2.python.compatibility import container_abcs from future.utils import viewitems from google.protobuf.message import DecodeError, Message from google.protobuf import text_format import sys +import collections import copy import functools import numpy as np @@ -126,7 +126,7 @@ def MakeArgument(key, value): """Makes an argument based on the value type.""" argument = caffe2_pb2.Argument() argument.name = key - iterable = isinstance(value, container_abcs.Iterable) + iterable = isinstance(value, collections.abc.Iterable) # Fast tracking common use case where a float32 array of tensor parameters # needs to be serialized. The entire array is guaranteed to have the same diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh index 48a47b271107..19633d451ab3 100755 --- a/docker/caffe2/jenkins/common/install_python.sh +++ b/docker/caffe2/jenkins/common/install_python.sh @@ -135,11 +135,6 @@ if [ -z "${INSTALL_SETUPTOOLS}" ]; then pip install -U pip setuptools!=38.5.2 fi -# tornado 5.0 requires Python 2.7.9+ or 3.4+ -if [[ $($PYTHON -c 'import sys; print(int(sys.version_info <= (2, 7, 9) or sys.version_info <= (3, 4)))' == 1) ]]; then - pip install 'tornado<5' -fi - # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by # defaults installs the most recent networkx version, so we install this lower # version explicitly before scikit-image pulls it in as a dependency diff --git a/docs/caffe2/process.py b/docs/caffe2/process.py index 9fa37e5fbb5a..3b94b9d38502 100644 --- a/docs/caffe2/process.py +++ b/docs/caffe2/process.py @@ -1,20 +1,21 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 ## @package process # Module doxygen.process # Script to insert preamble for doxygen and regen API docs -import glob, os, shutil +import os +import shutil # Module caffe2...caffe2.python.control_test -def insert(originalfile,first_line,description): - with open(originalfile,'r') as f: +def insert(originalfile, first_line, description): + with open(originalfile, 'r') as f: f1 = f.readline() - if(f1.find(first_line)<0): + if(f1.find(first_line) < 0): docs = first_line + description + f1 - with open('newfile.txt','w') as f2: + with open('newfile.txt', 'w') as f2: f2.write(docs) f2.write(f.read()) - os.rename('newfile.txt',originalfile) + os.rename('newfile.txt', originalfile) else: print('already inserted') @@ -29,15 +30,15 @@ def insert(originalfile,first_line,description): for file in files: if (file.endswith(".py") and not file.endswith("_test.py") and not file.endswith("__.py")): filepath = os.path.join(root, file) - print("filepath: " + filepath) + print(("filepath: " + filepath)) directory = os.path.dirname(filepath)[2:] - directory = directory.replace("/",".") - print "directory: " + directory + directory = directory.replace("/", ".") + print("directory: " + directory) name = os.path.splitext(file)[0] first_line = "## @package " + name description = "\n# Module " + directory + "." + name + "\n" - print first_line,description - insert(filepath,first_line,description) + print(first_line, description) + insert(filepath, first_line, description) if os.path.exists("doxygen/doxygen-python"): print("Looks like you ran this before, so we need to cleanup those old files...") diff --git a/mypy-strict.ini b/mypy-strict.ini index 42fc73abf1cc..7cc6fff83577 100644 --- a/mypy-strict.ini +++ b/mypy-strict.ini @@ -31,9 +31,11 @@ strict_equality = True files = tools/codegen/gen.py, tools/autograd/gen_annotated_fn_args.py, + tools/autograd/gen_autograd.py, tools/autograd/gen_python_functions.py, tools/autograd/gen_trace_type.py, tools/autograd/gen_variable_factories.py, + tools/autograd/gen_variable_type.py, tools/autograd/load_derivatives.py, torch/utils/benchmark/utils/common.py, torch/utils/benchmark/utils/timer.py, diff --git a/mypy.ini b/mypy.ini index 7d6161bddd17..bab4ce5dfd42 100644 --- a/mypy.ini +++ b/mypy.ini @@ -17,8 +17,13 @@ check_untyped_defs = True files = torch, caffe2, + test/test_bundled_images.py, + test/test_bundled_inputs.py, test/test_complex.py, + test/test_dataset.py, + test/test_expecttest.py, test/test_futures.py, + test/test_numpy_interop.py, test/test_torch.py, test/test_type_hints.py, test/test_type_info.py @@ -119,6 +124,12 @@ ignore_errors = True [mypy-torch.overrides] ignore_errors = True +# +# Adding type annotations to caffe2 is probably not worth the effort +# only work on this if you have a specific reason for it, otherwise +# leave these ignores as they are. +# + [mypy-caffe2.python.*] ignore_errors = True diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 5f591ec0a52f..4332916fef6b 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -73,7 +73,7 @@ def allow_listed(schema, allow_list): dont_parse_list = [ ("_TorchScriptTesting.*", datetime.date(2099, 9, 17)), ("test_backend", datetime.date(2099, 9, 17)), - ("c10d.frontend", datetime.date(2020, 12, 30)), + ("dist_c10d", datetime.date(2021, 1, 30)), ] diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/msnpu_extension.cpp index 88c1d509b34c..ea67910f96da 100644 --- a/test/cpp_extensions/msnpu_extension.cpp +++ b/test/cpp_extensions/msnpu_extension.cpp @@ -53,10 +53,10 @@ std::tuple fake_convolution_backward( } TORCH_LIBRARY_IMPL(aten, MSNPU, m) { - m.impl_UNBOXED("empty.memory_format", empty_override); - m.impl_UNBOXED("add.Tensor", add_override); - m.impl_UNBOXED("convolution_overrideable", fake_convolution); - m.impl_UNBOXED("convolution_backward_overrideable", fake_convolution_backward); + m.impl("empty.memory_format", empty_override); + m.impl("add.Tensor", add_override); + m.impl("convolution_overrideable", fake_convolution); + m.impl("convolution_backward_overrideable", fake_convolution_backward); } // TODO: Extend this to exercise multi-device setting. In that case, diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp index bf16a840dfc9..f3ab91fb3cab 100644 --- a/test/cpp_extensions/rng_extension.cpp +++ b/test/cpp_extensions/rng_extension.cpp @@ -22,6 +22,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl { void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); } uint64_t current_seed() const override { throw std::runtime_error("not implemented"); } uint64_t seed() override { throw std::runtime_error("not implemented"); } + void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); } + c10::intrusive_ptr get_state() const override { throw std::runtime_error("not implemented"); } TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); } static DeviceType device_type() { return DeviceType::CPU; } @@ -54,9 +56,9 @@ size_t getInstanceCount() { } TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) { - m.impl_UNBOXED("aten::random_.from", random_from_to); - m.impl_UNBOXED("aten::random_.to", random_to); - m.impl_UNBOXED("aten::random_", random_); + m.impl("aten::random_.from", random_from_to); + m.impl("aten::random_.to", random_to); + m.impl("aten::random_", random_); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py index 5ffd4b4fb088..93e26be7ee98 100644 --- a/test/distributed/test_c10d.py +++ b/test/distributed/test_c10d.py @@ -4641,6 +4641,43 @@ def test_nccl_barrier_timeout_new_group_non_member(self): with self.assertRaisesRegex(RuntimeError, "Timed out initializing process group"): c10d.new_group([0], timeout=timedelta(seconds=1)) + @requires_nccl() + @skip_if_not_multigpu + def test_nccl_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store) + + c10d.barrier(device_ids=[self.rank]) + + @requires_nccl() + @skip_if_not_multigpu + def test_nccl_barrier_device_ids_function_argument(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="nccl", + rank=self.rank, + world_size=self.world_size, + store=store) + + with self.assertRaisesRegex(RuntimeError, "Invalid function argument"): + c10d.barrier(device_ids=self.rank) + + @requires_gloo() + def test_gloo_barrier_device_ids(self): + store = c10d.FileStore(self.file_name, self.world_size) + c10d.init_process_group( + backend="gloo", + rank=self.rank, + world_size=self.world_size, + store=store) + + with self.assertRaisesRegex(RuntimeError, "device_ids not supported"): + c10d.barrier(device_ids=[self.rank]) + if __name__ == "__main__": assert ( not torch.cuda._initialized diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py index 85788b914059..182a405d0e78 100644 --- a/test/distributed/test_jit_c10d.py +++ b/test/distributed/test_jit_c10d.py @@ -4,6 +4,7 @@ import torch import torch.distributed as c10d import time +from datetime import timedelta from typing import List import torch.testing._internal.common_utils as common @@ -31,6 +32,14 @@ def unique_process_group_name(prefix): now = int(time.time() * 1000) return "%s_%d" % (prefix, now) +def _create_tcp_store(): + addr = "localhost" + port = common.find_free_port() + timeout = timedelta(minutes=5) + timeout_millisecond = int(timeout / timedelta(milliseconds=1)) + return torch.classes.dist_c10d.TCPStore(addr, port, 1, True, timeout_millisecond) + + @unittest.skipIf( TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment", @@ -48,19 +57,15 @@ def setUp(self): raise unittest.SkipTest("NCCL test requires 2+ GPUs") def _create_nccl_pg(self, name_prefix): - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True) name = unique_process_group_name(name_prefix) - return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name) + return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name) def _create_nccl_pg_as_base_process_group(self, name): - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() return torch.classes.dist_c10d.frontend().new_process_group_helper( self.world_size, self.rank, [], "nccl", tcp_store, name, 0) @@ -155,9 +160,7 @@ def test_frontend_singleton(self): frontend1 = torch.classes.dist_c10d.frontend() frontend2 = torch.classes.dist_c10d.frontend() - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() pg_name = unique_process_group_name("singleton_test_process_group") @@ -180,9 +183,7 @@ def test_process_group_as_module_member(self): class TestModule(torch.nn.Module): def __init__(self): super(TestModule, self).__init__() - addr = "localhost" - port = common.find_free_port() - tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True) + tcp_store = _create_tcp_store() name = unique_process_group_name("module_member_process_group") self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper( diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index b057d12a285d..8c927f35fd2e 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -727,7 +727,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params): # performs gradient checks on log_prob distribution = dist_ctor(*ctor_params) s = distribution.sample() - if s.is_floating_point(): + if not distribution.support.is_discrete: s = s.detach().requires_grad_() expected_shape = distribution.batch_shape + distribution.event_shape @@ -1422,7 +1422,7 @@ def test_uniform(self): self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,)) # Check log_prob computation when value outside range - uniform = Uniform(low_1d, high_1d) + uniform = Uniform(low_1d, high_1d, validate_args=False) above_high = torch.tensor([4.0]) below_low = torch.tensor([-1.0]) self.assertEqual(uniform.log_prob(above_high).item(), -inf) @@ -1517,7 +1517,7 @@ def test_halfcauchy(self): def test_halfnormal(self): std = torch.randn(5, 5).abs().requires_grad_() - std_1d = torch.randn(1, requires_grad=True) + std_1d = torch.randn(1).abs().requires_grad_() std_delta = torch.tensor([1e-5, 1e-5]) self.assertEqual(HalfNormal(std).sample().size(), (5, 5)) self.assertEqual(HalfNormal(std).sample((7,)).size(), (7, 5, 5)) @@ -1978,6 +1978,8 @@ def gradcheck_func(samples, mu, sigma, prec, scale_tril): sigma = 0.5 * (sigma + sigma.transpose(-1, -2)) # Ensure symmetry of covariance if prec is not None: prec = 0.5 * (prec + prec.transpose(-1, -2)) # Ensure symmetry of precision + if scale_tril is not None: + scale_tril = scale_tril.tril() return MultivariateNormal(mu, sigma, prec, scale_tril).log_prob(samples) gradcheck(gradcheck_func, (mvn_samples, mean, covariance, precision, scale_tril), raise_exception=True) @@ -2643,7 +2645,7 @@ def test_cdf_log_prob(self): for i, param in enumerate(params): dist = Dist(**param) samples = dist.sample() - if samples.dtype.is_floating_point: + if not dist.support.is_discrete: samples.requires_grad_() try: cdfs = dist.cdf(samples) @@ -3050,11 +3052,9 @@ def setUp(self): self.scalar_sample = 1 self.tensor_sample_1 = torch.ones(3, 2) self.tensor_sample_2 = torch.ones(3, 2, 3) - Distribution.set_default_validate_args(True) def tearDown(self): super(TestDistributionShapes, self).tearDown() - Distribution.set_default_validate_args(False) def test_entropy_shape(self): for Dist, params in EXAMPLES: @@ -3186,23 +3186,23 @@ def test_one_hot_categorical_shape(self): self.assertEqual(dist.sample().size(), torch.Size((3,))) self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3))) self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1) - simplex_sample = self.tensor_sample_2 / self.tensor_sample_2.sum(-1, keepdim=True) - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 2,))) + sample = torch.tensor([0., 1., 0.]).expand(3, 2, 3) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 2,))) self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,))) - simplex_sample = torch.ones(3, 3) / 3 - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,))) + sample = torch.eye(3) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,))) # batched dist = OneHotCategorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]])) self.assertEqual(dist._batch_shape, torch.Size((3,))) self.assertEqual(dist._event_shape, torch.Size((2,))) self.assertEqual(dist.sample().size(), torch.Size((3, 2))) self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2))) - simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True) - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,))) + sample = torch.tensor([0., 1.]) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,))) self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2) self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3))) - simplex_sample = torch.ones(3, 1, 2) / 2 - self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3))) + sample = torch.tensor([0., 1.]).expand(3, 1, 2) + self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 3))) def test_cauchy_shape_scalar_params(self): cauchy = Cauchy(0, 1) @@ -3531,12 +3531,15 @@ def __init__(self, probs): [0.2, 0.7, 0.1], [0.33, 0.33, 0.34], [0.2, 0.2, 0.6]]) - pareto = pairwise(Pareto, [2.5, 4.0, 2.5, 4.0], [2.25, 3.75, 2.25, 3.75]) + pareto = (Pareto(torch.tensor([2.5, 4.0, 2.5, 4.0]).expand(4, 4), + torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)), + Pareto(torch.tensor([2.25, 3.75, 2.25, 3.8]).expand(4, 4), + torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4))) poisson = pairwise(Poisson, [0.3, 1.0, 5.0, 10.0]) - uniform_within_unit = pairwise(Uniform, [0.15, 0.95, 0.2, 0.8], [0.1, 0.9, 0.25, 0.75]) + uniform_within_unit = pairwise(Uniform, [0.1, 0.9, 0.2, 0.75], [0.15, 0.95, 0.25, 0.8]) uniform_positive = pairwise(Uniform, [1, 1.5, 2, 4], [1.2, 2.0, 3, 7]) uniform_real = pairwise(Uniform, [-2., -1, 0, 2], [-1., 1, 1, 4]) - uniform_pareto = pairwise(Uniform, [6.5, 8.5, 6.5, 8.5], [7.5, 7.5, 9.5, 9.5]) + uniform_pareto = pairwise(Uniform, [6.5, 7.5, 6.5, 8.5], [7.5, 8.5, 9.5, 9.5]) continuous_bernoulli = pairwise(ContinuousBernoulli, [0.1, 0.2, 0.5, 0.9]) # These tests should pass with precision = 0.01, but that makes tests very expensive. @@ -4148,8 +4151,8 @@ def test_lazy_logits_initialization(self): probs = param.pop('probs') param['logits'] = probs_to_logits(probs) dist = Dist(**param) - shape = (1,) if not dist.event_shape else dist.event_shape - dist.log_prob(torch.ones(shape)) + # Create new instance to generate a valid sample + dist.log_prob(Dist(**param).sample()) message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params)) self.assertFalse('probs' in vars(dist), msg=message) try: @@ -4455,7 +4458,6 @@ def test_stack_transform(self): class TestValidation(TestCase): def setUp(self): super(TestCase, self).setUp() - Distribution.set_default_validate_args(True) def test_valid(self): for Dist, params in EXAMPLES: @@ -4475,7 +4477,6 @@ def test_invalid(self): def tearDown(self): super(TestValidation, self).tearDown() - Distribution.set_default_validate_args(False) class TestJit(TestCase): diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py index bd9a2bb32b89..a0dc99a4e463 100644 --- a/test/jit/test_recursive_script.py +++ b/test/jit/test_recursive_script.py @@ -495,6 +495,59 @@ def forward(self, x): self.checkModule(M(), (torch.randn(5, 5),)) + def test_prepare_scriptable_basic(self): + class SeluButReluWhenScripted(torch.nn.SELU): + def __prepare_scriptable__(self): + return nn.ReLU() + + t = torch.randn(5, 5) + m = SeluButReluWhenScripted() + sm = torch.jit.script(m) + eager_out = m(t) + script_out = sm(t) + self.assertNotEqual(eager_out, script_out) + + def test_prepare_scriptable_iterable_modules(self): + class SeluButReluWhenScripted(torch.nn.SELU): + def __prepare_scriptable__(self): + return nn.ReLU() + + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + shared = SeluButReluWhenScripted() + self.sequential = nn.Sequential( + SeluButReluWhenScripted(), + SeluButReluWhenScripted(), + nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()), + shared, + ) + self.module_list = nn.ModuleList([SeluButReluWhenScripted(), + shared, + SeluButReluWhenScripted()]) + + def forward(self, x): + for mod in self.module_list: + x += mod(x) + x += self.sequential(x) + return x + + t = torch.randn(5, 5) + m = M() + eager_out = m(t.clone()) + sm = torch.jit.script(m) + script_out = sm(t.clone()) + self.assertNotEqual(eager_out, script_out) + + def test_prepare_scriptable_cycle(self): + t = torch.randn(5, 5) + c = torch.nn.Module() + p = torch.nn.Module() + c.__dict__["_p"] = p + p.__dict__["_c"] = c + + sm = torch.jit.script(p) + def test_attributes(self): @torch.jit.script class Inner2(object): diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py index 31eec81d480a..7f43b31fe6ec 100644 --- a/test/jit/test_torchbind.py +++ b/test/jit/test_torchbind.py @@ -62,6 +62,32 @@ def f(): return ss1.pop() + ss2.pop() test_equality(f, lambda x: x) + # test nn module with prepare_scriptable function + class NonJitableClass(object): + def __init__(self, int1, int2): + self.int1 = int1 + self.int2 = int2 + + def return_vals(self): + return self.int1, self.int2 + + class CustomWrapper(torch.nn.Module): + def __init__(self, foo): + super(CustomWrapper, self).__init__() + self.foo = foo + + def forward(self) -> None: + self.foo.increment(1) + return + + def __prepare_scriptable__(self): + int1, int2 = self.foo.return_vals() + foo = torch.classes._TorchScriptTesting._Foo(int1, int2) + return CustomWrapper(foo) + + foo = CustomWrapper(NonJitableClass(1, 2)) + jit_foo = torch.jit.script(foo) + def test_torchbind_take_as_arg(self): global StackString # see [local resolution in python] StackString = torch.classes._TorchScriptTesting._StackString diff --git a/test/mobile/op_deps/simple_ops.cpp b/test/mobile/op_deps/simple_ops.cpp index 3651d1b05353..a76c58838a72 100644 --- a/test/mobile/op_deps/simple_ops.cpp +++ b/test/mobile/op_deps/simple_ops.cpp @@ -80,7 +80,7 @@ namespace { // cares about the name TORCH_LIBRARY(_test, m) { m.def("AA(Tensor self) -> Tensor"); - m.impl("AA", torch::CppFunction::makeUnboxedOnly(AA_op)); + m.impl("AA", torch::CppFunction::makeFromUnboxedFunction(AA_op)); m.def("BB(Tensor self) -> Tensor"); m.impl("BB", TORCH_FN(BB_op)); @@ -97,10 +97,10 @@ TORCH_LIBRARY_FRAGMENT(_test, m) { } TORCH_LIBRARY_IMPL(_test, CPU, m) { - m.impl_UNBOXED("EE", EE_op); + m.impl("EE", EE_op); m.impl("FF", torch::dispatch(DispatchKey::CPU, - torch::CppFunction::makeUnboxedOnly(FF_op)) + torch::CppFunction::makeFromUnboxedFunction(FF_op)) ); m.impl("GG", torch::dispatch(DispatchKey::CPU, diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py index 067c35bd3c64..c47982f0c0cc 100644 --- a/test/quantization/test_quantize.py +++ b/test/quantization/test_quantize.py @@ -726,6 +726,20 @@ def forward(self, x): ref_res = ref_m(data) self.assertEqual(res, ref_res) + @skipIfNoFBGEMM + def test_convtranspose_per_channel_fails_early(self): + r""" + Verifies that attempting to quantize a ConvTranspose module with per-Channel + weight observers fails in the prepare step, as opposed to the convert step. + """ + m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1)) + m.qconfig = torch.quantization.get_default_qconfig('fbgemm') + with self.assertRaises(AssertionError) as context: + mp = torch.quantization.prepare(m) + self.assertTrue( + str(context.exception) == + 'Per channel weight observer is not supported yet for ConvTranspose{n}d.') + @skipIfNoFBGEMM class TestPostTrainingDynamic(QuantizationTestCase): diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index 545e70a2c5e6..7965b3cc88a4 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -573,7 +573,16 @@ def forward(self, x): m = convert_fx(m) m(tensor_input) - def test_standalone_module(self): + def _test_standalone_module( + self, + interface_config, + prepare_count_check, + standalone_prepare_count_check, + convert_count_check, + standalone_convert_count_check): + """ Test standalone module with different quantized input/quantized output + configurations + """ class StandaloneModule(torch.nn.Module): def __init__(self): super().__init__() @@ -613,45 +622,32 @@ def forward(self, x): original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach()) original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach()) - qconfig_dict = {"": default_qconfig} - config_name = {"standalone_module_name": [("standalone", None, None)]} - config_class = {"standalone_module_class": [(StandaloneModule, None, None)]} - for prepare_config in [config_name, config_class]: + for is_name in [True, False]: + if is_name: + prepare_config = { + "standalone_module_name": [("standalone", None, interface_config)] + } + else: + prepare_config = { + "standalone_module_class": [(StandaloneModule, None, interface_config)] + } + original_m_copy = copy.deepcopy(original_m) original_ref_m_copy = copy.deepcopy(original_ref_m) + + qconfig_dict = {"": default_qconfig} # check prepared model m = prepare_fx( original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config) # calibration m(data) - # input and output of first conv, observer for standalone module - # will be inserted in the standalone module itself - count_check = { - ns.call_module(torch.quantization.MinMaxObserver): 2 - } - self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) - # for input and output of conv in the standalone module - count_check = { - ns.call_module(torch.quantization.MinMaxObserver): 2 - } - self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + self.checkGraphModuleNodes(m, expected_node_occurrence=prepare_count_check) + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_prepare_count_check) # check converted/quantized model m = convert_fx(m) - count_check = { - ns.call_function(torch.quantize_per_tensor) : 1, - ns.call_module(nnq.Conv2d) : 1, - ns.call_method('dequantize') : 1, - } - self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) - count_check = { - # standalone module will take float as input and output - # so we'll see quantize and dequantize in the modoule - ns.call_function(torch.quantize_per_tensor) : 1, - ns.call_module(nnq.Conv2d): 1, - ns.call_method('dequantize') : 1, - } - self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + self.checkGraphModuleNodes(m, expected_node_occurrence=convert_count_check) + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_convert_count_check) res = m(data) # quantize the reference model @@ -661,6 +657,76 @@ def forward(self, x): ref_res = ref_m(data) self.assertEqual(res, ref_res) + def test_standalone_module_float_interface(self): + float_interface_config = { + "input_quantized_idxs": [], # float input + "output_quantized_idxs": [], # float output + } + interface_config = float_interface_config + # input and output of first conv, observer for standalone module + # will be inserted in the standalone module itself + prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + # for input and output of conv in the standalone module + standalone_prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + convert_count_check = { + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + ns.call_method("dequantize") : 1, + } + standalone_convert_count_check = { + # standalone module will take float as input and output + # so we'll see quantize and dequantize in the modoule + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d): 1, + ns.call_method("dequantize") : 1, + } + self._test_standalone_module( + interface_config, + prepare_count_check, + standalone_prepare_count_check, + convert_count_check, + standalone_convert_count_check) + + def test_standalone_module_quantized_interface(self): + quantized_interface_config = { + "input_quantized_idxs": [0], # quantized input + "output_quantized_idxs": [0], # quantized output + } + interface_config = quantized_interface_config + # observer for input and output of first conv + prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + # for output of conv in the standalone module + standalone_prepare_count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 1 + } + convert_count_check = { + # quantizing input for conv + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + # dequantizing output of standalone module + ns.call_method("dequantize") : 1, + } + standalone_convert_count_check = { + # quantization of input happens in parent module + # quantization of output happens in the quantized conv module + ns.call_function(torch.quantize_per_tensor) : 0, + ns.call_module(nnq.Conv2d): 1, + # dequantization for output happens in parent module + ns.call_method("dequantize") : 0, + } + self._test_standalone_module( + interface_config, + prepare_count_check, + standalone_prepare_count_check, + convert_count_check, + standalone_convert_count_check) + @skipIfNoFBGEMM def test_qconfig_none(self): class M(torch.nn.Module): @@ -1278,6 +1344,21 @@ def test_fp32_input_fp32_output(self): self._test_quantized_inputs_outputs( prepare_custom_config_dict, prepare_count_check, convert_count_check) + @skipIfNoFBGEMM + def test_convtranspose_per_channel_fails_early(self): + r""" + Verifies that attempting to quantize a ConvTranspose module with per-Channel + weight observers fails in the prepare step, as opposed to the convert step. + """ + m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1)) + m.eval() + qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')} + with self.assertRaises(AssertionError) as context: + mp = prepare_fx(m, qconfig_dict) + self.assertTrue( + str(context.exception) == + 'Per channel weight observer is not supported yet for ConvTranspose{n}d.') + @skipIfNoFBGEMM class TestQuantizeFxOps(QuantizationTestCase): """Unit tests for individual ops diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index 22751697cd1d..8a70ae149c29 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -10,6 +10,7 @@ PlaceholderObserver, NoopObserver, FakeQuantize, + FixedQParamsFakeQuantize, default_debug_qconfig, default_observer, default_per_channel_weight_observer, @@ -504,6 +505,20 @@ def test_observer_qparams_respects_device_affinity(self): self.assertEqual(x.device, scale.device) self.assertEqual(x.device, zero_point.device) + def test_zero_numel(self): + obs_list = [MinMaxObserver, MovingAverageMinMaxObserver, + PerChannelMinMaxObserver, + MovingAveragePerChannelMinMaxObserver, HistogramObserver, + FakeQuantize, FixedQParamsFakeQuantize] + for obs_cls in obs_list: + if obs_cls is FixedQParamsFakeQuantize: + obs = obs_cls(0.1, 0) + else: + obs = obs_cls() + x = torch.Tensor() + # verify no crash + x = obs(x) + # HistogramObserver that works like it does on master class _ReferenceHistogramObserver(HistogramObserver): diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py index f57407c9b1d1..e12339f3acea 100644 --- a/test/test_bundled_inputs.py +++ b/test/test_bundled_inputs.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 import io +from typing import List + import torch import torch.utils.bundled_inputs from torch.testing._internal.common_utils import TestCase, run_tests @@ -27,7 +29,7 @@ def forward(self, arg): sm = torch.jit.script(SingleTensorModel()) original_size = model_size(sm) - get_expr = [] + get_expr : List[str] = [] samples = [ # Tensor with small numel and small storage. (torch.tensor([1]),), diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 047297c438b7..c257dd8a2fd7 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -3,6 +3,7 @@ import errno import os import ctypes +import faulthandler import torch import gc import time @@ -34,18 +35,6 @@ else: warnings.warn(err_msg) -try: - import faulthandler - HAS_FAULTHANDLER = True -except ImportError: - HAS_FAULTHANDLER = False - err_msg = ("faulthandler not found. Some data loader tests use it for error " - "reporting (e.g., TestDataLoader.test_proper_exit).") - if IS_PYTORCH_CI: - raise ImportError(err_msg) from None - else: - warnings.warn(err_msg) - # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -86,9 +75,7 @@ JOIN_TIMEOUT = 60.0 # seconds -supported_multiprocessing_contexts = [None] -if torch.multiprocessing._supports_context: - supported_multiprocessing_contexts += list(torch.multiprocessing.get_all_start_methods()) +supported_multiprocessing_contexts = [None] + list(torch.multiprocessing.get_all_start_methods()) @unittest.skipIf( @@ -312,29 +299,25 @@ def test_iterable_dataset_err(self): # takes in dummy var so this can also be used as a `worker_init_fn` def set_faulthander_if_available(_=None): - if HAS_FAULTHANDLER: - faulthandler.enable(sys.__stderr__) - if not IS_WINDOWS: - # windows does not have faulthandler.register - # chain=False prevents the default behavior of killing the process - faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False) + faulthandler.enable(sys.__stderr__) + if not IS_WINDOWS: + # windows does not have faulthandler.register + # chain=False prevents the default behavior of killing the process + faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False) set_faulthander_if_available() # Process `pid` must have called `set_faulthander_if_available` def print_traces_of_all_threads(pid): - if HAS_FAULTHANDLER: - if not IS_WINDOWS: - # use the custom signal if available - os.kill(pid, signal.SIGUSR1) - else: - # otherwise we can still use the handler given by faulthandler.enable() - # at the cost of killing the process. - os.kill(pid, signal.SIGSEGV) + if not IS_WINDOWS: + # use the custom signal if available + os.kill(pid, signal.SIGUSR1) else: - # if there is no faulthandler, use SIGINT otherwise and hope for the best - os.kill(pid, signal.SIGINT) + # otherwise we can still use the handler given by faulthandler.enable() + # at the cost of killing the process. + os.kill(pid, signal.SIGSEGV) + # wait in parent process to give subprocess some time to print time.sleep(5) @@ -1037,17 +1020,13 @@ def test_invalid_ctor_args_combinations(self): "batch_size=None option disables auto-batching and is mutually exclusive"): self._get_data_loader(self.dataset, batch_size=None, drop_last=True) - if torch.multiprocessing._supports_context: - valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1] - with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"): - self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx) - with self.assertRaisesRegex(ValueError, "should specify a valid start method in"): - self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad') - with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "): - self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object()) - else: - with self.assertRaisesRegex(ValueError, "multiprocessing_context relies on Python >= 3.4"): - self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='fork') + valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1] + with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"): + self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx) + with self.assertRaisesRegex(ValueError, "should specify a valid start method in"): + self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad') + with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "): + self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object()) # map-style sampler = torch.utils.data.SequentialSampler(self.dataset) @@ -1504,7 +1483,7 @@ def _test_sampler(self, **kwargs): def test_sampler(self): self._test_sampler() self._test_sampler(num_workers=4) - if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context: + if not NO_MULTIPROCESSING_SPAWN: self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn') def _test_batch_sampler(self, **kwargs): @@ -1529,7 +1508,7 @@ def _test_batch_sampler(self, **kwargs): def test_batch_sampler(self): self._test_batch_sampler() self._test_batch_sampler(num_workers=4) - if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context: + if not NO_MULTIPROCESSING_SPAWN: self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn') @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") diff --git a/test/test_dataset.py b/test/test_dataset.py index 2caa1a248435..a72b87cca555 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -90,7 +90,7 @@ def _collate_fn(batch): y = next(ds_iter) self.assertEqual(x, torch.tensor(sum(y), dtype=torch.float)) - collate_ds_nolen = CollateIterableDataset(ds_nolen) + collate_ds_nolen = CollateIterableDataset(ds_nolen) # type: ignore with self.assertRaises(NotImplementedError): len(collate_ds_nolen) ds_nolen_iter = iter(ds_nolen) @@ -144,7 +144,7 @@ def test_sampler_dataset(self): arrs = range(10) ds = IterDatasetWithLen(arrs) # Default SequentialSampler - sampled_ds = SamplerIterableDataset(ds) + sampled_ds = SamplerIterableDataset(ds) # type: ignore self.assertEqual(len(sampled_ds), 10) i = 0 for x in sampled_ds: @@ -152,7 +152,7 @@ def test_sampler_dataset(self): i += 1 # RandomSampler - random_sampled_ds = SamplerIterableDataset(ds, sampler=RandomSampler, replacement=True) + random_sampled_ds = SamplerIterableDataset(ds, sampler=RandomSampler, replacement=True) # type: ignore # Requires `__len__` to build SamplerDataset ds_nolen = IterDatasetWithoutLen(arrs) diff --git a/test/test_expecttest.py b/test/test_expecttest.py index 652a33c41869..5e2461797705 100644 --- a/test/test_expecttest.py +++ b/test/test_expecttest.py @@ -4,6 +4,7 @@ import string import textwrap import doctest +from typing import Dict, Any import hypothesis from hypothesis.strategies import text, integers, composite, sampled_from, booleans @@ -38,7 +39,7 @@ def test_replace_string_literal_roundtrip(self, t, raw, quote): r3 = {r}{quote}placeholder3{quote} """.format(r='r' if raw else '', quote=quote * 3) new_prog = expecttest.replace_string_literal(textwrap.dedent(prog), 2, t)[0] - ns = {} + ns : Dict[str, Any] = {} exec(new_prog, ns) msg = "program was:\n{}".format(new_prog) self.assertEqual(ns['r'], 'placeholder', msg=msg) # noqa: F821 diff --git a/test/test_fx.py b/test/test_fx.py index 65d5aa3f0101..2511adc52c62 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -861,6 +861,11 @@ def forward(self, x, w): x, w = torch.rand(3, 4), torch.rand(4, 4) self.assertTrue(any(n.target == torch.relu for n in traced.graph.nodes)) + def test_empty_graph_codegen(self): + graph = torch.fx.Graph() + gm = torch.fx.GraphModule(torch.nn.Module(), graph) + self.assertEqual(gm(), None) + def test_sequential(self): m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1)) gm = torch.fx.symbolic_trace(m) diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index 6e9c877b8de6..ac71d6037591 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -21,6 +21,7 @@ PartitionMode ) from torch.fx.experimental.fuser import fuse +from torch.fx.experimental import merge_matmul try: from torchvision.models import resnet18 @@ -844,6 +845,128 @@ def forward(self, a): for p_name in para_list: assert p_name in node.attrs_for_lowering + def test_merge_matmuls(self): + """ + A collection of test cases for torch.fx.experimental.merge_matmul, + a graph transformation that merges matrix multiplication operations. + """ + # Utility function for counting matmuls for test assertions. + def _count_matmuls(mod): + gm = torch.fx.symbolic_trace(mod) + + num_matmuls = 0 + for node in gm.graph.nodes: + if node.target == torch.matmul: + num_matmuls += 1 + + return num_matmuls + + # Simple test case in which there are two matmuls of the same size to merge. + class SimpleMergeMatmulModule(torch.nn.Module): + def __init__(self, rhs): + super().__init__() + self.rhs = rhs + + def forward(self, x, y): + a = torch.matmul(x, self.rhs) + b = torch.matmul(y, self.rhs) + return a + b + + # Initialize inputs. + a = torch.randn(3, 3) + b = torch.randn(3, 3) + + # Initialize RHS for matmuls. + rhs = torch.randn(3, 4) + + # Construct SimpleMergeMatmulModule and call merge_matmul on it. + module = SimpleMergeMatmulModule(rhs) + opt_module = merge_matmul.merge_matmul(module) + + # Numerical correctness check. + before = module(a, b) + after = opt_module(a, b) + before.allclose(after) + + # Basic graph structure check; original module should have 2 matmuls + # and optimized module should have 1. + self.assertEqual(_count_matmuls(module), 2) + self.assertEqual(_count_matmuls(opt_module), 1) + + # Test case in which there are multiple matmuls of different sizes to merge. + class FiveMergeMatmulModule(torch.nn.Module): + def __init__(self, rhs): + super().__init__() + self.rhs = rhs + + def forward(self, a, b, c, d, e): + s = torch.Tensor((0)) + matmuls = [] + + # For some reason using a list comprehension or for-loop for this + # doesn't work. + matmuls.append(torch.matmul(a, self.rhs)) + matmuls.append(torch.matmul(b, self.rhs)) + matmuls.append(torch.matmul(c, self.rhs)) + matmuls.append(torch.matmul(d, self.rhs)) + matmuls.append(torch.matmul(e, self.rhs)) + + for m in matmuls: + s += torch.sum(m) + + return s + + # Initialize inputs. + inputs = [torch.randn(2 * i + 1, 5) for i in range(5)] + + # Initialize RHS. + rhs = torch.randn(5, 4) + + # Construct FiveMergeMatmulModule and call merge_matmul on it. + module = FiveMergeMatmulModule(rhs) + opt_module = merge_matmul.merge_matmul(module) + + # Numerical correctness check. + before = module(*inputs) + after = opt_module(*inputs) + before.allclose(after) + + # Basic graph structure check; original module should have len(inputs) matmuls + # and optimized module should have 1. + self.assertEqual(_count_matmuls(module), len(inputs)) + self.assertEqual(_count_matmuls(opt_module), 1) + + # Simple test case in which two matmuls cannot be merged due to a data dependency between + # the LHS operands. + class UnmergeableMatmulModule(torch.nn.Module): + def __init__(self, rhs): + super().__init__() + self.rhs = rhs + + def forward(self, x): + a = torch.matmul(x, self.rhs) + a_abs = torch.abs(a) + b = torch.matmul(a_abs.transpose(1, 0), self.rhs) + return b + + # Initialize inputs. + a = torch.randn(3, 3) + + # Initialize RHS for matmuls. + rhs = torch.randn(3, 4) + + # Construct UnmergeableMatmulModule and call merge_matmul on it. + module = UnmergeableMatmulModule(rhs) + opt_module = merge_matmul.merge_matmul(module) + + # Numerical correctness check. + before = module(a) + after = opt_module(a) + before.allclose(after) + + # Basic graph structure check; the number of matrix multiplcations should not have changed. + self.assertEqual(_count_matmuls(module), 2) + self.assertEqual(_count_matmuls(opt_module), 2) if __name__ == "__main__": run_tests() diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index 2143b4e19020..4886abc58758 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1281,6 +1281,7 @@ def forward(self, x): self.assertEqual(ref, mod.forward(x)) self.assertLastGraphAllFused() + @unittest.skip("Temporarily disabled") def test_masked_fill(self): dtypes = [ torch.int8, diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py index dc6bb2fbf878..1cf67f87ded9 100644 --- a/test/test_jit_profiling.py +++ b/test/test_jit_profiling.py @@ -4,7 +4,6 @@ if __name__ == '__main__': run_tests() - if not PY2: - import test_jit_py3 - suite = unittest.findTestCases(test_jit_py3) - unittest.TextTestRunner().run(suite) + import test_jit_py3 + suite = unittest.findTestCases(test_jit_py3) + unittest.TextTestRunner().run(suite) diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py index 23da6602c572..23c7f3b4b6f6 100644 --- a/test/test_jit_simple.py +++ b/test/test_jit_simple.py @@ -4,7 +4,6 @@ if __name__ == '__main__': run_tests() - if not PY2: - import test_jit_py3 - suite = unittest.findTestCases(test_jit_py3) - unittest.TextTestRunner().run(suite) + import test_jit_py3 + suite = unittest.findTestCases(test_jit_py3) + unittest.TextTestRunner().run(suite) diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py index 35ac4eb94889..81c385ae90a2 100644 --- a/test/test_numpy_interop.py +++ b/test/test_numpy_interop.py @@ -47,10 +47,8 @@ def get_castable_tensor(shape, dtype): else: # can't directly use min and max, because for int64_t, max - min # is greater than int64_t range and triggers UB. - dtype_info = torch.iinfo(dtype) - low = max(dtype_info.min, int(-1e10)) - high = min(dtype_info.max, int(1e10)) - dtype_info = torch.iinfo(dtype) + low = max(torch.iinfo(dtype).min, int(-1e10)) + high = min(torch.iinfo(dtype).max, int(1e10)) t = torch.empty(shape, dtype=torch.int64).random_(low, high) return t.to(dtype) @@ -272,10 +270,12 @@ def test_numpy_array_interface(self, device): ] for tp, dtype in zip(types, dtypes): if np.dtype(dtype).kind == 'u': - x = torch.Tensor([1, 2, 3, 4]).type(tp) + # .type expects a XxxTensor, which have no type hints on + # purpose, so ignore during mypy type checking + x = torch.Tensor([1, 2, 3, 4]).type(tp) # type: ignore array = np.array([1, 2, 3, 4], dtype=dtype) else: - x = torch.Tensor([1, -2, 3, -4]).type(tp) + x = torch.Tensor([1, -2, 3, -4]).type(tp) # type: ignore array = np.array([1, -2, 3, -4], dtype=dtype) # Test __array__ w/o dtype argument @@ -309,7 +309,7 @@ def test_numpy_array_interface(self, device): float_types = [torch.DoubleTensor, torch.FloatTensor] float_dtypes = [np.float64, np.float32] for tp, dtype in zip(float_types, float_dtypes): - x = torch.Tensor([1, 2, 3, 4]).type(tp) + x = torch.Tensor([1, 2, 3, 4]).type(tp) # type: ignore array = np.array([1, 2, 3, 4], dtype=dtype) for func in ['sin', 'sqrt', 'ceil']: ufunc = getattr(np, func) @@ -321,7 +321,7 @@ def test_numpy_array_interface(self, device): # Test functions with boolean return value for tp, dtype in zip(types, dtypes): - x = torch.Tensor([1, 2, 3, 4]).type(tp) + x = torch.Tensor([1, 2, 3, 4]).type(tp) # type: ignore array = np.array([1, 2, 3, 4], dtype=dtype) geq2_x = np.greater_equal(x, 2) geq2_array = np.greater_equal(array, 2).astype('uint8') @@ -360,7 +360,7 @@ def test_parse_numpy_int(self, device): self.assertEqual(torch.ones([2, 2, 2, 2]).mean(scalar), torch.ones([2, 2, 2, 2]).mean(np_val)) # numpy integral type parses like a python int in custom python bindings: - self.assertEqual(torch.Storage(np_val).size(), scalar) + self.assertEqual(torch.Storage(np_val).size(), scalar) # type: ignore tensor = torch.tensor([2], dtype=torch.int) tensor[0] = np_val diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py index 43321508e0e2..f7da08eb24d7 100644 --- a/test/test_shape_ops.py +++ b/test/test_shape_ops.py @@ -378,21 +378,31 @@ def test_flip(self, device): self.assertEqual(size, list(data.flip(ds).size())) # test rectangular case - data = torch.tensor([1, 2, 3, 4, 5, 6]).view(2, 3).to(device) - flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]]).to(device) - flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]]).to(device) + data = torch.tensor([1, 2, 3, 4, 5, 6], device=device).view(2, 3) + flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]], device=device) + flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]], device=device) self.assertEqual(flip0_result, data.flip(0)) self.assertEqual(flip1_result, data.flip(1)) # test empty tensor, should just return an empty tensor of the same shape - data = torch.tensor([]) + data = torch.tensor((), device=device) self.assertEqual(data, data.flip(0)) # test bool tensor - a = torch.tensor([False, True]) + a = torch.tensor([False, True], device=device) self.assertEqual(a.flip(0), torch.tensor([True, False])) + # case: dims=() + a = torch.randn(3, 2, 1, device=device) + if device == 'cpu': + self.assertEqual(a.flip(dims=()), a) + else: + # Reference: https://github.com/pytorch/pytorch/issues/49982 + with self.assertRaisesRegex(IndexError, + "flip dims size out of range, got flip dims size=0"): + a.flip(dims=()) + def _rand_shape(self, dim, min_size, max_size): shape = [] for i in range(dim): diff --git a/test/test_sparse.py b/test/test_sparse.py index 4e982b8333d9..228c66aa403e 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -356,6 +356,11 @@ def test_to_sparse(self): sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3]) self.assertRaises(RuntimeError, lambda: sp.to_sparse()) + def test_sparse_bool(self): + a = self.value_tensor([True, False]).to(torch.bool) + b = a.to_sparse().to_dense() + self.assertEqual(a, b) + def test_scalar(self): # tensor with value a = self.sparse_tensor(self.index_tensor([]).unsqueeze(1), 12.3, []) diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py index 6192d6c4d6b6..085af5294a04 100644 --- a/test/test_spectral_ops.py +++ b/test/test_spectral_ops.py @@ -225,13 +225,13 @@ def test_empty_fft(self, device, dtype): def test_fft_invalid_dtypes(self, device): t = torch.randn(64, device=device, dtype=torch.complex128) - with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + with self.assertRaisesRegex(RuntimeError, "rfft expects a real input tensor"): torch.fft.rfft(t) with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input tensor"): torch.fft.rfftn(t) - with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"): + with self.assertRaisesRegex(RuntimeError, "ihfft expects a real input tensor"): torch.fft.ihfft(t) @skipCUDAIfRocm @@ -332,6 +332,27 @@ def test_fft_backward(self, device, dtype): args = args[1:] self._fft_grad_check_helper(fname, input, args) + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + def test_fft_invalid_out_types(self, device): + + complex_fft_funcs = [torch.fft.fft, torch.fft.ifft, torch.fft.fftn, torch.fft.ifftn, + torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft] + real_fft_funcs = [torch.fft.irfft, torch.fft.irfftn, torch.fft.hfft] + fft_funcs = complex_fft_funcs + real_fft_funcs + + # Test errors on invalid out dtypes + x = torch.rand(10, device=device, dtype=torch.float32) + for out_dtype, funcs in [(torch.int16, fft_funcs), + (torch.float32, complex_fft_funcs), + (torch.complex64, real_fft_funcs)]: + out = torch.empty((), device=device, dtype=out_dtype) + + for func in funcs: + with self.assertRaisesRegex(RuntimeError, "expects a .* output tensor"): + func(x, out=out) + # nd-fft tests @skipCPUIfNoMkl @@ -463,10 +484,10 @@ def test_fftn_invalid(self, device): torch.fft.rfftn, torch.fft.irfftn) for func in fft_funcs: - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(0, 1, 0)) - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(2, -1)) with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"): @@ -578,10 +599,10 @@ def test_fft2_invalid(self, device): torch.fft.rfft2, torch.fft.irfft2) for func in fft_funcs: - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(0, 0)) - with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"): + with self.assertRaisesRegex(RuntimeError, "dims must be unique"): func(a, dim=(2, -1)) with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"): @@ -623,6 +644,19 @@ def test_fftfreq_numpy(self, device, dtype): actual = torch_fn(*args, device=device, dtype=dtype) self.assertEqual(actual, expected, exact_dtype=False) + @skipCPUIfNoMkl + @skipCUDAIfRocm + @onlyOnCPUAndCUDA + @dtypes(torch.float, torch.double) + def test_fftfreq_out(self, device, dtype): + for func in (torch.fft.fftfreq, torch.fft.rfftfreq): + expect = func(n=100, d=.5, device=device, dtype=dtype) + actual = torch.empty((), device=device, dtype=dtype) + with self.assertWarnsRegex(UserWarning, "out tensor will be resized"): + func(n=100, d=.5, out=actual) + self.assertEqual(actual, expect) + + @skipCPUIfNoMkl @skipCUDAIfRocm @onlyOnCPUAndCUDA @@ -1066,10 +1100,12 @@ def test_complex_stft_onesided(self, device): with self.assertRaisesRegex(RuntimeError, 'complex'): x.stft(10, pad_mode='constant', onesided=True) + # stft is currently warning that it requires return-complex while an upgrader is written def test_stft_requires_complex(self, device): x = torch.rand(100) - with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'): - y = x.stft(10, pad_mode='constant') + y = x.stft(10, pad_mode='constant') + # with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'): + # y = x.stft(10, pad_mode='constant') @skipCUDAIfRocm @skipCPUIfNoMkl diff --git a/test/test_torch.py b/test/test_torch.py index 1f85ed2fff54..72fa853e2e7c 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5689,7 +5689,8 @@ def test_storage_multigpu(self, devices): x = torch.tensor([], device=device) self.assertEqual(x.dtype, x.storage().dtype) - @dtypes(torch.float, torch.double, torch.half) + @dtypesIfCUDA(torch.float, torch.double, torch.half) + @dtypes(torch.float, torch.double) def test_multinomial(self, device, dtype): def make_prob_dist(shape, is_contiguous): if is_contiguous: diff --git a/test/type_hint_tests/opt_size.py b/test/type_hint_tests/opt_size.py new file mode 100644 index 000000000000..f24e57e6e56f --- /dev/null +++ b/test/type_hint_tests/opt_size.py @@ -0,0 +1,6 @@ +import torch.nn as nn + +avg_pool1 = nn.AdaptiveAvgPool2d((1, None)) +avg_pool2 = nn.AdaptiveAvgPool2d((None, 1)) +max_pool1 = nn.AdaptiveMaxPool2d((1, None)) +max_pool2 = nn.AdaptiveMaxPool2d((None, 1)) diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py index 026293a9281a..9d4fa54c93b3 100755 --- a/tools/amd_build/build_amd.py +++ b/tools/amd_build/build_amd.py @@ -131,6 +131,20 @@ def is_hip_clang(): sources.write(line) print("%s updated" % gloo_cmake_file) +gloo_cmake_file = "third_party/gloo/cmake/Modules/Findrccl.cmake" +if os.path.exists(gloo_cmake_file): + do_write = False + with open(gloo_cmake_file, "r") as sources: + lines = sources.readlines() + newlines = [line.replace('RCCL_LIBRARY', 'RCCL_LIBRARY_PATH') for line in lines] + if lines == newlines: + print("%s skipped" % gloo_cmake_file) + else: + with open(gloo_cmake_file, "w") as sources: + for line in newlines: + sources.write(line) + print("%s updated" % gloo_cmake_file) + hipify_python.hipify( project_directory=proj_dir, output_directory=out_dir, diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py index 88c00e0ba71a..b930aca504df 100644 --- a/tools/autograd/gen_autograd.py +++ b/tools/autograd/gen_autograd.py @@ -23,9 +23,6 @@ import argparse import os -import yaml -import re -from .utils import YamlLoader, op_name_with_overload from tools.codegen.selective_build.selector import SelectiveBuilder # See NOTE [ Autograd View Variables ] in variable.h for details. @@ -89,84 +86,14 @@ 'tensor_split', 'swapdims', 'swapaxes' }) -def format_return_type(returns): - if len(returns) == 0: - return 'void' - elif len(returns) == 1: - return returns[0]['type'] - else: - return_types = [r['type'] for r in returns] - return 'std::tuple<{}>'.format(','.join(return_types)) - - -def get_simple_type(arg): - simple_type = arg['type'] - simple_type = simple_type.replace(' &', '').replace('const ', '') - simple_type = simple_type.replace('Generator *', 'Generator') - - opt_match = re.match(r'c10::optional<(.+)>', simple_type) - if opt_match: - simple_type = '{}?'.format(opt_match.group(1)) - return simple_type - -def has_tensoroptions_argument(declaration): - for argument in declaration['arguments']: - if 'TensorOptions' == argument['dynamic_type']: - return True - return False - - -def load_aten_declarations(path): - with open(path, 'r') as f: - declarations = yaml.load(f, Loader=YamlLoader) - - # enrich declarations with additional information - selected_declarations = [] - for declaration in declarations: - if declaration.get('deprecated'): - continue - - for arg in declaration['arguments']: - arg['simple_type'] = get_simple_type(arg) - for arg in declaration['schema_order_arguments']: - arg['simple_type'] = get_simple_type(arg) - for ret in declaration['returns']: - ret['simple_type'] = get_simple_type(ret) - - declaration['formals'] = [arg['type'] + ' ' + arg['name'] - for arg in declaration['arguments']] - declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name'] - for arg in declaration['schema_order_arguments']] - declaration['args'] = [arg['name'] for arg in declaration['arguments']] - declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']] - declaration['api_name'] = declaration['name'] - if declaration.get('overload_name'): - declaration['type_wrapper_name'] = "{}_{}".format( - declaration['name'], declaration['overload_name']) - else: - declaration['type_wrapper_name'] = declaration['name'] - declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0] - declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1] - declaration['return_type'] = format_return_type(declaration['returns']) - - declaration['base_name'] = declaration['name'] - selected_declarations.append(declaration) - - return selected_declarations - - -def gen_autograd(aten_path, native_functions_path, out, autograd_dir, operator_selector: SelectiveBuilder, disable_autograd=False): - full_aten_decls = load_aten_declarations(aten_path) - - def filter_decls(aten_decls, operator_selector): - def is_operator_selected_for_training(decl): - op_name = op_name_with_overload(decl) - return operator_selector.is_operator_selected_for_training(op_name) - - return [decl for decl in aten_decls if is_operator_selected_for_training(decl)] - - aten_decls = filter_decls(full_aten_decls, operator_selector) - +def gen_autograd( + aten_path: str, + native_functions_path: str, + out: str, + autograd_dir: str, + operator_selector: SelectiveBuilder, + disable_autograd: bool = False, +) -> None: # Parse and load derivatives.yaml from .load_derivatives import load_derivatives differentiability_infos = load_derivatives( @@ -175,13 +102,13 @@ def is_operator_selected_for_training(decl): template_path = os.path.join(autograd_dir, 'templates') # Generate VariableType.h/cpp + from .gen_trace_type import gen_trace_type + from .gen_variable_type import gen_variable_type if not disable_autograd: - from .gen_variable_type import gen_variable_type - gen_variable_type(out, aten_decls, differentiability_infos, template_path) + gen_variable_type(out, native_functions_path, differentiability_infos, template_path, operator_selector) - from . import gen_trace_type # operator filter not applied as tracing sources are excluded in selective build - gen_trace_type.gen_trace_type(out, native_functions_path, template_path) + gen_trace_type(out, native_functions_path, template_path) # Generate Functions.h/cpp from .gen_autograd_functions import gen_autograd_functions_lib @@ -193,7 +120,12 @@ def is_operator_selected_for_training(decl): gen_variable_factories(out, native_functions_path, template_path) -def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir): +def gen_autograd_python( + aten_path: str, + native_functions_path: str, + out: str, + autograd_dir: str, +) -> None: from .load_derivatives import load_derivatives differentiability_infos = load_derivatives( os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path) @@ -212,7 +144,7 @@ def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir): out, native_functions_path, deprecated_path, template_path) -def main(): +def main() -> None: parser = argparse.ArgumentParser( description='Generate autograd C++ files script') parser.add_argument('declarations', metavar='DECL', diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index 31eb8aacf296..d8e68606e6ba 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -117,13 +117,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen else: return [ADD_TRACE_INPUT.substitute(name=name, input=name)] - args: List[Union[Argument, TensorOptionsArguments]] = [] - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - args = list(f.func.schema_order_arguments()) - else: - sig_group = CppSignatureGroup.from_native_function(f, method=False) - args = [cpp_args.argument for cpp_args in sig_group.signature.arguments() - if not isinstance(cpp_args.argument, SelfArgument)] + args: List[Union[Argument, TensorOptionsArguments]] = list(f.func.schema_order_arguments()) if f.func.is_out_fn(): # *_out functions take the result as a separate argument, but we don't want to @@ -131,12 +125,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # So first, we need to remove the out argument from the list of arguments to trace. # TODO: byte-for-byte compatible with old codegen behavior - it's incorrect to assume # there is only one output argument. - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - # for c10-full ops, the out argument is in the end - args = args[:-1] - else: - # for legacy ops, the out argument is in the beginning. - args = args[1:] + args = args[:-1] trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args) @@ -374,14 +363,10 @@ def method_definition(f: NativeFunction) -> Optional[str]: if cpp.name(f.func) in MANUAL_TRACER: return None - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - formals = ', '.join( - f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' - for a in f.func.schema_order_arguments() - ) - else: - sig_group = CppSignatureGroup.from_native_function(f, method=False) - formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments()) + formals = ', '.join( + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ) return METHOD_DEFINITION.substitute( return_type=cpp.returns_type(f.func.returns), @@ -396,33 +381,22 @@ def method_definition(f: NativeFunction) -> Optional[str]: ); """) -UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\ -m.impl_UNBOXED("${name}", &${class_type}::${type_wrapper_name}); -""") - @with_native_function def method_registration(f: NativeFunction) -> Optional[str]: if cpp.name(f.func) in MANUAL_TRACER: return None - if f.use_c10_dispatcher.dispatcher_uses_new_style(): - return WRAPPER_REGISTRATION.substitute( - name=f.func.name, - type_wrapper_name=type_wrapper_name(f), - class_type='TraceType', - ) - else: - return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( - name=f.func.name, - type_wrapper_name=type_wrapper_name(f), - class_type='TraceType', - ) + return WRAPPER_REGISTRATION.substitute( + name=f.func.name, + type_wrapper_name=type_wrapper_name(f), + class_type='TraceType', + ) def gen_trace_type_shard( fm: FileManager, native_functions: Sequence[NativeFunction], suffix: str ) -> None: fm.write_with_template('TraceType%s.cpp' % suffix, 'TraceType.cpp', lambda: { - 'generated_comment': f'@generated from {fm.template_dir}/TraceType.cpp', + 'generated_comment': '@' + f'generated from {fm.template_dir}/TraceType.cpp', 'trace_method_definitions': list(mapMaybe(method_definition, native_functions)), 'trace_wrapper_registrations': list(mapMaybe(method_registration, native_functions)), }) diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index f0eb5d6b7ab1..e4337e9de855 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -22,20 +22,24 @@ # which will in turn dispatch back to VariableType for its # differentiable subcomponents. # +from dataclasses import dataclass -from .utils import CodeTemplate, nested_dict, write, make_out_api_name_faithful from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \ MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT from .gen_autograd_functions import uses_single_grad -from .gen_trace_type import MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD +from .gen_trace_type import ( + MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD, + declare_returned_variables, tie_return_values, get_return_value, type_wrapper_name, +) from tools.codegen.api.types import * from tools.codegen.api.autograd import * import tools.codegen.api.cpp as cpp -import tools.codegen.api.python as python -from tools.codegen.gen import with_native_function +from tools.codegen.code_template import CodeTemplate +from tools.codegen.gen import with_native_function, parse_native_yaml, FileManager, mapMaybe from tools.codegen.model import * -from typing import Dict, Optional, List, Sequence, Any, Callable +from tools.codegen.selective_build.selector import SelectiveBuilder +from typing import Callable, List, Optional, Sequence, Tuple, Union # We don't set or modify grad_fn on these methods. Generally, they return # tensors that have requires_grad=False. In-place functions listed here will @@ -187,19 +191,6 @@ } """) -# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once -# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because -# ops that are `use_c10_dispatcher: full` need different c++ code than ops -# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants -# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates -# can be deleted once all ops are `use_c10_dispatcher: full`. -# If you update one of the templates, you likely also have to update the other. - -# See NOTE[UnboxedOnly] -UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\ -m.impl_UNBOXED("${unqual_operator_name_with_overload}", &${class_type}::${type_wrapper_name}); -""") - WRAPPER_REGISTRATION = CodeTemplate("""\ m.impl("${unqual_operator_name_with_overload}", TORCH_FN(${class_type}::${type_wrapper_name}) @@ -209,9 +200,6 @@ UNPACK_TENSOR = CodeTemplate("""\ auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""") -LEGACY_WRAP_OPTIONS = CodeTemplate("""\ -auto ${arg_name}_ = TensorOptions(${arg_name});""") - DECLARE_GRAD_FN = CodeTemplate("""\ std::shared_ptr<${op}> grad_fn; """) @@ -304,49 +292,18 @@ #endif """) -# Methods shared by TraceType and VariableType to handle return variable declaration, tie and tuple. -def format_return_variables(declaration): - name = declaration['name'] - arguments = declaration['arguments'] - inplace = declaration['inplace'] - is_out_fn = name.endswith('_out') - modifies_arguments = inplace or is_out_fn - - def declare_returned_variables(): - if modifies_arguments: - return '' - if len(declaration['returns']) == 1: - return '' - # TODO: this will be ugly - names = [ret['type'] + ' ' + ret['name'] + ';' for ret in declaration['returns']] - return '\n'.join(names) - - def tie_return_values(): - if len(declaration['returns']) == 1: - return 'auto {}'.format(declaration['returns'][0]['name']) - names = [ret['name'] for ret in declaration['returns']] - return 'std::tie({})'.format(', '.join(names)) - - def get_return_value(): - if inplace: - return 'self' - if is_out_fn: - return_names = [arg['name'] for arg in arguments - if arg.get('output', False)] - if len(return_names) == 1: - return return_names[0] - return 'std::forward_as_tuple({})'.format(', '.join(return_names)) - - returns = declaration['returns'] - if len(returns) == 1: - return returns[0]['name'] - moved = ['std::move({})'.format(r['name']) for r in returns] - return 'std::make_tuple({})'.format(', '.join(moved)) - - return (declare_returned_variables(), tie_return_values(), get_return_value()) +@dataclass(frozen=True) +class NativeFunctionWithDifferentiabilityInfo: + func: NativeFunction + info: Optional[DifferentiabilityInfo] - -def gen_variable_type(out, aten_declarations, differentiability_infos, template_path): +def gen_variable_type( + out: str, + native_yaml_path: str, + differentiability_infos: Sequence[DifferentiabilityInfo], + template_path: str, + operator_selector: SelectiveBuilder, +) -> None: """VariableType.h and VariableType.cpp body @@ -354,154 +311,190 @@ def gen_variable_type(out, aten_declarations, differentiability_infos, template_ implementation of each function dispatches to the base tensor type to compute the output. The grad_fn is attached to differentiable functions. """ + fns = list(sorted(filter( + operator_selector.is_native_function_selected_for_training, + parse_native_yaml(native_yaml_path)), key=lambda f: cpp.name(f.func))) + fns_with_infos = match_differentiability_info(fns, differentiability_infos) - aten_declarations = list(sorted(aten_declarations, key=lambda decl: decl['name'])) - match_declarations_with_differentiability_info(aten_declarations, differentiability_infos) - - gen_variable_type_shard(out, aten_declarations, template_path, None, True) + fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False) + gen_variable_type_shard(fm, fns_with_infos, 'VariableType.h', 'VariableType.h') # NOTE: see Note [Sharded File] at the top of the VariableType.cpp # template regarding sharding of the generated files. num_shards = 5 - shards = [[] for _ in range(num_shards)] + shards: List[List[NativeFunctionWithDifferentiabilityInfo]] = [[] for _ in range(num_shards)] # functions are assigned arbitrarily but stably to a file based on hash - for decl in aten_declarations: - x = sum(ord(c) for c in decl['name']) % num_shards - shards[x].append(decl) + for fn in fns_with_infos: + x = sum(ord(c) for c in cpp.name(fn.func.func)) % num_shards + shards[x].append(fn) for i, shard in enumerate(shards): - gen_variable_type_shard(out, shard, template_path, '_%d' % i, False) - gen_variable_type_shard(out, aten_declarations, template_path, 'Everything', False) - + gen_variable_type_shard(fm, shard, 'VariableType.cpp', f'VariableType_{i}.cpp') -def gen_variable_type_shard(out, aten_declarations, template_path, suffix, header): - VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h') - VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp') + gen_variable_type_shard(fm, fns_with_infos, 'VariableType.cpp', 'VariableTypeEverything.cpp') - type_declarations = [] - type_definitions = [] - wrapper_registrations = [] - - for declaration in aten_declarations: - if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']: - formals = declaration['schema_order_formals'] - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - formals = declaration['formals'] - type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals)) - strategy = dispatch_strategy(declaration) - if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived': - body = emit_body(declaration) +@with_native_function +def gen_formals(f: NativeFunction) -> str: + return ', '.join( + f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}' + for a in f.func.schema_order_arguments() + ) +@with_native_function +def gen_wrapper_registration(f: NativeFunction) -> str: + return WRAPPER_REGISTRATION.substitute( + unqual_operator_name_with_overload=f.func.name, + type_wrapper_name=type_wrapper_name(f), + class_type='VariableType', + ) + +def gen_variable_type_shard( + fm: FileManager, + fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo], + template_name: str, + output_name: str, +) -> None: + type_declarations: List[str] = [] + type_definitions: List[str] = [] + wrapper_registrations: List[str] = [] + + for fn in fns_with_infos: + f = fn.func + name = cpp.name(f.func) + formals = gen_formals(f) + + type_declarations.append(METHOD_DECLARATION.substitute( + return_type=cpp.returns_type(f.func.returns), + type_wrapper_name=type_wrapper_name(f), + formals=formals, + )) + + if name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == 'use_derived': type_definitions.append(METHOD_DEFINITION.substitute( - declaration, type_definition_body=body, formals=formals)) - if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']: - wrapper_registrations.append(WRAPPER_REGISTRATION.substitute( - declaration, class_type='VariableType')) - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - wrapper_registrations.append(UNBOXEDONLY_WRAPPER_REGISTRATION.substitute( - declaration, class_type='VariableType')) + return_type=cpp.returns_type(f.func.returns), + type_wrapper_name=type_wrapper_name(f), + type_definition_body=emit_body(fn), + formals=formals, + )) + wrapper_registrations.append(gen_wrapper_registration(f)) # See Note [Manual Backend kernels] - assert (declaration['name'] in MANUAL_BACKEND) == declaration['manual_kernel_registration'] + assert (name in MANUAL_BACKEND) == f.manual_kernel_registration # If you want to register a kernel to Autograd, you must make the op abstract. # In other words, this op must have dispatch section in native_functions.yaml. - if declaration['name'] in MANUAL_AUTOGRAD_AND_TRACER or declaration['derivative']: - msg = (f'There\'s a formula for {declaration["name"]}(or its functional variant) in derivatives.yaml. ' + if name in MANUAL_AUTOGRAD_AND_TRACER or (fn.info and fn.info.has_derivatives): + msg = (f'There\'s a formula for {name}(or its functional variant) in derivatives.yaml. ' f'It\'s required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA ' f'or DefaultBackend in native_functions.yaml. Please see ' f'https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword ' f'for instructions to choose the right dispatch keyword.') - assert declaration['abstract'], msg + assert f.is_abstract, msg - env = { + fm.write_with_template(output_name, template_name, lambda: { + 'generated_comment': '@' + f'generated from {fm.template_dir}/{template_name}', 'type_derived_method_declarations': type_declarations, 'type_derived_method_definitions': type_definitions, 'wrapper_registrations': wrapper_registrations, - } - if header: - write(out, 'VariableType.h', VARIABLE_TYPE_H, env) - else: - write(out, 'VariableType%s.cpp' % suffix, VARIABLE_TYPE_CPP, env) - - -def emit_body(declaration): - assert dispatch_strategy(declaration) == 'use_derived' - - arguments = declaration['arguments'] - returns = declaration['returns'] - func = declaration['derivative'] - name = declaration['name'] - inplace = declaration['inplace'] - is_out_fn = name.endswith('_out') - modifies_arguments = inplace or is_out_fn - returns_void = len(returns) == 0 - - base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name + }) + +def emit_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]: + assert dispatch_strategy(fn) == 'use_derived' + f = fn.func + info = fn.info + + name = cpp.name(f.func) + inplace = f.func.kind() == SchemaKind.inplace + is_out_fn = f.func.kind() == SchemaKind.out + returns_void = len(f.func.returns) == 0 + base_name = f.func.name.name.base # TODO: should be str(f.func.name.name)? view_info = VIEW_FUNCTIONS.get(base_name, None) if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT: view_info = "self" - def is_differentiable(arg): - if 'TensorOptions' in arg['type']: - return False - if 'Tensor' not in arg['type']: - return False - if arg['name'] in declaration.get('non_differentiable_arg_names', []): - return False - return True - - def find_args_with_derivatives(differentiable_inputs): + def is_differentiable(name: str, type: Type) -> bool: + return type.is_tensor_like() and (info is None or name not in info.non_differentiable_arg_names) + + def gen_differentiable_input( + arg: Union[Argument, SelfArgument, TensorOptionsArguments] + ) -> Optional[DifferentiableInput]: + if isinstance(arg, TensorOptionsArguments): + return None + a: Argument = arg.argument if isinstance(arg, SelfArgument) else arg + + # TODO: `cpp_type` is only to keep it byte-for-byte compatible with the old codegen, should remove. + # NB: This is not a clone of cpp.argument() - TensorOptionsArguments / faithful / binds are + # not handled properly as they are irrelevant for this codegen. + cpp_type = cpp.argument_type(a, binds=a.name).cpp_type() + + if not is_differentiable(a.name, a.type): + return None + return DifferentiableInput( + name=a.name, + type=a.type, + cpp_type=cpp_type, + ) + + @with_native_function + def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]: + return list(mapMaybe(gen_differentiable_input, f.func.arguments.non_out)) + + def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) -> List[DifferentiableInput]: """Find arguments that have derivative definitions""" - if func is None: + if info is None or not info.has_derivatives: return differentiable_inputs - names = set(name for d in func.derivatives for name in d.var_names) - differentiable = [arg for arg in differentiable_inputs if arg['name'] in names] + names = set(name for d in info.derivatives for name in d.var_names) + differentiable = [arg for arg in differentiable_inputs if arg.name in names] if len(differentiable) != len(names): - missing = names - set(arg['name'] for arg in differentiable) - raise RuntimeError(f'Missing arguments for derivatives: {missing} in {func.name}') + missing = names - set(arg.name for arg in differentiable) + raise RuntimeError(f'Missing arguments for derivatives: {missing} in {info.name}') return differentiable - inputs = [arg for arg in arguments if not arg.get('output', False)] - differentiable_inputs = list(filter(is_differentiable, inputs)) + def gen_differentiable_outputs(f: NativeFunction) -> List[DifferentiableOutput]: + outputs: List[DifferentiableOutput] = [ + DifferentiableOutput(name=name, type=ret.type, cpp_type=cpp.return_type(ret)) + for name, ret in zip(cpp.return_names(f), f.func.returns)] + + output_differentiability = info.output_differentiability if info else None + if output_differentiability is not None: + differentiable_outputs: List[DifferentiableOutput] = [] + if False in output_differentiability and f.func.kind() == SchemaKind.inplace: + raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)") + for differentiable, output in zip(output_differentiability, outputs): + if differentiable: + differentiable_outputs.append(output) + return differentiable_outputs + + candidate_differentiable_outputs = list(filter(lambda r: is_differentiable(r.name, r.type), outputs)) + + if uses_single_grad(info): + return candidate_differentiable_outputs[:1] + else: + return candidate_differentiable_outputs + + differentiable_inputs = gen_differentiable_inputs(f) args_with_derivatives = find_args_with_derivatives(differentiable_inputs) - non_differentiable_arg_names = declaration.get('non_differentiable_arg_names', []) - candidate_differentiable_outputs = list(filter(is_differentiable, returns)) - - if declaration['output_differentiability'] is not None: - differentiable_outputs = [] - output_differentiability = declaration['output_differentiability'] - if False in output_differentiability and inplace: - raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)") - for differentiable, output in zip(output_differentiability, returns): - if differentiable: - differentiable_outputs.append(output) - elif uses_single_grad(func): - differentiable_outputs = candidate_differentiable_outputs[:1] - else: - differentiable_outputs = candidate_differentiable_outputs + differentiable_outputs = gen_differentiable_outputs(f) requires_derivative = ( base_name not in DONT_REQUIRE_DERIVATIVE and name not in DONT_REQUIRE_DERIVATIVE and len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0) - if func is not None and not requires_derivative: - raise RuntimeError('ERROR: derivative ignored for {} -- specified an autograd function without derivative' - .format(name)) + if info is not None and info.has_derivatives and not requires_derivative: + raise RuntimeError(f'ERROR: derivative ignored for {name} -- specified an autograd function without derivative') - def emit_save_inputs(): - setup = [] - if func is None: + def emit_save_inputs() -> List[str]: + setup: List[str] = [] + if info is None or not info.has_derivatives: return setup - has_tensorlist_arg = \ - any(arg.type in ['TensorList', 'const c10::List> &'] for arg in func.args_with_derivatives) + has_tensorlist_arg = any(is_tensor_list_type(arg.type) for arg in args_with_derivatives) # We don't want to save tensors if we know that they will never be used # when computing the derivative, so we add guards to those statements def guard_for(arg: SavedAttribute) -> Optional[str]: + assert info is not None + # It's hard to determine the edge offset if we have TensorLists if has_tensorlist_arg: return None @@ -512,12 +505,12 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: # require_grad if the backward function even gets executed. I don't # have any good ideas for detecting those cases, so I simply disabled the # checks. - if 'backward' in func.name: + if 'backward' in info.name: return None # If there's a single derivative we could compute, we already have # a requires_grad check that is sufficient - if len(func.args_with_derivatives) <= 1: + if len(args_with_derivatives) <= 1: return None # We really only care about trimming down the amount of tensors we save @@ -526,7 +519,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: # We want to emit simple guards, so we only allow that if checking one # input is enough to determine whether we need that value - used_in = [d for d in func.derivatives if arg in d.saved_inputs] + used_in = [d for d in info.derivatives if arg in d.saved_inputs] assert len(used_in) > 0 if len(used_in) != 1: return None @@ -536,75 +529,76 @@ def guard_for(arg: SavedAttribute) -> Optional[str]: derivative_var_name = derivative.var_names[0] # Figure out the offset of the edge that uses this variable - for edge_off, arg in enumerate(func.args_with_derivatives): - if arg.name == derivative_var_name: + for edge_off, a in enumerate(args_with_derivatives): + if a.name == derivative_var_name: break else: raise AssertionError() return f'grad_fn->should_compute_output({edge_off})' - setup.extend(save_variables(func.all_saved_inputs, False, guard_for)) - for arg in func.args_with_derivatives: - if arg.type in ['TensorList', 'const c10::List> &']: + setup.extend(save_variables(info.all_saved_inputs, False, guard_for)) + for arg in args_with_derivatives: + if is_tensor_list_type(arg.type): setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();') return setup - def setup_derivative(differentiable_inputs): - env = {} - env['args_with_derivatives'] = [arg['name'] for arg in args_with_derivatives] - env['op'] = func.op if func is not None else 'NotImplemented' - env['op_ctor'] = '' if func is not None else '"{}"'.format(declaration['api_name']) - + def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[str]: + body: List[str] = [] if is_out_fn: # For out functions, ensure that no input or output requires grad - body = [] body.append(DECLARE_GRAD_FN.substitute(op='Node')) body.append(SETUP_NONE_REQUIRES_GRAD.substitute( base_name=base_name, - args_to_check=[arg['name'] for arg in differentiable_inputs])) + args_to_check=[arg.name for arg in differentiable_inputs])) body.append(SETUP_NONE_REQUIRES_GRAD.substitute( base_name=base_name, - args_to_check=[arg['name'] for arg in differentiable_outputs])) + args_to_check=[arg.name for arg in differentiable_outputs])) return body + op = info.op if info is not None and info.has_derivatives else 'NotImplemented' setup = [] - setup.extend(ASSIGN_GRAD_FN.substitute(env).split('\n')) + setup.extend(ASSIGN_GRAD_FN.substitute( + op=op, + op_ctor='' if info is not None and info.has_derivatives else f'"{cpp.name(f.func)}"', + args_with_derivatives=[arg.name for arg in args_with_derivatives], + ).split('\n')) setup.extend(emit_save_inputs()) - body = [] body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives)) - body.append(DECLARE_GRAD_FN.substitute(env)) + body.append(DECLARE_GRAD_FN.substitute(op=op)) body.append(SETUP_DERIVATIVE.substitute(setup=setup)) return body - def emit_check_if_in_complex_autograd_allowlist(): - body = [] + def emit_check_if_in_complex_autograd_allowlist() -> List[str]: + body: List[str] = [] if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX: return body for arg in differentiable_outputs: - name = arg['name'] - if arg['type'] in ['Tensor', 'TensorList', 'const c10::List> &']: - body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name)) + name = arg.name + # TODO: should be `arg.type.is_tensor_like()`? + if arg.cpp_type in ['Tensor', 'TensorList', 'const c10::List> &']: + body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");') return body - def emit_check_no_requires_grad(tensor_args, args_with_derivatives): + def emit_check_no_requires_grad( + tensor_args: List[DifferentiableInput], + args_with_derivatives: List[DifferentiableInput], + ) -> List[str]: """Checks that arguments without derivatives don't require grad""" - body = [] + body: List[str] = [] for arg in tensor_args: if arg in args_with_derivatives: continue - name = arg['name'] - if name in non_differentiable_arg_names: + name = arg.name + if info and name in info.non_differentiable_arg_names: continue if name == 'output': # Double-backwards definitions sometimes take in 'input' and # 'output', but only define the derivative for input. continue - if arg['dynamic_type'] in {'IndexTensor', 'ByteTensor', 'BoolTensor'}: - continue - body.append('check_no_requires_grad({}, "{}");'.format(name, name)) + body.append(f'check_no_requires_grad({name}, "{name}");') return body def save_variables( @@ -644,42 +638,40 @@ def save_variables( stmts.append('}') return stmts - def emit_dispatch_call(api_name, input_base, unpacked_args): + def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str: """ Dispatch call via function in a namespace or method on Tensor.""" - if 'namespace' in declaration['method_of']: - if declaration['use_c10_dispatcher'] in ['hacky_wrapper_for_legacy_signatures', 'full']: - dispatcher_api_name = make_out_api_name_faithful(api_name) - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - dispatcher_api_name = api_name + if Variant.function in f.variants: call = CALL_DISPATCH_VIA_NAMESPACE.substitute( - api_name=dispatcher_api_name, + api_name=cpp.name( + f.func, + faithful_name_for_out_overloads=True, + ), unpacked_args=unpacked_args) else: call = CALL_DISPATCH_VIA_METHOD.substitute( - api_name=api_name, + api_name=cpp.name(f.func), var=input_base, unpacked_method_args=unpacked_args[1:]) return call - def emit_view_lambda(): + def emit_view_lambda(unpacked_bindings: List[Binding]) -> str: """ Generate an additional lambda function to recover views in backward when as_strided is not supported. See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details.""" input_base = 'input_base' replay_view_func = '' - updated_unpacked_args = [] - combined = nested_dict(env, declaration) - known_view_arg_simple_types = ['int64_t', 'int64_t?', 'bool', 'IntArrayRef'] - for arg in combined['unpacked_args']: + updated_unpacked_args: List[str] = [] + known_view_arg_simple_types: List[str] = ['int64_t', 'c10::optional', 'bool', 'IntArrayRef'] + for unpacked_binding in unpacked_bindings: + arg, arg_type = unpacked_binding.name, unpacked_binding.type if arg == 'self_': updated_unpacked_args.append(input_base) continue - arg_type = combined['unpacked_args_simple_type'][arg] if arg_type not in known_view_arg_simple_types: - raise TypeError('You are adding an {} {} argument to op {} in addition to known types: {}. ' - 'Please update the list or materialize it so that it can be closed over by value, ' - 'also add a test in pytorch/xla/test/test_operations.py where this code is exercised.' - .format(arg_type, arg, declaration['name'], ', '.join(known_view_arg_simple_types))) + known_types_str = ', '.join(known_view_arg_simple_types) + raise TypeError(f'You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: ' + f'{known_types_str}. Please update the list or materialize it so that it can be closed ' + 'over by value, also add a test in pytorch/xla/test/test_operations.py where this code ' + 'is exercised.') if arg_type == 'IntArrayRef': # It's not safe to close over IntArrayRef by value, since this is a @@ -687,7 +679,7 @@ def emit_view_lambda(): arg_vec = arg + '_vec' replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec) updated_unpacked_args.append(arg_vec) - elif arg_type == 'int64_t?': + elif arg_type == 'c10::optional': # Materialize int64_t? to int64_t arg_value = arg + '_val' replay_view_func += OPTIONAL_TO_VAL.substitute(arg=arg, val=arg_value, default='0') @@ -695,7 +687,7 @@ def emit_view_lambda(): else: updated_unpacked_args.append(arg) - replay_view_call = emit_dispatch_call(combined['api_name'], input_base, updated_unpacked_args) + replay_view_call = emit_dispatch_call(f, input_base, updated_unpacked_args) replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute( input_base=input_base, replay_view_call=replay_view_call) @@ -706,17 +698,17 @@ def emit_view_lambda(): is_view_with_metadata_change=is_view_with_metadata_change, replay_view_func=replay_view_func) - def wrap_output(return_values, var): + def wrap_output(f: NativeFunction, unpacked_bindings: List[Binding], var: str) -> str: call = '' - rhs_value = None - if 'Tensor' not in declaration['return_type']: + rhs_value: Optional[str] = None + if not any(r.type.is_tensor_like() for r in f.func.returns): rhs_value = var elif view_info is not None: # See NOTE [ Autograd View Variables ] in variable.h for details. - differentiable_output_vars = {r['name'] for r in differentiable_outputs} + differentiable_output_vars = {r.name for r in differentiable_outputs} if not isinstance(view_info, str): - raise TypeError("The view info should be a string for {}, but it is: {}".format(base_name, view_info)) + raise TypeError(f'The view info should be a string for {base_name}, but it is: {view_info}') if len(differentiable_output_vars) == 0: # no output is differentiable (.indices() for SparseTensors for example) @@ -725,54 +717,55 @@ def wrap_output(return_values, var): # Single differentiable output (Tensor or Tensor[]) return_info = differentiable_outputs[0] # We only support simple Tensor or a TensorList for functions that return views - if not return_info['dynamic_type'] in ['Tensor', 'TensorList']: - raise RuntimeError("{} that return differentiable views can only return Tensor or Tensor[]".format(base_name)) + if not is_tensor_type(return_info.type) and not is_tensor_list_type(return_info.type): + raise RuntimeError(f'{base_name} that return differentiable views can only return Tensor or Tensor[]') # Only allow rebasing of the history if we return a single Tensor # If we are in a no grad block, raise a warning # See NOTE [ View + Inplace detection ] for more details about this logic - if return_info['dynamic_type'] in ['TensorList', 'const c10::List> &']: + if is_tensor_list_type(return_info.type): if base_name in MULTI_OUTPUT_SAFE_FUNCTIONS: - creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE" + creation_meta = 'CreationMeta::MULTI_OUTPUT_SAFE' else: - creation_meta = "CreationMeta::MULTI_OUTPUT_NODE" - call += ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, " - "/* is_fw_differentiable */ true, " - "/* creation_meta */ {});").format(view_info, var, creation_meta) - rhs_value = 'std::move({})'.format(var) + creation_meta = 'CreationMeta::MULTI_OUTPUT_NODE' + call += (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, ' + '/* is_fw_differentiable */ true, ' + f'/* creation_meta */ {creation_meta});') + rhs_value = f'std::move({var})' else: - call += emit_view_lambda() - creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE" - rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, " - "/* is_fw_differentiable */ true, " - "/* view_func */ func, /* creation_meta */ {})").format(view_info, var, creation_meta) + call += emit_view_lambda(unpacked_bindings) + creation_meta = 'GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE' + rhs_value = (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, ' + '/* is_fw_differentiable */ true, ' + f'/* view_func */ func, /* creation_meta */ {creation_meta})') else: # This could be supported but we don't need it at the moment, so keeping things simple. - raise RuntimeError("Function that return multiple differentiable output " - "when at least one of them is view is not supported.") + raise RuntimeError('Function that return multiple differentiable output ' + 'when at least one of them is view is not supported.') else: - rhs_value = 'std::move({})'.format(var) + rhs_value = f'std::move({var})' assert rhs_value is not None - call += ASSIGN_RETURN_VALUE.substitute(return_values=return_values, + call += ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f), rhs_value=rhs_value) return call - def enforce_same_tensorimpl_and_storage(env, call): - save_ptrs_stmts = [] - enforce_same_ptrs_stmts = [] - if declaration['name'] not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE: - for arg in env.get('unpacked_args', []): - simple_type = env['unpacked_args_simple_type'][arg] - if simple_type == 'TensorList': + def enforce_same_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> str: + save_ptrs_stmts: List[str] = [] + enforce_same_ptrs_stmts: List[str] = [] + if cpp.name(f.func) not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE: + for unpacked_binding in unpacked_bindings: + arg = unpacked_binding.name + noref_cpp_type = unpacked_binding.ctype.cpp_type(strip_ref=True) + if noref_cpp_type == 'TensorList': save_ptrs_stmts += [SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)] enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg), ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)] - elif simple_type == 'c10::List>': + elif noref_cpp_type == 'c10::List>': save_ptrs_stmts += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)] enforce_same_ptrs_stmts += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg), ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)] - elif simple_type == 'Tensor': + elif noref_cpp_type == 'Tensor': save_ptrs_stmts += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg), SAVE_TENSOR_IMPL.substitute(tensor_name=arg)] enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=arg), @@ -784,74 +777,69 @@ def enforce_same_tensorimpl_and_storage(env, call): RUN_ONLY_IN_DEBUG_MODE.substitute(statements=enforce_same_ptrs_stmts) return call - def emit_call(env, tie_return_values): - combined = nested_dict(env, declaration) + def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str: # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure # the baseType operations still dispatch to non-Variable type, even if the arguments passed # in are now Variables. # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details. - base_type_call = emit_dispatch_call(combined['api_name'], 'self_', combined['unpacked_args']) - if not modifies_arguments and not returns_void: + unpacked_args = [b.name for b in unpacked_bindings] + base_type_call = emit_dispatch_call(f, 'self_', unpacked_args) + if not modifies_arguments(f) and not returns_void: call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute( base_type_call=base_type_call) - call += wrap_output(tie_return_values, 'tmp') + call += wrap_output(f, unpacked_bindings, 'tmp') else: call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute( base_type_call=base_type_call) - call = enforce_same_tensorimpl_and_storage(env, call) + call = enforce_same_tensorimpl_and_storage(call, unpacked_bindings) return call - def emit_history(): - fn = 'rebase' if modifies_arguments and view_info is None else 'set' - output_names = [r['name'] for r in differentiable_outputs] + def emit_history() -> str: + fn = 'rebase' if modifies_arguments(f) and view_info is None else 'set' + output_names = [r.name for r in differentiable_outputs] # TODO: flatten allocates a std::vector, which could be expensive outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names) return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs) - def emit_save_outputs(): + def emit_save_outputs() -> str: if is_out_fn: # out functions don't currently support differentiation return '' - func = declaration['derivative'] - if func is not None: - stmts = save_variables(func.all_saved_outputs, True) + if info is not None and info.has_derivatives: + stmts = save_variables(info.all_saved_outputs, True) if len(stmts) == 0: return '' return CONDITIONAL.substitute(cond='grad_fn', statements=stmts) return '' - def emit_any_requires_grad(): + def emit_any_requires_grad() -> List[str]: return [SETUP_ANY_REQUIRES_GRAD.substitute( - args_with_derivatives=[arg['name'] for arg in args_with_derivatives]), ] + args_with_derivatives=[arg.name for arg in args_with_derivatives]), ] - def emit_check_inplace(): + def emit_check_inplace() -> List[str]: if not inplace: return [] - return ['check_inplace({}, _any_requires_grad);'.format(arg['name']) for arg in differentiable_outputs] + return [f'check_inplace({arg.name}, _any_requires_grad);' for arg in differentiable_outputs] - def emit_increment_version(): - if not modifies_arguments: + def emit_increment_version(f: NativeFunction) -> List[str]: + if not modifies_arguments(f): return [] - return ['increment_version({});'.format(arg['name']) for arg in returns] - - env = {} - combined = nested_dict(env, declaration) + return [f'increment_version({r});' for r in cpp.return_names(f)] - body = [] + body: List[str] = [] + unpack_args_stats, unpacked_bindings = unpack_args(f) - declare_returned_variables, tie_return_values, get_return_value = format_return_variables(declaration) - - body.extend(unpack_args(env, declaration)) + body.extend(unpack_args_stats) if requires_derivative: body.extend(emit_any_requires_grad()) body.extend(emit_check_inplace()) body.extend(setup_derivative(differentiable_inputs)) - body.append(declare_returned_variables) + body.append(declare_returned_variables(f)) - body.append(emit_call(env, tie_return_values)) - body.extend(emit_increment_version()) + body.append(emit_call(f, unpacked_bindings)) + body.extend(emit_increment_version(f)) if requires_derivative: # set_flags has to appear after version_counter, because rebase_history # requires that the counter is incremented before it is called @@ -866,56 +854,50 @@ def emit_increment_version(): assert inplace body.append('reset_grad_accumulator(self);') if not returns_void: - body.append('return {};'.format(get_return_value)) + body.append(f'return {get_return_value(f)};') return body - -def unpack_args(env, declaration): - def requires_unpack(arg): - return 'Tensor' in arg['dynamic_type'] and 'c10::optional' not in arg['type'] - - body = [] - unpacked_args = [] - unpacked_args_simple_type = {} - if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']: - arguments = declaration['schema_order_arguments'] - else: - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper' - arguments = declaration['arguments'] - for i, arg in enumerate(arguments): - if not requires_unpack(arg): - unpacked_args.append(arg['name']) - unpacked_args_simple_type[arg['name']] = arg['simple_type'] +@with_native_function +def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]: + body: List[str] = [] + unpacked_bindings: List[Binding] = [] + + bindings = [r for a in f.func.schema_order_arguments() + for r in cpp.argument(a, + method=False, + cpp_no_default_args=set(), + faithful=False, + has_tensor_options=False)] + + for i, binding in enumerate(bindings): + assert not isinstance(binding.argument, SelfArgument) + if isinstance(binding.argument, TensorOptionsArguments): + raise RuntimeError("VariableKernel shouldn't take TensorOptions") + + is_nullable = binding.argument.type.is_nullable() + if not binding.argument.type.is_tensor_like() or is_nullable: + unpacked_bindings.append(binding) continue - dynamic_type = arg['dynamic_type'] - if 'TensorOptions' not in dynamic_type: - is_nullable = arg.get('is_nullable', False) - ref = (not is_nullable) and dynamic_type != 'TensorList' - suffix = '_opt' if is_nullable and dynamic_type != 'TensorList' else '' - body.append(UNPACK_TENSOR.substitute( - arg_name=arg['name'], - arg_pos=i, - suffix=suffix, - ref='&' if ref else '', - )) - else: - # Okay, we are abusing the definition of 'unpack' here a bit, - # although it's still getting the non-variable from the variable - # (in this case via TensorOptions rather than Variable/Tensor). - assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \ - "VariableKernel shouldn't take TensorOptions if the op is c10-full" - body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name'])) - - unpacked_args.append(arg['name'] + '_') - unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type'] - - env['unpacked_args'] = unpacked_args - env['unpacked_args_simple_type'] = unpacked_args_simple_type - return body - - -def dispatch_strategy(declaration): + is_tensor_list = is_tensor_list_type(binding.argument.type) + ref = (not is_nullable) and not is_tensor_list + suffix = '_opt' if is_nullable and not is_tensor_list else '' + body.append(UNPACK_TENSOR.substitute( + arg_name=binding.name, + arg_pos=i, + suffix=suffix, + ref='&' if ref else '', + )) + unpacked_bindings.append(Binding( + name=binding.name + '_', + ctype=binding.ctype, + argument=binding.argument, + default=binding.default, + )) + + return body, unpacked_bindings + +def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str: """How are we going to call the underlying implementation of a declaration? There are two strategies: @@ -935,7 +917,7 @@ def dispatch_strategy(declaration): get dispatched back to VariableType (which will ensure that they are differentiable.) """ - if declaration['abstract'] or declaration['derivative'] is not None: + if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives): # If the function is abstract (not implemented on at::Type), we must # call the implementation on the derived type with unpacked tensors. @@ -959,62 +941,47 @@ def dispatch_strategy(declaration): # assumption might not hold, but then you'll see gradcheck fail.) return 'use_type' -def get_decl_signature(declaration: Dict[Any, Any], use_base_variant: bool = False) -> str: - name = declaration['name'] - arguments = declaration['arguments'] - if use_base_variant: - if declaration['inplace']: - assert name.endswith('_') - name = name[:-1] - elif name.endswith('_out'): - name = name[:-4] - arguments = [arg for arg in arguments if not arg.get('output', False)] - simple_types = ', '.join(arg['simple_type'] for arg in arguments) - return f'{name}({simple_types})' +def is_tensor_type(t: Type) -> bool: + # TODO: Should handle optional here? + return t.is_tensor_like() and t.is_list_like() is None -@with_native_function -def get_func_signature(f: NativeFunction) -> str: - args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments() - types = ', '.join(python.argument_type_str(a.argument.type, simple_type=True) - if isinstance(a.argument, Argument) else 'TensorOptions' - for a in args) - return f'{cpp.name(f.func)}({types})' - -def match_declarations_with_differentiability_info( - declarations: Dict[Any, Any], +def is_tensor_list_type(t: Type) -> bool: + # TODO: Should handle optional here? + return t.is_tensor_like() and t.is_list_like() is not None + +def modifies_arguments(f: NativeFunction) -> bool: + return f.func.kind() in [SchemaKind.inplace, SchemaKind.out] + +def match_differentiability_info( + native_functions: List[NativeFunction], differentiability_infos: Sequence[DifferentiabilityInfo], -) -> None: +) -> List[NativeFunctionWithDifferentiabilityInfo]: """Sets the "derivative" key on declarations to matching autograd function In-place functions will use the out-of-place derivative definition if there is no in-place specific derivative. """ - info_by_signature = {get_func_signature(info.func): info for info in differentiability_infos} + info_by_schema = {info.func.func: info for info in differentiability_infos} + functional_info_by_signature = { + info.func.func.signature(strip_default=True): info + for info in differentiability_infos + if info.func.func.kind() == SchemaKind.functional} - def find_info(declaration: Dict[Any, Any]) -> Optional[DifferentiabilityInfo]: - signature = get_decl_signature(declaration) - if signature in info_by_signature: - return info_by_signature[signature] + def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]: + if f.func in info_by_schema: + return info_by_schema[f.func], True # if there is no exact match look for the out-of-place signature. # i.e mul() for mul_() or mul_out() - signature = get_decl_signature(declaration, use_base_variant=True) - return info_by_signature.get(signature) - - for declaration in declarations: - info = find_info(declaration) - declaration['derivative'] = info if info and info.args_with_derivatives else None - - # Currently, the '.strides()' to 'strides_or_error' replacement does not support - # 'self' derivatives of an inplace function, so we must check for this case. - if declaration['inplace'] and (info is not None): - for derivative in info.derivatives: - if 'self' in derivative.var_names: - for saved_input in derivative.saved_inputs: - assert 'strides_or_error' not in saved_input.expr, ( - "Calling '.strides()' in the 'self' derivative formula of an " - f"in-place function is not supported: {declaration['name']}") - - declaration['non_differentiable_arg_names'] = info.non_differentiable_arg_names if info else [] - declaration['output_differentiability'] = info.output_differentiability if info else None + return functional_info_by_signature.get(f.func.signature(strip_default=True)), False + + result: List[NativeFunctionWithDifferentiabilityInfo] = [] + for f in native_functions: + info, is_exact_match = find_info(f) + result.append(NativeFunctionWithDifferentiabilityInfo( + func=f, + info=info, + )) + + return result diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp index 49be92d30d35..a77547a6cc07 100644 --- a/tools/autograd/templates/python_fft_functions.cpp +++ b/tools/autograd/templates/python_fft_functions.cpp @@ -8,6 +8,7 @@ #include "torch/csrc/autograd/utils/wrap_outputs.h" #include "torch/csrc/autograd/utils/python_arg_parsing.h" #include "torch/csrc/autograd/generated/variable_factories.h" +#include "torch/csrc/utils/out_types.h" #include "torch/csrc/utils/pycfunction_helpers.h" #include "torch/csrc/utils/python_arg_parser.h" #include "torch/csrc/utils/structseq.h" @@ -30,6 +31,7 @@ using at::TensorList; using at::Dimname; using at::DimnameList; +using torch::utils::check_out_type_matches; using namespace torch::autograd::utils; namespace torch { namespace autograd { diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp index e05e6fbe1975..c42a869b3a98 100644 --- a/tools/autograd/templates/python_torch_functions.cpp +++ b/tools/autograd/templates/python_torch_functions.cpp @@ -19,6 +19,7 @@ #include "torch/csrc/Dtype.h" #include "torch/csrc/DynamicTypes.h" #include "torch/csrc/Exceptions.h" +#include "torch/csrc/utils/out_types.h" #include "torch/csrc/utils/pybind.h" #include "torch/csrc/utils/pycfunction_helpers.h" #include "torch/csrc/utils/python_arg_parser.h" @@ -53,43 +54,13 @@ using at::Dimname; using at::DimnameList; using at::ArrayRef; +using torch::utils::check_out_type_matches; using namespace torch::autograd::utils; namespace torch { namespace autograd { static PyObject* THPVariableFunctionsModule = NULL; -static void check_out_type_matches(Tensor result, - ScalarType scalarType, bool scalarType_is_none, - c10::optional layout, - const Device& device, bool device_is_none) { - if (scalarType_is_none && !layout && device_is_none) { // common case - return; - } - if (!scalarType_is_none && result.scalar_type() != scalarType) { - AT_ERROR( - "dtype ", scalarType, - " does not match dtype of out parameter (", result.scalar_type(), ")"); - } - auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType; - auto device_type_arg = device_is_none ? result.device().type() : device.type(); - if (result.scalar_type() != scalarType_arg) { - AT_ERROR( - "scalar type ", scalarType_arg, - " does not match scalar type of out parameter (", result.scalar_type(), ")"); - } - if (layout && result.layout() != *layout) { - AT_ERROR( - "layout ", *layout, - " does not match layout of out parameter (", result.layout(), ")"); - } - if (result.device().type() != device_type_arg) { - AT_ERROR( - "device type ", device_type_arg, - " does not match device type of out parameter (", result.device().type(), ")"); - } -} - inline Tensor dispatch_arange(Scalar end, Tensor result) { pybind11::gil_scoped_release no_gil; return at::arange_out(result, end); diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 8eeffe724c8e..5ed0b1340811 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -7,9 +7,6 @@ GENERATED_CPP = [ "autograd/generated/VariableType_2.cpp", "autograd/generated/VariableType_3.cpp", "autograd/generated/VariableType_4.cpp", - "jit/generated/generated_unboxing_wrappers_0.cpp", - "jit/generated/generated_unboxing_wrappers_1.cpp", - "jit/generated/generated_unboxing_wrappers_2.cpp", "autograd/generated/TraceType_0.cpp", "autograd/generated/TraceType_1.cpp", "autograd/generated/TraceType_2.cpp", @@ -39,9 +36,6 @@ libtorch_nvfuser_generated_headers = ["{}.h".format(name[36:-3]) for name in lib def libtorch_generated_sources(gencode_pattern): return [gencode_pattern.format(name) for name in [ "autograd/generated/Functions.cpp", - "jit/generated/generated_unboxing_wrappers_0.cpp", - "jit/generated/generated_unboxing_wrappers_1.cpp", - "jit/generated/generated_unboxing_wrappers_2.cpp", "autograd/generated/VariableType_0.cpp", "autograd/generated/VariableType_1.cpp", "autograd/generated/VariableType_2.cpp", @@ -351,6 +345,7 @@ libtorch_extra_sources = libtorch_core_jit_sources + [ "torch/csrc/jit/serialization/export_module.cpp", "torch/csrc/jit/serialization/import_legacy.cpp", "torch/csrc/utils/byte_order.cpp", + "torch/csrc/utils/out_types.cpp", ] def libtorch_sources(gencode_pattern = ":generate-code[{}]"): diff --git a/tools/code_analyzer/run_analyzer.sh b/tools/code_analyzer/run_analyzer.sh index 79b366fb1a0d..dc8705cc39f7 100755 --- a/tools/code_analyzer/run_analyzer.sh +++ b/tools/code_analyzer/run_analyzer.sh @@ -15,7 +15,7 @@ echo "Analyze: ${INPUT}" # to operate, so for safety we match a more expansive set. "${ANALYZER_BIN}" \ -op_schema_pattern="^(_aten|_prim|aten|quantized|_quantized|prepacked|profiler|_test)::[a-zA-Z0-9_.]+(\(.*)?$" \ - -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl|impl_UNBOXED)|torch::Library::(_?def|_?impl|_?impl_UNBOXED)" \ + -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl)|torch::Library::(_?def|_?impl)" \ -op_invoke_pattern="c10::Dispatcher::findSchema" \ -root_symbol_pattern="torch::jit::[^(]" \ -torch_library_init_pattern="^.*TORCH_LIBRARY_init_([^(]+)(\(.*)?$" \ diff --git a/tools/codegen/api/autograd.py b/tools/codegen/api/autograd.py index 58fb75bb7c07..6f58eea6d1ea 100644 --- a/tools/codegen/api/autograd.py +++ b/tools/codegen/api/autograd.py @@ -87,3 +87,36 @@ class DifferentiabilityInfo: # Raw data read from derivatives.yaml. output_differentiability: Optional[List[bool]] + + @property + def has_derivatives(self) -> bool: + return len(self.args_with_derivatives) > 0 + +# Represents a differentiable `Argument`. +# How is it different from the `Argument` type? +# - It's processed Arguments which are differentiable and only used in the +# context of the autograd codegen; +# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument; +@dataclass(frozen=True) +class DifferentiableInput: + name: str + type: Type + + # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. + cpp_type: str + +# Represents a differentiable `Return`. +# How it it different from the `Return` type? +# - The name in `Return` is optional. Here it is always populated using the same +# `cpp.return_names()` method. +# TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant? +# - It's processed Returns which are differentiable, in compliance with the +# `output_differentiability` field defined in derivatives.yaml (if specified), +# and are only used in the context of the autograd codegen; +@dataclass(frozen=True) +class DifferentiableOutput: + name: str + type: Type + + # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove. + cpp_type: str diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index 29a29e215f4f..0debd52ca896 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -1,6 +1,5 @@ from tools.codegen.model import * from tools.codegen.api.types import * -import tools.codegen.local as local from typing import Optional, Sequence, Union, List, Set # This file describes the translation of JIT schema to the public C++ @@ -88,10 +87,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: if mutable: return MutRefCType(BaseCType('Tensor', binds)) # TODO: fix this discrepancy else: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return ConstRefCType(OptionalCType(BaseCType('Tensor', binds))) - else: - return ConstRefCType(BaseCType('Tensor', binds)) + return ConstRefCType(OptionalCType(BaseCType('Tensor', binds))) elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) return OptionalCType(elem) elif isinstance(t, ListType): @@ -105,10 +101,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: elif str(t.elem) == 'Dimname': return BaseCType("DimnameList", binds) elif str(t.elem) == 'Tensor?': - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return BaseCType("const c10::List> &", binds) - else: - return BaseCType("TensorList", binds) + return ConstRefCType(BaseCType("c10::List>", binds)) elem = argumenttype_type(t.elem, mutable=mutable, binds=binds) # TODO: explicitly qualify namespace here return BaseCType(f"ArrayRef<{elem.cpp_type()}>", binds) diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index 3adc2465b607..bb65bc386e64 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -2,8 +2,6 @@ from tools.codegen.api.types import * import tools.codegen.api.cpp as cpp -import tools.codegen.api.native as native -import tools.codegen.local as local import itertools from typing import Sequence, List, Union @@ -31,17 +29,11 @@ def name(func: FunctionSchema) -> str: return cpp.name(func) def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - # This is a faux amis. If it makes sense in the future to add - # more special cases here, or invert things so cpp.argument_type - # calls this, or just completely inline the function, please do - # it. - return cpp.argumenttype_type(t, mutable=mutable, binds=binds) - else: - # This is real sharing. If you're modifying this path, ask - # yourself why you are changing the native functions protocol - # here and not in native. - return native.argumenttype_type(t, mutable=mutable, binds=binds) + # This is a faux amis. If it makes sense in the future to add + # more special cases here, or invert things so cpp.argument_type + # calls this, or just completely inline the function, please do + # it. + return cpp.argumenttype_type(t, mutable=mutable, binds=binds) def argument_type(a: Argument, *, binds: ArgName) -> CType: return argumenttype_type(a.type, mutable=a.is_write, binds=binds) @@ -53,10 +45,6 @@ def returns_type(rs: Sequence[Return]) -> str: def argument( a: Union[Argument, TensorOptionsArguments, SelfArgument] ) -> List[Binding]: - # We could forward to native.argument but it is a bit suspect because - # the grouping may not be set correctly - assert local.use_c10_dispatcher().dispatcher_uses_new_style() - if isinstance(a, Argument): return [Binding( ctype=argument_type(a, binds=a.name), @@ -71,13 +59,10 @@ def argument( assert_never(a) def arguments(func: FunctionSchema) -> List[Binding]: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return [ - r for a in itertools.chain( - func.arguments.positional, - func.arguments.kwarg_only, - func.arguments.out - ) for r in argument(a) - ] - else: - return native.arguments(func) + return [ + r for a in itertools.chain( + func.arguments.positional, + func.arguments.kwarg_only, + func.arguments.out + ) for r in argument(a) + ] diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py index 936500b560db..af82210b20f4 100644 --- a/tools/codegen/api/native.py +++ b/tools/codegen/api/native.py @@ -64,8 +64,7 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out # Erase SelfArgument from the distinction return argument(a.argument, is_out=is_out) elif isinstance(a, TensorOptionsArguments): - if local.use_c10_dispatcher() in [UseC10Dispatcher.hacky_wrapper_for_legacy_signatures, - UseC10Dispatcher.with_codegenerated_unboxing_wrapper]: + if local.use_c10_dispatcher() == UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: # TODO: expunge this logic entirely default = None if should_default: diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index bc5cbb440b98..749513cb5c0d 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -3,7 +3,6 @@ from tools.codegen.api.types import * import tools.codegen.api.cpp as cpp -import tools.codegen.local as local from tools.codegen.gen import pythonify_default from tools.codegen.model import * @@ -599,11 +598,8 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str: elif isinstance(t, OptionalType): if str(t.elem) == 'Tensor': - if not simple_type or local.use_c10_dispatcher().dispatcher_uses_new_style(): - # Is it desired to keep '?' for simple_type with new style dispatcher? - return 'Tensor?' - else: - return 'Tensor' + # Is it desired to keep '?' for simple_type with new style dispatcher? + return 'Tensor?' elem = argument_type_str(t.elem, simple_type=simple_type) if elem == 'Layout': # TODO: fix this special case in PythonArgParser? @@ -1022,10 +1018,7 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str: elif isinstance(t, OptionalType): if str(t.elem) == 'Tensor': - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return 'optionalTensor' - else: - return 'tensor' + return 'optionalTensor' elif isinstance(t.elem, BaseType): if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar, diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index ea03a1799cfb..39fb8bef3846 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -31,14 +31,16 @@ class BaseCType: type: str name: ArgName - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: return self.type @dataclass(frozen=True) class ConstRefCType: elem: 'CType' - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: + if strip_ref: + return self.elem.cpp_type(strip_ref=strip_ref) return f'const {self.elem.cpp_type()} &' @property @@ -49,7 +51,9 @@ def name(self) -> ArgName: class MutRefCType: elem: 'CType' - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: + if strip_ref: + return self.elem.cpp_type(strip_ref=strip_ref) return f'{self.elem.cpp_type()} &' @property @@ -60,7 +64,8 @@ def name(self) -> ArgName: class OptionalCType: elem: 'CType' - def cpp_type(self) -> str: + def cpp_type(self, *, strip_ref: bool = False) -> str: + # Do not pass `strip_ref` recursively. return f'c10::optional<{self.elem.cpp_type()}>' @property diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 782d8b919e7e..08e9572131e3 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -203,8 +203,7 @@ class RegisterSchema: @method_with_native_function def __call__(self, f: NativeFunction) -> Optional[str]: - op_name = f"aten::{f.func.name}" - if not self.selector.is_operator_selected(op_name): + if not self.selector.is_native_function_selected(f): return None return f'm.def({cpp_string(str(f.func))});\n' @@ -399,8 +398,7 @@ def gen_one(f: NativeFunction) -> Optional[str]: e.expr for e in translate(functional_sig.arguments(), dispatcher.arguments(functional_func), method=False) ) - op_name = f"aten::{f.func.name}" - if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name): + if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f): return None k = f.func.kind() @@ -437,6 +435,8 @@ def gen_one(f: NativeFunction) -> Optional[str]: # For an overview of what this template code looks like, see # https://github.com/pytorch/rfcs/pull/9 return f"""\ +namespace {{ + {self.gen_structured_class( f, k, class_name=class_name, @@ -450,6 +450,8 @@ def gen_one(f: NativeFunction) -> Optional[str]: {impl_call} return {ret_expr}; }} + +}} // anonymous namespace """ elif self.target is Target.REGISTRATION: @@ -469,19 +471,12 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: # for mypy type refinement; would be fixed by TODO on target assert self.target is not Target.DECLARATION - if f.func.is_out_fn(): - assert local.use_c10_dispatcher().dispatcher_uses_new_style(), \ - ("{} takes out arguments and has to be written in the new style. " + - "Please add `use_c10_dispatcher: full` to your operator in native_functions.yaml " + - "and write the C++ implementation to take out arguments in the end.").format(f.func.name) - if self.dispatch_key not in f.dispatch: return None if f.manual_kernel_registration: return None - op_name = f"aten::{f.func.name}" - if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name): + if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f): return None name = native.name(f.func) @@ -518,8 +513,7 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: const DeviceGuard device_guard(device_or_default(device)); """ else: - assert local.use_c10_dispatcher() in [UseC10Dispatcher.with_codegenerated_unboxing_wrapper, - UseC10Dispatcher.hacky_wrapper_for_legacy_signatures] + assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures cuda_guard_from_tensor_options = """\ const DeviceGuard device_guard(options.device()); """ @@ -543,9 +537,13 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: """ return f"""\ +namespace {{ + {returns_type} {name}({args_str}) {{ {cuda_guard}{return_kw}{impl_name}({args_exprs_str}); }} + +}} // anonymous namespace """ elif self.target is Target.REGISTRATION: @@ -557,16 +555,14 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: # Figure out which signature the function is if local.use_c10_dispatcher() is UseC10Dispatcher.full: payload = f"TORCH_FN({name})" - elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: + else: + assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures payload = f""" c10::impl::hacky_wrapper_for_legacy_signatures< {dispatcher_sig.type()}, {len(f.func.arguments.out)} >(TORCH_FN({name})) """ - else: - assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper - payload = f"torch::CppFunction::makeUnboxedOnly(&{name})" return f'm.impl("{f.func.name}",\n{payload});\n' else: @@ -785,14 +781,9 @@ def __call__(self, f: NativeFunction) -> Optional[str]: dispatcher_sig = DispatcherSignature.from_schema(f.func) sig: Union[NativeSignature, DispatcherSignature] - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - sig = dispatcher_sig - dispatcher_exprs = dispatcher_sig.exprs() - dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" - else: - sig = native_sig - dispatcher_exprs = native_sig.dispatcher_exprs() - dispatch_key = "options.computeDispatchKey()" + sig = dispatcher_sig + dispatcher_exprs = dispatcher_sig.exprs() + dispatch_key = "c10::computeDispatchKey(dtype, layout, device)" if self.target is Target.DEFINITION: # I don't think there's actually a good reason to generate @@ -818,11 +809,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]: }} """ elif self.target is Target.REGISTRATION: - if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" - else: - assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper - return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});""" + return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" elif self.target is Target.DECLARATION: raise AssertionError() else: @@ -1047,7 +1034,6 @@ def compute_declaration_yaml(f: NativeFunction) -> object: ('name', cpp.name(f.func)), ('operator_name', str(f.func.name.name)), ('overload_name', str(f.func.name.overload_name)), - ('use_c10_dispatcher', f.use_c10_dispatcher.name), ('manual_kernel_registration', f.manual_kernel_registration), ('category_override', f.category_override if f.category_override is not None else ''), ('matches_jit_signature', True), diff --git a/tools/codegen/model.py b/tools/codegen/model.py index ea667a0922cf..1128878fe45c 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -49,12 +49,8 @@ def __str__(self) -> str: class UseC10Dispatcher(Enum): full = 0 - with_codegenerated_unboxing_wrapper = 1 hacky_wrapper_for_legacy_signatures = 2 - def dispatcher_uses_new_style(self) -> bool: - return self in [UseC10Dispatcher.full, UseC10Dispatcher.hacky_wrapper_for_legacy_signatures] - # The basic input to the code generation is native_functions.yaml. # The name "native", BTW, comes from the distinction between native # functions and legacy TH functions. The legacy TH functions are gone, @@ -77,7 +73,7 @@ class NativeFunction: func: 'FunctionSchema' # Corresponds to the 'use_c10_dispatcher' field. The default - # is 'with_codegenerated_unboxing_wrapper' + # is 'full' use_c10_dispatcher: UseC10Dispatcher # Whether or not to omit automatic generation of a DeviceGuard @@ -177,16 +173,14 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction': assert isinstance(cpp_no_default_args_list, list) cpp_no_default_args = set(cpp_no_default_args_list) - use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None) - if use_c10_dispatcher_s is None: - use_c10_dispatcher = UseC10Dispatcher.full - elif use_c10_dispatcher_s == 'full': + use_c10_dispatcher_s = e.pop('use_c10_dispatcher', 'full') + if use_c10_dispatcher_s == 'full': use_c10_dispatcher = UseC10Dispatcher.full elif use_c10_dispatcher_s == 'hacky_wrapper_for_legacy_signatures': use_c10_dispatcher = UseC10Dispatcher.hacky_wrapper_for_legacy_signatures else: raise AssertionError( - f'use_c10_dispatcher must be unset or set to full, got {use_c10_dispatcher}') + f'use_c10_dispatcher must be full or hacky_wrapper_for_legacy_signatures, got {use_c10_dispatcher}') variants_s = e.pop('variants', 'function') assert isinstance(variants_s, str) @@ -567,7 +561,7 @@ def kind(self) -> SchemaKind: else: return SchemaKind.functional - def signature(self) -> 'FunctionSchema': + def signature(self, *, strip_default: bool = False) -> 'FunctionSchema': """ Certain schemas are 'related', in that they are simply inplace/out/functional versions of the same function. This method @@ -582,11 +576,13 @@ def signature(self) -> 'FunctionSchema': - Out arguments are stripped - Mutability annotations are stripped (this is sound because you cannot overload on mutability annotation) + - Return names are stripped since they are not overloadable and + some variants have return names but some not """ def strip_ret_annotation(r: Return) -> Return: return Return( - name=r.name, + name=None, type=r.type, annotation=None, ) @@ -600,7 +596,7 @@ def strip_ret_annotation(r: Return) -> Return: ), overload_name="", # stripped ), - arguments=self.arguments.signature(), + arguments=self.arguments.signature(strip_default=strip_default), returns=tuple(map(strip_ret_annotation, self.returns)), ) @@ -983,14 +979,14 @@ def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]: ret.extend(self.post_tensor_options_kwarg_only) return ret - def signature(self) -> 'Arguments': + def signature(self, *, strip_default: bool = False) -> 'Arguments': # dataclasses.replace could be used here, but it is less # type safe so for now I've opted to type everything out def strip_arg_annotation(a: Argument) -> Argument: return Argument( name=a.name, type=a.type, - default=a.default, # hmmm + default=a.default if not strip_default else None, annotation=None, ) diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py index 24e387128b6c..eeb15049075e 100644 --- a/tools/codegen/selective_build/selector.py +++ b/tools/codegen/selective_build/selector.py @@ -1,8 +1,9 @@ -from typing import Dict, Set, Optional, Tuple +from typing import Dict, Set, Optional, Tuple, List import yaml from dataclasses import dataclass +from tools.codegen.model import NativeFunction from tools.codegen.selective_build.operator import * # A SelectiveBuilder holds information extracted from the selective build @@ -25,6 +26,20 @@ class SelectiveBuilder: # A dictionary of operator -> operator metadata. operators: Dict[str, SelectiveBuildOperator] + # A dictionary of selected kernel tags and dtypes. Typically a + # PyTorch Operator Kernel (function) may have many code paths + # that are specialized for many many Tensor dtypes, so it's not + # one per kernel function, but there could be many per kernel + # function. The tag isn't a kernel function name, but some fragment + # of the kernel function implementation itself. + kernel_metadata: Dict[str, List[str]] + + # If true, then fragments for all dtypes for all kernel functions + # are included. This is typically set when any one of the + # operator lists is generated from a mechanism other than + # tracing based selective build. + include_all_kernel_dtypes: bool + @staticmethod def get_nop_selector() -> 'SelectiveBuilder': return SelectiveBuilder.from_yaml_dict({'include_all_operators': True}) @@ -32,9 +47,11 @@ def get_nop_selector() -> 'SelectiveBuilder': @staticmethod def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder': valid_top_level_keys = { + 'include_all_kernel_dtypes', 'include_all_operators', 'debug_info', 'operators', + 'kernel_metadata', } top_level_keys = set(data.keys()) if len(top_level_keys - valid_top_level_keys) > 0: @@ -57,7 +74,24 @@ def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder': for (k, v) in operators_dict.items(): operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v) - return SelectiveBuilder(include_all_operators, debug_info, operators) + + kernel_metadata = {} + kernel_metadata_dict = data.get('kernel_metadata', {}) + assert isinstance(kernel_metadata_dict, dict) + + for (k, v) in kernel_metadata_dict.items(): + kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v)) + + include_all_kernel_dtypes = data.get('include_all_kernel_dtypes', False) + assert isinstance(include_all_kernel_dtypes, bool) + + return SelectiveBuilder( + include_all_operators, + debug_info, + operators, + kernel_metadata, + include_all_kernel_dtypes, + ) @staticmethod def from_yaml_str(config_contents: str) -> 'SelectiveBuilder': @@ -85,6 +119,7 @@ def from_legacy_op_registration_allow_list( } return SelectiveBuilder.from_yaml_dict({ 'operators': operators, + 'include_all_kernel_dtypes': True, }) def is_operator_selected(self, name: str) -> bool: @@ -96,6 +131,10 @@ def is_operator_selected(self, name: str) -> bool: name = strip_operator_overload_name(name) return name in self.operators and self.operators[name].include_all_overloads + def is_native_function_selected(self, func: NativeFunction) -> bool: + op_name = op_name_from_native_function(func) + return self.is_operator_selected(op_name) + def is_operator_selected_for_training(self, name: str) -> bool: if not self.is_operator_selected(name): return False @@ -123,6 +162,10 @@ def is_operator_selected_for_training(self, name: str) -> bool: (base_op.include_all_overloads and base_op.is_used_for_training) ) + def is_native_function_selected_for_training(self, func: NativeFunction) -> bool: + op_name = op_name_from_native_function(func) + return self.is_operator_selected_for_training(op_name) + def is_root_operator(self, name: str) -> bool: if not self.is_operator_selected(name): return False @@ -138,8 +181,15 @@ def is_root_operator(self, name: str) -> bool: base_op: SelectiveBuildOperator = self.operators[name] return base_op.include_all_overloads and base_op.is_root_operator + def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool: + if self.include_all_operators or self.include_all_kernel_dtypes: + return True + + return kernel_tag in self.kernel_metadata and dtype in self.kernel_metadata[kernel_tag] + def to_dict(self) -> Dict[str, object]: ret: Dict[str, object] = { + 'include_all_kernel_dtypes': self.include_all_kernel_dtypes, 'include_all_operators': self.include_all_operators, } operators = {} @@ -150,11 +200,41 @@ def to_dict(self) -> Dict[str, object]: if self._debug_info is not None: ret['debug_info'] = self._debug_info + ret['kernel_metadata'] = {k: list(v) for (k, v) in self.kernel_metadata.items()} + return ret +def merge_kernel_metadata( + lhs: Dict[str, List[str]], + rhs: Dict[str, List[str]], +) -> Dict[str, List[str]]: + kernel_metadata: Dict[str, List[str]] = {} + for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()): + dtypes_copy = set(dtypes) + if tag_name in kernel_metadata: + dtypes_copy |= set(kernel_metadata[tag_name]) + + kernel_metadata[tag_name] = list(dtypes_copy) + + return kernel_metadata + def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> SelectiveBuilder: include_all_operators = lhs.include_all_operators or rhs.include_all_operators debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info) operators = merge_operator_dicts(lhs.operators, rhs.operators) - return SelectiveBuilder(include_all_operators, debug_info, operators) + kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata) + include_all_kernel_dtypes = lhs.include_all_kernel_dtypes or rhs.include_all_kernel_dtypes + return SelectiveBuilder( + include_all_operators, + debug_info, + operators, + kernel_metadata, + include_all_kernel_dtypes, + ) + + +def op_name_from_native_function(f: NativeFunction) -> str: + # This was originally read from the 'operator_name_with_overload' field in the + # declaration dict, which was the part before the first '(' in 'schema_string'. + return f'aten::{f.func.name}' diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py deleted file mode 100644 index 267b5a3b221a..000000000000 --- a/tools/jit/gen_unboxing_wrappers.py +++ /dev/null @@ -1,545 +0,0 @@ -""" -To run this file by hand from the root of the PyTorch -repository, run: - -python -m tools.jit.gen_unboxing_wrappers \ - build/aten/src/ATen/Declarations.yaml \ - $OUTPUT_DIR \ - tools/jit/templates - -Where $OUTPUT_DIR is where you would like the files to be -generated. In the full build system, OUTPUT_DIR is -torch/csrc/jit/generated/ -""" - -# This file generates generated_unboxing_wrappers, which contains -# manual unboxing wrappers for ops that aren't use_c10_dispatcher: full -# because the templated unboxing logic in c10 doesn't support them yet. -# The ultimate goal is to make all ops use the templated unboxing and -# delete this codegen file. - -import argparse -import re -from itertools import groupby -from functools import reduce -from ..autograd.gen_autograd import load_aten_declarations -from ..autograd.gen_autograd import RETURNS_VIEWS_OF_INPUT -from ..autograd.utils import CodeTemplate, write, is_out_variant, op_name_with_overload -from tools.codegen.selective_build.selector import SelectiveBuilder - -# JIT has a type system of -# Scalar = int | float | bool # int is the largest int (int64_t), -# float is the largest float (double) we don't have the others because they are never held in tensors -# Type = Scalar # primitive numbers -# | Tensor # any tensor, as defined by at::Tensor -# | Type[] # a dynamically sized list[ of a type -# | Scalar[N] # a homogenous fixed size scalar list, single scalars can expand to this list -# | (Type1, Type2, ...) # a heterogeneous tuple -# | Layout | ScalarType | Device | Generator # special singleton types for built-in concepts in tensor lib - -# clean up the variety of C++ types in the ATen declarations -# to be in the restricted set of types that the IR represents -# note: no default values for this map, to make it clear what types -# can be passedthrough - -TYPE_MAP = { - 'std::array': 'bool[2]', - 'std::array': 'bool[3]', - 'std::array': 'bool[4]', - 'std::string': 'str', - 'std::string?': 'str?', - 'Scalar': 'Scalar', - 'ScalarList': 'Scalar[]', - 'MemoryFormat': 'MemoryFormat', - 'MemoryFormat?': 'MemoryFormat?', - 'QScheme': 'QScheme', - 'Scalar?': 'Scalar?', - 'Tensor': 'Tensor', - 'Tensor?': 'Tensor?', - 'TensorList': 'Tensor[]', - # this appears in return values instead of TensorList - # since TensorList is a ArrayRef in arguments but a vector - # in returns - 'std::vector': 'Tensor[]', - 'IntArrayRef': 'int[]', - 'IntArrayRef?': 'int[]?', - 'ArrayRef?': 'float[]?', - 'Layout': 'Layout', - 'Layout?': 'Layout?', - 'Device': 'Device', - 'Device?': 'Device?', - 'ScalarType': 'ScalarType', - 'ScalarType?': 'ScalarType?', - 'int64_t': 'int', - 'int64_t?': 'int?', - 'double': 'float', - 'double?': 'float?', - 'bool': 'bool', - 'bool?': 'bool?', - 'Generator': 'Generator?', - 'Generator?': 'Generator?', -} - - -def optional_type_of(arg, typ): - # optional type special handling for Tensor?[] and Tensor - # types that is missing a optional annotation - if arg.get('is_nullable') and '?' not in typ: - if typ == 'TensorList' or typ == 'Tensor[]': - typ = 'Tensor?[]' - else: - typ = '{}?'.format(typ) - return typ - - -def annotated_type_of(arg, typ): - anno = arg.get('annotation') - if anno: - typ = '{}({})'.format(typ, anno) - return typ - - -def jit_type_of(arg): - jit_type = arg.get('jit_type') - if not jit_type: - jit_type = TYPE_MAP[arg['simple_type']] - if is_sized_intlist_arg(arg): - jit_type = 'int[{}]'.format(arg['size']) - jit_type = optional_type_of(arg, jit_type) - jit_type = annotated_type_of(arg, jit_type) - arg['jit_type'] = jit_type - return jit_type - - -# map from aten 'simple_type' to the function that will turn a tensor into -# that type -FROM_IVALUE = { - 'Device': '{}.toDevice()', - 'Device?': '{}.toOptional()', - 'IntArrayRef': '{}.toIntVector()', - 'IntArrayRef?': '{}.toOptionalIntArray()', - 'ArrayRef?': '{}.toOptionalDoubleArray()', - 'Layout': '{}.toLayout()', - 'Layout?': '{}.toOptional()', - 'MemoryFormat': '{}.toMemoryFormat()', - 'MemoryFormat?': '{}.toOptional()', - 'QScheme': '{}.toQScheme()', - 'Scalar': '{}.toScalar()', - 'Scalar?': '{}.toOptional()', - 'ScalarType': '{}.toScalarType()', - 'ScalarType?': '{}.toOptional()', - 'Tensor': '{}.toTensor()', - 'Tensor?': 'toOptionalTensor({})', - 'Tensor?[]': 'toListOfOptionalTensor({})', - 'TensorList': '{}.toTensorVector()', - 'ScalarList': '{}.toScalarVector()', - 'bool': '{}.toBool()', - 'bool?': '{}.toOptional()', - 'double': '{}.toDouble()', - 'double?': '{}.toOptional()', - 'int64_t': '{}.toInt()', - 'int64_t?': '{}.toOptional()', - 'std::string': '{}.toStringRef()', - 'std::string?': '{}.toOptional()', - 'Generator?': '{}.toOptional()', - 'std::array': 'as_bool_array<2>({}.toBoolList())', - 'std::array': 'as_bool_array<3>({}.toBoolList())', - 'std::array': 'as_bool_array<4>({}.toBoolList())', -} - - -def from_ivalue(arg, value): - typ = optional_type_of(arg, arg['simple_type']) - return FROM_IVALUE[typ].format(value) - - -CALL_UNBOXED_KERNEL = CodeTemplate("""\ -auto result_ = callUnboxedKernel<${return_type}${formals_types_with_leading_comma}>(unboxedKernel${args_with_leading_comma}); -""") -CALL_NAMESPACE = CodeTemplate("""\ -auto result_ = at::${name}( - ${args} -); -""") -CALL_METHOD = CodeTemplate("""\ -auto result_ = (${first}).${name}( - ${args} -); -""") -CALL_NAMESPACE_WITH_TENSOR_OPTIONS = CodeTemplate("""\ -const auto options = TensorOptions() - .dtype(${dtype}) - .layout(${layout}) - .device(${device}) - .pinned_memory(${pin_memory}); - auto result_ = torch::${name}(${args_with_tensor_options}); -""") -CALL_METHOD_WITH_TENSOR_OPTIONS = CodeTemplate("""\ -const auto options = TensorOptions() - .dtype(${dtype}) - .layout(${layout}) - .device(${device}) - .pinned_memory(${pin_memory}); -auto result_ = (${first}).${name}(${args_with_tensor_options}); -""") - -CONSTRUCTOR = CodeTemplate("""\ -[](OperatorKernel* unboxedKernel, const OperatorHandle&, Stack* stack) { - using namespace at; - ${lvalues} - ${call} - drop(*stack, ${num_inputs}); - pack(*stack, std::move(result_)); -} -""") - -OPERATOR = CodeTemplate("""\ - .op("${signature}", - ${op}) -""") - - -disallowed_types = { - 'Storage', - 'DimnameList?', - 'ConstQuantizerPtr', - 'Dimname', - 'DimnameList', -} - -default_only_types = {'Generator'} - - -def is_jit_arg(i, arg): - simple_type = arg['simple_type'] - if simple_type in disallowed_types: - return False - if simple_type in default_only_types and 'default' not in arg: - return False - if simple_type == 'Type': - return False - return True - - -def is_jit_op(decl): - # We currently don't support functions that return nothing - assert all(r['type'] != 'void' for r in decl['returns']) - if len(decl['returns']) == 0: - return False - - arguments = decl['arguments'] - - # there must be a single out variant - if is_out_variant(decl) and sum([not not arg.get('output') for arg in arguments]) > 1: - return False - - return (('namespace' in decl['method_of'] or 'Tensor' in decl['method_of']) and - all(is_jit_arg(i, arg) for i, arg in enumerate(decl['arguments'])) and - all(is_jit_arg(i, arg) for i, arg in enumerate(decl['returns']))) - - -def is_tensor_arg(arg): - return arg['simple_type'] in {'Tensor', 'TensorList'} - - -def is_sized_intlist_arg(arg): - """Returns True for arguments declared as IntArrayRef[k], but False for IntArrayRef.""" - return (arg['simple_type'] == 'IntArrayRef') and ('size' in arg) - - -def base_name(decl): - name = decl['name'] - return name[:-1] if decl.get('inplace', False) else name[:-4] if name.endswith('_out') else name - - -def is_view(decl): - return base_name(decl) in RETURNS_VIEWS_OF_INPUT - - -# Copied from ..autograd.gen_python_functions.SKIP_PYTHON_BINDINGS -BACKWARD_OP_PATTERNS = [ - '.*_backward', - '.*_backward_(out|input|weight|bias)', -] - -def is_backward_op(decl): - for pattern in BACKWARD_OP_PATTERNS: - if re.match('^' + pattern + '$', decl['name']): - return True - return False - - -# for each argument in decl, the location it should appear in the -# jit schema declaration. e.g. -# arguments = [x, y, z] # the order in aten -# jit_argument_order = [2, 0, 1] -# aten::my_arg(Tensor y, Tensor z, Tensor x) # the order in schema -# used to move 'out' arguments to the end of the list -def argument_order(decl): - return decl.get('jit_argument_order') or list(range(len(decl['arguments']))) - - -def gen_unboxing_wrappers( - declarations, - out, - template_path, - operator_selector: SelectiveBuilder, - disable_autograd=False, - force_schema_registration=False, -): - GENERATED_UNBOXING_WRAPPERS_CPP = CodeTemplate.from_file(template_path + '/generated_unboxing_wrappers.cpp') - - ops = [] - - def get_invocation(decl, args, num_inputs): - - # because the arg list can get lengthy we put them on a separate line - def pack_arguments(args): - return ',\n'.join(args) - is_namespace_function = 'namespace' in decl['method_of'] - tensor_options_arg_index = decl.get('tensor_options_arg_index', None) - if tensor_options_arg_index is not None: - dtype = args[tensor_options_arg_index] - layout = args[tensor_options_arg_index + 1] - device = args[tensor_options_arg_index + 2] - pin_memory = args[tensor_options_arg_index + 3] - args_with_tensor_options = args[:tensor_options_arg_index] + \ - ['options'] + args[(tensor_options_arg_index + 4):] - if is_namespace_function: - return CALL_NAMESPACE_WITH_TENSOR_OPTIONS.substitute( - name=decl['name'], dtype=dtype, layout=layout, - device=device, pin_memory=pin_memory, - args_with_tensor_options=pack_arguments(args_with_tensor_options)) - else: - return CALL_METHOD_WITH_TENSOR_OPTIONS.substitute( - name=decl['name'], dtype=dtype, layout=layout, - device=device, pin_memory=pin_memory, - args_with_tensor_options=pack_arguments(args_with_tensor_options[1:]), - first=args_with_tensor_options[0], num_inputs=num_inputs) - elif decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper': - if len(decl['returns']) == 0: - return_type = "void" - elif len(decl['returns']) == 1: - return_type = decl['returns'][0]['type'] - else: - return_type = "std::tuple<{}>".format(", ".join([r['type'] for r in decl['returns']])) - for a in decl['arguments']: - if 'type' not in a: - raise Exception(decl) - argument_types_with_leading_comma = ", ".join([a['type'] for a in decl['arguments']]) - if argument_types_with_leading_comma != "": - argument_types_with_leading_comma = ", " + argument_types_with_leading_comma - args_with_leading_comma = pack_arguments(args) - if args_with_leading_comma != "": - args_with_leading_comma = ", " + args_with_leading_comma - return CALL_UNBOXED_KERNEL.substitute(name=decl['name'], - args_with_leading_comma=args_with_leading_comma, - num_inputs=num_inputs, - return_type=return_type, - formals_types_with_leading_comma=argument_types_with_leading_comma) - else: - assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures'] - if is_namespace_function: - return CALL_NAMESPACE.substitute(name=decl['name'], - args=pack_arguments(args), - num_inputs=num_inputs) - else: - return CALL_METHOD.substitute( - name=decl['name'], first=args[0], - args=pack_arguments(args[1:]), num_inputs=num_inputs) - - def requires_lvalue(arg): - jit_type = jit_type_of(arg) - return jit_type.startswith('Tensor') and '!' in jit_type - - def emit_decl_variant(decl): - if ('emit_dummy_placeholder' in decl): - return "DUMMY_OPERATION" - kw_assignments = [] - - # mutable arguments in aten are passed as non const references - # these must be lvalues, so we have to put them in variables - # before calling the function - lvalues = [] - - arguments = [] - num_inputs = len(decl['arguments']) - op_capture = '' - order = argument_order(decl) - for i, arg in enumerate(decl['arguments']): - value = from_ivalue(arg, '(std::move(peek(*stack, {}, {})))'.format(order[i], num_inputs)) - if requires_lvalue(arg): - lvalues.append('auto {} = {};\n'.format(arg['name'], value)) - value = arg['name'] - arguments.append(value) - - call = get_invocation(decl, arguments, num_inputs) - - returns = decl['returns'] - - if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper': - constructor = CONSTRUCTOR.substitute(name=decl['name'], - call=call, - kw_assignments=kw_assignments, - num_inputs=num_inputs, - op_capture=op_capture, - lvalues=lvalues) - else: - assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures'] - - return constructor - - def filter_decls(jit_decls, disable_autograd, operator_selector: SelectiveBuilder, force_schema_registration): - result = [] - for decl in jit_decls: - if disable_autograd and is_backward_op(decl): - continue - op_name = op_name_with_overload(decl) - if operator_selector.is_root_operator(op_name): - result.append(decl) - else: - if force_schema_registration: - decl['emit_dummy_placeholder'] = True - result.append(decl) - - return result - - # This function declares an order on declarations. This is necessary because - # there is some ambiguity in the choice of overload: if an argument is overloaded - # to accept both Scalar and Tensor, the schema with the Tensor should come first - # TODO: this can (probably) be removed when we remove the implicit conversion - # from Tensor -> Number. - def sort_decls(jit_decls): - def declkey(decl): - # key = sum_{i < len(args)} {1 if arg is tensor else 2} * (3 ** i) - # This is a ternary encoding where - # 0: No argument at this position - # 1: Tensor argument at this position - # 2: Some other argument at this position. - args = decl['arguments'] - result = 0 - for i in range(len(args)): - result += (3 ** i) * (1 if args[i]['simple_type'] == 'Tensor' else 2) - return result - - # NB: itertools.groupby requires the list be sorted. - sorted_decls = sorted(jit_decls, key=lambda decl: decl['name']) - grouped_decls = [list(g) for _, g in - groupby(sorted_decls, key=lambda decl: decl['name'])] - return [sorted(g, key=declkey) for g in grouped_decls] - - aten_decls = load_aten_declarations(declarations) - jit_decls = [d for d in aten_decls if is_jit_op(d)] - - # add arguments dtype and device for functions like zeros - def expand_options(decl, i, arg): - if arg['simple_type'] != 'TensorOptions': - return [arg] - assert decl.get('tensor_options_arg_index') != i - decl['tensor_options_arg_index'] = i - tensor_options_expansion = [ - # XXX - until we actually have first-class interpreter types for these - # concepts, the default values to be encoded in Tensors - # If you change this, you also need to update [TensorOptions in script] - # in the tracer code. - # dtype is specified as an int64_t of at::ScalarType - {'name': 'dtype', 'simple_type': 'ScalarType'}, - # layout is specified as an int64_t of at::Layout - {'name': 'layout', 'simple_type': 'Layout'}, - # device is specified as an IntArrayRef of { at::Device::Type, device_id } - {'name': 'device', 'simple_type': 'Device'}, - # pin_memory is specified as a boolean - {'name': 'pin_memory', 'simple_type': 'bool', 'default': False}, - ] - # TODO: Don't repack this into TensorOptions. Needs various changes in downstream code. - if 'default' in arg: - for el in tensor_options_expansion: - el['simple_type'] += '?' - el['default'] = 'None' - if 'default' in arg and arg['default'] == 'at::kLong': - tensor_options_expansion[0]['default'] = 'long' - if 'kwarg_only' in arg and arg['kwarg_only']: - for el in tensor_options_expansion: - el['kwarg_only'] = True - return tensor_options_expansion - - additional_jit_decls = [] - - for decl in jit_decls: - decl['arguments'] = [a for i, arg in enumerate(decl['arguments']) for a in expand_options(decl, i, arg)] - if is_out_variant(decl): - reorder_out_args(decl) - - jit_decls.extend(additional_jit_decls) - jit_decls = filter_decls(jit_decls, disable_autograd, operator_selector, force_schema_registration) - - # generation is deterministic - jit_decl_groups = sort_decls(jit_decls) - - # NOTE: see Note [Sharded File] at the top of the generated_unboxing_wrappers.cpp - # template regarding sharding of the generated files. - # - # If you edit the number of shards here, you will also have to - # modify generate_code.py, torch/CMakeLists.txt, and the TARGETS - # files. - num_shards = 3 - shards = [[] for _ in range(num_shards)] - - # ops are assigned arbitrarily but stably to a file based on hash - for group in jit_decl_groups: - x = sum(ord(c) for c in group[0]['name']) % num_shards - for decl in group: - if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper': - shards[x].append(OPERATOR.substitute(signature=decl['schema_string'], - op=emit_decl_variant(decl))) - else: - assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures'] - - for i, shard in enumerate(shards): - env = { - 'constructors': shard, - } - write(out, 'generated_unboxing_wrappers_%d.cpp' % i, GENERATED_UNBOXING_WRAPPERS_CPP, env) - - all_shards = reduce( - lambda lhs, rhs: lhs + rhs, - shards, - ) - env = { - 'constructors': all_shards, - } - write(out, 'generated_unboxing_wrappers_everything.cpp', GENERATED_UNBOXING_WRAPPERS_CPP, env) - - -default_map = {'{}': 'None', 'nullptr': 'None', 'c10::nullopt': 'None'} - - -def reorder_out_args(decl): - first_arg = decl['arguments'][0] - assert(first_arg['output']) - # the output variant must go at the end - # note: this is an annoying side effect of using a single '*' - # to denote kwarg_only - nargs = len(decl['arguments']) - decl['jit_argument_order'] = [nargs - 1] + list(range(nargs - 1)) - - -def is_kwarg_only(a): - return a.get('kwarg_only') or a.get('output') - -def main(): - parser = argparse.ArgumentParser( - description='Generate JIT op dispatch') - parser.add_argument('declarations', metavar='DECL', - help='path to Declarations.yaml') - parser.add_argument('out', metavar='OUT', - help='path to output directory') - parser.add_argument('template_path', metavar='TEMPLATE_PATH', - help='path to templates directory') - args = parser.parse_args() - gen_unboxing_wrappers(args.declarations, args.out, args.template_path, - SelectiveBuilder.get_nop_selector()) - - -if __name__ == '__main__': - main() diff --git a/tools/jit/templates/generated_unboxing_wrappers.cpp b/tools/jit/templates/generated_unboxing_wrappers.cpp deleted file mode 100644 index cd8d12f6b15e..000000000000 --- a/tools/jit/templates/generated_unboxing_wrappers.cpp +++ /dev/null @@ -1,132 +0,0 @@ -#include "torch/csrc/jit/runtime/operator.h" -#include "torch/csrc/jit/runtime/custom_operator.h" -#include "torch/csrc/jit/frontend/function_schema_parser.h" - -#include "torch/csrc/autograd/profiler.h" -#include "torch/csrc/autograd/generated/variable_factories.h" - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// ${generated_comment} - -// This file contains manual unboxing wrappers for ops that aren't -// use_c10_dispatcher: full because the templated unboxing logic in c10 doesn't -// support them yet. The ultimate goal is to make all ops use the templated -// unboxing and delete this codegen file. - -// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up -// incremental rebuilds. See the comment at the top of -// templates/VariableType.cpp for an analogous, in-depth discussion. - -namespace torch { namespace jit { - -using autograd::Variable; -using autograd::variable_list; -using at::Scalar; -using at::ScalarType; -using at::Tensor; -using at::TensorOptions; -using at::DeviceGuard; -using at::MemoryFormat; - -using ::c10::fmap; -using ::c10::filter; -using c10::OperatorKernel; -using c10::OperatorHandle; -using c10::KernelFunction; -using c10::RegistrationHandleRAII; -using c10::Stack; - -namespace { - -template -Return callUnboxedKernel(OperatorKernel* unboxedKernel, Args... args) { - using FuncType = Return (Args...); - auto* typedUnboxedKernel = static_cast*>(unboxedKernel); - return (*typedUnboxedKernel)(std::forward(args)...); -} - -// TODO: remove the toOptionalTensor and toListOfOptionalTensor -// when we remove the undefined tensor semantic from TH - -// XXX: This function is to specialize IValue for tensor type in -// interpreter, it should only be used in this file -at::Tensor toOptionalTensor(const IValue& v) { - if (v.isNone()) { - return at::Tensor(); - } - return v.toTensor(); -} - -// XXX: This function is to specialize IValue for list of optional -// tensor type in interpreter, it should only be used in this file -std::vector toListOfOptionalTensor(const IValue& v) { - // v is a list of optional tensor, loop over as generic list - auto vlist = v.toListRef(); - std::vector res; - - for (const IValue &v: vlist) { - res.emplace_back(toOptionalTensor(v)); - } - return res; -} - -template -std::array as_bool_array(const c10::List& list) { - std::array res; - AT_ASSERT(list.size() == N); - std::copy(list.begin(), list.end(), res.begin()); - return res; -} - -KernelFunction::InternalBoxedKernelFunction *DUMMY_OPERATION = - [](c10::OperatorKernel *, const c10::OperatorHandle &, std::vector *) -> void { - TORCH_CHECK(false, "Operator has been stripped in the custom build.") - }; - -class Registerer final { -public: - Registerer&& op(const std::string& schemaStr, KernelFunction::InternalBoxedKernelFunction* boxed_kernel_wrapper) && { - static auto& dispatcher = c10::Dispatcher::singleton(); - auto schema = parseSchema(schemaStr); - schema.setAliasAnalysis(AliasAnalysisKind::FROM_SCHEMA); - c10::OperatorName name = schema.operator_name(); - RegistrationHandleRAII registration = dispatcher.registerName(name); - auto op = dispatcher.findOp(name).value(); - registrationHandles_.push_back(std::move(registration)); - dispatcher.setManuallyBoxedKernelFor_(op, boxed_kernel_wrapper); - return std::move(*this); - } - - Registerer() = default; - Registerer(const Registerer&) = delete; - Registerer& operator=(const Registerer&) = delete; - Registerer(Registerer&&) noexcept = default; - Registerer& operator=(Registerer&&) noexcept = default; -private: - std::vector registrationHandles_; -}; - -static auto registry = Registerer() - // Generated operators - ${constructors} - ; - -} // anon namespace - - -}} // namespace torch::jit diff --git a/tools/nightly.py b/tools/nightly.py index 1fecc67e72f3..55a90e3fd9fb 100755 --- a/tools/nightly.py +++ b/tools/nightly.py @@ -322,10 +322,10 @@ def pytorch_install(url): def _site_packages(dirname, platform): if platform.startswith("win"): - os.path.join(pytdir.name, "Lib", "site-packages") + template = os.path.join(dirname, "Lib", "site-packages") else: template = os.path.join(dirname, "lib", "python*.*", "site-packages") - spdir = glob.glob(template)[0] + spdir = glob.glob(template)[0] return spdir diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py index 9ca843abc69f..10bbc33c352f 100644 --- a/tools/setup_helpers/generate_code.py +++ b/tools/setup_helpers/generate_code.py @@ -30,7 +30,6 @@ def generate_code(ninja_global=None, operator_selector=None): from tools.autograd.gen_autograd import gen_autograd, gen_autograd_python from tools.autograd.gen_annotated_fn_args import gen_annotated - from tools.jit.gen_unboxing_wrappers import gen_unboxing_wrappers from tools.codegen.selective_build.selector import SelectiveBuilder @@ -70,13 +69,6 @@ def generate_code(ninja_global=None, disable_autograd=disable_autograd, operator_selector=operator_selector, ) - gen_unboxing_wrappers( - declarations_path or DECLARATIONS_PATH, - jit_gen_dir, - tools_jit_templates, - disable_autograd=disable_autograd, - operator_selector=operator_selector, - force_schema_registration=force_schema_registration) if subset == "python" or not subset: gen_annotated( diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py index c24a19678c39..51c57aa161c9 100644 --- a/tools/shared/module_loader.py +++ b/tools/shared/module_loader.py @@ -1,5 +1,3 @@ - - def import_module(name, path): import importlib.util spec = importlib.util.spec_from_file_location(name, path) diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi index cd9a0f7d46a9..5ac2c0a8315d 100644 --- a/torch/_C/_distributed_c10d.pyi +++ b/torch/_C/_distributed_c10d.pyi @@ -77,6 +77,7 @@ class ReduceScatterOptions: timeout: timedelta class BarrierOptions: + device_ids: List[int] timeout: timedelta class AllToAllOptions: diff --git a/torch/_six.py b/torch/_six.py index c53feed94cce..00f9fa6b7f95 100644 --- a/torch/_six.py +++ b/torch/_six.py @@ -33,7 +33,6 @@ FileNotFoundError = builtins.FileNotFoundError StringIO = io.StringIO container_abcs = collections.abc -PY3 = sys.version_info[0] == 3 PY37 = sys.version_info[0] == 3 and sys.version_info[1] >= 7 def with_metaclass(meta: type, *bases) -> type: diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py index 67e2ec1a2cd9..26f32cfd9ffd 100644 --- a/torch/_vmap_internals.py +++ b/torch/_vmap_internals.py @@ -137,7 +137,7 @@ def _get_name(func: Callable): # Not all callables have __name__, in fact, only static functions/methods do. # A callable created via functools.partial or an nn.Module, to name some # examples, don't have a __name__. - fn_name = repr(func) + return repr(func) # vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors, # sends those into func, and then unwraps the output BatchedTensors. Operations diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp index 55e5abc29ef9..2bc478f36007 100644 --- a/torch/csrc/Generator.cpp +++ b/torch/csrc/Generator.cpp @@ -15,7 +15,6 @@ #include #ifdef USE_CUDA -#include #include #endif @@ -78,45 +77,32 @@ static PyObject * THPGenerator_getState(PyObject *_self, PyObject *noargs) { using namespace torch::autograd; HANDLE_TH_ERRORS - auto self = (THPGenerator*)_self; - Variable var = torch::empty({0}, at::device(at::kCPU).dtype(at::kByte)); - if (self->cdata.device().type() == at::kCPU) { - THByteTensor_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl())); - } else { -#ifdef USE_CUDA - TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA); - THCRandom_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl())); -#else - TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA"); -#endif - } - return THPVariable_Wrap(std::move(var)); + auto& gen = ((THPGenerator*)_self)->cdata; + + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen.mutex()); + auto state_tensor = gen.get_state(); + + return THPVariable_Wrap(std::move(state_tensor)); END_HANDLE_TH_ERRORS } static PyObject * THPGenerator_setState(PyObject *_self, PyObject *_new_state) { using namespace torch::autograd; - auto self = (THPGenerator*)_self; + HANDLE_TH_ERRORS if (!THPVariable_Check(_new_state)) { throw torch::TypeError("expected a torch.ByteTensor, but got %s", Py_TYPE(_new_state)->tp_name); } - auto& tensor = ((THPVariable*)_new_state)->cdata; - if (tensor.layout() != kStrided || tensor.device().type() != kCPU || tensor.scalar_type() != kByte) { - auto type_name = torch::utils::options_to_string(tensor.options()); - throw torch::TypeError("expected a torch.ByteTensor, but got %s", type_name.c_str()); - } - if (self->cdata.device().type() == at::kCPU) { - THByteTensor_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl()); - } else { -#ifdef USE_CUDA - TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA); - THCRandom_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl()); -#else - TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA"); -#endif - } + auto self = (THPGenerator*)_self; + auto& gen = self->cdata; + auto& new_state_tensor = ((THPVariable*)_new_state)->cdata; + + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen.mutex()); + gen.set_state(new_state_tensor); + Py_INCREF(self); return (PyObject*)self; END_HANDLE_TH_ERRORS diff --git a/torch/csrc/api/include/torch/cuda.h b/torch/csrc/api/include/torch/cuda.h index 5f6f2a9eb8a9..a7e063b90af9 100644 --- a/torch/csrc/api/include/torch/cuda.h +++ b/torch/csrc/api/include/torch/cuda.h @@ -23,5 +23,8 @@ void TORCH_API manual_seed(uint64_t seed); /// Sets the seed for all available GPUs. void TORCH_API manual_seed_all(uint64_t seed); +/// Waits for all kernels in all streams on a CUDA device to complete. +void TORCH_API synchronize(int64_t device_index = -1); + } // namespace cuda } // namespace torch diff --git a/torch/csrc/api/src/cuda.cpp b/torch/csrc/api/src/cuda.cpp index d40cd8611c42..b8f3ffa0ee0a 100644 --- a/torch/csrc/api/src/cuda.cpp +++ b/torch/csrc/api/src/cuda.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -49,5 +50,13 @@ void manual_seed_all(uint64_t seed) { } } +void synchronize(int64_t device_index) { + TORCH_CHECK(is_available(), "No CUDA GPUs are available"); + int64_t num_gpus = cuda::device_count(); + TORCH_CHECK(device_index == -1 || device_index < num_gpus, + "Device index out of range: ", device_index); + at::detail::getCUDAHooks().deviceSynchronize(device_index); +} + } // namespace cuda } // namespace torch diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp index d1f15fff3669..f6c3f23cd0f7 100644 --- a/torch/csrc/autograd/VariableTypeManual.cpp +++ b/torch/csrc/autograd/VariableTypeManual.cpp @@ -387,14 +387,6 @@ TORCH_LIBRARY_IMPL(aten, Autograd, m) { m.impl("detach", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach))); m.impl("detach_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach_))); m.impl("copy_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::copy_))); - // For backward() and requires_grad_(), we need the DefaultBackend kernel, but we also need the Autograd backend - // kernel, because when called with a VariableTensorId tensor, it goes through the variable fallback kernel, - // which calls callBoxed(), which doesn't support optional tensor arguments yet and backward() has an optional - // tensor argument. - // TODO Once callBoxed() supports optional tensor arguments, we can enable `use_c10_dispatcher: full` for backward() - // and requires_grad_(), then remove the backend Autograd kernel here, only leaving the Math kernel. - m.impl("_backward", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_backward))); - m.impl("requires_grad_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::requires_grad_))); m.impl("_fw_primal", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_fw_primal))); } diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index b31d44a1d295..0d4250eddd13 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -345,6 +345,7 @@ They are used in specifying strategies for reduction collectives, e.g., py::class_<::c10d::BarrierOptions>(module, "BarrierOptions") .def(py::init<>()) + .def_readwrite("device_ids", &::c10d::BarrierOptions::device_ids) .def_readwrite("timeout", &::c10d::BarrierOptions::timeout); py::class_<::c10d::AllToAllOptions>(module, "AllToAllOptions") @@ -1259,11 +1260,25 @@ static const auto TCPStoreTorchBind = .def(torch::init([](const std::string& host_name, int64_t port, int64_t world_size, - bool is_master) { + bool is_master, + int64_t timeout) { + auto timeout_miliseconds = std::chrono::milliseconds(timeout); return c10::make_intrusive<::c10d::TCPStore>( - host_name, port, world_size, is_master); + host_name, port, world_size, is_master, timeout_miliseconds); })); +// TODO: This should really take Store as constructor argument instead of +// TCPStore, but the fact that TorchScript does not support polymorphism +// forced us to cast in C++ instead of automatic casting +static const auto PrefixStoreTorchBind = + torch::class_<::c10d::PrefixStore>("dist_c10d", "PrefixStore") + .def(torch::init([](const std::string& prefix, + const c10::intrusive_ptr<::c10d::TCPStore>& store) { + return c10::make_intrusive<::c10d::PrefixStore>( + prefix, store); + })); + + // Torchbind the ProcessGroup to make it available in TorchScript static const auto ProcessGroupWorkTorchBind = torch::class_<::c10d::ProcessGroup::Work>("dist_c10d", "Work") @@ -1623,7 +1638,14 @@ static const auto ProcessGroupNCCLTorchBind = outputSplitSizes, inputSplitSizes, ::c10d::AllToAllOptions()); - }); + + }) + .def("size", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + return (int64_t) self->getSize(); + }) + .def("rank", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) { + return (int64_t) self->getRank(); + }); #endif static const auto DistributedC10dFrontendTorchBind = diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index f1a0a634727a..5bddc510fe56 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -209,7 +209,7 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId( std::stringstream encoded_inputs; for (const auto& input : inputs) { if (input.isTensor()) { - auto input_tensor = input.toTensor(); + auto& input_tensor = input.toTensor(); encoded_inputs << ";"; auto sep = ""; diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp index 4e76dc23e55d..4f4aa0d1536b 100644 --- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp +++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp @@ -45,11 +45,17 @@ constexpr int so_suffix_len = 3; constexpr int cpp_suffix_len = 4; #endif +intptr_t run(const std::string& cmd); + static bool programExists(const std::string& program) { TemplateEnv env; env.s("program", program); std::string cmd = format(check_exists_string, env); +#ifdef _MSC_VER + return (run(cmd.c_str()) == 0); +#else return (system(cmd.c_str()) == 0); +#endif } #ifdef _MSC_VER diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp index 1bab391bd393..0c88371399de 100644 --- a/torch/csrc/jit/frontend/tracer.cpp +++ b/torch/csrc/jit/frontend/tracer.cpp @@ -137,7 +137,7 @@ Value* TracingState::getValue(const IValue& var) { return graph->insertNode(dict_node)->output(); } if (var.isTensor()) { - auto ten = var.toTensor(); + auto& ten = var.toTensor(); if (!ten.defined()) { Node* n = graph->createNone(); return graph->insertNode(n)->output(); @@ -237,7 +237,7 @@ bool TracingState::hasValue(const IValue& var) const { Value* TracingState::getOutput(const IValue& iv, size_t i) { bool tracing_mode_strict = getTracingState()->strict; if (iv.isTensor()) { - at::Tensor var = iv.toTensor(); + const at::Tensor& var = iv.toTensor(); if (!var.defined()) { Node* n = graph->createNone(); return graph->insertNode(n)->output(); @@ -506,7 +506,7 @@ void setValueTrace(const IValue& v, Value* value) { } void TracingState::setValue(const IValue& v, Value* value) { if (v.isTensor()) { - auto var = v.toTensor(); + auto& var = v.toTensor(); AT_ASSERT(var.defined()); env_stack.back()[v] = value; } else if (v.isTensorList()) { diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp index 031c21474618..681eddfaa832 100644 --- a/torch/csrc/jit/mobile/interpreter.cpp +++ b/torch/csrc/jit/mobile/interpreter.cpp @@ -148,7 +148,7 @@ bool InterpreterState::run(Stack& stack) { case RET: return false; case LIST_CONSTRUCT: { - auto type = code_->types_[inst.X]->expect(); + const auto& type = code_->types_[inst.X]->expectRef(); listConstruct(stack, type, inst.N); ++pc; } break; diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp index da9d551a6c88..75be7e86acab 100644 --- a/torch/csrc/jit/passes/constant_propagation.cpp +++ b/torch/csrc/jit/passes/constant_propagation.cpp @@ -45,7 +45,9 @@ c10::optional> runNodeIfInputsAreConstant( } break; case prim::ListConstruct: { listConstruct( - stack, n->output()->type()->expect(), n->inputs().size()); + stack, + n->output()->type()->expectRef(), + n->inputs().size()); } break; case prim::DictConstruct: { dictConstruct( diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 2778c7712f23..f66f54eeb567 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -289,11 +289,11 @@ class AttributePropagator { IValue overrideGradient(IValue attr) { if (attr.isTensor()) { - auto t = attr.toTensor(); + auto& t = attr.toTensor(); if (t.requires_grad()) { - t = t.detach(); - t.set_requires_grad(false); - attr = IValue(t); + auto detached = t.detach(); + detached.set_requires_grad(false); + attr = IValue(std::move(detached)); } } else if (attr.isTuple()) { auto tuple = std::move(attr).toTuple(); diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp index 73361f8f3415..166238cebe17 100644 --- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp +++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp @@ -15,6 +15,7 @@ #include #include +// NOLINTNEXTLINE C10_DEFINE_bool( torch_jit_disable_cat, false, @@ -126,7 +127,7 @@ bool isSupported(Node* node) { "aten::round(Tensor self) -> Tensor", "aten::trunc(Tensor self) -> Tensor", "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor", - "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", + // "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor", // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", TODO: requires 0-dim Tensor "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor", "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor", diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h index 401933c6d67e..a0e60e879146 100644 --- a/torch/csrc/jit/runtime/argument_spec.h +++ b/torch/csrc/jit/runtime/argument_spec.h @@ -237,7 +237,7 @@ struct CompleteArgumentSpec { for (int32_t i = 0; i < num_inputs; i++) { if (!inputs[i].isTensor()) continue; - auto tensor = inputs[i].toTensor(); + auto& tensor = inputs[i].toTensor(); all_dims += tensor.defined() ? tensor.ndimension() : 0; } // allocate enough room for all TensorPODs and dimensions diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index 24ca9dbf9793..7d588b6d96e7 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1418,7 +1418,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { // Check every input's shape against profiled (expected) shape. for (i = 0; i < num_inputs; i++) { auto& input = peek(stack, i, num_inputs); - auto t = input.toTensor(); + auto& t = input.toTensor(); const TypePtr& expected = frame.function->type_table_[inst.X + i]; auto expected_type = expected->cast(); if (t.defined() && !expected_type->matchTensor(t)) { @@ -1439,7 +1439,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { // so it's safe to pass this guard check push(stack, true); } else { - auto t = stack.back().toTensor(); + auto& t = stack.back().toTensor(); const TypePtr& expected = frame.function->type_table_[inst.X]; auto expected_type = expected->cast(); if (t.defined() && @@ -1495,7 +1495,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target { ++frame.pc; } break; case LIST_CONSTRUCT: { - auto type = frame.function->type_table_[inst.X]->expect(); + const auto& type = + frame.function->type_table_[inst.X]->expectRef(); listConstruct(stack, type, inst.N); ++frame.pc; } break; diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp index 8d276dd58b50..d233f089f187 100644 --- a/torch/csrc/jit/runtime/profiling_record.cpp +++ b/torch/csrc/jit/runtime/profiling_record.cpp @@ -165,7 +165,7 @@ void ProfilingRecord::insertShapeProfile(Node* n, size_t offset) { if (v.isTensor()) { std::lock_guard lock(this->mutex_); auto& profiled_types = profiled_types_per_frame_[frame_id]; - auto t = v.toTensor(); + auto& t = v.toTensor(); if (t.defined()) { auto pttp = tensorTypeInCurrentExecutionContext(t); GRAPH_DEBUG( diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp index 5c118f513565..4d66c6382c2d 100644 --- a/torch/csrc/jit/runtime/static/ops.cpp +++ b/torch/csrc/jit/runtime/static/ops.cpp @@ -79,13 +79,13 @@ struct static_add final : public at::native::structured_add_out { REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); auto in2_s = p_node->Input(2, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); static_add op{out_t}; op.meta(in0_t, in1_t, in2_s); op.impl(in0_t, in1_t, in2_s, out_t); @@ -94,12 +94,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::mul_out(out_t, in0_t, in1_t); }; @@ -107,15 +107,15 @@ REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); - auto in2_t = p_node->Input(2, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); + auto& in2_t = p_node->Input(2, reg).toTensor(); auto in3_s = p_node->Input(3, reg).toScalar(); auto in4_s = p_node->Input(4, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::addmm_cpu_out(out_t, in0_t, in1_t, in2_t, in3_s, in4_s); }; @@ -123,13 +123,13 @@ REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_s = p_node->Input(1, reg).toScalar(); auto in2_s = p_node->Input(2, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::clamp_out(out_t, in0_t, in1_s, in2_s); }; @@ -137,12 +137,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator { REGISTER_OPERATOR_FUNCTOR(aten::bmm, aten_bmm, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); - auto in1_t = p_node->Input(1, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); + auto& in1_t = p_node->Input(1, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::bmm_out_cpu(out_t, in0_t, in1_t); }; @@ -154,7 +154,7 @@ REGISTER_OPERATOR_FUNCTOR( [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { auto input_size = p_node->input_regs().size(); - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); double in1_d = input_size > 1 ? p_node->Input(1, reg).toDouble() : 0; double in2_d = input_size > 2 ? p_node->Input(2, reg).toDouble() : std::numeric_limits::infinity(); @@ -164,7 +164,7 @@ REGISTER_OPERATOR_FUNCTOR( if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::nan_to_num_out(out_t, in0_t, in1_d, in2_d, in3_d); }; @@ -176,18 +176,18 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator { if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_tl[0]); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::_cat_out_cpu(out_t, in0_tl, in1_i); }; }); REGISTER_OPERATOR_FUNCTOR(aten::tanh, aten_tanh, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::tanh_out(out_t, in0_t); }; @@ -217,7 +217,7 @@ SROperator aten_stack(Node* n) { for (auto i = 0; i < inputs.size(); i++) { inputs[i] = inputs[i].unsqueeze(dim); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::_cat_out_cpu(out_t, inputs, dim); }; @@ -230,11 +230,11 @@ REGISTER_OPERATOR_FUNCTOR( aten_sigmoid, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::sigmoid_out(out_t, in0_t); }; @@ -247,57 +247,57 @@ REGISTER_OPERATOR_FUNCTOR( if (in1) { auto in1_s = in1->toScalar(); return [=](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); at::native::leaky_relu_out(out_t, in0_t, in1_s); }; } else { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_s = p_node->Input(1, reg).toScalar(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); at::native::leaky_relu_out(out_t, in0_t, in1_s); }; } }); REGISTER_OPERATOR_FUNCTOR(aten::relu, aten_relu, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::threshold_out(out_t, in0_t, 0, 0); }; }); REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); double in1_d = p_node->input_regs().size() > 1 ? p_node->Input(1, reg).toDouble() : -1.0; if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); out_t.resize_({0}); at::native::logit_out(out_t, in0_t, in1_d); }; }); REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); if (p_node->Output(0, reg).isNone()) { p_node->Output(0, reg) = create_empty_from(in0_t); } - auto out_t = p_node->Output(0, reg).toTensor(); + auto& out_t = p_node->Output(0, reg).toTensor(); at::native::resize_as_(out_t, in0_t, c10::nullopt); at::native::copy_(out_t, in0_t, false); }; @@ -317,14 +317,14 @@ std::function&)> getNativeOperation(Node* n) { if (n->kind() == c10::Symbol::fromQualString("aten::transpose")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toInt(); auto in2_i = p_node->Input(2, reg).toInt(); p_node->Output(0, reg) = at::native::transpose(in0_t, in1_i, in2_i); }; } else if (n->kind() == c10::Symbol::fromQualString("aten::flatten")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toInt(); auto in2_i = p_node->Input(2, reg).toInt(); p_node->Output(0, reg) = at::native::flatten(in0_t, in1_i, in2_i); @@ -361,7 +361,7 @@ getNativeOperation(Node* n) { // run op listConstruct( stack, - p_node->get_node()->output()->type()->expect(), + p_node->get_node()->output()->type()->expectRef(), p_node->input_regs().size()); // put output back p_node->Output(0, reg) = std::move(stack[0]); @@ -386,19 +386,19 @@ getNativeOperation(Node* n) { }; } else if (n->kind() == c10::Symbol::fromQualString("aten::permute")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_iv = p_node->Input(1, reg).toIntVector(); p_node->Output(0, reg) = at::native::permute(in0_t, in1_iv); }; } else if (n->kind() == c10::Symbol::fromQualString("aten::reshape")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_iv = p_node->Input(1, reg).toIntVector(); p_node->Output(0, reg) = at::native::reshape(in0_t, in1_iv); }; } else if (n->kind() == c10::Symbol::fromQualString("aten::slice")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toInt(); auto in2_i = p_node->Input(2, reg).toInt(); auto in3_i = p_node->Input(3, reg).toInt(); @@ -408,13 +408,13 @@ getNativeOperation(Node* n) { }; } else if (n->kind() == c10::Symbol::fromQualString("aten::narrow")) { return [](const ProcessedNode* p_node, std::vector& reg) { - auto self = p_node->Input(0, reg).toTensor(); // self + auto& self = p_node->Input(0, reg).toTensor(); // self auto dim = p_node->Input(1, reg).toInt(); // dim int64_t start = 0; if (p_node->Input(2, reg).isScalar()) { start = p_node->Input(2, reg).toInt(); } else { - auto t = p_node->Input(2, reg).toTensor(); + auto& t = p_node->Input(2, reg).toTensor(); start = t.item(); } auto length = p_node->Input(3, reg).toInt(); // length @@ -440,7 +440,7 @@ getNativeOperation(Node* n) { } else if (n->kind() == c10::Symbol::fromQualString("aten::to")) { return [](const ProcessedNode* p_node, std::vector& reg) { DCHECK(p_node->input_regs().size() == 5); - auto in0_t = p_node->Input(0, reg).toTensor(); + auto& in0_t = p_node->Input(0, reg).toTensor(); auto in1_i = p_node->Input(1, reg).toScalarType(); auto in2_i = p_node->Input(2, reg).toBool(); auto in3_i = p_node->Input(3, reg).toBool(); diff --git a/torch/csrc/jit/runtime/vararg_functions.cpp b/torch/csrc/jit/runtime/vararg_functions.cpp index 44bc56206eaf..220a5e67f723 100644 --- a/torch/csrc/jit/runtime/vararg_functions.cpp +++ b/torch/csrc/jit/runtime/vararg_functions.cpp @@ -204,16 +204,13 @@ void namedTupleConstruct( c10::ivalue::Tuple::createNamed(std::move(elems), std::move(type))); } -void listConstruct( - Stack& stack, - const at::ListTypePtr& type, - size_t num_inputs) { +void listConstruct(Stack& stack, const at::ListType& type, size_t num_inputs) { // Structuring the implementation this way allows NRVO to avoid // move-constructing vals on its way onto the stack. Moving a List // isn't free. auto makeList = - [](Stack& stack, const at::ListTypePtr& type, size_t num_inputs) { - c10::List vals(type->getElementType()); + [](Stack& stack, const at::ListType& type, size_t num_inputs) { + c10::List vals(type.getElementType()); vals.reserve(num_inputs); for (size_t i = stack.size() - num_inputs; i < stack.size(); ++i) { vals.emplace_back(std::move(stack[i])); diff --git a/torch/csrc/jit/runtime/vararg_functions.h b/torch/csrc/jit/runtime/vararg_functions.h index d6eba7f5d191..e9580411212a 100644 --- a/torch/csrc/jit/runtime/vararg_functions.h +++ b/torch/csrc/jit/runtime/vararg_functions.h @@ -25,7 +25,7 @@ void namedTupleConstruct( void listConstruct( Stack& stack, - const at::ListTypePtr& list_type, + const at::ListType& list_type, size_t num_inputs); void dictConstruct( diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp index 6e5c3b927c38..811569485888 100644 --- a/torch/csrc/jit/serialization/pickler.cpp +++ b/torch/csrc/jit/serialization/pickler.cpp @@ -354,7 +354,7 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) { // // The format here is the same one used by `torch.save()`. The code for the // format can be found in `torch/serialization.py`. - auto tensor = ivalue.toTensor(); + auto& tensor = ivalue.toTensor(); bool quantized = tensor.is_quantized(); // The arguments to this function are: // storage, storage_offset, size, stride, requires_grad, backward_hooks diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp index e203a03a2e24..18d656c98f32 100644 --- a/torch/csrc/jit/serialization/python_print.cpp +++ b/torch/csrc/jit/serialization/python_print.cpp @@ -309,12 +309,12 @@ struct PythonPrintImpl { // because it doesn't hash any information about the tensors. // We will probably need to optimize this at some point using hashing. if (val.isTensor()) { - auto t = val.toTensor(); + auto& t = val.toTensor(); for (size_t i = 0; i < constant_table_.size(); ++i) { if (!constant_table_[i].isTensor()) { continue; } - auto t2 = constant_table_[i].toTensor(); + auto& t2 = constant_table_[i].toTensor(); if (t.options().type_equal(t2.options()) && t.equal(t2)) { return i; } @@ -1339,15 +1339,13 @@ struct PythonPrintImpl { body_ << "\"" << param << "\", "; } body_ << "]\n"; -#ifndef FBCODE_CAFFE2 - // Note: Forward compat gated. TODO: @voznesenskym to remove when ready. + indent(); body_ << "__buffers__ = ["; for (const auto& buffer : buffers) { body_ << "\"" << buffer << "\", "; } body_ << "]\n"; -#endif } for (size_t i = 0; i < numAttrs; i++) { diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp index 3ff5da29fe1f..841e87592be9 100644 --- a/torch/csrc/jit/serialization/unpickler.cpp +++ b/torch/csrc/jit/serialization/unpickler.cpp @@ -632,7 +632,7 @@ void Unpickler::rebuildTensor(bool quantized) { auto tup = pop(stack_).toTuple(); const auto& elements = tup->elements(); size_t idx = 0; - auto storage_tensor = elements.at(idx++).toTensor(); + auto& storage_tensor = elements.at(idx++).toTensor(); int64_t storage_offset = elements.at(idx++).toInt(); std::vector size = tupleToIntList(elements.at(idx++)); std::vector stride = tupleToIntList(elements.at(idx++)); diff --git a/torch/csrc/utils/out_types.cpp b/torch/csrc/utils/out_types.cpp new file mode 100644 index 000000000000..0ceeb43bd1f8 --- /dev/null +++ b/torch/csrc/utils/out_types.cpp @@ -0,0 +1,39 @@ +#include + +namespace torch { +namespace utils { + +// Used by python binding codegen to ensure any TensorOptions arguments are consistent +// with the out tensor's options +void check_out_type_matches(const at::Tensor& result, + at::ScalarType scalarType, bool scalarType_is_none, + c10::optional layout, + const at::Device& device, bool device_is_none) { + if (scalarType_is_none && !layout && device_is_none) { // common case + return; + } + if (!scalarType_is_none && result.scalar_type() != scalarType) { + AT_ERROR( + "dtype ", scalarType, + " does not match dtype of out parameter (", result.scalar_type(), ")"); + } + auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType; + auto device_type_arg = device_is_none ? result.device().type() : device.type(); + if (result.scalar_type() != scalarType_arg) { + AT_ERROR( + "scalar type ", scalarType_arg, + " does not match scalar type of out parameter (", result.scalar_type(), ")"); + } + if (layout && result.layout() != *layout) { + AT_ERROR( + "layout ", *layout, + " does not match layout of out parameter (", result.layout(), ")"); + } + if (result.device().type() != device_type_arg) { + AT_ERROR( + "device type ", device_type_arg, + " does not match device type of out parameter (", result.device().type(), ")"); + } +} + +}} diff --git a/torch/csrc/utils/out_types.h b/torch/csrc/utils/out_types.h new file mode 100644 index 000000000000..adc3686a6b97 --- /dev/null +++ b/torch/csrc/utils/out_types.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace torch { +namespace utils { + +TORCH_API void check_out_type_matches( + const at::Tensor& result, + at::ScalarType scalarType, bool scalarType_is_none, + c10::optional layout, + const at::Device& device, bool device_is_none); + +}} diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h index 28d990c64c42..7e1cb0c4f92d 100644 --- a/torch/csrc/utils/python_compat.h +++ b/torch/csrc/utils/python_compat.h @@ -63,20 +63,5 @@ __PySlice_Unpack(PyObject *_r, (PySlice_Unpack(SLICE, START, STOP, STEP) == 0) #endif -// https://bugsfiles.kde.org/attachment.cgi?id=61186 -#if PY_VERSION_HEX >= 0x03020000 #define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \ (PySlice_GetIndicesEx(SLICE, LEN, START, STOP, LENGTH, STEP) == 0) -#else -#define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \ - (PySlice_GetIndicesEx((PySliceObject*)SLICE, LEN, START, STOP, LENGTH, STEP) == 0) -#endif - -// This function was introduced in Python 3.4 -#if PY_VERSION_HEX < 0x03040000 -inline int -PyGILState_Check() { - PyThreadState * tstate = _PyThreadState_Current; - return tstate && (tstate == PyGILState_GetThisThreadState()); -} -#endif diff --git a/torch/csrc/utils/six.h b/torch/csrc/utils/six.h index 932f0bf61a29..b83e60c77cf3 100644 --- a/torch/csrc/utils/six.h +++ b/torch/csrc/utils/six.h @@ -23,11 +23,7 @@ inline bool isTuple(pybind11::handle input) { if (PyTuple_Check(input.ptr())) { return true; } -#if PY_MAJOR_VERSION == 2 - return isStructSeq(input); -#else return false; -#endif } inline bool isTuple(PyObject* obj) { @@ -40,12 +36,8 @@ inline bool isTuple(PyObject* obj) { // But on Python 2, structseq is not a subtype of tuple, so we need to manually create a // new tuple object from structseq. inline THPObjectPtr maybeAsTuple(PyStructSequence *obj) { -#if PY_MAJOR_VERSION == 2 - return THPObjectPtr(torch::utils::structseq_slice(obj, 0, Py_SIZE(obj))); -#else Py_INCREF(obj); return THPObjectPtr((PyObject *)obj); -#endif } inline THPObjectPtr maybeAsTuple(PyObject *obj) { diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index 8ee83fa81fe7..e59c798a59be 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -153,15 +153,9 @@ def _lazy_init(): # immediately, while we are still guaranteed to have the GIL, because some # of the C calls we make below will release the GIL if _is_in_bad_fork(): - from sys import version_info - if version_info < (3, 4): - msg = ("To use CUDA with multiprocessing, you must use Python " - "3.4+ and the 'spawn' start method") - else: - msg = ("To use CUDA with multiprocessing, you must use the " - "'spawn' start method") raise RuntimeError( - "Cannot re-initialize CUDA in forked subprocess. " + msg) + "Cannot re-initialize CUDA in forked subprocess. To use CUDA with " + "multiprocessing, you must use the 'spawn' start method") if not hasattr(torch._C, '_cuda_getDeviceCount'): raise AssertionError("Torch not compiled with CUDA enabled") if _cudart is None: @@ -271,6 +265,9 @@ def get_device_name(device: Optional[_device_t] = None) -> str: name. This function is a no-op if this argument is a negative integer. It uses the current device, given by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None`` (default). + + Returns: + str: the name of the device """ return get_device_properties(device).name @@ -293,6 +290,15 @@ def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int] def get_device_properties(device: _device_t) -> _CudaDeviceProperties: + r"""Gets the properties of a device. + + Args: + device (torch.device or int or str): device for which to return the + properties of the device. + + Returns: + _CudaDeviceProperties: the properties of the device + """ _lazy_init() # will define _get_device_properties device = _get_device_index(device, optional=True) if device < 0 or device >= device_count(): diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index a8517a4bb394..5b300452f6d3 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1,8 +1,8 @@ +import contextlib +import logging import pickle import torch import warnings -import contextlib -import sys import time from torch._six import string_classes from datetime import timedelta @@ -17,8 +17,8 @@ AllreduceOptions, AllreduceCoalescedOptions, AllToAllOptions, + BarrierOptions, BroadcastOptions, - FileStore, GatherOptions, PrefixStore, ProcessGroup, @@ -27,15 +27,8 @@ ReduceScatterOptions, ScatterOptions, Store, - TCPStore, ) -if sys.platform != 'win32': - from torch._C._distributed_c10d import ( - HashStore, - ) - - _MPI_AVAILABLE = True _NCCL_AVAILABLE = True _GLOO_AVAILABLE = True @@ -191,16 +184,35 @@ def _store_based_barrier(rank, store, timeout): """ store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _group_count) store.add(store_key, 1) + logging.info('Added key: {} to store for rank: {}'.format(store_key, rank)) # Now wait for all workers to check in with the store. world_size = get_world_size() - worker_count = int(store.get(store_key)) + # Use 'add' instead of 'get' since for some store implementations 'add' + # doesn't work well with 'get'. Ideally the store implementations should + # be fixed, but for backward compatiblity reasons it is risky to change + # the store implementations. Once, we completely migrate away from these + # legacy stores, we can use 'get' here instead. + worker_count = store.add(store_key, 0) start = time.time() + log_time = time.time() while worker_count != world_size: time.sleep(0.01) - worker_count = int(store.get(store_key)) + worker_count = store.add(store_key, 0) + + # Print status periodically to keep track. + if timedelta(seconds=(time.time() - log_time)) > timedelta(seconds=10): + logging.info( + "Waiting in store based barrier to initialize process group for " + "rank: {}, key: {} (world_size={}, worker_count={}, timeout={})".format( + rank, store_key, world_size, worker_count, timeout)) + log_time = time.time() + if timedelta(seconds=(time.time() - start)) > timeout: - raise RuntimeError("Timed out initializing process group") + raise RuntimeError( + "Timed out initializing process group in store based barrier on " + "rank: {}, for key: {} (world_size={}, worker_count={}, timeout={})".format( + rank, store_key, world_size, worker_count, timeout)) def _rank_not_in_group(group: ProcessGroup): """ @@ -504,12 +516,8 @@ def init_process_group(backend, # barrier at the end to ensure that once we return from this method, all # process groups including global variables are updated correctly on all # ranks. - if backend == Backend.MPI or not ( - isinstance(store, TCPStore) or - isinstance(store, FileStore) or - (sys.platform != 'win32' and isinstance(store, HashStore)) - ): - # MPI doesn't have store. + if backend == Backend.MPI: + # MPI backend doesn't use store. barrier() else: # Use store based barrier here since barrier() used a bunch of @@ -2370,8 +2378,11 @@ def all_to_all(output_tensor_list, work.wait() + def barrier(group=GroupMember.WORLD, - async_op=False): + async_op=False, + device_ids=None): + """ Synchronizes all processes. @@ -2382,6 +2393,8 @@ def barrier(group=GroupMember.WORLD, group (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. async_op (bool, optional): Whether this op should be an async op + device_ids ([int], optional): List of device/GPU ids. + Valid only for NCCL backend. Returns: Async work handle, if async_op is set to True. @@ -2390,11 +2403,22 @@ def barrier(group=GroupMember.WORLD, if _rank_not_in_group(group): return + opts = BarrierOptions() + if device_ids is not None: + if get_backend(group) != Backend.NCCL: + raise RuntimeError("Function argument device_ids not supported " + "for the selected backend {}".format(get_backend(group))) + if isinstance(device_ids, list): + opts.device_ids = device_ids + else: + raise RuntimeError("Invalid function argument: " + "device_ids type should be List[int]") + if group is None: default_pg = _get_default_group() - work = default_pg.barrier() + work = default_pg.barrier(opts=opts) else: - work = group.barrier() + work = group.barrier(opts=opts) if async_op: return work @@ -2491,16 +2515,12 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None): # barrier at the end to ensure that once we return from this method, all # process groups including global variables are updated correctly on all # ranks. - if backend == Backend.MPI or not ( - isinstance(default_store, TCPStore) or - isinstance(default_store, FileStore) or - (sys.platform != 'win32' and isinstance(default_store, HashStore)) - ): + if backend == Backend.MPI: # MPI doesn't have store. barrier() else: # Use store based barrier here since barrier() used a bunch of # default devices and messes up NCCL internal state. - _store_based_barrier(group_rank, default_store, timeout) + _store_based_barrier(global_rank, default_store, timeout) return pg diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py index 50be941e073a..63181a2a6733 100644 --- a/torch/distributions/cauchy.py +++ b/torch/distributions/cauchy.py @@ -69,8 +69,6 @@ def cdf(self, value): return torch.atan((value - self.loc) / self.scale) / math.pi + 0.5 def icdf(self, value): - if self._validate_args: - self._validate_sample(value) return torch.tan(math.pi * (value - 0.5)) * self.scale + self.loc def entropy(self): diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py index 630c192ffed0..87d72d52d26b 100644 --- a/torch/distributions/constraints.py +++ b/torch/distributions/constraints.py @@ -3,13 +3,17 @@ - ``constraints.boolean`` - ``constraints.cat`` +- ``constraints.corr_cholesky`` - ``constraints.dependent`` - ``constraints.greater_than(lower_bound)`` +- ``constraints.greater_than_eq(lower_bound)`` - ``constraints.integer_interval(lower_bound, upper_bound)`` - ``constraints.interval(lower_bound, upper_bound)`` +- ``constraints.less_than(upper_bound)`` - ``constraints.lower_cholesky`` - ``constraints.lower_triangular`` - ``constraints.nonnegative_integer`` +- ``constraints.one_hot`` - ``constraints.positive`` - ``constraints.positive_definite`` - ``constraints.positive_integer`` @@ -57,6 +61,8 @@ class Constraint(object): A constraint object represents a region over which a variable is valid, e.g. within which a variable can be optimized. """ + is_discrete = False + def check(self, value): """ Returns a byte tensor of `sample_shape + batch_shape` indicating @@ -103,14 +109,30 @@ class _Boolean(Constraint): """ Constrain to the two values `{0, 1}`. """ + is_discrete = True + def check(self, value): return (value == 0) | (value == 1) +class _OneHot(Constraint): + """ + Constrain to one-hot vectors. + """ + is_discrete = True + + def check(self, value): + is_boolean = (value == 0) | (value == 1) + is_normalized = value.sum(-1).eq(1) + return is_boolean.all(-1) & is_normalized + + class _IntegerInterval(Constraint): """ Constrain to an integer interval `[lower_bound, upper_bound]`. """ + is_discrete = True + def __init__(self, lower_bound, upper_bound): self.lower_bound = lower_bound self.upper_bound = upper_bound @@ -128,6 +150,8 @@ class _IntegerLessThan(Constraint): """ Constrain to an integer interval `(-inf, upper_bound]`. """ + is_discrete = True + def __init__(self, upper_bound): self.upper_bound = upper_bound @@ -144,6 +168,8 @@ class _IntegerGreaterThan(Constraint): """ Constrain to an integer interval `[lower_bound, inf)`. """ + is_discrete = True + def __init__(self, lower_bound): self.lower_bound = lower_bound @@ -358,6 +384,7 @@ def check(self, value): dependent = _Dependent() dependent_property = _DependentProperty boolean = _Boolean() +one_hot = _OneHot() nonnegative_integer = _IntegerGreaterThan(0) positive_integer = _IntegerGreaterThan(1) integer_interval = _IntegerInterval diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py index 180fbd8187ee..5d3d48840203 100644 --- a/torch/distributions/continuous_bernoulli.py +++ b/torch/distributions/continuous_bernoulli.py @@ -168,8 +168,6 @@ def cdf(self, value): torch.where(torch.ge(value, 1.0), torch.ones_like(value), unbounded_cdfs)) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) cut_probs = self._cut_probs() return torch.where( self._outside_unstable_region(), diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py index f16eb154e2dd..bc61e0b0584e 100644 --- a/torch/distributions/distribution.py +++ b/torch/distributions/distribution.py @@ -12,10 +12,21 @@ class Distribution(object): has_rsample = False has_enumerate_support = False - _validate_args = False + _validate_args = __debug__ @staticmethod def set_default_validate_args(value): + """ + Sets whether validation is enabled or disabled. + + The default behavior mimics Python's ``assert`` statement: validation + is on by default, but is disabled if Python is run in optimized mode + (via ``python -O``). Validation may be expensive, so you may want to + disable it once a model is working. + + Args: + value (bool): Whether to enable validation. + """ if value not in [True, False]: raise ValueError Distribution._validate_args = value diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py index 41d7cd9f9787..ac18980c778b 100644 --- a/torch/distributions/exponential.py +++ b/torch/distributions/exponential.py @@ -68,8 +68,6 @@ def cdf(self, value): return 1 - torch.exp(-self.rate * value) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) return -torch.log(1 - value) / self.rate def entropy(self): diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py index d7ec01c65b35..a505d60c8f38 100644 --- a/torch/distributions/laplace.py +++ b/torch/distributions/laplace.py @@ -75,8 +75,6 @@ def cdf(self, value): return 0.5 - 0.5 * (value - self.loc).sign() * torch.expm1(-(value - self.loc).abs() / self.scale) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) term = value - 0.5 return self.loc - self.scale * (term).sign() * torch.log1p(-2 * term.abs()) diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py index 051725db19ca..4a8babb34a7c 100644 --- a/torch/distributions/negative_binomial.py +++ b/torch/distributions/negative_binomial.py @@ -77,8 +77,10 @@ def param_shape(self): @lazy_property def _gamma(self): + # Note we avoid validating because self.total_count can be zero. return torch.distributions.Gamma(concentration=self.total_count, - rate=torch.exp(-self.logits)) + rate=torch.exp(-self.logits), + validate_args=False) def sample(self, sample_shape=torch.Size()): with torch.no_grad(): diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py index 2468e2f225dc..1f14f0ae015f 100644 --- a/torch/distributions/normal.py +++ b/torch/distributions/normal.py @@ -82,8 +82,6 @@ def cdf(self, value): return 0.5 * (1 + torch.erf((value - self.loc) * self.scale.reciprocal() / math.sqrt(2))) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) return self.loc + self.scale * torch.erfinv(2 * value - 1) * math.sqrt(2) def entropy(self): diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py index c661a245f716..64f696802d76 100644 --- a/torch/distributions/one_hot_categorical.py +++ b/torch/distributions/one_hot_categorical.py @@ -29,7 +29,7 @@ class OneHotCategorical(Distribution): """ arg_constraints = {'probs': constraints.simplex, 'logits': constraints.real} - support = constraints.simplex + support = constraints.one_hot has_enumerate_support = True def __init__(self, probs=None, logits=None, validate_args=None): diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py index b212c52695c2..edaf5abf77a5 100644 --- a/torch/distributions/uniform.py +++ b/torch/distributions/uniform.py @@ -81,8 +81,6 @@ def cdf(self, value): return result.clamp(min=0, max=1) def icdf(self, value): - if self._validate_args: - self._validate_sample(value) result = value * (self.high - self.low) + self.low return result diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py new file mode 100644 index 000000000000..b72bbe633dd9 --- /dev/null +++ b/torch/fx/experimental/merge_matmul.py @@ -0,0 +1,220 @@ +import torch + +from torch.fx.graph import Graph +from torch.fx.graph_module import GraphModule +from torch.fx.node import Node +from torch.fx.symbolic_trace import symbolic_trace + +import itertools +import operator + +from typing import Dict, List + + +def get_first_dim(t: torch.Tensor) -> int: + """ + A free function primarily for use in the merge_matmul graph transformation below + that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape + is an attribute (and cannot be the target of a call_function node) and also helps save + a getitem op in the graph. + + Arguments: + t: The tensor to get the first dimension of. + + Returns: + The first dimension of t. + """ + return t.shape[0] + + +def legalize_graph(gm: GraphModule): + """ + Replace the graph of the given GraphModule with one that contains the same nodes as the + original, but in topologically sorted order. + + This is used by the merge_matmul transformation below, which disturbs the topologically sorted + order of its input GraphModule, so that this order is restored before further transformation. + + Arguments: + gm: The graph module to topologically sort. It is modified in-place. + + """ + # Build an adjacency list representation of node dependencies in the graph. This also + # serves as a list of nodes that still need to be inserted into the new, topologically + # sorted graph. + dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes} + + # Construct a new graph that will contain all nodes in topologically sorted order. + new_graph = Graph() + value_remap: Dict[Node, Node] = {} + + # Copy over all nodes with no dependencies. + for node, deps in dependencies.items(): + if not deps: + value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n]) + + # Remove the copied over nodes from the adjacency list. + for copied_node in value_remap.keys(): + del dependencies[copied_node] + + # While there are still nodes to insert into the new graph: + while dependencies: + copied_this_round = [] + + # Copy over all nodes whose dependencies already exist in the new graph. + for node, deps in dependencies.items(): + all_deps_copied = True + for dep in deps: + if dep not in value_remap: + all_deps_copied = False + + if all_deps_copied: + value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n]) + copied_this_round.append(node) + + # Delete all nodes copied over in this iteration from dependencies. + for copied_node in copied_this_round: + del dependencies[copied_node] + + # Replace the old graph with the new, topologically sorted one. + gm.graph = new_graph + + +def may_depend_on(a: Node, b: Node, search_depth: int = 6): + """ + Determine if one node depends on another in a torch.fx.Graph. + + Arguments: + a: The node that may have a dependency on b. + b: The node that a may have a dependency on. + search_depth: In the case of an indirect dependency, this function + searches upto this many nodes away in search of a + data dependency. If none is found, the function + makes the conservative assumption that there is a + dependency. + + Returns: + True if a may depend on b, False if it definitely does not. + """ + # Equivalence is defined as dependence. + if a == b: + return True + + # If a has no inputs, it cannot depend on b. + if len(a.all_input_nodes) == 0: + return False + + # If the search depth has been exhausted and no conclusion has been + # reached, assume that there is a data dependency. + if search_depth == 0: + return True + + # Recursively check all inputs of a. + for inp in a.all_input_nodes: + if may_depend_on(inp, b, search_depth - 1): + return True + + return False + + +def are_nodes_independent(nodes: List[Node]): + """ + Check if all of the given nodes are pairwise-data independent. + + Arguments: + nodes: The nodes to check for data dependencies. + + Returns: + True if any pair in nodes has a data dependency. + """ + # For each pair in nodes: + for i, j in itertools.combinations(nodes, 2): + if may_depend_on(i, j) or may_depend_on(j, i): + return False + + return True + + +def merge_matmul(in_mod: torch.nn.Module): + """ + A graph transformation that merges matrix multiplication operations that share the same right-hand + side operand into one large matrix multiplication. + ____ _________ _________ + ---- | | | | M| A * C | + M| A | T| B | * K| C | = |---------| + ---- , | | | | T| B * C | + K ---- --------- --------- + K R R + """ + gm = symbolic_trace(in_mod) + + rhs_users: Dict[Node, List[Node]] = {} + lhs_users: Dict[Node, List[Node]] = {} + + # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to + # the matmul of which they are the LHS/RHS. + for node in gm.graph.nodes: + if node.op != "call_function" or node.target is not torch.matmul: + continue + + lhs, rhs = node.args + + # TODO: Properly handle aliasing caused by get_attr. For now, + # use the attribute name as the operand if the node is a + # get_attr. + lhs = lhs.target if lhs.op == "get_attr" else lhs + rhs = rhs.target if rhs.op == "get_attr" else rhs + + lhs_users.setdefault(lhs, []).append(node) + rhs_users.setdefault(rhs, []).append(node) + + for rhs, mms in rhs_users.items(): + # There must be at least matmuls for a merge to make sense. + if len(mms) < 2: + continue + + # All matmuls must not depend on each other directly or indirectly + # in order for the merge to be possible. + if not are_nodes_independent(mms): + continue + + lhs_vals = [mm.args[0] for mm in mms] + + # Merge the matmul. + # Collect a list of LHS operands and the single RHS operand. + lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals] + rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs + + # Concatenate all the LHS operands. + merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {}) + + # Multiply the concatenated LHS operands with the one RHS. This will produce + # the same results as all the individual matmuls involving rhs in the original graph, + # but they will all be concatenated together. + merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {}) + + # Split the result of the merged matmul using the shapes of the LHS operands + # to ascertain how large each chunk should be. + merge_mm_sizes = [ + gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs + ] + merge_mm_split = gm.graph.call_function( + torch.split, (merge_mm, merge_mm_sizes), {} + ) + merge_mm_res = [ + gm.graph.call_function(operator.getitem, (merge_mm_split, out), {}) + for out in range(len(lhs)) + ] + + # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul. + for old, new in zip(mms, merge_mm_res): + old.replace_all_uses_with(new) + gm.graph.erase_node(old) + + # All of the new nodes created above were inserted at the end, so we need to sort + # the nodes topologically to make sure all definitions precede uses. + legalize_graph(gm) + + gm.recompile() + gm.graph.lint(in_mod) + return gm diff --git a/torch/fx/graph.py b/torch/fx/graph.py index fd0087dca398..6e493676f8c2 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -693,13 +693,18 @@ def emit_node(node : Node): import_strs = [f'import {name}' for name in sorted(modules_used)] import_block = '\n'.join(import_strs) + if len(body) == 0: + # If the Graph has no non-placeholder nodes, no lines for the body + # have been emitted. To continue to have valid Python code, emit a + # single pass statement + body.append('pass\n') + code = ''.join(body) - code = '\n'.join(' ' + line for line in code.split('\n')) + '\n' + code = '\n'.join(' ' + line for line in code.split('\n')) fn_code = f"""\ {import_block} def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}: -{code} -""" +{code}""" return fn_code diff --git a/torch/jit/_async.py b/torch/jit/_async.py index 26bc6eeada67..ae9684a0e229 100644 --- a/torch/jit/_async.py +++ b/torch/jit/_async.py @@ -17,7 +17,7 @@ def fork(func, *args, **kwargs): - """ + r""" Creates an asynchronous task executing `func` and a reference to the value of the result of this execution. `fork` will return immediately, so the return value of `func` may not have been computed yet. To force completion @@ -42,7 +42,8 @@ def fork(func, *args, **kwargs): Example (fork a free function): - .. testcode:: + .. code-block:: python + import torch from torch import Tensor def foo(a : Tensor, b : int) -> Tensor: @@ -60,16 +61,17 @@ def bar(a): Example (fork a module method): - .. testcode:: + .. code-block:: python + import torch from torch import Tensor - class SubMod(torch.nn.Module): + class AddMod(torch.nn.Module): def forward(self, a: Tensor, b : int): return a + b class Mod(torch.nn.Module): def __init__(self): super(self).__init__() - self.mod = SubMod() + self.mod = AddMod() def forward(self, input): fut = torch.jit.fork(self.mod, a, b=2) return torch.jit.wait(fut) @@ -81,7 +83,7 @@ def forward(self, input): def wait(future): - """ + r""" Forces completion of a `torch.jit.Future[T]` asynchronous task, returning the result of the task. See :func:`~fork` for docs and examples. Args: diff --git a/torch/jit/_script.py b/torch/jit/_script.py index cc84877e5267..8bc8c6117c1b 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -741,6 +741,43 @@ class RecursiveScriptModule(ScriptModule): # type: ignore def __init__(self, arg=None): super().__init__() +def call_prepare_scriptable_func_impl(obj, memo): + if not isinstance(obj, torch.nn.Module): + return obj + + obj_id = id(obj) + + # If obj_id is in memo, obj has already been prepared or is being + # prepared in another call up the stack. + if obj_id in memo: + return memo[id(obj)] + + obj = obj.__prepare_scriptable__() if hasattr(obj, '__prepare_scriptable__') else obj # type: ignore + # Record obj in memo to avoid infinite recursion in the case of cycles in the module + # hierarchy when recursing below. + memo[obj_id] = obj + + new_obj_dict = {} + + for name in obj.__dict__: + sub_module = obj.__dict__.get(name) + if name == '_modules': + for k, v in sub_module.items(): + sub_module[k] = call_prepare_scriptable_func_impl(v, memo) + new_obj_dict[name] = sub_module + elif isinstance(sub_module, torch.nn.Module) and not isinstance(sub_module, ScriptModule): + new_obj_dict[name] = call_prepare_scriptable_func_impl(sub_module, memo) + else: + new_obj_dict[name] = sub_module + + for k, v in new_obj_dict.items(): + obj.__dict__[name] = v + + return obj + +def call_prepare_scriptable_func(obj): + memo: Dict[int, torch.nn.Module] = {} + return call_prepare_scriptable_func_impl(obj, memo) def script(obj, optimize=None, _frames_up=0, _rcb=None): r""" @@ -894,6 +931,7 @@ def forward(self, input): return obj if isinstance(obj, torch.nn.Module): + obj = call_prepare_scriptable_func(obj) return torch.jit._recursive.create_script_module( obj, torch.jit._recursive.infer_methods_to_compile ) diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py index 615741f38da7..d853a55b3933 100644 --- a/torch/jit/quantized.py +++ b/torch/jit/quantized.py @@ -130,8 +130,7 @@ def check_forward_input(self, input): input.size(1), self.input_size)) @torch.jit.script_method - def check_forward_hidden(self, input, hx, hidden_label=''): - # type: (Tensor, Tensor, str) -> None + def check_forward_hidden(self, input: Tensor, hx: Tensor, hidden_label: str = '') -> None: if input.size(0) != hx.size(0): raise RuntimeError( "Input batch size {} doesn't match hidden{} batch size {}".format( @@ -169,8 +168,7 @@ def __init__(self, other): self.nonlinearity = other.nonlinearity @torch.jit.script_method - def forward(self, input, hx=None): - # type: (Tensor, Optional[Tensor]) -> Tensor + def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: self.check_forward_input(input) if hx is None: hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device) @@ -201,8 +199,7 @@ def __init__(self, other): super(QuantizedLSTMCell, self).__init__(other) @torch.jit.script_method - def forward(self, input, hx=None): - # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor] + def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]: self.check_forward_input(input) if hx is None: zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device) @@ -222,8 +219,7 @@ def __init__(self, other): super(QuantizedGRUCell, self).__init__(other) @torch.jit.script_method - def forward(self, input, hx=None): - # type: (Tensor, Optional[Tensor]) -> Tensor + def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor: self.check_forward_input(input) if hx is None: hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device) @@ -236,8 +232,7 @@ def forward(self, input, hx=None): ) -def apply_permutation(tensor, permutation, dim=1): - # type: (Tensor, Tensor, int) -> Tensor +def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor: return tensor.index_select(dim, permutation) @@ -303,8 +298,7 @@ def get_weight_bias(ihhh): self.all_weights.append(cell_params) @torch.jit.script_method - def check_input(self, input, batch_sizes): - # type: (Tensor, Optional[Tensor]) -> None + def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None: expected_input_dim = 2 if batch_sizes is not None else 3 if input.dim() != expected_input_dim: raise RuntimeError( @@ -316,8 +310,7 @@ def check_input(self, input, batch_sizes): self.input_size, input.size(-1))) @torch.jit.script_method - def get_expected_hidden_size(self, input, batch_sizes): - # type: (Tensor, Optional[Tensor]) -> Tuple[int, int, int] + def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]: if batch_sizes is not None: mini_batch = int(batch_sizes[0]) else: @@ -328,21 +321,19 @@ def get_expected_hidden_size(self, input, batch_sizes): return expected_hidden_size @torch.jit.script_method - def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'): - # type: (Tensor, Tuple[int, int, int], str) -> None + def check_hidden_size(self, hx: Tensor, expected_hidden_size: Tuple[int, int, int], + msg: str = 'Expected hidden size {}, got {}') -> None: if hx.size() != expected_hidden_size: raise RuntimeError(msg.format(expected_hidden_size, list(hx.size()))) @torch.jit.script_method - def check_forward_args(self, input, hidden, batch_sizes): - # type: (Tensor, Tensor, Optional[Tensor]) -> None + def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None: self.check_input(input, batch_sizes) expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) self.check_hidden_size(hidden, expected_hidden_size, msg='Expected hidden size {}, got {}') @torch.jit.script_method - def permute_hidden(self, hx, permutation): - # type: (Tensor, Optional[Tensor]) -> Tensor + def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor: if permutation is None: return hx return apply_permutation(hx, permutation) @@ -355,8 +346,9 @@ def __init__(self, other, dtype): super(QuantizedLSTM, self).__init__(other, dtype) @torch.jit.script_method - def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): - # type: (Tensor, Optional[Tuple[Tensor, Tensor]], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] # noqa + def forward_impl(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]], batch_sizes: Optional[Tensor], + max_batch_size: int, sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: + # noqa if hx is None: num_directions = 2 if self.bidirectional else 1 zeros = torch.zeros(self.num_layers * num_directions, @@ -379,8 +371,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): return output, hidden @torch.jit.script_method - def forward_tensor(self, input, hx=None): - # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] + def forward_tensor(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: batch_sizes = None max_batch_size = input.size(0) if self.batch_first else input.size(1) sorted_indices = None @@ -391,8 +382,8 @@ def forward_tensor(self, input, hx=None): return output, self.permute_hidden(hidden, unsorted_indices) @torch.jit.script_method - def forward_packed(self, input, hx=None): - # type: (PackedSequence, Optional[Tuple[Tensor, Tensor]]) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]] # noqa + def forward_packed(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None + ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]: input, batch_sizes, sorted_indices, unsorted_indices = input max_batch_size = batch_sizes[0] max_batch_size = int(max_batch_size) @@ -404,15 +395,13 @@ def forward_packed(self, input, hx=None): @torch.jit.script_method - def permute_hidden(self, hx, permutation): - # type: (Tuple[Tensor, Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor] + def permute_hidden(self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]) -> Tuple[Tensor, Tensor]: if permutation is None: return hx return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation) @torch.jit.script_method - def check_forward_args(self, input, hidden, batch_sizes): - # type: (Tensor, Tuple[Tensor, Tensor], Optional[Tensor]) -> None + def check_forward_args(self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]) -> None: self.check_input(input, batch_sizes) expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes) @@ -432,8 +421,9 @@ class QuantizedGRU(QuantizedRNNBase): __overloads__ = {'forward': ['forward_packed', 'forward_tensor']} @torch.jit.script_method - def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): - # type: (Tensor, Optional[Tensor], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tensor] # noqa + def forward_impl(self, input: Tensor, hx: Optional[Tensor], batch_sizes: Optional[Tensor], max_batch_size: int, + sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tensor]: + # noqa if hx is None: num_directions = 2 if self.bidirectional else 1 hx = torch.zeros(self.num_layers * num_directions, @@ -459,8 +449,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices): return output, hidden @torch.jit.script_method - def forward_tensor(self, input, hx=None): - # type: (Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor] + def forward_tensor(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]: batch_sizes = None max_batch_size = input.size(0) if self.batch_first else input.size(1) sorted_indices = None @@ -470,8 +459,7 @@ def forward_tensor(self, input, hx=None): return output, self.permute_hidden(hidden, unsorted_indices) @torch.jit.script_method - def forward_packed(self, input, hx=None): - # type: (PackedSequence, Optional[Tensor]) -> Tuple[PackedSequence, Tensor] + def forward_packed(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]: input, batch_sizes, sorted_indices, unsorted_indices = input max_batch_size = batch_sizes[0] max_batch_size = int(max_batch_size) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index 01ce71afd388..b9ac5aa77150 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -1409,7 +1409,13 @@ c10::intrusive_ptr ProcessGroupNCCL::reduce_scatter( c10::intrusive_ptr ProcessGroupNCCL::barrier( const BarrierOptions& opts) { std::vector devices; - if (usedDeviceIdxs_.empty()) { + + // Use user defined GPU device ids if provided + if (!opts.device_ids.empty()) { + for (auto device : opts.device_ids) { + devices.push_back(at::Device(at::DeviceType::CUDA, device)); + } + } else if (usedDeviceIdxs_.empty()) { // This means there is not yet a NCCL collective being called // Here we have to use the best guesses and will use a single GPU to call // allreduce to achieve barrier. diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp index 03b2e59e4295..a5a0d5fa20df 100644 --- a/torch/lib/c10d/Types.hpp +++ b/torch/lib/c10d/Types.hpp @@ -62,6 +62,7 @@ struct AllToAllOptions { }; struct BarrierOptions { + std::vector device_ids; std::chrono::milliseconds timeout = kUnsetTimeout; }; diff --git a/torch/library.h b/torch/library.h index d86c1afbd50e..fee98abb2b81 100644 --- a/torch/library.h +++ b/torch/library.h @@ -116,19 +116,6 @@ class TORCH_API CppFunction final { , debug_() {} - /// This static factory lets you create CppFunctions that (1) don't have boxing - /// wrappers (because we don't support it yet) and (2) don't have schema - /// inference (because some ops don't support it). - template - static CppFunction makeUnboxedOnly(Func* f) { - // TODO: Eliminate the necessity for this function entirely. - return CppFunction( - c10::KernelFunction::makeFromUnboxedOnlyRuntimeFunction(f), - /* cpp_signature */ c10::impl::CppSignature::make(), - /* schema */ nullptr - ); - } - /// This creates a fallthrough function. Fallthrough functions /// immediately redispatch to the next available dispatch key, /// but are implemented more efficiently than a hand written @@ -170,6 +157,22 @@ class TORCH_API CppFunction final { ); } + /// Create a function from an unboxed kernel function. + /// This is typically used to register common operators. + template::value, std::nullptr_t> = nullptr> + static CppFunction makeFromUnboxedFunction(FuncPtr* f) { + return CppFunction(f); + } + + /// Create a function from a compile time unboxed kernel function pointer. + /// This is typically used to register common operators. + /// Compile time function pointers can be used to allow the compiler + /// to optimize (e.g. inline) calls to it. + template::value, std::nullptr_t> = nullptr> + static CppFunction makeFromUnboxedFunction(FuncPtr f) { + return CppFunction(f); + } + CppFunction&& debug(std::string d) && { debug_ = std::move(d); return std::move(*this); @@ -496,20 +499,10 @@ class TORCH_API Library final { return impl(name, dispatch(std::forward(key), std::forward(raw_f))); } - /// \private - /// - /// Convenience overload for unboxed only kernels; kernels whose type - /// signatures are not supported by our template based metaprogramming - /// system. These are currently quite common but will be eventually - /// eliminated. - /// - /// This is equivalent to calling CppFunction::makeUnboxedOnly() on - /// the function, but this name for the function makes it easy to grep for. template Library& impl_UNBOXED(Name name, Func* raw_f) & { - // TODO: Remove this overload once the makeUnboxedOnly incidence rate - // goes way down - return impl(name, CppFunction::makeUnboxedOnly(raw_f)); + static_assert(c10::guts::false_t(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead."); + return *this; } // These overloads cover cases when a SelectiveStr (see Note [Selective build]) @@ -531,7 +524,10 @@ class TORCH_API Library final { template Library& impl(detail::SelectiveStr, Dispatch&& key, Func&& raw_f) & { return *this; } template - Library& impl_UNBOXED(detail::SelectiveStr name, Func* raw_f) & { return *this; } + Library& impl_UNBOXED(detail::SelectiveStr name, Func* raw_f) & { + static_assert(c10::guts::false_t(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead."); + return *this; + } template Library& impl(detail::SelectiveStr name, Func&& raw_f) & { @@ -543,7 +539,8 @@ class TORCH_API Library final { } template Library& impl_UNBOXED(detail::SelectiveStr name, Func* raw_f) & { - return impl(name.operator const char*(), CppFunction::makeUnboxedOnly(raw_f)); + static_assert(c10::guts::false_t(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead."); + return *this; } /// Register a fallback implementation for all operators which will be used diff --git a/torch/multiprocessing/__init__.py b/torch/multiprocessing/__init__.py index 561eddfb02a2..039ddf2a1b09 100644 --- a/torch/multiprocessing/__init__.py +++ b/torch/multiprocessing/__init__.py @@ -35,7 +35,7 @@ """Add helper function to spawn N processes and wait for completion of any of them. This depends `mp.get_context` which was added in Python 3.4.""" -from .spawn import spawn, SpawnContext, _supports_context, start_processes, ProcessContext, \ +from .spawn import spawn, SpawnContext, start_processes, ProcessContext, \ ProcessRaisedException, ProcessExitedException diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py index b2008912dbb5..9ad17c94ccf8 100644 --- a/torch/multiprocessing/spawn.py +++ b/torch/multiprocessing/spawn.py @@ -66,24 +66,8 @@ def _wrap(fn, i, args, error_queue): sys.exit(1) -# Multiprocessing contexts are introduced at Python 3.4 -_supports_context = sys.version_info >= (3, 4) - - -def _python_version_check(): - if not _supports_context: - raise RuntimeError("Requires python 3.4 or higher to use " - "torch.multiprocessing.spawn and " - "torch.multiprocessing.ProcessContext helper " - "to launch multiple processes. If you are using " - "this for distributed training and have a lower " - "version of python, please use " - "torch.distributed.launch instead.") - - class ProcessContext: def __init__(self, processes, error_queues): - _python_version_check() self.error_queues = error_queues self.processes = processes self.sentinels = { @@ -182,7 +166,6 @@ def __init__(self, processes, error_queues): # Currently we only add this API first, we can consider adding it to documentation as # needed in the future. def start_processes(fn, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'): - _python_version_check() mp = multiprocessing.get_context(start_method) error_queues = [] processes = [] diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py index fa9d5bb1eb00..884f739e2781 100644 --- a/torch/nn/common_types.py +++ b/torch/nn/common_types.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Union, Tuple +from typing import TypeVar, Union, Tuple, Optional from .. import Tensor # Create some useful type aliases @@ -24,6 +24,11 @@ _size_5_t = _scalar_or_tuple_5_t[int] _size_6_t = _scalar_or_tuple_6_t[int] +# For arguments which represent optional size parameters (eg, adaptive pool parameters) +_size_any_opt_t = _scalar_or_tuple_any_t[Optional[int]] +_size_2_opt_t = _scalar_or_tuple_2_t[Optional[int]] +_size_3_opt_t = _scalar_or_tuple_3_t[Optional[int]] + # For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters) _ratio_2_t = _scalar_or_tuple_2_t[float] _ratio_3_t = _scalar_or_tuple_3_t[float] diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in index 94071556e144..208dc7c2df40 100644 --- a/torch/nn/functional.pyi.in +++ b/torch/nn/functional.pyi.in @@ -1,7 +1,7 @@ from torch import Tensor from torch.types import _size from typing import Any, Optional, Tuple, Dict, List, Callable, Sequence, Union -from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t +from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t, _size_2_opt_t, _size_3_opt_t # 'TypedDict' is a new accepted type that represents a dictionary with a fixed set of allowed keys. # It is standards-track but not in `typing` yet. We leave this hear to be uncommented once the feature @@ -75,21 +75,21 @@ def adaptive_max_pool1d_with_indices(input: Tensor, output_size: _size, return_i Tensor, Tensor]: ... -def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[ +def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size_2_opt_t, return_indices: bool = ...) -> Tuple[ Tensor, Tensor]: ... -def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[ +def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size_3_opt_t, return_indices: bool = ...) -> Tuple[ Tensor, Tensor]: ... def adaptive_avg_pool1d(input: Tensor, output_size: _size_1_t) -> Tensor: ... -def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_t) -> Tensor: ... +def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ... -def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_t) -> Tensor: ... +def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ... def dropout(input: Tensor, p: float = ..., training: bool = ..., inplace: bool = ...) -> Tensor: ... diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index 0c5258615bfd..837ecca6fe9d 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -848,8 +848,9 @@ class MultiheadAttention(Module): kdim: total number of features in key. Default: None. vdim: total number of features in value. Default: None. - Note: if kdim and vdim are None, they will be set to embed_dim such that - query, key, and value have the same number of features. + Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set + to :attr:`embed_dim` such that query, key, and value have the same + number of features. Examples:: @@ -921,9 +922,8 @@ def __setstate__(self, state): super(MultiheadAttention, self).__setstate__(state) - def forward(self, query, key, value, key_padding_mask=None, - need_weights=True, attn_mask=None): - # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]] + def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None, + need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]: r""" Args: query, key, value: map a query and a set of key-value pairs to an output. diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index f22c35fa39ff..6a9c4dcd2ef6 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -530,8 +530,9 @@ def __init__(self, in_channels, out_channels, kernel_size, stride, # dilation being an optional parameter is for backwards # compatibility - def _output_padding(self, input, output_size, stride, padding, kernel_size, dilation=None): - # type: (Tensor, Optional[List[int]], List[int], List[int], List[int], Optional[List[int]]) -> List[int] + def _output_padding(self, input: Tensor, output_size: Optional[List[int]], + stride: List[int], padding: List[int], kernel_size: List[int], + dilation: Optional[List[int]] = None) -> List[int]: if output_size is None: ret = _single(self.output_padding) # converting to list if was not already else: diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 297a4edf15bf..f054590da66a 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -843,7 +843,6 @@ def _slow_forward(self, *input, **kwargs): if recording_scopes: name = torch.jit._trace._trace_module_map[self] if self in torch.jit._trace._trace_module_map else None if name: - cur_scope_name = tracing_state.current_scope() tracing_state.push_scope(name) else: recording_scopes = False diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py index e8f68307f230..78aae504083b 100644 --- a/torch/nn/modules/pooling.py +++ b/torch/nn/modules/pooling.py @@ -5,7 +5,8 @@ from .utils import _single, _pair, _triple from .. import functional as F -from ..common_types import _size_any_t, _size_1_t, _size_2_t, _size_3_t, _ratio_3_t, _ratio_2_t +from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t, + _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t) class _MaxPoolNd(Module): @@ -953,7 +954,7 @@ class _AdaptiveMaxPoolNd(Module): __constants__ = ['output_size', 'return_indices'] return_indices: bool - def __init__(self, output_size: _size_any_t, return_indices: bool = False) -> None: + def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None: super(_AdaptiveMaxPoolNd, self).__init__() self.output_size = output_size self.return_indices = return_indices @@ -1020,7 +1021,7 @@ class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd): """ - output_size: _size_2_t + output_size: _size_2_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_max_pool2d(input, self.output_size, self.return_indices) @@ -1057,7 +1058,7 @@ class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd): """ - output_size: _size_3_t + output_size: _size_3_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_max_pool3d(input, self.output_size, self.return_indices) @@ -1066,7 +1067,7 @@ def forward(self, input: Tensor) -> Tensor: class _AdaptiveAvgPoolNd(Module): __constants__ = ['output_size'] - def __init__(self, output_size: _size_any_t) -> None: + def __init__(self, output_size: _size_any_opt_t) -> None: super(_AdaptiveAvgPoolNd, self).__init__() self.output_size = output_size @@ -1125,7 +1126,7 @@ class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd): """ - output_size: _size_2_t + output_size: _size_2_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_avg_pool2d(input, self.output_size) @@ -1159,7 +1160,7 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd): """ - output_size: _size_3_t + output_size: _size_3_opt_t def forward(self, input: Tensor) -> Tensor: return F.adaptive_avg_pool3d(input, self.output_size) diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py index 3e0b93c7afc0..97e4195619cb 100644 --- a/torch/nn/modules/utils.py +++ b/torch/nn/modules/utils.py @@ -26,8 +26,7 @@ def _reverse_repeat_tuple(t, n): return tuple(x for x in reversed(t) for _ in range(n)) -def _list_with_default(out_size, defaults): - # type: (List[int], List[int]) -> List[int] +def _list_with_default(out_size: List[int], defaults: List[int]) -> List[int]: if isinstance(out_size, int): return out_size if len(defaults) <= len(out_size): diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py index a069c6c6f939..8effeece5908 100644 --- a/torch/nn/parallel/replicate.py +++ b/torch/nn/parallel/replicate.py @@ -108,7 +108,6 @@ def replicate(network, devices, detach=False): modules = list(network.modules()) module_copies = [[] for device in devices] module_indices = {} - scriptmodule_skip_attr = {"_parameters", "_buffers", "_modules", "forward", "_c"} for i, module in enumerate(modules): module_indices[module] = i diff --git a/torch/nn/quantized/dynamic/modules/rnn.py b/torch/nn/quantized/dynamic/modules/rnn.py index df88169471ca..59c0195d7858 100644 --- a/torch/nn/quantized/dynamic/modules/rnn.py +++ b/torch/nn/quantized/dynamic/modules/rnn.py @@ -239,8 +239,6 @@ def from_float(cls, mod): _all_weight_values = [] for layer in range(qRNNBase.num_layers): for direction in range(num_directions): - layer_input_size = qRNNBase.input_size if layer == 0 else qRNNBase.hidden_size * num_directions - suffix = '_reverse' if direction == 1 else '' def retrieve_weight_bias(ihhh): diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py index a9ba3293630d..00ceba7ab367 100644 --- a/torch/nn/quantized/modules/conv.py +++ b/torch/nn/quantized/modules/conv.py @@ -240,8 +240,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConv1d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv1d_prepack( w, b, self.stride, self.padding, self.dilation, self.groups) @@ -327,8 +326,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConv2d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv2d_prepack( w, b, self.stride, self.padding, self.dilation, self.groups) @@ -412,8 +410,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConv3d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv3d_prepack( w, b, self.stride, self.padding, self.dilation, self.groups) @@ -466,8 +463,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, transposed, output_padding, groups, bias, padding_mode) - def _input_padding(self, kernel_size, dilation, padding): - # type: (List[int], List[int], List[int]) -> List[int] + def _input_padding(self, kernel_size: List[int], dilation: List[int], padding: List[int]) -> List[int]: res = torch.jit.annotate(List[int], []) for kdx in range(len(kernel_size)): pad = (dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx]) @@ -561,8 +557,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConvTranpose1d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv_transpose1d_prepack( w, b, self.stride, self.padding, self.output_padding, self.dilation, self.groups) @@ -645,8 +640,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConvTranpose2d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv_transpose2d_prepack( w, b, self.stride, self.padding, self.output_padding, self.dilation, self.groups) @@ -730,8 +724,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def _get_name(self): return 'QuantizedConvTranpose3d' - def set_weight_bias(self, w, b): - # type: (torch.Tensor, Optional[torch.Tensor]) -> None + def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None: self._packed_params = torch.ops.quantized.conv_transpose3d_prepack( w, b, self.stride, self.padding, self.output_padding, self.dilation, self.groups) diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py index d16748b3baf7..523994b364c8 100644 --- a/torch/nn/quantized/modules/embedding_ops.py +++ b/torch/nn/quantized/modules/embedding_ops.py @@ -22,8 +22,7 @@ def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8): raise NotImplementedError('Unsupported dtype on quantized embedding! Supports quint8 and quint4x2.') @torch.jit.export - def set_weight(self, weight): - # type: (torch.Tensor) -> None + def set_weight(self, weight: torch.Tensor) -> None: if self.dtype in [torch.quint8, torch.quint4x2]: self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight) else: @@ -52,7 +51,6 @@ def _save_to_state_dict(self, destination, prefix, keep_vars): def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): - version = local_metadata.get('version', None) self.dtype = state_dict[prefix + 'dtype'] state_dict.pop(prefix + 'dtype') @@ -126,8 +124,7 @@ def extra_repr(self): return extra_repr_str - def set_weight(self, w): - # type: (torch.Tensor) -> None + def set_weight(self, w: torch.Tensor) -> None: self._packed_params.set_weight(w) def weight(self): diff --git a/torch/nn/quantized/modules/functional_modules.py b/torch/nn/quantized/modules/functional_modules.py index b9fab962d563..08b5447bb925 100644 --- a/torch/nn/quantized/modules/functional_modules.py +++ b/torch/nn/quantized/modules/functional_modules.py @@ -40,45 +40,39 @@ def forward(self, x): "'forward'. Please use the underlying operation") r"""Operation equivalent to ``torch.add(Tensor, Tensor)``""" - def add(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.add(Tensor, float)``""" - def add_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def add_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.add(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``""" - def mul(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def mul(self, x: Tensor, y: Tensor) -> Tensor: r = torch.mul(x, y) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.mul(Tensor, float)``""" - def mul_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def mul_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.mul(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.cat``""" - def cat(self, x, dim=0): - # type: (List[Tensor], int) -> Tensor + def cat(self, x: List[Tensor], dim: int = 0) -> Tensor: r = torch.cat(x, dim=dim) r = self.activation_post_process(r) return r r"""Operation equivalent to ``relu(torch.add(x,y))``""" - def add_relu(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) r = torch.nn.functional.relu(r) r = self.activation_post_process(r) @@ -101,38 +95,32 @@ def forward(self, x): "'forward'. Please use the underlying operation") r"""Operation equivalent to ``torch.add(Tensor, Tensor)``""" - def add(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) return r r"""Operation equivalent to ``torch.add(Tensor, float)``""" - def add_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def add_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.add(x, y) return r r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``""" - def mul(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def mul(self, x: Tensor, y: Tensor) -> Tensor: r = torch.mul(x, y) return r r"""Operation equivalent to ``torch.mul(Tensor, float)``""" - def mul_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def mul_scalar(self, x: Tensor, y: float) -> Tensor: r = torch.mul(x, y) return r r"""Operation equivalent to ``torch.cat``""" - def cat(self, x, dim=0): - # type: (List[Tensor], int) -> Tensor + def cat(self, x: List[Tensor], dim: int = 0) -> Tensor: r = torch.cat(x, dim=dim) return r r"""Operation equivalent to ``relu(torch.add(x,y))``""" - def add_relu(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: r = torch.add(x, y) r = torch.nn.functional.relu(r) return r @@ -195,45 +183,39 @@ def forward(self, x): "'forward'. Please use the underlying operation") r"""Operation equivalent to ``torch.ops.quantized.add``""" - def add(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add(self, x: Tensor, y: Tensor) -> Tensor: r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``""" - def add_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def add_scalar(self, x: Tensor, y: float) -> Tensor: r = ops.quantized.add_scalar(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``""" - def mul(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def mul(self, x: Tensor, y: Tensor) -> Tensor: r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``""" - def mul_scalar(self, x, y): - # type: (Tensor, float) -> Tensor + def mul_scalar(self, x: Tensor, y: float) -> Tensor: r = ops.quantized.mul_scalar(x, y) # Note: this operation is not observed because the observation is not # needed for the quantized op. return r r"""Operation equivalent to ``torch.ops.quantized.cat``""" - def cat(self, x, dim=0): - # type: (List[Tensor], int) -> Tensor + def cat(self, x: List[Tensor], dim: int = 0) -> Tensor: r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim) r = self.activation_post_process(r) return r r"""Operation equivalent to ``torch.ops.quantized.add_relu``""" - def add_relu(self, x, y): - # type: (Tensor, Tensor) -> Tensor + def add_relu(self, x: Tensor, y: Tensor) -> Tensor: r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point) r = self.activation_post_process(r) return r diff --git a/torch/nn/quantized/modules/normalization.py b/torch/nn/quantized/modules/normalization.py index 4664120ec8b5..c12f74374863 100644 --- a/torch/nn/quantized/modules/normalization.py +++ b/torch/nn/quantized/modules/normalization.py @@ -29,7 +29,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.normalized_shape, mod.weight, mod.bias, float(scale), @@ -63,7 +62,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_groups, mod.num_channels, mod.weight, mod.bias, float(scale), int(zero_point), @@ -98,7 +96,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point), @@ -133,7 +130,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point), @@ -168,7 +164,6 @@ def _get_name(self): @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process scale, zero_point = mod.activation_post_process.calculate_qparams() new_mod = cls( mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point), diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py index 84fa30021ed1..851a551da0d8 100644 --- a/torch/nn/utils/prune.py +++ b/torch/nn/utils/prune.py @@ -587,7 +587,6 @@ def compute_mask(self, t, default_mask): # Compute number of units to prune: amount if int, # else amount * tensor_size nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size) - nparams_tokeep = tensor_size - nparams_toprune # This should raise an error if the number of units to prune is larger # than the number of units in the tensor _validate_pruning_amount(nparams_toprune, tensor_size) diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py index 5aabbd66c4b1..59e3851dcd57 100644 --- a/torch/quantization/fx/fuse.py +++ b/torch/quantization/fx/fuse.py @@ -21,7 +21,7 @@ from .quantization_types import Pattern -from typing import Callable, Tuple, Optional +from typing import Callable, Tuple class Fuser: @@ -59,11 +59,12 @@ def load_arg(a): model = GraphModule(input_root, self.fused_graph) return model - def _find_matches(self, root: GraphModule, graph: Graph, - patterns: Dict[Pattern, Callable] - ) -> Dict[str, Tuple[Node, Optional[Any]]]: + def _find_matches( + self, root: GraphModule, graph: Graph, + patterns: Dict[Pattern, Callable] + ) -> Dict[str, Tuple[Node, FuseHandler]]: modules = dict(root.named_modules()) - match_map = {} # node name -> (root_node, match_value?) + match_map : Dict[str, Tuple[Node, FuseHandler]] = {} # node name -> (root_node, match_value) def apply_match(pattern, node, match): if isinstance(pattern, tuple): diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py index b7af6008b3f3..1749484fccec 100644 --- a/torch/quantization/fx/fusion_patterns.py +++ b/torch/quantization/fx/fusion_patterns.py @@ -6,12 +6,25 @@ from .utils import _parent_name from .quantization_types import QuantizerCls from ..fuser_method_mappings import get_fuser_method +from abc import ABC, abstractmethod from typing import Any, Callable, Dict # --------------------- -# Fusion Patterns +# Fusion Pattern Registrations # --------------------- +# Base Pattern Handler +class FuseHandler(ABC): + """ Base handler class for the fusion patterns + """ + def __init__(self, quantizer: QuantizerCls, node: Node): + pass + + @abstractmethod + def fuse(self, quantizer: QuantizerCls, load_arg: Callable, + fuse_custom_config_dict: Dict[str, Any] = None) -> Node: + pass + @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d)) @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d)) @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d)) @@ -27,9 +40,9 @@ @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm1d, torch.nn.Conv1d))) @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d))) @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d))) -class ConvBNReLUFusion(): +class ConvBNReLUFusion(FuseHandler): def __init__(self, quantizer: QuantizerCls, node: Node): - super().__init__() + super().__init__(quantizer, node) self.relu_node = None self.bn_node = None if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \ @@ -94,9 +107,9 @@ def fuse(self, quantizer: QuantizerCls, load_arg: Callable, @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d)) @register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d)) @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm3d)) -class ModuleReLUFusion(): +class ModuleReLUFusion(FuseHandler): def __init__(self, quantizer: QuantizerCls, node: Node): - super().__init__() + super().__init__(quantizer, node) self.relu_node = node assert isinstance(node.args[0], Node) node = node.args[0] diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py index a95bc184fa10..808a3b36fb4a 100644 --- a/torch/quantization/fx/observed_module.py +++ b/torch/quantization/fx/observed_module.py @@ -2,11 +2,11 @@ import copy from torch.fx import GraphModule # type: ignore from torch.fx.graph import Graph -from typing import Union, Dict, Any +from typing import Union, Dict, Any, List class ObservedGraphModule(GraphModule): - def get_preserved_attr_names(self): + def get_preserved_attr_names(self) -> List[str]: return ['_activation_post_process_map', '_patterns', '_qconfig_map', @@ -35,6 +35,12 @@ def is_observed_module(module: Any) -> bool: return isinstance(module, ObservedGraphModule) class ObservedStandaloneGraphModule(ObservedGraphModule): + def get_preserved_attr_names(self) -> List[str] : + return super().get_preserved_attr_names() + [ + "_standalone_module_input_quantized_idxs", + "_standalone_module_output_quantized_idxs" + ] + def __deepcopy__(self, memo): fake_mod = torch.nn.Module() fake_mod.__dict__ = copy.deepcopy(self.__dict__) diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py index 46fbed74bdc8..fb5bef0bd0ad 100644 --- a/torch/quantization/fx/quantization_patterns.py +++ b/torch/quantization/fx/quantization_patterns.py @@ -755,10 +755,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable, qconfig = quantizer.qconfig_map[node.name] convert = torch.quantization.quantize_fx._convert_standalone_module_fx # type: ignore observed_standalone_module = quantizer.modules[node.target] + input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs.tolist() quantized_standalone_module = convert(observed_standalone_module, debug=debug) parent_name, name = _parent_name(node.target) # update the modules dict setattr(quantizer.modules[parent_name], name, quantized_standalone_module) quantizer.modules[node.target] = quantized_standalone_module - # standalone module takes float input - return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False)) + return quantizer.quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs)) diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index af9496a66a63..318295270b61 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -102,14 +102,15 @@ def insert_observer( 'call_module', observer_name, (load_arg(node),), {}) observed_node_names_set.add(node.name) -def insert_observer_for_special_module( +def maybe_insert_observer_for_special_module( quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module], - prepare_custom_config_dict: Any, qconfig: Any, node: Node): + prepare_custom_config_dict: Any, qconfig: Any, node: Node) -> Optional[List[int]]: """ Insert observer for custom module and standalone module Returns: standalone_module_input_idxs: the indexs for inputs that needs to be observed by parent module """ assert modules is not None + standalone_module_input_idxs = None if isinstance(quantize_handler, CustomModuleQuantizeHandler): custom_module = modules[node.target] # type: ignore custom_module_class_mapping = prepare_custom_config_dict.get( @@ -129,19 +130,22 @@ def insert_observer_for_special_module( class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs} name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs} config = class_config_map.get(type(standalone_module), (None, None)) - config = name_config_map.get(node.target, (None, None)) - standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0] - standalone_prepare_config_dict = {} if config[1] is None else config[1] + config = name_config_map.get(node.target, config) + sm_qconfig_dict = {"": qconfig} if config[0] is None else config[0] + sm_prepare_config_dict = {} if config[1] is None else config[1] prepare = \ torch.quantization.quantize_fx._prepare_standalone_module_fx # type: ignore observed_standalone_module = \ - prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict) + prepare(standalone_module, sm_qconfig_dict, sm_prepare_config_dict) + standalone_module_input_idxs = observed_standalone_module.\ + _standalone_module_input_quantized_idxs.int().tolist() observed_standalone_module = mark_observed_standalone_module( observed_standalone_module) parent_name, name = _parent_name(node.target) setattr(modules[parent_name], name, observed_standalone_module) modules[node.target] = observed_standalone_module # type: ignore + return standalone_module_input_idxs def insert_observer_for_output_of_the_node( node: Node, @@ -155,7 +159,8 @@ def insert_observer_for_output_of_the_node( observed_graph: Graph, load_arg: Callable, observed_node_names_set: Set[str], - matched_nodes: Optional[List[Node]]): + matched_nodes: Optional[List[Node]], + standalone_module_input_idxs: Optional[List[int]]): """ Insert observer/fake_quantize module for output of the observed module if needed """ @@ -215,8 +220,13 @@ def input_is_observed(arg): observed_node_names_set.add(node.name) elif isinstance(quantize_handler, StandaloneModuleQuantizeHandler): - # output is observed in the standalone module - return + assert node.op == "call_module" + assert isinstance(node.target, str) + sm_out_qidxs = modules[node.target]._standalone_module_output_quantized_idxs.tolist() # type: ignore + output_is_quantized = 0 in sm_out_qidxs + + if output_is_quantized: + observed_node_names_set.add(node.name) elif (quantize_handler.all_node_args and input_output_observed(quantize_handler)): # observer for outputs @@ -226,6 +236,16 @@ def input_is_observed(arg): activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set) + # insert observer for input of standalone module + if standalone_module_input_idxs is not None: + for idx in standalone_module_input_idxs: + if node.args[idx].name not in observed_node_names_set: # type: ignore + new_observer = qconfig.activation() + insert_observer( + node, new_observer, model, + activation_post_process_map, env, observed_graph, + load_arg, observed_node_names_set) + def insert_observer_for_input_arg_of_observed_node( node: Node, observed_node_names_set: Set[str], quants: Dict[str, Tuple[DefaultQuantizeHandler, Callable]], @@ -373,10 +393,19 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any, """ standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. - When we are preparing a standalone module: - both input and output are observed in prepared standalone module + How the standalone module is observed is specified by `input_quantized_idxs` and + `output_quantized_idxs` in the prepare_custom_config for the standalone module Returns: model(GraphModule): prepared standalone module + attributes: + _standalone_module_input_quantized_idxs(List[Int]): a list of + indexes for the graph input that is expected to be quantized, + same as input_quantized_idxs configuration provided + for the standalone module + _standalone_module_output_quantized_idxs(List[Int]): a list of + indexs for the graph output that is quantized + same as input_quantized_idxs configuration provided + for the standalone module """ if prepare_custom_config_dict is None: prepare_custom_config_dict = {} @@ -430,8 +459,6 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any, def load_arg(a): return map_arg(a, lambda node: env[node.name]) - # indexes for the inputs that needs to be observed - standalone_module_observed_input_idxs: List[int] = [] graph_inputs = [] for node in model.graph.nodes: if node.op == 'placeholder': @@ -487,14 +514,15 @@ def load_arg(a): # parent if qconfig is not None: assert obj is not None - insert_observer_for_special_module( - obj, self.modules, prepare_custom_config_dict, qconfig, - node) + standalone_module_input_idxs = \ + maybe_insert_observer_for_special_module( + obj, self.modules, prepare_custom_config_dict, qconfig, + node) insert_observer_for_output_of_the_node( node, obj, qconfig, self.modules, model, pattern, self.activation_post_process_map, env, observed_graph, load_arg, observed_node_names_set, - matched_nodes) + matched_nodes, standalone_module_input_idxs) else: env[node.name] = observed_graph.node_copy(node, load_arg) @@ -516,6 +544,21 @@ def load_arg(a): model = GraphModule(model, observed_graph) self.save_state(model) model = mark_observed_module(model) + if is_standalone_module: + assert result_node is not None + assert isinstance(result_node.args[0], Node), \ + "standalone module only supports returning simple value currently"\ + "(not tuple, dict etc.)" + # indicator for whether output is observed or not. + # This used for correctly quantize standalone modules + output_is_observed = \ + result_node.args[0].name in observed_node_names_set + # these inputs are observed in parent + # converting List[int] to Tensor since module attribute is + # Union[Tensor, Module] + model._standalone_module_input_quantized_idxs = \ + torch.Tensor(input_quantized_idxs) + model._standalone_module_output_quantized_idxs = torch.Tensor(output_quantized_idxs) return model def save_state(self, observed: GraphModule) -> None: @@ -569,8 +612,10 @@ def _convert(self, model: GraphModule, debug: bool = False, """ standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. - Returns a quantized standalone module which accepts float input - and produces float output. + Returns a quantized standalone module, whether input/output is quantized is + specified by prepare_custom_config_dict, with + input_quantized_idxs, output_quantized_idxs, please + see docs for prepare_fx for details """ if convert_custom_config_dict is None: convert_custom_config_dict = {} @@ -627,36 +672,50 @@ def load_x(n: Node) -> Node: else: return env[n.name] - def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]] + def load_arg(quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] ) -> Callable[[Node], Argument]: """ Input: quantized, which can be None, list, boolean or tuple - - if quantized is a list or tuple, then arg should be a list and - the args with corresponding indexes will be quantized - - if quantized is a boolean, then all args will be - quantized/not quantized - if quantized is None, then we'll load the node as long as it exists + - if quantized is a boolean, then all args will be + quantized/not quantized + - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=False) + - if quantized is a list or tuple, then arg should be a list and + the args with corresponding indexes will be quantized Output: fn which takes arg_or_args, and loads them from the corresponding environment depending on the value of quantized. """ assert quantized is None or \ isinstance(quantized, (tuple, list, bool)), type(quantized) + if isinstance(quantized, (tuple, list)) and len(quantized) == 0: + # empty tuple or list means nothing is quantized + quantized = False def load_arg_impl(arg_or_args): - if quantized is None: + # we'll update the format of `quantized` + # to better match arg_or_args + updated_quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] = quantized + + if isinstance(quantized, (tuple, list)) and \ + len(quantized) == 1 and isinstance(arg_or_args, Node): + # when argument is one Node instead of tuple, we just need to check + # 0 is in the quantized list + updated_quantized = 0 in quantized + + if updated_quantized is None: return map_arg(arg_or_args, load_x) - if isinstance(quantized, bool): + if isinstance(updated_quantized, bool): return map_arg( arg_or_args, - load_quantized if quantized else load_non_quantized) - elif isinstance(quantized, (tuple, list)): + load_quantized if updated_quantized else load_non_quantized) + elif isinstance(updated_quantized, (tuple, list)): assert isinstance(arg_or_args, (tuple, list)), arg_or_args loaded_args = [] # for now, we only support quantizing positional arguments for i, a in enumerate(arg_or_args): - if i in quantized: + if i in updated_quantized: loaded_args.append(map_arg(a, load_quantized)) else: loaded_args.append(map_arg(a, load_non_quantized)) @@ -690,10 +749,10 @@ def node_arg_is_quantized(node_arg: Any) -> bool: def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool: """ Check if output node is quantized or not """ assert self.modules is not None - # by default the output is expected to be quantized + # by default the output for a quantizable node is expected to be quantized quantized = True - # Need to get correct quantized/non-quantized state for the output + # Need to get correct quantized/non-quantized state forn the output # of CopyNode if type(obj) in [ CopyNode, @@ -750,7 +809,7 @@ def insert_quantize_node(node: Node) -> None: "output_quantized_idxs", []) for node in model.graph.nodes: - if node.op == 'output': + if node.op == "output": cur_output_node_idx = output_node_seen_cnt output_node_seen_cnt += 1 if cur_output_node_idx in output_quantized_idxs: @@ -775,12 +834,19 @@ def insert_quantize_node(node: Node) -> None: quantized = False else: assert obj is not None + # We will get whether the output is quantized or not before + # convert for standalone module and after convert + # for non-standalone module, since _standalone_module_output_quantized_idxs + # is only available in observed standalone module + if is_observed_standalone_module_node: + out_quant_idxs = self.modules[node.target]._standalone_module_output_quantized_idxs.tolist() # type: ignore + assert len(out_quant_idxs) <= 1, "Currently standalone only support one output" + quantized = 0 in out_quant_idxs + result = obj.convert( self, node, load_arg, debug=debug, convert_custom_config_dict=convert_custom_config_dict) - if is_observed_standalone_module_node: - quantized = False - else: + if not is_observed_standalone_module_node: quantized = is_output_quantized(node, obj) if quantized: @@ -929,7 +995,7 @@ def _find_matches( standalone_module_names = [] match_map: Dict[str, MatchResult] = {} - all_matched = set() + all_matched : Set[str] = set() def record_match(pattern, node, matched): if isinstance(pattern, tuple): diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py index c1f849803342..8285e204b1ed 100644 --- a/torch/quantization/fx/utils.py +++ b/torch/quantization/fx/utils.py @@ -9,7 +9,7 @@ Node, ) -from typing import Callable, Optional, List, Dict, Any +from typing import Callable, Optional, List, Dict, Any, Set # turn foo.bar -> ['foo', 'bar'] def _parent_name(target): @@ -140,7 +140,7 @@ def get_next_qparams_idx(module, qparams): inputs.append(graph.create_node('get_attr', qparam_full_path)) return graph.create_node('call_function', quantize_op, tuple(inputs), {}) -def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key): +def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key) -> List[Any]: r""" Get all the unique custom module keys in the custom config dict e.g. Input: @@ -163,7 +163,7 @@ def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key): [CustomModule1, CustomModule2, CustomModule3] """ # using set to dedup - float_custom_module_classes = set() + float_custom_module_classes : Set[Any] = set() custom_module_mapping = custom_config_dict.get(custom_config_dict_key, {}) for quant_mode in ["static", "dynamic", "weight_only"]: quant_mode_custom_module_config = custom_module_mapping.get(quant_mode, {}) diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py index 7addaa622962..2cc579f66087 100644 --- a/torch/quantization/observer.py +++ b/torch/quantization/observer.py @@ -390,6 +390,8 @@ def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine, def forward(self, x_orig): r"""Records the running minimum and maximum of ``x``.""" + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_val.dtype) min_val_cur, max_val_cur = torch._aminmax(x) @@ -463,6 +465,8 @@ def __init__(self, averaging_constant=0.01, dtype=torch.quint8, quant_max=quant_max) def forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_val.dtype) min_val = self.min_val @@ -532,6 +536,8 @@ def forward(self, x_orig): return self._forward(x_orig) def _forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape min_vals = self.min_vals max_vals = self.max_vals @@ -638,6 +644,8 @@ def __init__(self, averaging_constant=0.01, ch_axis=0, dtype=torch.quint8, self.averaging_constant = averaging_constant def forward(self, x_orig): + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() # avoid keeping autograd tape x = x.to(self.min_vals.dtype) min_vals = self.min_vals @@ -878,6 +886,8 @@ def _combine_histograms(self, return orig_hist def forward(self, x_orig: torch.Tensor) -> torch.Tensor: + if x_orig.numel() == 0: + return x_orig x = x_orig.detach() min_val = self.min_val max_val = self.max_val diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py index 8da4ad6bb182..2d91d8ab6b3e 100644 --- a/torch/quantization/qconfig.py +++ b/torch/quantization/qconfig.py @@ -3,6 +3,8 @@ from .fake_quantize import * import torch.nn as nn +from typing import Union + class QConfig(namedtuple('QConfig', ['activation', 'weight'])): """ Describes how to quantize a layer or a part of the network by providing @@ -109,3 +111,18 @@ def get_default_qat_qconfig(backend='fbgemm'): else: qconfig = default_qat_qconfig return qconfig + +def assert_valid_qconfig(qconfig: Union[QConfig, QConfigDynamic], + mod: torch.nn.Module) -> None: + is_conv_transpose_mod = ( + isinstance(mod, torch.nn.ConvTranspose1d) or + isinstance(mod, torch.nn.ConvTranspose2d) or + isinstance(mod, torch.nn.ConvTranspose3d)) + if is_conv_transpose_mod: + example_observer = qconfig.weight() + is_per_channel = ( + isinstance(example_observer, torch.quantization.PerChannelMinMaxObserver) or + isinstance(example_observer, torch.quantization.MovingAveragePerChannelMinMaxObserver) + ) + assert not is_per_channel, \ + 'Per channel weight observer is not supported yet for ConvTranspose{n}d.' diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py index 1be867e0a299..77752a8af9c9 100644 --- a/torch/quantization/quantize.py +++ b/torch/quantization/quantize.py @@ -50,6 +50,8 @@ def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None, module_qconfig = qconfig_dict.get(prefix, module_qconfig) module_qconfig = getattr(module, 'qconfig', module_qconfig) + torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module) + module.qconfig = module_qconfig for name, child in module.named_children(): module_prefix = prefix + '.' + name if prefix else name @@ -256,9 +258,12 @@ def _remove_activation_post_process(module): delattr(module, 'activation_post_process') # remove activation_post_proceess hook + handle_ids_to_remove = set() for handle_id, hook_fn in module._forward_hooks.items(): if hook_fn is _observer_forward_hook: - module._forward_hooks.pop(handle_id) + handle_ids_to_remove.add(handle_id) + for handle_id in handle_ids_to_remove: + module._forward_hooks.pop(handle_id) # TODO: rename to something more general def _remove_qconfig(module): diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py index cba104b8f783..89ba877ffe78 100644 --- a/torch/quantization/quantize_fx.py +++ b/torch/quantization/quantize_fx.py @@ -107,8 +107,20 @@ def _prepare_standalone_module_fx( standalone_module means it a submodule that is not inlined in parent module, and will be quantized separately as one unit. - Both input and output of the module are observed in the - standalone module. + How the standalone module is observed is specified by `input_quantized_idxs` and + `output_quantized_idxs` in the prepare_custom_config for the standalone module + + Returns: + model(GraphModule): prepared standalone module + attributes: + _standalone_module_input_quantized_idxs(List[Int]): a list of + indexes for the graph input that is expected to be quantized, + same as input_quantized_idxs configuration provided + for the standalone module + _standalone_module_output_quantized_idxs(List[Int]): a list of + indexs for the graph output that is quantized + same as input_quantized_idxs configuration provided + for the standalone module """ return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True) @@ -378,8 +390,9 @@ def _convert_standalone_module_fx( r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx` and convert it to a quantized model - Return: - A quantized standalone module which accepts float input - and produces float output. + Returns a quantized standalone module, whether input/output is quantized is + specified by prepare_custom_config_dict, with + input_quantized_idxs, output_quantized_idxs, please + see docs for prepare_fx for details """ return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True) diff --git a/torch/serialization.py b/torch/serialization.py index ebc5d0a08541..3b6f5828d858 100644 --- a/torch/serialization.py +++ b/torch/serialization.py @@ -192,7 +192,7 @@ def storage_to_tensor_type(storage): def _is_path(name_or_buffer): return isinstance(name_or_buffer, str) or \ - (sys.version_info[0] == 3 and isinstance(name_or_buffer, pathlib.Path)) + isinstance(name_or_buffer, pathlib.Path) class _opener(object): diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 60e9fdc389ce..c658dbef10e6 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -639,6 +639,30 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): return out +def sample_inputs_flip(op_info, device, dtype, requires_grad): + tensors = ( + make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad), + make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad) + ) + + dims = ((0, 1, 2), (0,), (0, 2), (-1,)) + + # On CUDA, `dims=()` errors out with IndexError + # Reference: https://github.com/pytorch/pytorch/issues/49982 + if device == 'cpu': + dims = dims + ((),) # type: ignore + + samples = [SampleInput(tensor, kwargs={'dims': dim}) for tensor, dim in product(tensors, dims)] + + return samples + +def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad): + tensors = ( + make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad), + make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad) + ) + return [SampleInput(tensor) for tensor in tensors] + # Operator database (sorted alphabetically) op_db: List[OpInfo] = [ # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952) @@ -810,7 +834,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.fftn', aten_name='fft_fftn', @@ -818,7 +842,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False, decorators=[precisionOverride( {torch.float: 1e-4, torch.cfloat: 1e-4})],), @@ -828,7 +852,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.rfft', aten_name='fft_rfft', @@ -836,7 +860,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.rfftn', aten_name='fft_rfftn', @@ -844,7 +868,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False, decorators=[precisionOverride({torch.float: 1e-4})],), SpectralFuncInfo('fft.ifft', @@ -853,7 +877,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.ifftn', aten_name='fft_ifftn', @@ -861,7 +885,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.ihfft', aten_name='fft_ihfft', @@ -869,7 +893,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and(torch.bool), default_test_dtypes=floating_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.irfft', aten_name='fft_irfft', @@ -877,7 +901,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=False, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), SpectralFuncInfo('fft.irfftn', aten_name='fft_irfftn', @@ -885,8 +909,26 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False): ndimensional=True, dtypes=all_types_and_complex_and(torch.bool), default_test_dtypes=floating_and_complex_types(), - supports_tensor_out=False, + supports_tensor_out=True, test_inplace_grad=False,), + OpInfo('flip', + op=torch.flip, + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_flip, + test_inplace_grad=False, + supports_tensor_out=False), + OpInfo('fliplr', + op=torch.fliplr, + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_fliplr_flipud, + test_inplace_grad=False, + supports_tensor_out=False), + OpInfo('flipud', + op=torch.flipud, + dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16), + sample_inputs_func=sample_inputs_fliplr_flipud, + test_inplace_grad=False, + supports_tensor_out=False), UnaryUfuncInfo('log', ref=np.log, domain=(0, float('inf')), @@ -1573,13 +1615,6 @@ def method_tests(): ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)), ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'), ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'), - ('flip', (S, S, S), ([0],), 'd0'), - ('flip', (S, S, S), ([0, 1, 2],), 'd012'), - ('flip', (S, S, S), ([0, 2],), 'd02'), - ('flip', (S, S, S), ([2, 0],), 'd20'), - ('flip', (S, S, S), ([-1],), 'neg_d'), - ('fliplr', (S, S, S), ()), - ('flipud', (S, S, S), ()), ('roll', (S, S, S), (0, 0), 'd0'), ('roll', (S, S, S), (1, 2), 'd12'), ('roll', (S, S, S), (0, 2,), 'd02'), diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index c588f69c2875..022255a5298b 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -2988,7 +2988,7 @@ def fractional_max_pool3d_test(test_case): .scale_factor(std::vector({3., 3., 3.})) .mode(torch::kTrilinear) .align_corners(false)''', - input_size=(1, 2, 3, 4, 4), + input_size=(1, 2, 3, 4, 5), fullname='interpolate_trilinear_scale_3d', # See https://github.com/pytorch/pytorch/issues/5006 precision=3e-4, @@ -4866,7 +4866,7 @@ def __call__(self, test_case): if self.should_test_pickle: # TODO: do this with in-memory files as soon as torch.save will support it - with TemporaryFile() as f: + with tempfile.TemporaryFile() as f: test_case._forward(module, input) torch.save(module, f) f.seek(0) diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py index c7fdbe536061..15d5cfeca214 100644 --- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py @@ -11,6 +11,7 @@ import torch.testing._internal.dist_utils from torch.autograd import Function from torch.autograd.function import once_differentiable +from torch.distributed.rpc import RRef from torch.testing._internal.common_utils import IS_MACOS from torch.testing._internal.dist_utils import ( dist_init, @@ -70,8 +71,7 @@ def create_tensor(): @torch.jit.script -def create_torchscript_tensor(): - # type: () -> Tensor +def create_torchscript_tensor() -> torch.Tensor: return torch.ones((3, 3)).requires_grad_() @@ -94,8 +94,7 @@ def my_script_add(t1, t2): @torch.jit.script -def my_script_ref_add(ref_t1, t2): - # type: (RRef[Tensor], Tensor) -> Tensor +def my_script_ref_add(ref_t1: RRef[torch.Tensor], t2: torch.Tensor) -> torch.Tensor: t1 = ref_t1.to_here() return torch.add(t1, t2) diff --git a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py index ee3ebdb33eff..5ae40cdea065 100644 --- a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py +++ b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py @@ -34,8 +34,7 @@ def test_get_gradients(self): dst_rank = self.rank @torch.jit.script - def dist_get_gradients(context_id): - # type: (int) -> (Dict[Tensor, Tensor]) + def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]): return dist_autograd.get_gradients(context_id) FileCheck().check("get_gradients").run(str(dist_get_gradients.graph)) diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py index 656f25322274..96ede7231a97 100644 --- a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py +++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py @@ -3,6 +3,7 @@ import torch import torch.distributed.rpc as rpc from torch import Tensor +from torch.distributed.rpc import RRef from torch.testing._internal.dist_utils import ( dist_init, worker_name, @@ -63,18 +64,15 @@ def rpc_async_call_future_ret( return fut @torch.jit.script -def rref_to_here(rref_var): - # type: (RRef[Tensor]) -> Tensor +def rref_to_here(rref_var: RRef[Tensor]) -> Tensor: return rref_var.to_here() @torch.jit.script -def rref_to_here_with_timeout(rref_var, timeout): - # type: (RRef[Tensor], float) -> Tensor +def rref_to_here_with_timeout(rref_var: RRef[Tensor], timeout: float) -> Tensor: return rref_var.to_here(timeout) @torch.jit.script -def rpc_async_with_rref_arg(dst_worker_name, args): - # type: (str, Tuple[RRef[Tensor]]) -> Tensor +def rpc_async_with_rref_arg(dst_worker_name: str, args: Tuple[RRef[Tensor]]) -> Tensor: fut = rpc.rpc_async(dst_worker_name, rref_to_here, args) ret = fut.wait() return ret diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 8eec8100270b..ede2471aa3a2 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -1335,7 +1335,11 @@ def convert_remote_to_local(event_name): for event in events if convert_remote_to_local(event.name) in EXPECTED_REMOTE_EVENTS ] - self.assertEqual(remote_events_list, EXPECTED_REMOTE_EVENTS) + self.assertEqual( + set(remote_events_list), + set(EXPECTED_REMOTE_EVENTS), + f"Mismatch between profiled events: {set(remote_events_list)} and expected events: {set(EXPECTED_REMOTE_EVENTS)}", + ) @dist_init def test_profiler_remote_events_profiled(self): @@ -1579,8 +1583,8 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function scope_event = get_function_event(events, "foo") # Since RPC call is within the scope, its CPU interval should be # contained within foo's interval. - self.assertTrue(scope_event.time_range.start < rpc_event.time_range.start) - self.assertTrue(scope_event.time_range.end > rpc_event.time_range.end) + self.assertLessEqual(scope_event.time_range.start, rpc_event.time_range.start) + self.assertGreaterEqual(scope_event.time_range.end, rpc_event.time_range.end) # the sender, dest worker, function run, and type of RPC should all # be recorded. self_worker_name = worker_name(self.rank) @@ -1776,7 +1780,13 @@ def _assert_top_level_events(self, process_global_events, expected_top_level_eve if time_range.start > last_end_time: top_level_event_names.append(event_name) last_end_time = time_range.end - self.assertEqual(sorted(top_level_event_names), sorted(expected_top_level_event_names)) + top_level_event_names = sorted(top_level_event_names) + expected_top_level_event_names = sorted(expected_top_level_event_names) + self.assertEqual( + top_level_event_names, + expected_top_level_event_names, + f"Expected events {expected_top_level_event_names}, but got {top_level_event_names}", + ) @dist_init def test_server_process_global_profiler(self): @@ -1799,9 +1809,12 @@ def test_server_process_global_profiler(self): outer_profile_rref.rpc_sync().__exit__(None, None, None) inner_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (inner_profile_rref,)) - self._assert_top_level_events(inner_events, ['aten::sub']) + expected_inner_events = ['aten::sub'] + expected_outer_events = expected_inner_events + ['aten::add'] + + self._assert_top_level_events(inner_events, expected_inner_events) outer_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (outer_profile_rref,)) - self._assert_top_level_events(outer_events, ['aten::add', 'aten::sub']) + self._assert_top_level_events(outer_events, expected_outer_events) inner_profile_rref.rpc_sync().key_averages() outer_profile_rref.rpc_sync().key_averages() diff --git a/torch/testing/_internal/expecttest.py b/torch/testing/_internal/expecttest.py index 9e46a9a84a37..4dae7ebf03dc 100644 --- a/torch/testing/_internal/expecttest.py +++ b/torch/testing/_internal/expecttest.py @@ -3,6 +3,7 @@ import traceback import os import string +from typing import Tuple # This file implements expect tests (also known as "golden" tests). @@ -139,7 +140,8 @@ def ok_for_raw_triple_quoted_string(s, quote): r"(?Pr?)", re.DOTALL) -def replace_string_literal(src, lineno, new_string): +def replace_string_literal(src : str, lineno : int, + new_string : str) -> Tuple[str, int]: r""" Replace a triple quoted string literal with new contents. Only handles printable ASCII correctly at the moment. This diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py index c5d603885e4a..741c0841778a 100644 --- a/torch/utils/bundled_inputs.py +++ b/torch/utils/bundled_inputs.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union +from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence import textwrap import torch from torch._C import TupleType, OptionalType, ListType @@ -17,7 +17,7 @@ class InflatableArg(NamedTuple): def augment_model_with_bundled_inputs( model: torch.jit.ScriptModule, - inputs: Optional[List[Tuple[Any, ...]]] = None, + inputs: Optional[Sequence[Tuple[Any, ...]]] = None, _receive_inflate_expr: Optional[List[str]] = None, # For debugging. ) -> None: """Add bundled sample inputs to a model. diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index f75e4cca195e..d4ef1a99a2df 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -308,10 +308,6 @@ def multiprocessing_context(self): def multiprocessing_context(self, multiprocessing_context): if multiprocessing_context is not None: if self.num_workers > 0: - if not multiprocessing._supports_context: - raise ValueError('multiprocessing_context relies on Python >= 3.4, with ' - 'support for different start methods') - if isinstance(multiprocessing_context, string_classes): valid_start_methods = multiprocessing.get_all_start_methods() if multiprocessing_context not in valid_start_methods: