diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py index 8b129533765a..4339d26a6d3c 100644 --- a/.circleci/cimodel/data/pytorch_build_data.py +++ b/.circleci/cimodel/data/pytorch_build_data.py @@ -93,7 +93,7 @@ ]), ]), ("rocm", [ - ("3.9", [ + ("3.7", [ ("3.6", [ ('build_only', [XImportant(True)]), ]), diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py index 9ba9fb0a8c0c..91f757207915 100644 --- a/.circleci/cimodel/data/simple/docker_definitions.py +++ b/.circleci/cimodel/data/simple/docker_definitions.py @@ -29,6 +29,7 @@ "pytorch-linux-xenial-py3.6-gcc5.4", # this one is used in doc builds "pytorch-linux-xenial-py3.6-gcc7.2", "pytorch-linux-xenial-py3.6-gcc7", + "pytorch-linux-bionic-rocm3.7-py3.6", "pytorch-linux-bionic-rocm3.8-py3.6", "pytorch-linux-bionic-rocm3.9-py3.6", ] diff --git a/.circleci/config.yml b/.circleci/config.yml index 59a26f30d1ba..f4b694afe964 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -453,8 +453,12 @@ jobs: no_output_timeout: "1h" command: | set -e - if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then - export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6" + # TODO: Remove this after we figure out why rocm tests are failing + if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then + export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148" + fi + if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then + export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb" fi if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}" @@ -534,8 +538,12 @@ jobs: command: | set -e export PYTHONUNBUFFERED=1 - if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then - export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6" + # TODO: Remove this after we figure out why rocm tests are failing + if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then + export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148" + fi + if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then + export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb" fi # See Note [Special build images] output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1} @@ -7272,6 +7280,9 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-xenial-py3.6-gcc7" image_name: "pytorch-linux-xenial-py3.6-gcc7" + - docker_build_job: + name: "docker-pytorch-linux-bionic-rocm3.7-py3.6" + image_name: "pytorch-linux-bionic-rocm3.7-py3.6" - docker_build_job: name: "docker-pytorch-linux-bionic-rocm3.8-py3.6" image_name: "pytorch-linux-bionic-rocm3.8-py3.6" @@ -7702,11 +7713,11 @@ workflows: docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9" resource_class: large - pytorch_linux_build: - name: pytorch_linux_bionic_rocm3_9_py3_6_build + name: pytorch_linux_bionic_rocm3_7_py3_6_build requires: - - "docker-pytorch-linux-bionic-rocm3.9-py3.6" - build_environment: "pytorch-linux-bionic-rocm3.9-py3.6-build" - docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.9-py3.6" + - "docker-pytorch-linux-bionic-rocm3.7-py3.6" + build_environment: "pytorch-linux-bionic-rocm3.7-py3.6-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.7-py3.6" resource_class: xlarge - pytorch_macos_10_13_py3_build: name: pytorch_macos_10_13_py3_build diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 150e2bb9f380..019c7f6e9d1c 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -274,6 +274,13 @@ case "$image" in VISION=yes KATEX=yes ;; + pytorch-linux-bionic-rocm3.7-py3.6) + ANACONDA_PYTHON_VERSION=3.6 + PROTOBUF=yes + DB=yes + VISION=yes + ROCM_VERSION=3.7 + ;; pytorch-linux-bionic-rocm3.8-py3.6) ANACONDA_PYTHON_VERSION=3.6 PROTOBUF=yes diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml index 6c978987f779..a5876c3af738 100644 --- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -15,8 +15,12 @@ jobs: no_output_timeout: "1h" command: | set -e - if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then - export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6" + # TODO: Remove this after we figure out why rocm tests are failing + if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then + export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148" + fi + if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then + export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb" fi if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}" @@ -96,8 +100,12 @@ jobs: command: | set -e export PYTHONUNBUFFERED=1 - if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then - export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6" + # TODO: Remove this after we figure out why rocm tests are failing + if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then + export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148" + fi + if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then + export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb" fi # See Note [Special build images] output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1} diff --git a/.gitignore b/.gitignore index 3d2e85be977f..d1f06437acee 100644 --- a/.gitignore +++ b/.gitignore @@ -93,6 +93,8 @@ torch/lib64 torch/include/ torch/share/ torch/test/ +torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h +torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h torch/version.py # Root level file used in CI to specify certain env configs. # E.g., see .circleci/config.yaml diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index 0549e9aa78a4..56ce8d525f89 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -265,7 +265,7 @@ fi ############################################################################### # Install ONNX into a local directory -pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" +pip install --user "file://${ROOT_DIR}/third_party/onnx#egg=onnx" report_compile_cache_stats diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat index c7d60bedafd7..d4821c1b1a8d 100644 --- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat +++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat @@ -9,10 +9,10 @@ if "%CUDA_SUFFIX%" == "" ( if "%REBUILD%"=="" ( if "%BUILD_ENVIRONMENT%"=="" ( - curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z + curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z ) else ( - aws s3 cp s3://ossci-windows/magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet + aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet ) - 7z x -aoa %TMP_DIR_WIN%\magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma + 7z x -aoa %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma ) set MAGMA_HOME=%TMP_DIR_WIN%\magma diff --git a/BUILD.bazel b/BUILD.bazel index 218d3b2ebcb7..76afe6aec1ea 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -131,6 +131,7 @@ genrule( "aten/src/ATen/RegisterQuantizedCPU.cpp", "aten/src/ATen/RegisterSparseCPU.cpp", "aten/src/ATen/RegisterMath.cpp", + "aten/src/ATen/RegisterMeta.cpp", "aten/src/ATen/RegisterDefaultBackend.cpp", "aten/src/ATen/RegisterSchema.cpp", "aten/src/ATen/Functions.h", diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h index dd6dae47d14f..df51c071c418 100644 --- a/aten/src/ATen/LegacyTHFunctionsCUDA.h +++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h @@ -36,12 +36,6 @@ std::tuple _th_topk(const Tensor & self, int64_t k, int64_t dim, Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm); -Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other); -Tensor _th_fmod(const Tensor & self, Scalar other); -Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other); -Tensor _th_fmod(const Tensor & self, const Tensor & other); -Tensor & _th_fmod_(Tensor & self, Scalar other); -Tensor & _th_fmod_(Tensor & self, const Tensor & other); Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim); Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim); std::tuple _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A); diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 8e3d2d579e91..a2bdc24ff51c 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -5,6 +5,10 @@ #include #include +// TODO: try to remove this +// There is some back story, see https://github.com/pytorch/pytorch/issues/48684 +#include + namespace at { namespace indexing { diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h index abca65feda17..59a7dc740175 100644 --- a/aten/src/ATen/TensorMeta.h +++ b/aten/src/ATen/TensorMeta.h @@ -3,8 +3,24 @@ #include // TODO: improve // #include +#include +#include +#include + namespace at { +namespace impl { + +struct MetaBase { + virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0; + void set_output(IntArrayRef sizes, TensorOptions options) { + set_output(0, sizes, {}, options, {}); + } + virtual ~MetaBase() {} +}; + +} // namespace impl + struct TensorMeta { DimVector sizes; // TODO: DimVector strides; @@ -14,6 +30,11 @@ struct TensorMeta { : sizes(_sizes), options(_options) {} }; +inline Tensor meta_tensor_from_meta(const TensorMeta& meta) { + // TODO: eliminate indirection + return at::empty_meta(meta.sizes, meta.options); +} + inline Tensor tensor_from_meta(const TensorMeta& meta) { // TODO: eliminate indirection return at::empty(meta.sizes, meta.options); diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index 5fd07d892630..192e131897c8 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -185,4 +185,15 @@ std::string show_config() { return ss.str(); } +std::string get_cxx_flags() { + #if defined(FBCODE_CAFFE2) + TORCH_CHECK( + false, + "Buck does not populate the `CXX_FLAGS` field of Caffe2 build options. " + "As a result, `get_cxx_flags` is OSS only." + ); + #endif + return caffe2::GetBuildOptions().at("CXX_FLAGS"); +} + } diff --git a/aten/src/ATen/Version.h b/aten/src/ATen/Version.h index 18fd31d3ed87..3ac7a5858ace 100644 --- a/aten/src/ATen/Version.h +++ b/aten/src/ATen/Version.h @@ -11,4 +11,6 @@ CAFFE2_API std::string get_mkldnn_version(); CAFFE2_API std::string get_openmp_version(); +CAFFE2_API std::string get_cxx_flags(); + } // namespace at diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp index 0165c53ac60d..82421f49de1e 100644 --- a/aten/src/ATen/cuda/CublasHandlePool.cpp +++ b/aten/src/ATen/cuda/CublasHandlePool.cpp @@ -50,6 +50,15 @@ cublasHandle_t getCurrentCUDABlasHandle() { } else { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); } +#endif +#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 308 + rocblas_atomics_mode rocblas_mode; + if (at::globalContext().deterministic()) { + rocblas_mode = rocblas_atomics_not_allowed; + } else { + rocblas_mode = rocblas_atomics_allowed; + } + TORCH_CUDABLAS_CHECK(rocblas_set_atomics_mode(handle, rocblas_mode)); #endif return handle; } diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp index 6b6974cda1e9..878c8fb3d8a1 100644 --- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp +++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp @@ -1045,372 +1045,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) { } return self; } -Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toByte(); - THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Char: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toChar(); - THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Double: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toDouble(); - THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Float: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toFloat(); - THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Int: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toInt(); - THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Long: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toLong(); - THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Short: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toShort(); - THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Half: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toHalf(); - THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - default: - AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type); - } - return result; -} -Tensor _th_fmod(const Tensor & self, Scalar other) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toByte(); - THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toChar(); - THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toDouble(); - THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toFloat(); - THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toInt(); - THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toLong(); - THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toShort(); - THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toHalf(); - THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - default: - AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type); - } - return result; -} -Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Char: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Double: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Float: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Int: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Long: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Short: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Half: { - auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - default: - AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type); - } - return result; -} -Tensor _th_fmod(const Tensor & self, const Tensor & other) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - auto result_ = c10::make_intrusive(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release(); - auto result = Tensor(c10::intrusive_ptr::reclaim(result_)); - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_); - break; - } - default: - AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type); - } - return result; -} -Tensor & _th_fmod_(Tensor & self, Scalar other) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toByte(); - THCudaByteTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toChar(); - THCudaCharTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toDouble(); - THCudaDoubleTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toFloat(); - THCudaTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toInt(); - THCudaIntTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toLong(); - THCudaLongTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toShort(); - THCudaShortTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = other.toHalf(); - THCudaHalfTensor_fmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - default: - AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type); - } - return self; -} -Tensor & _th_fmod_(Tensor & self, const Tensor & other) { - // DeviceGuard omitted - auto dispatch_scalar_type = infer_scalar_type(self); - - switch (dispatch_scalar_type) { - case ScalarType::Byte: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaByteTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Char: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaCharTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Double: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaDoubleTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Float: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Int: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaIntTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Long: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaLongTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Short: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaShortTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - case ScalarType::Half: { - auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type); - THCudaHalfTensor_cfmod(globalContext().getTHCState(), self_, self_, other_); - break; - } - default: - AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type); - } - return self; -} Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim) { // DeviceGuard omitted auto dispatch_scalar_type = infer_scalar_type(self); diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 37b7c5bbb223..9cc040b4dc8f 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -1315,7 +1315,7 @@ Tensor _lu_solve_helper_cpu(const Tensor& self, const Tensor& LU_data, const Ten if (self.numel() == 0 || LU_data.numel() == 0) { return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); } - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lu_solve_cpu", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "lu_solve_cpu", [&]{ apply_lu_solve(self_working_copy, LU_data_working_copy, LU_pivots_working_copy, infos); }); if (self.dim() > 2) { diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index d43e1394bd56..d6cb17418365 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -40,7 +40,6 @@ DEFINE_DISPATCH(tanh_backward_stub); DEFINE_DISPATCH(maximum_stub); DEFINE_DISPATCH(minimum_stub); DEFINE_DISPATCH(fmod_stub); -DEFINE_DISPATCH(fmod_scalar_stub); DEFINE_DISPATCH(logaddexp_stub); DEFINE_DISPATCH(logaddexp2_stub); DEFINE_DISPATCH(gcd_stub); @@ -897,34 +896,42 @@ Tensor& floor_divide_(Tensor& self, Scalar other) { Tensor& fmod_out(Tensor & result, const Tensor& self, const Tensor& other) { auto iter = TensorIterator::binary_op(result, self, other); - TORCH_CHECK(iter.device_type() == at::kCPU, "Native fmod only supports CPU"); fmod_stub(iter.device_type(), iter); return result; } Tensor& fmod_out(Tensor & result, const Tensor& self, Scalar other) { - auto iter = TensorIterator::unary_op(result, self); - TORCH_CHECK(iter.device_type() == at::kCPU, "Native fmod only supports CPU"); - fmod_scalar_stub(iter.device_type(), iter, other); + Tensor other_tensor = wrapped_scalar_tensor(other); + // FIXME: 'other' is converted to match the dtype of 'self' to retain + // BC with TH, but in the future, we should use normal type promotion, + // like in numpy + // Issue #47779: https://github.com/pytorch/pytorch/issues/47779 + at::fmod_out(result, self, other_tensor.to(self.dtype())); return result; } Tensor fmod(const Tensor& self, const Tensor & other) { - Tensor result = at::empty({0}, self.options()); - return at::fmod_out(result, self, other); + Tensor result; + auto iter = TensorIterator::binary_op(result, self, other); + fmod_stub(iter.device_type(), iter); + return iter.output(); } Tensor fmod(const Tensor& self, Scalar other) { - Tensor result = at::empty({0}, self.options()); - return at::fmod_out(result, self, other); + Tensor other_tensor = wrapped_scalar_tensor(other); + // FIXME: 'other' is converted to match the dtype of 'self' to retain + // BC with TH, but in the future, we should use normal type promotion, + // like in numpy + // Issue #47779: https://github.com/pytorch/pytorch/issues/47779 + return native::fmod(self, other_tensor.to(self.dtype())); } Tensor& fmod_(Tensor& self, const Tensor& other) { - return at::fmod_out(self, self, other); + return native::fmod_out(self, self, other); } Tensor& fmod_(Tensor& self, Scalar other) { - return at::fmod_out(self, self, other); + return native::fmod_out(self, self, other); } Tensor& logaddexp_out(Tensor& result, const Tensor& self, const Tensor& other) { diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h index 8b01ce9b06f3..d76dd9d205e9 100644 --- a/aten/src/ATen/native/BinaryOps.h +++ b/aten/src/ATen/native/BinaryOps.h @@ -62,7 +62,6 @@ DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub); DECLARE_DISPATCH(binary_fn, tanh_backward_stub); DECLARE_DISPATCH(binary_fn, mse_stub); DECLARE_DISPATCH(binary_fn, fmod_stub); -DECLARE_DISPATCH(binary_fn_alpha, fmod_scalar_stub); DECLARE_DISPATCH(binary_fn, logaddexp_stub); DECLARE_DISPATCH(binary_fn, logaddexp2_stub); DECLARE_DISPATCH(binary_fn, gcd_stub); diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index ef85d5f602c3..afd4ec15d25f 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -111,36 +111,61 @@ Tensor pinverse(const Tensor& self, double rcond) { return at::matmul(V.conj() * S_pseudoinv.unsqueeze(-2), U.transpose(-2, -1).conj()); } -static inline Tensor _matrix_rank_helper(const Tensor& self, bool symmetric) { +Tensor& linalg_matrix_rank_out(Tensor& result, const Tensor& self, optional tol, bool hermitian) { + TORCH_CHECK(result.scalar_type() == ScalarType::Long, + "result dtype ", result.scalar_type(), " does not match the expected dtype ", ScalarType::Long); + + // Matrices or batch of matrices are allowed + TORCH_CHECK(self.dim() >= 2, "linalg_matrix_rank: Expected as input a matrix or a batch of matrices, but got a tensor of size: ", self.sizes()); + + // matrix_rank assigns a scalar value for each matrix in the batch so + // result's shape is equal to self.shape[0:self.ndim-2] + // for single matrix result_shape = {} + auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2); + at::native::resize_output(result, result_shape); + + // NumPy doesn't take into account possible input with no elements and it errors on max not defined for this case + // Let's output 0 for this case, since that kind of matrices have zero number of non-zero rows, hence rank is 0. + if (self.numel() == 0) { + result.fill_(0); + return result; + } + + // We compute matrix rank as the number of singular or absolute eigen values above 'tol' threshold Tensor S; - if (!symmetric) { + if (!hermitian) { Tensor U, V; + // TODO: replace self.svd with linalg_svd std::tie(U, S, V) = self.svd(/*some=*/true, /*compute_uv=*/false); } else { - Tensor eigvecs; - std::tie(S, eigvecs) = self.symeig(/*eigenvectors=*/false); + S = at::linalg_eigvalsh(self); S = S.abs(); } - return S; + + if (tol.has_value()) { + double tol_value = tol.value(); + at::sum_out(result, S > tol_value, /*dim=*/-1); + } else { + ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype())); + double tol_value = _get_epsilon(real_dtype) * std::max(self.size(-1), self.size(-2)); + Tensor max_S = S.amax(/*dim=*/-1); + at::sum_out(result, S > max_S.mul_(tol_value).unsqueeze_(-1), /*dim=*/-1); + } + return result; } -Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) { - TORCH_CHECK((at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type())) && self.dim() == 2, - "matrix_rank(", self.scalar_type(), "{", self.sizes(), "}): expected a 2D tensor " - "of floating types"); +Tensor linalg_matrix_rank(const Tensor& self, optional tol, bool hermitian) { + Tensor result = at::empty({0}, self.options().dtype(ScalarType::Long)); + result = at::linalg_matrix_rank_out(result, self, tol, hermitian); + return result; +} - Tensor S = _matrix_rank_helper(self, symmetric); - return (S > tol).sum(); +Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) { + return at::linalg_matrix_rank(self, optional(tol), symmetric); } Tensor matrix_rank(const Tensor& self, bool symmetric) { - TORCH_CHECK((at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type())) && self.dim() == 2, - "matrix_rank(", self.scalar_type(), "{", self.sizes(), "}): expected a 2D tensor " - "of floating types"); - - Tensor S = _matrix_rank_helper(self, symmetric); - double tol = _get_epsilon(self.scalar_type()) * std::max(self.size(0), self.size(1)); - return (S > S.max().mul_(tol)).sum(); + return at::linalg_matrix_rank(self, c10::nullopt, symmetric); } static void check_1d(const Tensor& t, const char* arg, const char* fn) { @@ -924,8 +949,8 @@ inline Tensor _blob_to_Tensor( // Blob is assumed to be a 1D array, that is why // we also insert a fake dimension so that the result could directly // be used in _compute_linear_combination - auto tensor = at::from_blob((void*)blob.begin(), blob.size(), in.dtype()) - .unsqueeze(0); + auto tensor = at::from_blob((void*)blob.begin(), blob.size(), + c10::toValueType(in.scalar_type())).unsqueeze(0); return _move_memory_if_cuda_input(tensor, in); } @@ -1058,7 +1083,7 @@ Tensor compute_T12(const Tensor& A) { reinterpret_cast(&b), {num_prods, num_prods}, {num_prods, 1}, - A.dtype() + c10::toValueType(A.scalar_type()) ); bs = _move_memory_if_cuda_input(bs, A); @@ -1130,7 +1155,7 @@ Tensor compute_T18(const Tensor& A) { reinterpret_cast(&b), {num_prods, num_prods}, {num_prods, 1}, - A.dtype() + c10::toValueType(A.scalar_type()) ); bs = _move_memory_if_cuda_input(bs, A); @@ -1303,7 +1328,7 @@ Tensor backward_analytic_function_of_a_matrix( const Tensor& self, const Tensor& grad, const func_t& function_of_a_matrix ) { - auto self_transposed = self.transpose(-2, -1); + auto self_transposed = self.transpose(-2, -1).conj(); auto self_transposed_sizes = self_transposed.sizes().vec(); self_transposed_sizes[self.dim() - 2] <<= 1; self_transposed_sizes[self.dim() - 1] <<= 1; diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index d6da309d4cf7..e5a0423e493c 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -77,12 +77,13 @@ Tensor& resize_as_( Tensor& resize_( Tensor& self, IntArrayRef size, - c10::optional optional_memory_format) { + c10::optional optional_memory_format, + bool resize_storage) { if (self.has_names()) { return resize_named_tensor_(self, size, optional_memory_format); } auto* self_ = self.unsafeGetTensorImpl(); - resize_impl_cpu_(self_, size, /*strides=*/c10::nullopt); + resize_impl_cpu_(self_, size, /*strides=*/c10::nullopt, resize_storage); if (optional_memory_format.has_value()) { auto memory_format = optional_memory_format.value(); @@ -95,5 +96,20 @@ Tensor& resize_( return self; } +Tensor& resize_( + Tensor& self, + IntArrayRef size, + c10::optional optional_memory_format) { + return resize_(self, size, optional_memory_format, /*resize_storage=*/true); +} + +Tensor& resize_meta_( + Tensor& self, + IntArrayRef size, + c10::optional optional_memory_format) { + // meta tensors don't have storage, so don't resize them + return resize_(self, size, optional_memory_format, /*resize_storage=*/false); +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index 501cacfbd073..d3d8faf3aa23 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -43,7 +43,8 @@ static inline void maybe_resize_storage_cpu(TensorImpl* self, int64_t new_size) inline TensorImpl* resize_impl_cpu_( TensorImpl* self, IntArrayRef size, - c10::optional stride) { + c10::optional stride, + bool resize_storage = true) { if (self->sizes() == size && (!stride || self->strides() == stride)) { return self; } @@ -57,7 +58,9 @@ inline TensorImpl* resize_impl_cpu_( self->set_sizes_contiguous(size); storage_size = self->numel(); } - maybe_resize_storage_cpu(self, storage_size); + if (resize_storage) { + maybe_resize_storage_cpu(self, storage_size); + } return self; } diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index 4fec66afd2c8..770774aaaeda 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -9,11 +9,11 @@ namespace at { -using DimMask = TensorIterator::DimMask; -using PtrVector = TensorIterator::PtrVector; -using loop_t = TensorIterator::loop_t; -using loop2d_t = TensorIterator::loop2d_t; -using StrideVector = TensorIterator::StrideVector; +using DimMask = TensorIteratorBase::DimMask; +using PtrVector = TensorIteratorBase::PtrVector; +using loop_t = TensorIteratorBase::loop_t; +using loop2d_t = TensorIteratorBase::loop2d_t; +using StrideVector = TensorIteratorBase::StrideVector; /// Construction TensorIteratorConfig& TensorIteratorConfig::add_output(const Tensor& output) { @@ -150,7 +150,7 @@ TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef sha // in the strides of trivial dimensions, so physical layout is unaffected but permutation information is lost) // We might change this behavior in future once performance considerations are resolved -void TensorIterator::reorder_dimensions() { +void TensorIteratorBase::reorder_dimensions() { // Sort the dimensions based on strides in ascending order with reduced dims // at the front. NOTE: that this inverts the order of C-contiguous tensors. // strides[0] is the fastest moving dimension instead of strides[ndim - 1]. @@ -224,7 +224,7 @@ void TensorIterator::reorder_dimensions() { // Computes a common dtype using type promotion // See the [Common Dtype Computation] note -ScalarType TensorIterator::compute_common_dtype() { +ScalarType TensorIteratorBase::compute_common_dtype() { at::native::ResultTypeState state = {}; for (const auto& op : operands_) { if (op.is_output) { @@ -251,7 +251,7 @@ ScalarType TensorIterator::compute_common_dtype() { // NOTE: Checks for more specific behaviors (e.g. the first and second // inputs must share a dtype, but the third must have the long dtype) // should be implemented directly and outside of TensorIterator. -void TensorIterator::compute_types(const TensorIteratorConfig& config) { +void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) { // Reviews operands (1/2) // - validates that all input tensors are defined // - computes common device @@ -402,10 +402,25 @@ void TensorIterator::compute_types(const TensorIteratorConfig& config) { if (common_device == kCPU) { // Casts to outputs by creating temporaries of the correct dtype (if needed) if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_) { + TORCH_INTERNAL_ASSERT(op.tensor.defined()); op.original_tensor = op.tensor; + // NB: do NOT use set_output here, as the temporary is NOT a true output; + // op.tensor is the true output and it was pre-provided for us. + // TODO: When we extend this to work with meta tensors, we'll need to + // skip this temporary allocation in that case (because it's + // unnecessary) + // TODO: The logic for cast_outputs will need to be handled by the + // structured kernels implementation. What probably should happen + // is that we pass in the inferred dtype into the out kernel, and + // then after calling the out kernel, do the conversion (which + // is cast_outputs here), but integrating this with existing + // TensorIterator will take a little doing op.tensor = at::empty_like(op.tensor, op.tensor.options().dtype(common_dtype_), LEGACY_CONTIGUOUS_MEMORY_FORMAT); + if (!names_.empty()) { + namedinference::propagate_names(op.tensor, names_); + } op.current_dtype = common_dtype_; op.target_dtype = common_dtype_; } @@ -421,7 +436,7 @@ void TensorIterator::compute_types(const TensorIteratorConfig& config) { } } -StrideVector TensorIterator::compatible_stride(int element_size) const { +StrideVector TensorIteratorBase::compatible_stride(int element_size) const { auto stride = StrideVector(); int64_t next_stride = element_size; for (int dim = 0; dim < ndim(); dim++) { @@ -431,7 +446,7 @@ StrideVector TensorIterator::compatible_stride(int element_size) const { return stride; } -DimVector TensorIterator::invert_perm(IntArrayRef input) const { +DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const { // Invert the permutation caused by reorder_dimensions. This is not valid // after coalesce_dimensions is called. TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_); @@ -443,7 +458,7 @@ DimVector TensorIterator::invert_perm(IntArrayRef input) const { return res; } -void TensorIterator::allocate_or_resize_outputs() { +void TensorIteratorBase::allocate_or_resize_outputs() { for (int i = 0; i < num_outputs_; i++) { auto& op = operands_[i]; if (!op.tensor.defined() || op.will_resize) { @@ -460,33 +475,27 @@ void TensorIterator::allocate_or_resize_outputs() { } auto tensor_shape = invert_perm(shape_); if (inverted) { - if (!op.tensor.defined()) { - // can just return contiguous output - // it is faster because it avoids allocating 0 size tensor and - // resizing and restriding it - op.tensor = at::empty(tensor_shape, op.options()); - } else { - at::native::resize_output(op.tensor, tensor_shape); - } + // can just return contiguous output + // it is faster because it avoids allocating 0 size tensor and + // resizing and restriding it + set_output(i, tensor_shape, {}, op.options(), names_); } else { auto tensor_stride = invert_perm(op.stride_bytes); for (int dim = 0; dim < ndim(); dim++) { tensor_stride[dim] /= element_size; } - if (!op.tensor.defined()) { - op.tensor = - at::empty_strided(tensor_shape, tensor_stride, op.options()); - } else { - at::native::resize_output(op.tensor, tensor_shape); - op.tensor.as_strided_(tensor_shape, tensor_stride); - } + set_output(i, tensor_shape, tensor_stride, op.options(), names_); } op.current_dtype = op.target_dtype; + } else if (op.tensor.defined() && !names_.empty()) { + // Even if we don't resize, we may still propagate names, esp + // if we were doing an inplace operation + namedinference::propagate_names(op.tensor, names_); } } } -void TensorIterator::compute_names(const TensorIteratorConfig& config) { +void TensorIteratorBase::compute_names(const TensorIteratorConfig& config) { bool should_infer_names = std::any_of( operands_.begin(), operands_.end(), @@ -513,27 +522,7 @@ void TensorIterator::compute_names(const TensorIteratorConfig& config) { } } -void TensorIterator::propagate_names_to_outputs() { - // names_ can be empty for two reasons: - // 1. We were performing ops on scalar tensors. Then there should be no names. - // 2. All of the defined inputs/outputs had no names. Then we shouldn't - // run name inference. - if (names_.empty()) { - return; - } - - // propagate names - for (int i = 0; i < num_outputs_; i++) { - auto& op = operands_[i]; - // must call propagate_names_to_outputs after outputs have been allocated. - TORCH_INTERNAL_ASSERT(op.tensor.defined()); - if (!names_.empty()) { - namedinference::propagate_names(op.tensor, names_); - } - } -} - -void TensorIterator::coalesce_dimensions() { +void TensorIteratorBase::coalesce_dimensions() { if (ndim() <= 1) { return; } @@ -586,7 +575,7 @@ void TensorIterator::coalesce_dimensions() { has_coalesced_dimensions_ = true; } -int64_t TensorIterator::numel() const { +int64_t TensorIteratorBase::numel() const { int64_t numel = 1; for (int64_t size : shape_) { numel *= size; @@ -594,7 +583,7 @@ int64_t TensorIterator::numel() const { return numel; } -StrideVector TensorIterator::get_dim_strides(int dim) const { +StrideVector TensorIteratorBase::get_dim_strides(int dim) const { auto dims = ndim(); auto inner_strides = StrideVector(); for (auto& op : operands_) { @@ -603,7 +592,7 @@ StrideVector TensorIterator::get_dim_strides(int dim) const { return inner_strides; } -SmallVector TensorIterator::get_data_ptrs(ArrayRef base, IntArrayRef counter) const { +SmallVector TensorIteratorBase::get_data_ptrs(ArrayRef base, IntArrayRef counter) const { auto ptrs = SmallVector(base); for (int dim = 0; dim < ndim(); dim++) { int64_t value = counter[dim]; @@ -614,7 +603,7 @@ SmallVector TensorIterator::get_data_ptrs(ArrayRef base, IntArr return ptrs; } -SmallVector TensorIterator::get_base_ptrs() const { +SmallVector TensorIteratorBase::get_base_ptrs() const { auto ptrs = SmallVector(); for (int i = 0; i < ntensors(); i++) { ptrs.push_back((char*)data_ptr(i)); @@ -622,7 +611,7 @@ SmallVector TensorIterator::get_base_ptrs() const { return ptrs; } -bool TensorIterator::is_dim_reduced(int dim) const { +bool TensorIteratorBase::is_dim_reduced(int dim) const { for (auto& op : operands_) { if (op.is_output && op.stride_bytes[dim] == 0 && shape_[dim] > 1) { return true; @@ -631,7 +620,7 @@ bool TensorIterator::is_dim_reduced(int dim) const { return false; } -void TensorIterator::permute_dimensions(IntArrayRef perm) { +void TensorIteratorBase::permute_dimensions(IntArrayRef perm) { TORCH_INTERNAL_ASSERT(perm.size() == ndim()); auto reorder = [perm](IntArrayRef data) { @@ -651,7 +640,7 @@ void TensorIterator::permute_dimensions(IntArrayRef perm) { } } -int64_t TensorIterator::num_output_elements() const { +int64_t TensorIteratorBase::num_output_elements() const { int64_t elem = 1; for (int dim = 0; dim < ndim(); dim++) { if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0) { @@ -661,7 +650,7 @@ int64_t TensorIterator::num_output_elements() const { return elem; } -int TensorIterator::num_reduce_dims() const { +int TensorIteratorBase::num_reduce_dims() const { int count = 0; for (int dim = 0; dim < ndim(); dim++) { if (operands_[0].stride_bytes[dim] == 0) { @@ -686,11 +675,11 @@ int TensorIterator::num_reduce_dims() const { } \ } -void TensorIterator::for_each(loop_t loop, int64_t grain_size) { +void TensorIteratorBase::for_each(loop_t loop, int64_t grain_size) { for_each(LOOP_WRAPPER(ntensors(), loop), grain_size); } -void TensorIterator::for_each(loop2d_t loop, int64_t grain_size) { +void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) { int64_t numel = this->numel(); if (numel == 0) { return; @@ -703,7 +692,7 @@ void TensorIterator::for_each(loop2d_t loop, int64_t grain_size) { } } -StrideVector TensorIterator::get_strides() const { +StrideVector TensorIteratorBase::get_strides() const { StrideVector strides; for (int dim = 0; dim < ndim(); dim++) { for (int arg = 0; arg < ntensors(); arg++) { @@ -713,11 +702,11 @@ StrideVector TensorIterator::get_strides() const { return strides; } -void TensorIterator::serial_for_each(loop_t loop, Range range) const { +void TensorIteratorBase::serial_for_each(loop_t loop, Range range) const { serial_for_each(LOOP_WRAPPER(ntensors(), loop), range); } -void TensorIterator::serial_for_each(loop2d_t loop, Range range) const { +void TensorIteratorBase::serial_for_each(loop2d_t loop, Range range) const { if (range.size() == 0) { return; } @@ -741,12 +730,12 @@ void TensorIterator::serial_for_each(loop2d_t loop, Range range) const { } } -bool TensorIterator::is_trivial_1d() const { +bool TensorIteratorBase::is_trivial_1d() const { // TODO: check for casting once it's supported return ndim() == 1; } -bool TensorIterator::is_contiguous() const { +bool TensorIteratorBase::is_contiguous() const { if (numel() == 1) { return true; } @@ -757,7 +746,7 @@ bool TensorIterator::is_contiguous() const { } -bool TensorIterator::is_scalar(int arg) const { +bool TensorIteratorBase::is_scalar(int arg) const { const auto& stride = operands_[arg].stride_bytes; for (int i = 0; i < ndim(); i++) { if (stride[i] != 0 && shape_[i] != 1) { @@ -767,11 +756,11 @@ bool TensorIterator::is_scalar(int arg) const { return true; } -bool TensorIterator::is_cpu_scalar(int arg) const { +bool TensorIteratorBase::is_cpu_scalar(int arg) const { return is_scalar(arg) && device(arg).is_cpu(); } -void TensorIterator::cast_outputs() { +void TensorIteratorBase::cast_outputs() { for (auto& op : operands_) { if (op.is_output && op.original_tensor.defined() && op.original_tensor.scalar_type() != op.current_dtype) { @@ -784,19 +773,19 @@ void TensorIterator::cast_outputs() { } } -void* TensorIterator::data_ptr(int arg) const { +void* TensorIteratorBase::data_ptr(int arg) const { return operands_[arg].data; } -void TensorIterator::remove_operand(int arg) { +void TensorIteratorBase::remove_operand(int arg) { operands_.erase(operands_.begin() + arg); } -void TensorIterator::unsafe_replace_operand(int arg, void* data) { +void TensorIteratorBase::unsafe_replace_operand(int arg, void* data) { operands_[arg].data = data; } -void TensorIterator::narrow(int dim, int64_t start, int64_t size) { +void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) { TORCH_INTERNAL_ASSERT(dim < ndim() && size >= 1); shape_[dim] = size; view_offsets_[dim] += start; @@ -808,7 +797,7 @@ void TensorIterator::narrow(int dim, int64_t start, int64_t size) { } } -void TensorIterator::select_all_keeping_dim(int start_dim, IntArrayRef indices) { +void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) { TORCH_INTERNAL_ASSERT(start_dim <= ndim()); for (int i = start_dim; i < ndim(); ++i) { for (auto& op : operands_) { @@ -948,14 +937,14 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso .build(); } -void TensorIterator::populate_operands(TensorIteratorConfig& config) { +void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) { for (int i = 0; i < config.tensors_.size(); i++) { operands_.emplace_back(std::move(config.tensors_[i])); } num_outputs_ = config.num_outputs_; } -void TensorIterator::mark_outputs() { +void TensorIteratorBase::mark_outputs() { // TODO: merge this into populate_operands for (int i = 0; i < num_outputs_; i++) { operands_[i].is_output = true; @@ -972,7 +961,7 @@ void TensorIterator::mark_outputs() { } } -void TensorIterator::mark_resize_outputs(const TensorIteratorConfig& config) { +void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config) { // Outputs cannot be broadcasted. Check that the shape of the outputs matches // the inferred shape. There's an exception for write-only tensors to support // our legacy behavior that functions with `out=` arguments resize their @@ -994,7 +983,7 @@ void TensorIterator::mark_resize_outputs(const TensorIteratorConfig& config) { } } -void TensorIterator::compute_mem_overlaps(const TensorIteratorConfig& config) { +void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config) { if (!config.check_mem_overlap_) { return; } @@ -1009,7 +998,7 @@ void TensorIterator::compute_mem_overlaps(const TensorIteratorConfig& config) { } } -void TensorIterator::compute_shape(const TensorIteratorConfig& config) { +void TensorIteratorBase::compute_shape(const TensorIteratorConfig& config) { if (config.static_shape_.has_value()) { shape_ = *config.static_shape_; return; @@ -1045,7 +1034,7 @@ void TensorIterator::compute_shape(const TensorIteratorConfig& config) { } } -void TensorIterator::compute_strides(const TensorIteratorConfig& config) { +void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) { for (auto& op : operands_) { if (op.tensor.defined()) { IntArrayRef original_shape = config.static_shape_ ? shape_ : op.tensor.sizes(); @@ -1068,7 +1057,7 @@ void TensorIterator::compute_strides(const TensorIteratorConfig& config) { } } -bool TensorIterator::can_use_32bit_indexing() const { +bool TensorIteratorBase::can_use_32bit_indexing() const { int64_t max_value = std::numeric_limits::max(); if (numel() > max_value) { return false; @@ -1085,7 +1074,7 @@ bool TensorIterator::can_use_32bit_indexing() const { return true; } -std::unique_ptr TensorIterator::split(int dim) { +std::unique_ptr TensorIteratorBase::split(int dim) { TORCH_INTERNAL_ASSERT(dim >= 0 && dim < ndim() && shape()[dim] >= 2); std::unique_ptr copy(new TensorIterator(*this)); @@ -1101,7 +1090,7 @@ std::unique_ptr TensorIterator::split(int dim) { } -int TensorIterator::get_dim_to_split() const { +int TensorIteratorBase::get_dim_to_split() const { TORCH_INTERNAL_ASSERT(ndim() >= 1); int64_t max_extent = -1; int dim_to_split = -1; @@ -1122,7 +1111,7 @@ int TensorIterator::get_dim_to_split() const { return dim_to_split; } -bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) { +bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { // This function tries to do a fast setup to avoid needless reordering of dimensions and tracking output strides // Return true if it can do fast setup or false otherwise // TODO enable fast handling for reductions @@ -1137,13 +1126,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) { { for (int i = 0; i < num_outputs_; i++){ auto& op = operands_[i]; - if (!op.tensor.defined()) { - TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); - op.tensor = at::empty(shape_, op.options(), MemoryFormat::Contiguous); - op.current_dtype = op.target_dtype; - } else if (op.will_resize) { - at::native::resize_output(op.tensor, shape_); - } + set_output(i, shape_, {}, op.options().memory_format(MemoryFormat::Contiguous), names_); } break; } @@ -1151,15 +1134,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) { { for (int i = 0; i < num_outputs_; i++){ auto& op = operands_[i]; - if (!op.tensor.defined()) { - TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); - op.tensor = at::empty(shape_, op.options(), MemoryFormat::ChannelsLast); - op.current_dtype = op.target_dtype; - } else if (op.will_resize) { - at::native::resize_output(op.tensor, shape_); - op.tensor.unsafeGetTensorImpl()->empty_tensor_restride( - MemoryFormat::ChannelsLast); - } + set_output(i, shape_, {}, op.options().memory_format(MemoryFormat::ChannelsLast), names_); } break; } @@ -1173,14 +1148,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) { TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs"); for (int i = 0; i < num_outputs_; i++){ auto& op = operands_[i]; - if (!op.tensor.defined()) { - TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); - op.tensor = at::empty_strided(shape_, operands_[i_defined].tensor.strides(), op.options()); - op.current_dtype = op.target_dtype; - } else if (op.will_resize) { - at::native::resize_output(op.tensor, shape_); - op.tensor.as_strided_(shape_, operands_[i_defined].tensor.strides()); - } + set_output(i, shape_, operands_[i_defined].tensor.strides(), op.options(), names_); } break; } @@ -1205,7 +1173,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) { return true; } -FastSetupType TensorIterator::compute_fast_setup_type(const TensorIteratorConfig& config) { +FastSetupType TensorIteratorBase::compute_fast_setup_type(const TensorIteratorConfig& config) { if (is_reduction_ || !all_ops_same_shape_) { return FastSetupType::NONE; } @@ -1257,11 +1225,9 @@ FastSetupType TensorIterator::compute_fast_setup_type(const TensorIteratorConfig return FastSetupType::NONE; } -TensorIterator::TensorIterator(TensorIteratorConfig& config) { - build(config); -} +TensorIteratorBase::TensorIteratorBase() {} -void TensorIterator::build(TensorIteratorConfig& config) { +void TensorIteratorBase::build(TensorIteratorConfig& config) { // populate some persistent configuration fields is_reduction_ = config.is_reduction_; @@ -1291,8 +1257,6 @@ void TensorIterator::build(TensorIteratorConfig& config) { // coalesce adjacent dimensions when possible coalesce_dimensions(); } - // perform name inference - propagate_names_to_outputs(); for (auto& op : operands_) { TORCH_INTERNAL_ASSERT(op.tensor.defined()); @@ -1307,14 +1271,40 @@ void TensorIterator::build(TensorIteratorConfig& config) { view_offsets_ = DimVector(ndim_offsets, 0); } -SplitUntil32Bit TensorIterator::with_32bit_indexing() const { +void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) { + auto& op = operands_[output_idx]; + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_); + if (!op.tensor.defined()) { + TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", output_idx); + if (strides.empty()) { + op.tensor = at::empty(sizes, options); + } else { + op.tensor = at::empty_strided(sizes, strides, options); + } + op.current_dtype = op.target_dtype; + } else if (op.will_resize) { + at::native::resize_output(op.tensor, sizes); + if (!strides.empty()) { + TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value()); + op.tensor.as_strided_(sizes, strides); + } else if (options.memory_format_opt().has_value()) { + op.tensor.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt()); + } + } + if (!names.empty()) { + TORCH_INTERNAL_ASSERT(op.tensor.defined()); + namedinference::propagate_names(op.tensor, names); + } +} + +SplitUntil32Bit TensorIteratorBase::with_32bit_indexing() const { return SplitUntil32Bit(*this); } /// SplitUntil32Bit. Recursively splits an iterator into sub-iterators that /// can use 32-bit indexing. -SplitUntil32Bit::iterator::iterator(const TensorIterator& iter) { +SplitUntil32Bit::iterator::iterator(const TensorIteratorBase& iter) { vec.emplace_back(new TensorIterator(iter)); vec.emplace_back(nullptr); // ++ first pops the last element ++(*this); diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index 86c3992e5241..25b255d72fef 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -8,6 +8,7 @@ #include #include #include +#include // TensorIterator is a helper class for element-wise operations, such as // arithmetic, comparisons, and trigonometric functions. It handles @@ -131,13 +132,15 @@ enum class FastSetupType : uint8_t { }; class TensorIteratorConfig; +struct TensorIterator; -struct CAFFE2_API TensorIterator { +struct CAFFE2_API TensorIteratorBase : public impl::MetaBase { using DimMask = std::bitset<64>; using PtrVector = SmallVector; using StrideVector = SmallVector; - TensorIterator(TensorIteratorConfig&); + TensorIteratorBase(); + void build(TensorIteratorConfig&); // The inner-loop function operates on the fastest moving dimension. It // implements element-wise operations in terms of 1-d strided tensors. @@ -152,19 +155,10 @@ struct CAFFE2_API TensorIterator { using loop_t = c10::function_ref; using loop2d_t = c10::function_ref; - using loop_subiter_t = c10::function_ref; + using loop_subiter_t = c10::function_ref; void foreach_reduced_elt(loop_subiter_t loop, bool parallelize=true); - static TensorIterator binary_float_op(Tensor& out, const Tensor& a, const Tensor& b); - static TensorIterator binary_op(Tensor& out, const Tensor& a, const Tensor& b); - static TensorIterator comparison_op(Tensor& out, const Tensor& a, const Tensor& b); - static TensorIterator unary_op(Tensor& out, const Tensor& a); - static TensorIterator unary_float_op(Tensor& out, const Tensor& a); - static TensorIterator nullary_op(Tensor& out); - static TensorIterator reduce_op(Tensor& out, const Tensor& a); - static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a); - int ndim() const { return shape_.size(); } IntArrayRef shape() const { return shape_; } int64_t numel() const; @@ -297,8 +291,6 @@ struct CAFFE2_API TensorIterator { } protected: - void build(TensorIteratorConfig&); - // Mutable reference as it moves tensors out of TensorIteratorConfig void populate_operands(TensorIteratorConfig&); void mark_outputs(); @@ -402,8 +394,26 @@ struct CAFFE2_API TensorIterator { bool is_reduction_ = false; }; +struct CAFFE2_API TensorIterator final : public TensorIteratorBase { + TensorIterator() : TensorIteratorBase() {} + // Slicing is OK, TensorIterator guaranteed NOT to have any fields + TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {} + + static TensorIterator binary_float_op(Tensor& out, const Tensor& a, const Tensor& b); + static TensorIterator binary_op(Tensor& out, const Tensor& a, const Tensor& b); + static TensorIterator comparison_op(Tensor& out, const Tensor& a, const Tensor& b); + static TensorIterator unary_op(Tensor& out, const Tensor& a); + static TensorIterator unary_float_op(Tensor& out, const Tensor& a); + static TensorIterator nullary_op(Tensor& out); + static TensorIterator reduce_op(Tensor& out, const Tensor& a); + static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a); + + void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override; +}; + class CAFFE2_API TensorIteratorConfig final { public: + friend struct TensorIteratorBase; friend struct TensorIterator; TensorIteratorConfig() {} @@ -478,7 +488,9 @@ class CAFFE2_API TensorIteratorConfig final { // It would be better if this was && qualified, but this would be at the cost // of a lot of boilerplate above TensorIterator build() { - return TensorIterator(*this); + TensorIterator iter; + iter.build(*this); + return iter; } private: @@ -508,9 +520,10 @@ class CAFFE2_API TensorIteratorConfig final { struct CAFFE2_API SplitUntil32Bit { struct CAFFE2_API iterator { iterator() {}; - iterator(const TensorIterator& iter); + iterator(const TensorIteratorBase& iter); iterator(iterator&&) = default; + // Guaranteed to be a TensorIterator proper! TensorIterator& operator*() const; iterator& operator++(); bool operator==(const iterator& other) const { @@ -524,13 +537,13 @@ struct CAFFE2_API SplitUntil32Bit { std::vector> vec; }; - SplitUntil32Bit(const TensorIterator& iter) : iter(iter) {} + SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {} iterator begin() const; iterator end() const; private: - const TensorIterator& iter; + const TensorIteratorBase& iter; }; } // namespace at diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp index 6d3ba3acb4fc..3837c7567e31 100644 --- a/aten/src/ATen/native/TensorIteratorReduce.cpp +++ b/aten/src/ATen/native/TensorIteratorReduce.cpp @@ -7,13 +7,13 @@ namespace at { -using loop2d_t = TensorIterator::loop2d_t; +using loop2d_t = TensorIteratorBase::loop2d_t; -static bool use_two_pass_reduction(TensorIterator& iter); -static void two_pass_reduction(TensorIterator& iter, loop2d_t loop); -static void parallel_dim_reduction(TensorIterator& iter, loop2d_t loop); +static bool use_two_pass_reduction(TensorIteratorBase& iter); +static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop); +static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop); -void TensorIterator::parallel_reduce(loop2d_t loop) { +void TensorIteratorBase::parallel_reduce(loop2d_t loop) { TORCH_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output"); int64_t numel = this->numel(); if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 || @@ -26,11 +26,11 @@ void TensorIterator::parallel_reduce(loop2d_t loop) { } } -static bool use_two_pass_reduction(TensorIterator& iter) { +static bool use_two_pass_reduction(TensorIteratorBase& iter) { return iter.output(0).numel() == 1; } -static void two_pass_reduction(TensorIterator& iter, loop2d_t loop) { +static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) { int max_threads = at::get_num_threads(); auto dst = iter.output(0); @@ -65,7 +65,7 @@ static void two_pass_reduction(TensorIterator& iter, loop2d_t loop) { /// Chooses a dimension over which to parallelize. Prefers the outer-most /// dimension thats larger than the number of available threads. -static int find_split_dim(TensorIterator& iter) { +static int find_split_dim(TensorIteratorBase& iter) { int num_threads = at::get_num_threads(); auto shape = iter.shape(); @@ -84,7 +84,7 @@ static int find_split_dim(TensorIterator& iter) { } static std::tuple -round_columns(TensorIterator& iter, int dim, int multiple, int64_t begin, int64_t end) { +round_columns(TensorIteratorBase& iter, int dim, int multiple, int64_t begin, int64_t end) { begin = begin - (begin % multiple); if (end != iter.shape()[dim]) { // only round the 'end' column down if it's not the final column @@ -93,7 +93,7 @@ round_columns(TensorIterator& iter, int dim, int multiple, int64_t begin, int64_ return std::make_tuple(begin, end); } -static void parallel_dim_reduction(TensorIterator& iter, loop2d_t loop) { +static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop) { AT_ASSERT(iter.ndim() >= 1); int dim = find_split_dim(iter); int64_t cols = iter.shape()[dim]; @@ -116,7 +116,7 @@ static void parallel_dim_reduction(TensorIterator& iter, loop2d_t loop) { }); } -void TensorIterator::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) { +void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) { AT_ASSERT(ninputs() == 1); AT_ASSERT(noutputs() >= 1); @@ -153,7 +153,7 @@ void TensorIterator::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) return; } - auto sub_iter = *this; + TensorIterator sub_iter(*this); sub_iter.narrow(dim, begin, end - begin); // On some broken setups, `#ifdef _OPENMP` is true, diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index 66e0e141a633..daea7e7f68bb 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -385,8 +385,8 @@ Tensor& sqrt_(Tensor& self) { return unary_op_impl_(self, at::sqrt_out); } Tensor square(const Tensor& self) { return at::pow(self, 2); } Tensor& square_(Tensor& self) { return at::pow_out(self, self, 2); } -Tensor& sigmoid_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sigmoid_stub); } -Tensor sigmoid(const Tensor& self) { return unary_op_impl(self, at::sigmoid_out); } +Tensor& sigmoid_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sigmoid_stub); } +Tensor sigmoid(const Tensor& self) { return unary_op_impl_float(self, sigmoid_stub); } Tensor& sigmoid_(Tensor& self) { return unary_op_impl_(self, at::sigmoid_out); } Tensor& logit_out( diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index f35d9054a729..36c01b2af49e 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -639,6 +639,9 @@ void mse_kernel(TensorIterator& iter) { } void fmod_kernel(TensorIterator& iter) { + // Use the dtype of the first argument to retain BC, + // change to common_dtype for type promotion in the future + // Issue #47779: https://github.com/pytorch/pytorch/issues/47779 if (isIntegralType(iter.dtype(), /*includeBool=*/ false)) { AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "fmod_cpu", [&]() { cpu_kernel(iter, [=](scalar_t x, scalar_t d) -> scalar_t { @@ -660,32 +663,6 @@ void fmod_kernel(TensorIterator& iter) { } } -void fmod_scalar_kernel(TensorIterator& iter, Scalar divisor) { - if (isIntegralType(iter.dtype(), /*includeBool=*/ false)) { - AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "fmod_scalar_cpu", [&]() { - const auto div = divisor.to(); - TORCH_CHECK(div != 0, "ZeroDivisionError"); - cpu_kernel(iter, [=](scalar_t x) -> scalar_t { - return x % div; - }); - }); - } else { - AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "fmod_scalar_cpu", [&]() { - const auto div = divisor.to(); - const auto div_vec = Vec256(div); - cpu_kernel_vec( - iter, - [=](scalar_t x) -> scalar_t { - return std::fmod(x, div); - }, - [=](Vec256 x) { - return x.fmod(div_vec); - }); - }); - } - -} - void logaddexp_kernel(TensorIterator& iter) { AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "logaddexp_cpu", [&]() { cpu_kernel_vec( @@ -872,7 +849,6 @@ REGISTER_DISPATCH(logit_backward_stub, &logit_backward_kernel); REGISTER_DISPATCH(tanh_backward_stub, &tanh_backward_kernel); REGISTER_DISPATCH(mse_stub, &mse_kernel); REGISTER_DISPATCH(fmod_stub, &fmod_kernel); -REGISTER_DISPATCH(fmod_scalar_stub, &fmod_scalar_kernel); REGISTER_DISPATCH(logaddexp_stub, &logaddexp_kernel); REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_kernel); REGISTER_DISPATCH(gcd_stub, &gcd_kernel); diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index 129cdc0845c4..b94e4b44aae0 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -109,7 +109,7 @@ static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, } template -static void set_result(const int index, const res_t result, const TensorIterator &iter, const int num_outputs) { +static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) { // static_assert(std::is_same::value, "data types must match"); if (index < num_outputs) { char *out = (char *) iter.data_ptr(index); @@ -118,20 +118,20 @@ static void set_result(const int index, const res_t result, const TensorIterator } template -static void set_results(const res_t result, const TensorIterator &iter, const int num_outputs) { +static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) { AT_ASSERT(num_outputs == 1); set_result(0, result, iter, num_outputs); } template static inline typename std::enable_if::type -for_each_in_tuple(const std::tuple& t, const TensorIterator &iter, const int num_outputs) { +for_each_in_tuple(const std::tuple& t, const TensorIteratorBase &iter, const int num_outputs) { return i; } template static inline typename std::enable_if::type -for_each_in_tuple(const std::tuple& t, const TensorIterator &iter, const int num_outputs) { +for_each_in_tuple(const std::tuple& t, const TensorIteratorBase &iter, const int num_outputs) { if (i < (size_t)num_outputs) { set_result(i, std::get(t), iter, num_outputs); return for_each_in_tuple(t, iter, num_outputs); @@ -140,7 +140,7 @@ for_each_in_tuple(const std::tuple& t, const TensorIterator &iter, c } template -static void set_results(const std::tuple& result, const TensorIterator &iter, const int num_outputs) { +static void set_results(const std::tuple& result, const TensorIteratorBase &iter, const int num_outputs) { AT_ASSERT(num_outputs >= 1); std::size_t result_size = for_each_in_tuple(result, iter, num_outputs); AT_ASSERT((size_t)num_outputs == result_size); @@ -178,7 +178,7 @@ struct all_same : guts::conjunction< // into several pieces, reduce each separately, and then combine them. template -void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { +void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) { using rf_t = decltype(&ops_t::reduce); using cf_t = decltype(&ops_t::combine); using pf_t = decltype(&ops_t::project); @@ -202,7 +202,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { "the accumulate type must be default-constructible" ); const int num_outputs = iter.noutputs(); - iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIterator &sub_iter) { + iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) { auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t { int ntensors = sub_iter.ntensors(); sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) { @@ -244,7 +244,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) { } template -void binary_kernel_reduce_vec(TensorIterator& iter, func_t op, vec_func_t vop, double ident = 0) { +void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) { using traits = binary_function_traits; static_assert( all_same< diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu index 87b7304ad097..0f1df59618be 100644 --- a/aten/src/ATen/native/cuda/Activation.cu +++ b/aten/src/ATen/native/cuda/Activation.cu @@ -112,7 +112,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) { input_stride0, input_stride1, input_numel); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } return result; @@ -230,7 +230,7 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te input_stride0, input_stride1, input_numel); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); // update weight_grad std::vector reduce_dims; diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu index b24f26a2396e..5066480535b9 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu @@ -520,7 +520,7 @@ namespace { sizeB, sizeC, isizeH, isizeW, osizeH, osizeW, kernel_stride_C, kernel_size_C, istrideB, istrideC, istrideH, istrideW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); break; @@ -563,7 +563,7 @@ namespace { input_data, output_data, isizeH, isizeW, osizeH, osizeW, istrideD, istrideH, istrideW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); break; @@ -666,7 +666,7 @@ namespace { sizeB, sizeC, isizeH, isizeW, osizeH, osizeW, kernel_stride_C, kernel_size_C, ostrideB, ostrideC, ostrideH, ostrideW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); break; @@ -703,7 +703,7 @@ namespace { atomic_adaptive_average_gradinput <<>> ( gradInput_data, gradOutput_data, isizeH, isizeW, osizeH, osizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { @@ -711,7 +711,7 @@ namespace { adaptive_average_gradinput <<>> ( gradInput_data, gradOutput_data, isizeH, isizeW, osizeH, osizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } ); diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu index 26f95a6e010b..3e87105298e0 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu @@ -123,7 +123,7 @@ void adaptiveaveragepool_loop( istrideD, istrideT, istrideH, istrideW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } @@ -216,7 +216,7 @@ void adaptiveaveragegradinput_loop( isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } @@ -310,7 +310,7 @@ void atomicadaptiveaveragegradinput_loop( isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu index ab97030d5264..dfe4c49b80aa 100644 --- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu @@ -251,7 +251,7 @@ void adaptive_max_pool2d_out_cuda_template( indices_data, isizeH, isizeW, osizeH, osizeW, istrideD, istrideH, istrideW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); } else { @@ -287,7 +287,7 @@ void adaptive_max_pool2d_out_cuda_template( indices_data, isizeH, isizeW, osizeH, osizeW, istrideD, istrideH, istrideW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); } @@ -344,7 +344,7 @@ void adaptive_max_pool2d_backward_out_cuda_template( gradInput_data, gradOutput_data, indices_data, isizeH, isizeW, osizeH, osizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { @@ -353,7 +353,7 @@ void adaptive_max_pool2d_backward_out_cuda_template( gradInput_data, gradOutput_data, indices_data, isizeH, isizeW, osizeH, osizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } ); @@ -391,7 +391,7 @@ void adaptive_max_pool2d_backward_out_cuda_template( gradInput_data, gradOutput_data, indices_data, isizeH, isizeW, osizeH, osizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { @@ -400,7 +400,7 @@ void adaptive_max_pool2d_backward_out_cuda_template( gradInput_data, gradOutput_data, indices_data, isizeH, isizeW, osizeH, osizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } ); diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu index dc9ebefc6d35..d515cf78bbca 100644 --- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu +++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu @@ -131,7 +131,7 @@ void adaptivemaxpool_loop( adaptivemaxpool<<>>( input_data, output_data, indices_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; @@ -209,7 +209,7 @@ void adaptivemaxgradinput_loop( adaptivemaxgradinput<<>>( gradInput_data, gradOutput_data, indices_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } @@ -285,7 +285,7 @@ void atomicadaptivemaxgradinput_loop( atomicadaptivemaxgradinput<<>>( gradInput_data, gradOutput_data, indices_data, isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu index 69ed46ad61f6..0ac6537a8de9 100644 --- a/aten/src/ATen/native/cuda/AmpKernels.cu +++ b/aten/src/ATen/native/cuda/AmpKernels.cu @@ -243,7 +243,7 @@ Tensor _amp_update_scale_cuda(Tensor& growth_tracker, growth_factor, backoff_factor, growth_interval); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return new_scale; } diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu index 6d74e5b4196f..274ced8edcdf 100644 --- a/aten/src/ATen/native/cuda/AveragePool2d.cu +++ b/aten/src/ATen/native/cuda/AveragePool2d.cu @@ -325,7 +325,7 @@ void avg_pool2d_out_cuda_template( output_data, divisor_override_value, count_include_pad, use_divisor); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } case MemoryFormat::Contiguous: { @@ -343,7 +343,7 @@ void avg_pool2d_out_cuda_template( output_data, divisor_override_value, count_include_pad, use_divisor); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); @@ -457,7 +457,7 @@ Tensor& avg_pool2d_backward_out_cuda_template( gradInput_data, divisor_override_value, count_include_pad, use_divisor); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } case MemoryFormat::Contiguous: { @@ -475,7 +475,7 @@ Tensor& avg_pool2d_backward_out_cuda_template( gradInput_data, divisor_override_value, count_include_pad, use_divisor); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu index a9cdfbb65705..eaee3c87b1f8 100644 --- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu +++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu @@ -1025,6 +1025,23 @@ void magmaLuSolve( AT_CUDA_CHECK(cudaGetLastError()); } +template<> +void magmaLuSolve>( + magma_int_t n, magma_int_t nrhs, c10::complex* dA, magma_int_t ldda, magma_int_t* ipiv, + c10::complex* dB, magma_int_t lddb, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_zgetrs_gpu(MagmaNoTrans, n, nrhs, reinterpret_cast(dA), ldda, ipiv, reinterpret_cast(dB), lddb, info); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaLuSolve>( + magma_int_t n, magma_int_t nrhs, c10::complex* dA, magma_int_t ldda, magma_int_t* ipiv, + c10::complex* dB, magma_int_t lddb, magma_int_t* info) { + MagmaStreamSyncGuard guard; + magma_cgetrs_gpu(MagmaNoTrans, n, nrhs, reinterpret_cast(dA), ldda, ipiv, reinterpret_cast(dB), lddb, info); + AT_CUDA_CHECK(cudaGetLastError()); +} template<> void magmaLuSolveBatched( @@ -1043,6 +1060,24 @@ void magmaLuSolveBatched( info = magma_sgetrs_batched(MagmaNoTrans, n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, batchsize, magma_queue.get_queue()); AT_CUDA_CHECK(cudaGetLastError()); } + +template<> +void magmaLuSolveBatched>( + magma_int_t n, magma_int_t nrhs, c10::complex** dA_array, magma_int_t ldda, magma_int_t** dipiv_array, + c10::complex** dB_array, magma_int_t lddb, magma_int_t& info, + magma_int_t batchsize, const MAGMAQueue& magma_queue) { + info = magma_zgetrs_batched(MagmaNoTrans, n, nrhs, reinterpret_cast(dA_array), ldda, dipiv_array, reinterpret_cast(dB_array), lddb, batchsize, magma_queue.get_queue()); + AT_CUDA_CHECK(cudaGetLastError()); +} + +template<> +void magmaLuSolveBatched>( + magma_int_t n, magma_int_t nrhs, c10::complex** dA_array, magma_int_t ldda, magma_int_t** dipiv_array, + c10::complex** dB_array, magma_int_t lddb, magma_int_t& info, + magma_int_t batchsize, const MAGMAQueue& magma_queue) { + info = magma_cgetrs_batched(MagmaNoTrans, n, nrhs, reinterpret_cast(dA_array), ldda, dipiv_array, reinterpret_cast(dB_array), lddb, batchsize, magma_queue.get_queue()); + AT_CUDA_CHECK(cudaGetLastError()); +} #endif #define ALLOCATE_ARRAY(name, type, size) \ @@ -2149,7 +2184,7 @@ Tensor _lu_solve_helper_cuda(const Tensor& self, const Tensor& LU_data, const Te if (self.numel() == 0 || LU_data.numel() == 0) { return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT); } - AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lu_solve_cuda", [&]{ + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "lu_solve_cuda", [&]{ apply_lu_solve(self_working_copy, LU_data_working_copy, LU_pivots_working_copy, info); }); TORCH_CHECK(info == 0, "MAGMA lu_solve : invalid argument: ", -info); diff --git a/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu b/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu index 04cf2cc0c7e8..db43b0552b70 100644 --- a/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu +++ b/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu @@ -36,6 +36,27 @@ void remainder_kernel_cuda(TensorIterator& iter) { } } +void fmod_kernel_cuda(TensorIterator& iter) { + // Use the dtype of the first argument to retain BC, + // change to common_dtype for type promotion in the future + // Issue #47779: https://github.com/pytorch/pytorch/issues/47779 + if (isIntegralType(iter.dtype(), /*includeBool*/ false)) { + AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "fmod_cuda", [&]() { + gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return a % b; + }); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "fmod_cuda", [&]() { + gpu_kernel_with_scalars(iter, + []GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t { + return ::fmod(a, b); + }); + }); + } +} + REGISTER_DISPATCH(remainder_stub, &remainder_kernel_cuda); +REGISTER_DISPATCH(fmod_stub, &fmod_kernel_cuda); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu index dad691981ddb..e28e7414aac6 100644 --- a/aten/src/ATen/native/cuda/Bucketization.cu +++ b/aten/src/ATen/native/cuda/Bucketization.cu @@ -86,7 +86,7 @@ void searchsorted_cuda_contiguous(Tensor& result, const Tensor& input, const Ten searchsorted_cuda_kernel<<>>( data_out, data_in, data_bd, idim_in, idim_bd, numel_in, right, boundaries.dim() == 1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) { diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu index d0b8c40ee4dc..d92a0d6fd1ff 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu @@ -413,7 +413,7 @@ void max_pool2d_with_indices_out_cuda_template( in_stride_h, in_stride_w, kernel_stride_C, kernel_size_C, output_data, indices_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } case MemoryFormat::Contiguous: { @@ -425,7 +425,7 @@ void max_pool2d_with_indices_out_cuda_template( nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, dilationH, dilationW, output_data, indices_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); @@ -583,7 +583,7 @@ void max_pool2d_with_indices_backward_out_cuda_template( in_stride_h, in_stride_w, kernel_stride_C, kernel_size_C, gradInput_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } case MemoryFormat::Contiguous: { @@ -607,7 +607,7 @@ void max_pool2d_with_indices_backward_out_cuda_template( nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth, kH, kW, dH, dW, padH, padW, dilationH, dilationW, gradInput_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu index bbafebacbf13..e6eacbb8424e 100644 --- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu @@ -112,7 +112,7 @@ void max_pool3d_with_indices_out_frame( pT, pH, pW, dilationT, dilationH, dilationW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; @@ -177,7 +177,7 @@ void max_pool3d_with_indices_backward_out_frame( pT, pH, pW, dilationT, dilationH, dilationW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu index 60a2c943e742..515388a0fe3e 100644 --- a/aten/src/ATen/native/cuda/DistanceKernel.cu +++ b/aten/src/ATen/native/cuda/DistanceKernel.cu @@ -231,19 +231,19 @@ void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, doubl AT_DISPATCH_FLOATING_TYPES(x1.scalar_type(), "cdist_cuda", [&] { if (p == 0.0) { cdist_kernel_cuda_impl::zero><<>>(result.data_ptr(), x1.data_ptr(), x2.data_ptr(), p, r1, r2, m, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p == 1.0) { cdist_kernel_cuda_impl::one><<>>(result.data_ptr(), x1.data_ptr(), x2.data_ptr(), p, r1, r2, m, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p == 2.0) { cdist_kernel_cuda_impl::two><<>>(result.data_ptr(), x1.data_ptr(), x2.data_ptr(), p, r1, r2, m, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (std::isinf(p)) { cdist_kernel_cuda_impl::inf><<>>(result.data_ptr(), x1.data_ptr(), x2.data_ptr(), p, r1, r2, m, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { cdist_kernel_cuda_impl::p><<>>(result.data_ptr(), x1.data_ptr(), x2.data_ptr(), p, r1, r2, m, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } @@ -261,19 +261,19 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) { AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_cuda", [&] { if (p == 0.0) { pdist_kernel_cuda_impl::zero><<>>(result.data_ptr(), self.data_ptr(), n, m, p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p == 1.0) { pdist_kernel_cuda_impl::one><<>>(result.data_ptr(), self.data_ptr(), n, m, p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p == 2.0) { pdist_kernel_cuda_impl::two><<>>(result.data_ptr(), self.data_ptr(), n, m, p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (std::isinf(p)) { pdist_kernel_cuda_impl::inf><<>>(result.data_ptr(), self.data_ptr(), n, m, p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { pdist_kernel_cuda_impl::p><<>>(result.data_ptr(), self.data_ptr(), n, m, p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } @@ -303,19 +303,19 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_cuda_backward", [&] { if (p == 1.0) { pdist_backward_kernel_cuda_impl::one><<>>(buffer.data_ptr(), grad.data_ptr(), self.data_ptr(), dist.data_ptr(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p < 2.0) { pdist_backward_kernel_cuda_impl::lt_two><<>>(buffer.data_ptr(), grad.data_ptr(), self.data_ptr(), dist.data_ptr(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p == 2.0) { pdist_backward_kernel_cuda_impl::two><<>>(buffer.data_ptr(), grad.data_ptr(), self.data_ptr(), dist.data_ptr(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (std::isinf(p)) { pdist_backward_kernel_cuda_impl::inf><<>>(buffer.data_ptr(), grad.data_ptr(), self.data_ptr(), dist.data_ptr(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { pdist_backward_kernel_cuda_impl::p><<>>(buffer.data_ptr(), grad.data_ptr(), self.data_ptr(), dist.data_ptr(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); @@ -354,27 +354,27 @@ void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor cdist_backward_kernel_cuda_impl::one><<>>(buffer.data_ptr(), grad.data_ptr(), x1.data_ptr(), x2.data_ptr(), dist.data_ptr(), gs, p, r1, r2, m, count, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p < 2.0) { cdist_backward_kernel_cuda_impl::lt_two><<>>(buffer.data_ptr(), grad.data_ptr(), x1.data_ptr(), x2.data_ptr(), dist.data_ptr(), gs, p, r1, r2, m, count, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (p == 2.0) { cdist_backward_kernel_cuda_impl::two><<>>(buffer.data_ptr(), grad.data_ptr(), x1.data_ptr(), x2.data_ptr(), dist.data_ptr(), gs, p, r1, r2, m, count, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (std::isinf(p)) { cdist_backward_kernel_cuda_impl::inf><<>>(buffer.data_ptr(), grad.data_ptr(), x1.data_ptr(), x2.data_ptr(), dist.data_ptr(), gs, p, r1, r2, m, count, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { cdist_backward_kernel_cuda_impl::p><<>>(buffer.data_ptr(), grad.data_ptr(), x1.data_ptr(), x2.data_ptr(), dist.data_ptr(), gs, p, r1, r2, m, count, r_size, l1_size, l2_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 8301f0564572..67adbaabbb84 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -243,7 +243,7 @@ inline void launcher( nelem, pa, rng_engine_inputs); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; case 2: fused_dropout_kernel_vec< @@ -259,7 +259,7 @@ inline void launcher( nelem, pa, rng_engine_inputs); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; } } else { @@ -273,7 +273,7 @@ inline void launcher( nelem, pa, rng_engine_inputs); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); break; default: if (!self.is_contiguous() && ret.is_contiguous() && @@ -289,7 +289,7 @@ inline void launcher( nelem, pa, rng_engine_inputs); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { fused_dropout_kernel <<(num_indices), static_cast(stride), static_cast(padding_idx)); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); }); @@ -370,7 +370,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices, static_cast(max_norm), static_cast(norm_type), dim, self.stride(0), self.stride(1)); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); }); diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu index 3931edeb6f12..689db4347067 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu @@ -235,7 +235,7 @@ Tensor embedding_backward_cuda_kernel( segment_offsets.data_ptr(), num_of_segments, numel); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } // In order to compute `partial_segment_offset`, which is the start index @@ -263,7 +263,7 @@ Tensor embedding_backward_cuda_kernel( partials_per_segment_offset.data_ptr(), segment_offsets.data_ptr(), num_of_segments); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } const int stride_warped = ceil_div(stride, C10_WARP_SIZE)*C10_WARP_SIZE; @@ -296,7 +296,7 @@ Tensor embedding_backward_cuda_kernel( partial_segment_offset.data_ptr(), num_of_partial_segments, grad_weight_per_segment.data_ptr(), stride_warped); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { compute_grad_weight<<>>( orig_indices.data_ptr(), @@ -307,7 +307,7 @@ Tensor embedding_backward_cuda_kernel( num_of_partial_segments, grad_weight_per_segment.data_ptr(), stride_warped); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } // Finally, we sum all the partial-sums and scatter them @@ -323,7 +323,7 @@ Tensor embedding_backward_cuda_kernel( num_of_partial_segments, padding_idx, stride_warped); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); }); diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 1ace60f40600..5bed5532baee 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -244,7 +244,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad, scalar_t, index_t><<>>( max_indices.data_ptr(), grad.data_ptr(), grad_weight.data_ptr(), stride, numBags); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); @@ -335,7 +335,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices, mode == MODE_MAX ? max_indices.data_ptr() : NULL, per_sample_weights.defined() ? per_sample_weights.data_ptr() : NULL, per_sample_weights.defined() ? per_sample_weights.stride(0) : 0); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); }); }); @@ -475,7 +475,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda( num_samples, embedding_features, output.data_ptr()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } ); diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu index 9a9586bdc160..bee3cfa4d436 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu @@ -205,7 +205,7 @@ void fractional_max_pool2d_out_cuda_template( <<>>( devOutput, devIndices, devInput, devSamples, poolSizeH, poolSizeW); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); } @@ -272,7 +272,7 @@ void fractional_max_pool2d_backward_out_cuda_template( fractional_max_pool2d_backward_out_cuda_frame <<>>( devGradInput, devGradOutput, devIndices); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); } diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu index b4aebc4b1e4c..0d492de48570 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu @@ -241,7 +241,7 @@ void fractional_max_pool3d_out_cuda_template( randomSamples.packed_accessor64(), poolSizeT, poolSizeH, poolSizeW ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); } @@ -327,7 +327,7 @@ void fractional_max_pool3d_backward_out_cuda_template( gradOutput_.packed_accessor64(), indices_.packed_accessor64() ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } ); } diff --git a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu index 3168698848ad..180385aaf052 100644 --- a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu +++ b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu @@ -37,7 +37,7 @@ void _lauch_kernel(int total_n_elems, const func_t& f) { auto stream = at::cuda::getCurrentCUDAStream(); _elemwise_kernel <<>>(total_n_elems, f); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu index 273f0af25c8b..a08c13037e34 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cu +++ b/aten/src/ATen/native/cuda/GridSampler.cu @@ -708,7 +708,7 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { grid_sampler_2d_kernel <<>>( @@ -719,7 +719,7 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } @@ -749,7 +749,7 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { grid_sampler_3d_kernel <<>>( @@ -760,7 +760,7 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } @@ -796,7 +796,7 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { grid_sampler_2d_backward_kernel <<>>( @@ -809,7 +809,7 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } @@ -846,7 +846,7 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { grid_sampler_3d_backward_kernel <<>>( @@ -859,7 +859,7 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, static_cast(interpolation_mode), static_cast(padding_mode), align_corners); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); } diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index be2c477d4cdc..7d7a59b32406 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -90,7 +90,7 @@ static void launch_kernel(int64_t N, const func_t& f) { dim3 grid((N + block.x * vt - 1) / (block.x * vt)); auto stream = at::cuda::getCurrentCUDAStream(); index_elementwise_kernel<<>>(N, f); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template diff --git a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp index 92895d947758..1bbe47dbfb2e 100644 --- a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp +++ b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp @@ -76,50 +76,4 @@ Tensor & masked_scatter__cuda(Tensor& self, const Tensor & mask, const Tensor & } } -Tensor & fmod_cuda_out(Tensor & result, const Tensor & self, Scalar other) { - at::assert_no_internal_overlap(result); - return legacy::cuda::_th_fmod_out(result, self, other); -} - -Tensor fmod_cuda(const Tensor & self, Scalar other) { - return legacy::cuda::_th_fmod(self, other); -} - -Tensor & fmod_cuda_out(Tensor & result, const Tensor & self, const Tensor & other) { - at::assert_no_internal_overlap(result); - Tensor b_self, b_other; - // optimization that codegen used to do; avoids broadcast. - if (other.dim() == 0) { - return fmod_cuda_out(result, self, other.item()); - } - std::tie(b_self, b_other) = expand_outplace(self, other, "fmod_out"); - return legacy::cuda::_th_fmod_out(result, b_self, b_other); -} - -Tensor fmod_cuda(const Tensor & self, const Tensor & other) { - // optimization that codegen used to do; avoids broadcast. - if (other.dim() == 0) { - return fmod_cuda(self, other.item()); - } - Tensor b_self, b_other; - std::tie(b_self, b_other) = expand_outplace(self, other, "fmod"); - return legacy::cuda::_th_fmod(b_self, b_other); -} - -Tensor & fmod_cuda_(Tensor & self, Scalar other) { - at::assert_no_internal_overlap(self); - return legacy::cuda::_th_fmod_(self, other); -} - -Tensor & fmod_cuda_(Tensor & self, const Tensor & other) { - // optimization that codegen used to do; avoids broadcast. - if (other.dim() == 0) { - return fmod_cuda_(self, other.item()); - } - at::assert_no_internal_overlap(self); - Tensor b_other; - std::tie(b_other) = expand_inplace(self, other, "fmod_"); - return legacy::cuda::_th_fmod_(self, b_other); -} - }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index ef4e1f618a59..fb1f8e6720b4 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -183,7 +183,7 @@ static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const fun int64_t grid = (N + block_work_size - 1) / block_work_size; auto stream = at::cuda::getCurrentCUDAStream(); unrolled_elementwise_kernel_for_multi_outputs<<>>(N, f, data, ic, oc); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index 29ff6e6a47ad..69718b206d6b 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -281,7 +281,7 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2), tg_batch_offsets.data_ptr(), tg_target_stride, batch_size, BLANK); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return std::make_tuple(neg_log_likelihood, log_alpha); } @@ -633,7 +633,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data_ptr(), tg_target_stride, batch_size, BLANK); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on @@ -690,7 +690,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data_ptr(), tg_target_stride, batch_size, num_labels, BLANK, zero_infinity); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { // small problem, use naive algorithm // Still no block/grid configuration guru... int threads_input = max_threads; @@ -713,7 +713,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ log_beta.stride(0), log_beta.stride(1), log_beta.stride(2), tg_batch_offsets.data_ptr(), tg_target_stride, batch_size, num_labels, BLANK, zero_infinity); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); // catch launch errors + C10_CUDA_KERNEL_LAUNCH_CHECK(); // catch launch errors } // zero those invalid graident elements due to padding @@ -737,7 +737,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_ grad.size(1), grad.size(2) ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return grad; diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu index 81a634c2fd47..c3517ab49d1c 100644 --- a/aten/src/ATen/native/cuda/MaxUnpooling.cu +++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu @@ -169,7 +169,7 @@ Tensor& max_unpooling2d_forward_out_cuda( oheight, owidth, output.data_ptr()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); })); if (self.ndimension() == 3) { output.resize_({numChannels, oheight, owidth}); @@ -343,7 +343,7 @@ Tensor& max_unpooling3d_forward_out_cuda( oH, oW, offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } @@ -446,7 +446,7 @@ at::Tensor& max_unpooling2d_backward_out_cuda( oheight, owidth, grad_input.data_ptr()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); })); return grad_input; } @@ -550,7 +550,7 @@ at::Tensor& max_unpooling3d_backward_out_cuda( indices.packed_accessor64(), grad_input_reshaped.packed_accessor64(), offsetZ); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); totalZ -= 65535; offsetZ += 65535; } diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh index ade7c09e4951..8b300483c945 100644 --- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh +++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh @@ -96,7 +96,7 @@ void multi_tensor_apply( tensorListMeta, callable, args...); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Reset. loc_block_info = 0; @@ -152,7 +152,7 @@ void multi_tensor_apply( tensorListMeta, callable, args...); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // Reset. loc_block_info = 0; diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu index 147f7f3fad6f..a435c7060f45 100644 --- a/aten/src/ATen/native/cuda/TensorTransformations.cu +++ b/aten/src/ATen/native/cuda/TensorTransformations.cu @@ -95,7 +95,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) { kernel_pointwise_flip_apply2 <<>>( in_tensor_info, out_tensor_info, N, flip_dim, total_dims); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); return out_tensor; } @@ -132,7 +132,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) { stride_contiguous.cuda().data_ptr(), shape_t.cuda().data_ptr(), total_dims); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); return out_tensor; @@ -197,7 +197,7 @@ Tensor roll_cuda(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { size, in_tensor.stride(dim), total_dims); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); return out_tensor; diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu index 6bb2ae79ffa9..6ba73e1c143e 100644 --- a/aten/src/ATen/native/cuda/TriangularOps.cu +++ b/aten/src/ATen/native/cuda/TriangularOps.cu @@ -67,14 +67,14 @@ Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, c triu_tril_kernel <<>>( result_info, self_info, k, N); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { auto result_info = cuda::detail::getTensorInfo(result); auto self_info = cuda::detail::getTensorInfo(self); triu_tril_kernel <<>>( result_info, self_info, k, N); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); return result; @@ -192,7 +192,7 @@ Tensor& apply_diag(Tensor& result, const Tensor& self, int64_t dimension) { sz, self_stride_0 + self_stride_1, result_stride); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } else { auto n_elems = self.numel(); @@ -221,7 +221,7 @@ Tensor& apply_diag(Tensor& result, const Tensor& self, int64_t dimension) { n_elems, result_stride_0 + result_stride_1, self_stride); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu index e0a27adbd985..512154fd02df 100644 --- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu +++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu @@ -95,7 +95,7 @@ void sqrt_kernel_cuda(TensorIterator& iter) { } void sigmoid_kernel_cuda(TensorIterator& iter) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "sigmoid_cuda", [&]() { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "sigmoid_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t { scalar_t one = scalar_t(1); return one / (one + std::exp(- a)); diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu index 1884b09a4fab..64bda79809bb 100644 --- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu +++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu @@ -45,7 +45,7 @@ static void _launch_unfold_backward_kernel(int total_n_elems, func_t f) { auto stream = at::cuda::getCurrentCUDAStream(); _unfold_backward_elementwise_kernel <<>>(total_n_elems, f); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu index 53af1d463606..13f0741bb5da 100644 --- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu @@ -228,7 +228,7 @@ static void upsample_bicubic2d_out_cuda_template( align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -302,7 +302,7 @@ static void upsample_bicubic2d_backward_out_cuda_template( 0, stream>>>( num_kernels, rheight, rwidth, align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu index 248d972bb320..4b142d5024d8 100644 --- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu @@ -213,7 +213,7 @@ static void upsample_bilinear2d_out_cuda_template( 0, stream>>>( num_kernels, rheight, rwidth, align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -305,7 +305,7 @@ static void upsample_bilinear2d_backward_out_cuda_template( align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu index 08824565b150..eda43fbfa398 100644 --- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu @@ -160,7 +160,7 @@ static void upsample_linear1d_out_cuda_template( num_threads, 0, stream>>>(num_kernels, rwidth, align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -220,7 +220,7 @@ static void upsample_linear1d_backward_out_cuda_template( num_threads, 0, stream>>>(num_kernels, rwidth, align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index 425d450b375f..ef287ca592da 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -128,7 +128,7 @@ static void upsample_nearest1d_out_cuda_template( upsample_nearest1d_out_frame<<>>( idata, nbatch, channels, input_width, output_width, odata, scale_factor); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -190,7 +190,7 @@ static void upsample_nearest1d_backward_out_cuda_template( upsample_nearest1d_backward_out_frame <<>>( odata, nbatch, channels, output_width, input_width, idata, scale_factor); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu index a7f935e5f681..0ac02e292b28 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu @@ -204,7 +204,7 @@ static void upsample_nearest2d_out_cuda_template( output_width, height_scale, width_scale); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -286,7 +286,7 @@ static void upsample_nearest2d_backward_out_cuda_template( idata, height_scale, width_scale); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu index 820358152351..000e116e7bdf 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu @@ -199,7 +199,7 @@ static void upsample_nearest3d_out_cuda_template( depth_scale, height_scale, width_scale); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -291,7 +291,7 @@ static void upsample_nearest3d_backward_out_cuda_template( depth_scale, height_scale, width_scale); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index cf623723eaaa..bfd191977c4d 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -271,7 +271,7 @@ static void upsample_trilinear3d_out_cuda_template( align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } @@ -360,7 +360,7 @@ static void upsample_trilinear3d_backward_out_cuda_template( align_corners, idata, odata); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); }); } diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu index 6cb9351548fa..1fd710a65e9f 100644 --- a/aten/src/ATen/native/cuda/group_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu @@ -570,7 +570,7 @@ void GroupNormKernelImplInternal( : cuda_utils::kCUDABlockReduceNumThreads; RowwiseMomentsCUDAKernel<<>>( D * HxW, eps, X_data, mean_data, rstd_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (HxW == 1) { GroupNorm1dForward(X, mean, rstd, gamma, beta, N, C, G, Y); @@ -604,7 +604,7 @@ void GroupNormKernelImplInternal( const int64_t B = (N * C + kCUDANumThreads - 1) / kCUDANumThreads; ComputeFusedParamsCUDAKernel<<>>( N, C, G, mean_data, rstd_data, gamma_data, beta_data, a_data, b_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .resize_outputs(false) @@ -698,7 +698,7 @@ void GroupNorm1dBackward( gamma_data, c2_data, c3_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (gamma.defined()) { auto iter = TensorIteratorConfig() @@ -754,7 +754,7 @@ void GroupNorm1dBackward( rstd_data, dgamma_data, dbeta_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize; // The algorithm for colwise reduction here is to accumulate each 32 cols @@ -773,7 +773,7 @@ void GroupNorm1dBackward( rstd_data, dgamma_data, dbeta_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } } @@ -837,7 +837,7 @@ void GroupNormBackwardKernelImplInternal( : cuda_utils::kCUDABlockReduceNumThreads; ComputeInternalGradientsCUDAKernel<<>>( HxW, dY_data, X_data, ds_data, db_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (dX.defined()) { Tensor c1 = at::empty({0}, X.options().dtype(kAccType)); @@ -873,7 +873,7 @@ void GroupNormBackwardKernelImplInternal( db_data, c2_data, c3_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); if (gamma.defined()) { auto iter = TensorIteratorConfig() @@ -925,7 +925,7 @@ void GroupNormBackwardKernelImplInternal( db_data, dgamma_data, dbeta_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize; // The algorithm for colwise reduction here is to accumulate each 32 cols @@ -944,7 +944,7 @@ void GroupNormBackwardKernelImplInternal( db_data, dgamma_data, dbeta_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } } diff --git a/aten/src/ATen/native/cuda/im2col.cuh b/aten/src/ATen/native/cuda/im2col.cuh index e0eada961f24..aee072fcea82 100644 --- a/aten/src/ATen/native/cuda/im2col.cuh +++ b/aten/src/ATen/native/cuda/im2col.cuh @@ -108,7 +108,7 @@ void im2col( height_col, width_col, data_col); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -208,7 +208,7 @@ void col2im( output_height, output_width, data_im); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } // namespace native diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index 7c5a08e9ede7..817001e126ae 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -278,10 +278,10 @@ void LayerNormKernelImplInternal( RowwiseMomentsCUDAKernel <<>>( N, eps, X_data, mean_data, rstd_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); LayerNormForwardCUDAKernel<<>>( N, X_data, mean_data, rstd_data, gamma_data, beta_data, Y_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } void LayerNormKernelImpl( @@ -340,7 +340,7 @@ void LayerNormBackwardKernelImplInternal( ComputeInternalGradientsCUDAKernel <<>>( N, dY_data, X_data, gamma_data, ds_data, db_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads; ComputeGradientFusedParamsCUDAKernel <<>>( @@ -352,7 +352,7 @@ void LayerNormBackwardKernelImplInternal( db_data, scale_data, bias_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); LayerNormBackwardCUDAKenrel<<>>( N, dY_data, @@ -362,7 +362,7 @@ void LayerNormBackwardKernelImplInternal( scale_data, bias_data, dX_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } if (dgamma->defined() || dbeta->defined()) { T* dgamma_data = @@ -381,7 +381,7 @@ void LayerNormBackwardKernelImplInternal( rstd_data, dgamma_data, dbeta_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { const int64_t B = (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize; @@ -397,7 +397,7 @@ void LayerNormBackwardKernelImplInternal( rstd_data, dgamma_data, dbeta_data); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } } diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh index ec97888a5c47..960b44e6d106 100644 --- a/aten/src/ATen/native/cuda/vol2col.cuh +++ b/aten/src/ATen/native/cuda/vol2col.cuh @@ -129,7 +129,7 @@ void vol2col( height_col, width_col, data_col); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -264,7 +264,7 @@ void col2vol( output_height, output_width, data_vol); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } // namespace native diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 2d6e570d25c8..3cc6f3a93f5a 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -1693,6 +1693,7 @@ CPU: resize_ CUDA: resize_cuda_ QuantizedCPU: quantized_resize_cpu_ + Meta: resize_meta_ - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor use_c10_dispatcher: full @@ -5587,15 +5588,13 @@ use_c10_dispatcher: full variants: method dispatch: - CPU: fmod_ - CUDA: fmod_cuda_ + CPU, CUDA: fmod_ - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!) use_c10_dispatcher: full variants: method dispatch: - CPU: fmod_ - CUDA: fmod_cuda_ + CPU, CUDA: fmod_ - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!) use_c10_dispatcher: full @@ -6581,27 +6580,23 @@ - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: fmod_out - CUDA: fmod_cuda_out + CPU, CUDA: fmod_out - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: fmod - CUDA: fmod_cuda + CPU, CUDA: fmod - func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: - CPU: fmod_out - CUDA: fmod_cuda_out + CPU, CUDA: fmod_out - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor use_c10_dispatcher: full variants: method, function dispatch: - CPU: fmod - CUDA: fmod_cuda + CPU, CUDA: fmod - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) dispatch: @@ -9508,6 +9503,18 @@ dispatch: Math: linalg_tensorsolve_out +- func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor + python_module: linalg + variants: function + dispatch: + Math: linalg_matrix_rank + +- func: linalg_matrix_rank.out(Tensor self, float? tol=None, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!) + python_module: linalg + variants: function + dispatch: + Math: linalg_matrix_rank_out + ## Functions that are only for testing # It is undocumented and should not be used outside of tests. - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index 1217a94d753b..1c52242641e7 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -31,10 +31,18 @@ at::Tensor embedding_bag_4bit_impl( // Get compressed indices for pruned_weights op. int32_t* compressed_indices_mapping_data = nullptr; int compressed_index_size = 0; + bool fallback_to_no_sparse = false; if (pruned_weights) { compressed_index_size = compressed_indices_mapping.value().numel(); compressed_indices_mapping_data = compressed_indices_mapping.value().data_ptr(); + + // if compressed_indices_mapping is [0], it is a indicator that + // we should fallback to non sparse embedding look up kernel. + if ((compressed_index_size == 1 && + compressed_indices_mapping_data[0] == 0)) { + fallback_to_no_sparse = true; + } } const int64_t N = weight.size(0); @@ -69,7 +77,7 @@ at::Tensor embedding_bag_4bit_impl( constexpr int prefetch_distance = 16; #ifdef USE_FBGEMM - if (!pruned_weights) { + if (!pruned_weights || fallback_to_no_sparse) { // Generate the fbgemm kernel auto kernel = fbgemm::GenerateEmbeddingSpMDMNBit( /*bit rate=*/4, @@ -209,10 +217,18 @@ at::Tensor embedding_bag_byte_impl( // Get compressed indices for pruned_weights. int32_t* compressed_indices_mapping_data = nullptr; int compressed_index_size = 0; + bool fallback_to_no_sparse = false; if (pruned_weights) { compressed_index_size = compressed_indices_mapping.value().numel(); compressed_indices_mapping_data = compressed_indices_mapping.value().data_ptr(); + + // if compressed_indices_mapping is [0], it is a indicator that + // we should fallback to non sparse embedding look up kernel. + if ((compressed_index_size == 1 && + compressed_indices_mapping_data[0] == 0)) { + fallback_to_no_sparse = true; + } } const int64_t N = weight.size(0); @@ -247,7 +263,7 @@ at::Tensor embedding_bag_byte_impl( const int index_size = indices.numel(); #ifdef USE_FBGEMM - if (!pruned_weights) { + if (!pruned_weights || fallback_to_no_sparse) { auto kernel_i8 = fbgemm::GenerateEmbeddingSpMDM( /*block_size=*/D, @@ -613,12 +629,12 @@ class QEmbeddingBag final { false /* is_embedding_op */); } else if (bit_rate == 4) { return packed_weight->embeddingbag_4bit( - indices, - offsets, - pruned_weights, - per_sample_weights_, - compressed_indices_mapping, - include_last_offset); + indices, + offsets, + pruned_weights, + per_sample_weights_, + compressed_indices_mapping, + include_last_offset); } else { TORCH_INTERNAL_ASSERT( "Currently only support 8-bit embedding_bag quantization"); diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp index 81e2a9f6d406..37c2919bb458 100644 --- a/aten/src/ATen/templates/Functions.cpp +++ b/aten/src/ATen/templates/Functions.cpp @@ -7,6 +7,22 @@ namespace at { +Tensor var(const Tensor& self, int dim) { + return at::var(self, IntArrayRef{dim}); +} + +std::tuple var_mean(const Tensor& self, int dim) { + return at::var_mean(self, IntArrayRef{dim}); +} + +Tensor std(const Tensor& self, int dim) { + return at::std(self, IntArrayRef{dim}); +} + +std::tuple std_mean(const Tensor& self, int dim) { + return at::std_mean(self, IntArrayRef{dim}); +} + ${function_definitions} } diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h index 5ec90e7e3d61..50623dc2dfed 100644 --- a/aten/src/ATen/templates/Functions.h +++ b/aten/src/ATen/templates/Functions.h @@ -7,7 +7,6 @@ #include #include #include -#include // TODO: try to delete this #include #include #include @@ -19,26 +18,38 @@ namespace at { +// These functions are defined in ATen/Utils.cpp. +#define TENSOR(T, S) \ + CAFFE2_API Tensor tensor(ArrayRef values, const TensorOptions& options); \ + inline Tensor tensor( \ + std::initializer_list values, const TensorOptions& options) { \ + return at::tensor(ArrayRef(values), options); \ + } \ + inline Tensor tensor(T value, const TensorOptions& options) { \ + return at::tensor(ArrayRef(value), options); \ + } \ + inline Tensor tensor(ArrayRef values) { \ + return at::tensor(std::move(values), at::dtype(k##S)); \ + } \ + inline Tensor tensor(std::initializer_list values) { \ + return at::tensor(ArrayRef(values)); \ + } \ + inline Tensor tensor(T value) { \ + return at::tensor(ArrayRef(value)); \ + } +AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR) +AT_FORALL_COMPLEX_TYPES(TENSOR) +#undef TENSOR + ${function_declarations} // Special C++ only overloads for std()-like functions (See gh-40287) // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef // So, for example std(0) would select the std(unbiased=False) overload -inline Tensor var(const Tensor& self, int dim) { - return at::native::var(self, IntArrayRef{dim}); -} - -inline std::tuple var_mean(const Tensor& self, int dim) { - return at::native::var_mean(self, IntArrayRef{dim}); -} - -inline Tensor std(const Tensor& self, int dim) { - return at::native::std(self, IntArrayRef{dim}); -} - -inline std::tuple std_mean(const Tensor& self, int dim) { - return at::native::std_mean(self, IntArrayRef{dim}); -} +CAFFE2_API Tensor var(const Tensor& self, int dim); +CAFFE2_API std::tuple var_mean(const Tensor& self, int dim); +CAFFE2_API Tensor std(const Tensor& self, int dim); +CAFFE2_API std::tuple std_mean(const Tensor& self, int dim); namespace { inline std::vector zero_sizes(const TensorOptions& options) { diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h index 0244efcb3a6b..b4cb31f60ee8 100644 --- a/aten/src/ATen/templates/NativeFunctions.h +++ b/aten/src/ATen/templates/NativeFunctions.h @@ -23,29 +23,6 @@ struct Type; } // namespace at namespace at { -// These functions are defined in ATen/Utils.cpp. -#define TENSOR(T, S) \ - CAFFE2_API Tensor tensor(ArrayRef values, const TensorOptions& options); \ - inline Tensor tensor( \ - std::initializer_list values, const TensorOptions& options) { \ - return at::tensor(ArrayRef(values), options); \ - } \ - inline Tensor tensor(T value, const TensorOptions& options) { \ - return at::tensor(ArrayRef(value), options); \ - } \ - inline Tensor tensor(ArrayRef values) { \ - return at::tensor(std::move(values), at::dtype(k##S)); \ - } \ - inline Tensor tensor(std::initializer_list values) { \ - return at::tensor(ArrayRef(values)); \ - } \ - inline Tensor tensor(T value) { \ - return at::tensor(ArrayRef(value)); \ - } -AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR) -AT_FORALL_COMPLEX_TYPES(TENSOR) -#undef TENSOR - namespace native { ${native_function_declarations} diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu index da57a1ad36f8..04fb34df4f70 100644 --- a/aten/src/THC/THCTensorMathPairwise.cu +++ b/aten/src/THC/THCTensorMathPairwise.cu @@ -21,36 +21,5 @@ struct TensorMulConstantOp { const T val; }; -template -struct TensorFmodOp { - TensorFmodOp(T v) : val((float)v) {} - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = (T) fmodf((float) *in, val); - } - - __device__ __forceinline__ void operator()(T* v) { - *v = (T) fmodf((float) *v, val); - } - - const float val; -}; - -template <> -struct TensorFmodOp { - TensorFmodOp(double v) : val(v) {} - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = fmod(*in, val); - } - - __device__ __forceinline__ void operator()(double* v) { - *v = fmod(*v, val); - } - - const double val; -}; - -#include -#include - #include #include diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh index 2b511983934f..bb2e31de2669 100644 --- a/aten/src/THC/THCTensorMathPointwise.cuh +++ b/aten/src/THC/THCTensorMathPointwise.cuh @@ -36,50 +36,6 @@ struct TensorMulOp { } }; -template -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(T* out, T* in) { - *out = *out % *in; - } - - __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) { - *out = *in1 % *in2; - } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(float* out, float* in) { - *out = fmodf(*out, *in); - } - - __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) { - *out = fmodf(*in1, *in2); - } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(double* out, double* in) { - *out = fmod(*out, *in); - } - - __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) { - *out = fmod(*in1, *in2); - } -}; - -template <> -struct TensorCFmodOp { - __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) { - *out = fmodf(*out, *in); - } - - __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) { - *out = fmodf(*in1, *in2); - } -}; - template struct TensorCrossOp { TensorCrossOp(int64_t sx, int64_t sy, int64_t so) : sx(sx), sy(sy), so(so) {} diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h index 7bae0a54fa41..2e530aa4be42 100644 --- a/aten/src/THC/generic/THCTensorMathPairwise.h +++ b/aten/src/THC/generic/THCTensorMathPairwise.h @@ -7,7 +7,6 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src); #if !defined(THC_REAL_IS_BOOL) THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); -THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value); #endif diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu index 54fe16bc85c0..c33716c5f565 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.cu +++ b/aten/src/THC/generic/THCTensorMathPointwise.cu @@ -83,23 +83,5 @@ void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2))); } -void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2) -{ - THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2)); - THArgCheck(THCTensor_(nElement)(state, src1) == - THCTensor_(nElement)(state, src2), 2, "sizes do not match"); - - if (self == src1) { - if (!THC_pointwiseApply2(state, self, src2, TensorCFmodOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } else { - THCTensor_(resizeAs)(state, self, src1); - if (!THC_pointwiseApply3(state, self, src1, src2, TensorCFmodOp())) { - THArgCheck(false, 2, CUTORCH_DIM_WARNING); - } - } -} - #endif #endif diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h index 658f344f8b14..56ead75ba992 100644 --- a/aten/src/THC/generic/THCTensorMathPointwise.h +++ b/aten/src/THC/generic/THCTensorMathPointwise.h @@ -20,7 +20,6 @@ THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1, THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); THC_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); THC_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); -THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2); #endif #endif diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h index e3b2348ef6ab..5d1a473b5597 100644 --- a/c10/cuda/CUDAException.h +++ b/c10/cuda/CUDAException.h @@ -33,4 +33,4 @@ // This should be used directly after every kernel launch to ensure // the launch happened correctly and provide an early, close-to-source // diagnostic if it didn't. -#define TORCH_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError()) +#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError()) diff --git a/caffe2/sgd/adadelta_op_gpu.cu b/caffe2/sgd/adadelta_op_gpu.cu index 8ead39705a70..92416b503bcd 100644 --- a/caffe2/sgd/adadelta_op_gpu.cu +++ b/caffe2/sgd/adadelta_op_gpu.cu @@ -47,7 +47,7 @@ void AdadeltaUpdate( CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(N, w, g, h, d, epsilon, decay, lr, nw, nh, nd); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } // namespace @@ -155,7 +155,7 @@ class CUDASparseAdadeltaOp final : public Operator { paramOut, momentOut, momentDeltaOut); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu index 88d51e1026a8..539b9919e8e7 100644 --- a/caffe2/sgd/adagrad_fused_op_gpu.cu +++ b/caffe2/sgd/adagrad_fused_op_gpu.cu @@ -638,7 +638,7 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final 0, context_.cuda_stream()>>>( grad, lengths, grad_buffer_data, block_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } if (block_size <= maxThreads) { @@ -661,7 +661,7 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final is_mean ? grad_buffer_data : grad, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { // calling cuda kernel with ExactBlock = false sparse_adagrad_fused_length_sum_gradient_kernel< @@ -680,7 +680,7 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final is_mean ? grad_buffer_data : grad, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; } @@ -822,7 +822,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (block_size > 64) { sparse_adagrad_fused_length_weighted_sum_gradient_kernel< IndexType, @@ -842,7 +842,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (block_size > 32) { sparse_adagrad_fused_length_weighted_sum_gradient_kernel< IndexType, @@ -862,7 +862,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { sparse_adagrad_fused_length_weighted_sum_gradient_kernel< IndexType, @@ -882,7 +882,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; } @@ -1012,7 +1012,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final 0, context_.cuda_stream()>>>( grad, lengths, grad_buffer_data, block_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } // 0: nearest rounding @@ -1045,7 +1045,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { rowwise_sparse_adagrad_fused_length_sum_gradient_kernel< IndexType, @@ -1065,7 +1065,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } else { if (round_option_) { @@ -1091,7 +1091,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { rowwise_sparse_adagrad_fused_length_sum_gradient_kernel< IndexType, @@ -1115,7 +1115,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } return true; @@ -1253,7 +1253,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final 0, context_.cuda_stream()>>>( grad, lengths, grad_buffer_data, block_size); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } sorted_linear_ind_buffer_.ResizeLike(indicesInput); @@ -1265,7 +1265,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final indices, prefix_sum_length_data, seg_id_buffer_.template mutable_data()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); sort_pairs_wrapper( num_indices, @@ -1330,7 +1330,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< IndexType, @@ -1357,7 +1357,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } else { const int sm_size = block_size * sizeof(float); @@ -1392,7 +1392,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel< IndexType, @@ -1419,7 +1419,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final lr, seed, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } @@ -1569,7 +1569,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (block_size > 64) { rowwise_sparse_adagrad_fused_length_weighted_sum_gradient_kernel< IndexType, @@ -1589,7 +1589,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else if (block_size > 32) { rowwise_sparse_adagrad_fused_length_weighted_sum_gradient_kernel< IndexType, @@ -1609,7 +1609,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { rowwise_sparse_adagrad_fused_length_weighted_sum_gradient_kernel< IndexType, @@ -1629,7 +1629,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final out_weight_grads, lr, weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu index 0937bbaef05e..8abb3376ca87 100644 --- a/caffe2/sgd/adagrad_op_gpu.cu +++ b/caffe2/sgd/adagrad_op_gpu.cu @@ -44,7 +44,7 @@ void adagrad_update( 0, context->cuda_stream()>>>( N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -189,7 +189,7 @@ class CUDASparseAdagradOp final : public Operator { Input(GRAD).template data(), Input(LR).template data(), weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -233,7 +233,7 @@ bool RowWiseSparseAdagradOp::DoRunWithType() { Input(GRAD).template data(), Input(LR).template data(), weight_decay_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu index b5edb3fd3ab2..42ab975faacb 100644 --- a/caffe2/sgd/adam_op_gpu.cu +++ b/caffe2/sgd/adam_op_gpu.cu @@ -47,7 +47,7 @@ void adam_update( 0, context->cuda_stream()>>>( N, g, m, v, ng, nm, nv, beta1, beta2, eps_hat, correction, lr); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } __global__ void AdamCompute( @@ -95,7 +95,7 @@ void adam_compute( 0, context->cuda_stream()>>>( N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } __global__ void AdamComputeOutputGrad( @@ -145,7 +145,7 @@ void adam_compute_output_grad( 0, context->cuda_stream()>>>( N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } template @@ -336,7 +336,7 @@ bool SparseAdamOp::DoRunWithType() { correction, Input(LR).template data(), iter); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); SparseAdamOutputGradKernel @@ -358,7 +358,7 @@ bool SparseAdamOp::DoRunWithType() { correction, Input(LR).template data(), iter); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; @@ -403,7 +403,7 @@ bool RowWiseSparseAdamOp::DoRunWithType() { Input(GRAD).template data(), correction, Input(LR).template data()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); RowWiseSparseAdamOutputGradKernel @@ -424,7 +424,7 @@ bool RowWiseSparseAdamOp::DoRunWithType() { Input(GRAD).template data(), correction, Input(LR).template data()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; diff --git a/caffe2/sgd/fp16_momentum_sgd_op.cu b/caffe2/sgd/fp16_momentum_sgd_op.cu index be0be6d78643..985f4f2864d1 100644 --- a/caffe2/sgd/fp16_momentum_sgd_op.cu +++ b/caffe2/sgd/fp16_momentum_sgd_op.cu @@ -215,7 +215,7 @@ void fp16_momentum_sgd_update( nesterov, weight_decay, reinterpret_cast(param)); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // not setting N to N/2 } else { FP16MomentumSGDFP32Kernel<<< @@ -233,7 +233,7 @@ void fp16_momentum_sgd_update( nesterov, weight_decay, reinterpret_cast(param)); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // not setting N to N/2 } diff --git a/caffe2/sgd/fp32_momentum_sgd_op.cu b/caffe2/sgd/fp32_momentum_sgd_op.cu index f31b24ba7541..1ae9015bc6d9 100644 --- a/caffe2/sgd/fp32_momentum_sgd_op.cu +++ b/caffe2/sgd/fp32_momentum_sgd_op.cu @@ -108,7 +108,7 @@ void fp32_momentum_sgd_update( nesterov, weight_decay, reinterpret_cast(param)); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); // not setting N to N/2 // TODO_ check float performance vs float2 } diff --git a/caffe2/sgd/lars_op_gpu.cu b/caffe2/sgd/lars_op_gpu.cu index e3129235255f..2a1d6c79e833 100644 --- a/caffe2/sgd/lars_op_gpu.cu +++ b/caffe2/sgd/lars_op_gpu.cu @@ -31,7 +31,7 @@ void LarsOp::ComputeLearningRate( float* lr_rescaled) { ComputeLearningRateKernel<<<1, 1, 0, context_.cuda_stream()>>>( wd, trust, lr_max, offset, lr_min, X_norm, dX_norm, lr_rescaled); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } REGISTER_CUDA_OPERATOR(Lars, LarsOp); diff --git a/caffe2/sgd/momentum_sgd_op_gpu.cu b/caffe2/sgd/momentum_sgd_op_gpu.cu index e2439a75d711..e8eb00654e65 100644 --- a/caffe2/sgd/momentum_sgd_op_gpu.cu +++ b/caffe2/sgd/momentum_sgd_op_gpu.cu @@ -82,14 +82,14 @@ void momentum_sgd_update( CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { MomentumSGDKernel <<cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } @@ -152,7 +152,7 @@ bool SparseMomentumSGDUpdateOp::DoRunWithType() { Input(GRAD).template data(), Output(OUTPUT_GRAD)->template mutable_data(), Input(LR).template data()); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/caffe2/sgd/rmsprop_op_gpu.cu b/caffe2/sgd/rmsprop_op_gpu.cu index f2b5b4f19463..d6f236739084 100644 --- a/caffe2/sgd/rmsprop_op_gpu.cu +++ b/caffe2/sgd/rmsprop_op_gpu.cu @@ -43,7 +43,7 @@ void rmsprop_update( CUDAContext* context) { RmsPropUpdate<<cuda_stream()>>>( N, g, ms, mom, ng, nms, nmom, decay, momentum, epsilon, lr); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/caffe2/sgd/yellowfin_op_gpu.cu b/caffe2/sgd/yellowfin_op_gpu.cu index 7644821cf3f1..cb62ae433557 100644 --- a/caffe2/sgd/yellowfin_op_gpu.cu +++ b/caffe2/sgd/yellowfin_op_gpu.cu @@ -32,7 +32,7 @@ void YellowFinOp::GetLrMu() { // Finding root of cubic formula for YF's Single Step GetLrMuKernel<<<1, 1, 0, context_.cuda_stream()>>>( g_norm2_max_deb_, g_norm2_min_deb_, distance_deb_, variance_, mu_, lr_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_); MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_); } @@ -79,7 +79,7 @@ void YellowFinOp::MomentumSgdUpdate() { param_out_, moment_out_, nesterov_); - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); } REGISTER_CUDA_OPERATOR(YellowFin, YellowFinOp); diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h index 83e7cb2317bb..d5dbe121f6f8 100644 --- a/caffe2/utils/eigen_utils.h +++ b/caffe2/utils/eigen_utils.h @@ -73,17 +73,28 @@ using EArrXf = Eigen::ArrayXf; using EArrXd = Eigen::ArrayXd; using EArrXi = Eigen::ArrayXi; using EArrXb = EArrXt; +using EArrXI32 = EArrXt; +using EArrXU16 = EArrXt; +using EArrXU8 = EArrXt; // 2-d array, column major template using EArrXXt = Eigen::Array; using EArrXXf = Eigen::ArrayXXf; +using EArrXXI32 = EArrXXt; +using EArrXXU16 = EArrXXt; +using EArrXXU8 = EArrXXt; +using EArrXXi = EArrXXt; // 2-d array, row major template using ERArrXXt = Eigen::Array; using ERArrXXf = ERArrXXt; +using ERArrXXI32t = ERArrXXt; +using ERArrXXU16t = ERArrXXt; +using ERArrXXU8t = ERArrXXt; +using ERArrXXi = ERArrXXt; // 1-d vector template diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst index aebc39038368..fe09626e60d8 100644 --- a/docs/source/distributions.rst +++ b/docs/source/distributions.rst @@ -167,6 +167,15 @@ Probability distributions - torch.distributions :undoc-members: :show-inheritance: +:hidden:`Kumaraswamy` +~~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: torch.distributions.kumaraswamy +.. autoclass:: Kumaraswamy + :members: + :undoc-members: + :show-inheritance: + :hidden:`Laplace` ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst index eb7b3c120c61..b5d78572c06b 100644 --- a/docs/source/linalg.rst +++ b/docs/source/linalg.rst @@ -16,6 +16,7 @@ Functions .. autofunction:: det .. autofunction:: eigh .. autofunction:: eigvalsh +.. autofunction:: matrix_rank .. autofunction:: norm .. autofunction:: tensorinv .. autofunction:: tensorsolve diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst index 443d3849582f..cc195e7a93a9 100644 --- a/docs/source/notes/windows.rst +++ b/docs/source/notes/windows.rst @@ -20,14 +20,15 @@ MKL and MAGMA. Here are the steps to build with them. REM Download MAGMA files REM version available: + REM 2.5.4 (CUDA 10.1 10.2 11.0 11.1) x (Debug Release) REM 2.5.3 (CUDA 10.1 10.2 11.0) x (Debug Release) REM 2.5.2 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release) REM 2.5.1 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release) REM 2.5.0 (CUDA 9.0 9.2 10.0 10.1) x (Debug Release) REM 2.4.0 (CUDA 8.0 9.2) x (Release) - set CUDA_PREFIX=cuda92 + set CUDA_PREFIX=cuda101 set CONFIG=release - curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.1_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z + curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z 7z x -aoa magma.7z -omagma REM Setting essential environment variables diff --git a/setup.py b/setup.py index fd777b141688..01f173d6825b 100644 --- a/setup.py +++ b/setup.py @@ -327,8 +327,16 @@ def check_file(f): # Use copies instead of symbolic files. # Windows has very poor support for them. - sym_files = ['tools/shared/_utils_internal.py'] - orig_files = ['torch/_utils_internal.py'] + sym_files = [ + 'tools/shared/_utils_internal.py', + 'torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h', + 'torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h', + ] + orig_files = [ + 'torch/_utils_internal.py', + 'third_party/valgrind-headers/callgrind.h', + 'third_party/valgrind-headers/valgrind.h', + ] for sym_file, orig_file in zip(sym_files, orig_files): same = False if os.path.exists(sym_file): @@ -907,6 +915,9 @@ def print_box(msg): 'share/cmake/Gloo/*.cmake', 'share/cmake/Tensorpipe/*.cmake', 'share/cmake/Torch/*.cmake', + 'utils/benchmark/utils/*.cpp', + 'utils/benchmark/utils/valgrind_wrapper/*.cpp', + 'utils/benchmark/utils/valgrind_wrapper/*.h', ], 'caffe2': [ 'python/serialized_test/data/operator_test/*.zip', diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py index 8b1bdf1419ee..779a704bfd92 100644 --- a/test/benchmark_utils/test_benchmark_utils.py +++ b/test/benchmark_utils/test_benchmark_utils.py @@ -7,7 +7,7 @@ import torch import torch.utils.benchmark as benchmark_utils -from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, slowTest +from torch.testing._internal.common_utils import TestCase, run_tests, IS_SANDCASTLE, IS_WINDOWS, slowTest from torch.testing._internal import expecttest import numpy as np @@ -162,6 +162,17 @@ def test_timer(self): ).timeit(5).median self.assertIsInstance(sample, float) + @slowTest + @unittest.skipIf(IS_SANDCASTLE, "C++ timing is OSS only.") + def test_cpp_timer(self): + timer = benchmark_utils.Timer( + "torch::Tensor y = x + 1;", + setup="torch::Tensor x = torch::empty({1});", + language=benchmark_utils.Language.CPP, + ) + t = timer.timeit(10) + self.assertIsInstance(t.median, float) + class _MockTimer: _seed = 0 diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index 68f5922753a3..d75f21740435 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -44,7 +44,7 @@ Distribution, Exponential, ExponentialFamily, FisherSnedecor, Gamma, Geometric, Gumbel, HalfCauchy, HalfNormal, - Independent, Laplace, LogisticNormal, + Independent, Kumaraswamy, Laplace, LogisticNormal, LogNormal, LowRankMultivariateNormal, MixtureSameFamily, Multinomial, MultivariateNormal, NegativeBinomial, Normal, OneHotCategorical, Pareto, @@ -240,6 +240,16 @@ def is_all_nan(tensor): 'reinterpreted_batch_ndims': 3, }, ]), + Example(Kumaraswamy, [ + { + 'concentration1': torch.empty(2, 3).uniform_(1, 2).requires_grad_(), + 'concentration0': torch.empty(2, 3).uniform_(1, 2).requires_grad_(), + }, + { + 'concentration1': torch.rand(4).uniform_(1, 2).requires_grad_(), + 'concentration0': torch.rand(4).uniform_(1, 2).requires_grad_(), + }, + ]), Example(Laplace, [ { 'loc': torch.randn(5, 5, requires_grad=True), @@ -2249,6 +2259,42 @@ def test_gumbel_sample(self): scipy.stats.gumbel_r(loc=loc, scale=scale), 'Gumbel(loc={}, scale={})'.format(loc, scale)) + def test_kumaraswamy_shape(self): + concentration1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) + concentration0 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) + concentration1_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True) + concentration0_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True) + self.assertEqual(Kumaraswamy(concentration1, concentration0).sample().size(), (2, 3)) + self.assertEqual(Kumaraswamy(concentration1, concentration0).sample((5,)).size(), (5, 2, 3)) + self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample().size(), (1,)) + self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample((1,)).size(), (1, 1)) + self.assertEqual(Kumaraswamy(1.0, 1.0).sample().size(), ()) + self.assertEqual(Kumaraswamy(1.0, 1.0).sample((1,)).size(), (1,)) + + # Kumaraswamy distribution is not implemented in SciPy + # Hence these tests are explicit + def test_kumaraswamy_mean_variance(self): + c1_1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) + c0_1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True) + c1_2 = torch.tensor(torch.randn(4).abs(), requires_grad=True) + c0_2 = torch.tensor(torch.randn(4).abs(), requires_grad=True) + cases = [(c1_1, c0_1), (c1_2, c0_2)] + for i, (a, b) in enumerate(cases): + m = Kumaraswamy(a, b) + samples = m.sample((60000, )) + expected = samples.mean(0) + actual = m.mean + error = (expected - actual).abs() + max_error = max(error[error == error]) + self.assertLess(max_error, 0.01, + "Kumaraswamy example {}/{}, incorrect .mean".format(i + 1, len(cases))) + expected = samples.var(0) + actual = m.variance + error = (expected - actual).abs() + max_error = max(error[error == error]) + self.assertLess(max_error, 0.01, + "Kumaraswamy example {}/{}, incorrect .variance".format(i + 1, len(cases))) + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") def test_fishersnedecor(self): df1 = torch.randn(2, 3).abs().requires_grad_() @@ -2622,6 +2668,18 @@ def test_valid_parameter_broadcasting(self): (1, 2)), (Gumbel(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])), (1, 1)), + (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=1.), + (2,)), + (Kumaraswamy(concentration1=1, concentration0=torch.tensor([1., 1.])), + (2, )), + (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([1.])), + (2,)), + (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([[1.], [1.]])), + (2, 2)), + (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([[1.]])), + (1, 2)), + (Kumaraswamy(concentration1=torch.tensor([1.]), concentration0=torch.tensor([[1.]])), + (1, 1)), (Laplace(loc=torch.tensor([0., 0.]), scale=1), (2,)), (Laplace(loc=0, scale=torch.tensor([1., 1.])), @@ -2701,6 +2759,14 @@ def test_invalid_parameter_broadcasting(self): 'concentration': torch.tensor([0, 0]), 'rate': torch.tensor([1, 1, 1]) }), + (Kumaraswamy, { + 'concentration1': torch.tensor([[1, 1]]), + 'concentration0': torch.tensor([1, 1, 1, 1]) + }), + (Kumaraswamy, { + 'concentration1': torch.tensor([[[1, 1, 1], [1, 1, 1]]]), + 'concentration0': torch.tensor([1, 1]) + }), (Laplace, { 'loc': torch.tensor([0, 0]), 'scale': torch.tensor([1, 1, 1]) @@ -3242,6 +3308,15 @@ def test_gumbel_shape_scalar_params(self): self.assertEqual(gumbel.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))) self.assertEqual(gumbel.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))) + def test_kumaraswamy_shape_scalar_params(self): + kumaraswamy = Kumaraswamy(1, 1) + self.assertEqual(kumaraswamy._batch_shape, torch.Size()) + self.assertEqual(kumaraswamy._event_shape, torch.Size()) + self.assertEqual(kumaraswamy.sample().size(), torch.Size()) + self.assertEqual(kumaraswamy.sample((3, 2)).size(), torch.Size((3, 2))) + self.assertEqual(kumaraswamy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))) + self.assertEqual(kumaraswamy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3))) + def test_vonmises_shape_tensor_params(self): von_mises = VonMises(torch.tensor([0., 0.]), torch.tensor([1., 1.])) self.assertEqual(von_mises._batch_shape, torch.Size((2,))) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 0f481689a10a..abaa078bb353 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -2349,22 +2349,6 @@ def forward(self, input, indices): indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64) self.run_test(GatherModel(), input=(input, indices)) - @skipIfUnsupportedMinOpsetVersion(9) - def test_gather_float_index(self): - class GatherFloatIndexModel(torch.jit.ScriptModule): - @torch.jit.script_method - def forward(self, input, mask): - seq_length, batch_size = mask.shape - result = input[0][0][0] - for i in torch.arange(2, seq_length): - result = input[0][i][0] - return result - - model = GatherFloatIndexModel() - x = torch.randint(0, 5, (8, 8, 17), dtype=torch.long) - y = torch.ones(8, 1, dtype=torch.uint8) - self.run_test(model, (x, y)) - @skipIfUnsupportedMinOpsetVersion(9) def test_expand(self): class ExpandModel(torch.nn.Module): diff --git a/test/quantization/test_numeric_suite.py b/test/quantization/test_numeric_suite.py index 44963735d6c2..74ecc4a90469 100644 --- a/test/quantization/test_numeric_suite.py +++ b/test/quantization/test_numeric_suite.py @@ -104,7 +104,7 @@ def compare_and_validate_results(float_model, q_model): model.eval() if hasattr(model, "fuse_model"): model.fuse_model() - q_model = quantize(model, test_only_eval_fn, self.img_data_2d) + q_model = quantize(model, test_only_eval_fn, [self.img_data_2d]) compare_and_validate_results(model, q_model) @override_qengines @@ -126,7 +126,7 @@ def compare_and_validate_results(float_model, q_model): model.eval() if hasattr(model, "fuse_model"): model.fuse_model() - q_model = quantize(model, test_only_eval_fn, self.calib_data) + q_model = quantize(model, test_only_eval_fn, [self.calib_data]) compare_and_validate_results(model, q_model) @override_qengines @@ -197,7 +197,7 @@ def compare_and_validate_results(float_model, q_model, module_swap_list, data): model.eval() if hasattr(model, "fuse_model"): model.fuse_model() - q_model = quantize(model, test_only_eval_fn, self.img_data_2d) + q_model = quantize(model, test_only_eval_fn, [self.img_data_2d]) compare_and_validate_results( model, q_model, module_swap_list, self.img_data_2d[0][0] ) @@ -223,7 +223,7 @@ def compare_and_validate_results(float_model, q_model, module_swap_list, data): model.eval() if hasattr(model, "fuse_model"): model.fuse_model() - q_model = quantize(model, test_only_eval_fn, self.calib_data) + q_model = quantize(model, test_only_eval_fn, [self.calib_data]) compare_and_validate_results(model, q_model, module_swap_list, linear_data) @override_qengines @@ -233,7 +233,7 @@ def test_compare_model_stub_submodule_static(self): qengine = torch.backends.quantized.engine model = ModelWithSubModules().eval() - q_model = quantize(model, test_only_eval_fn, self.img_data_2d) + q_model = quantize(model, test_only_eval_fn, [self.img_data_2d]) module_swap_list = [SubModule] ob_dict = compare_model_stub( model, q_model, module_swap_list, self.img_data_2d[0][0] @@ -350,7 +350,7 @@ def compare_and_validate_results(float_model, q_model, data): model.eval() if hasattr(model, "fuse_model"): model.fuse_model() - q_model = quantize(model, test_only_eval_fn, self.img_data_2d) + q_model = quantize(model, test_only_eval_fn, [self.img_data_2d]) compare_and_validate_results(model, q_model, self.img_data_2d[0][0]) @override_qengines @@ -376,7 +376,7 @@ def compare_and_validate_results(float_model, q_model, data): model.eval() if hasattr(model, "fuse_model"): model.fuse_model() - q_model = quantize(model, test_only_eval_fn, self.calib_data) + q_model = quantize(model, test_only_eval_fn, [self.calib_data]) compare_and_validate_results(model, q_model, linear_data) @override_qengines diff --git a/test/quantization/test_qat_module.py b/test/quantization/test_qat_module.py index 4144c0744104..32de0ff50f0e 100644 --- a/test/quantization/test_qat_module.py +++ b/test/quantization/test_qat_module.py @@ -110,7 +110,11 @@ def _forward(self, input): running_std = torch.sqrt(self.running_var + self.eps) scale_factor = self.gamma / running_std scaled_weight = self.weight * scale_factor.reshape([-1, 1, 1, 1]) - conv = self._conv_forward(input, self.weight_fake_quant(scaled_weight)) + if self.bias is not None: + zero_bias = torch.zeros_like(self.bias) + else: + zero_bias = torch.zeros(self.out_channels, device=scaled_weight.device) + conv = self._conv_forward(input, self.weight_fake_quant(scaled_weight), zero_bias) if self.training and not self.freeze_bn: # recovering original conv to get original batch_mean and batch_var diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py index ee4a114dcee0..745437a86ca3 100644 --- a/test/quantization/test_quantize.py +++ b/test/quantization/test_quantize.py @@ -120,7 +120,7 @@ def checkQuantized(model): base = AnnotatedSingleLayerLinearModel(qengine) base.qconfig = qconfig keys_before = set(list(base.state_dict().keys())) - model = quantize(base, test_only_eval_fn, self.calib_data) + model = quantize(base, test_only_eval_fn, [self.calib_data]) checkQuantized(model) keys_after = set(list(base.state_dict().keys())) self.assertEqual(keys_before, keys_after) # simple check that nothing changed @@ -128,7 +128,7 @@ def checkQuantized(model): # in-place version model = AnnotatedSingleLayerLinearModel(qengine) model.qconfig = qconfig - quantize(model, test_only_eval_fn, self.calib_data, inplace=True) + quantize(model, test_only_eval_fn, [self.calib_data], inplace=True) checkQuantized(model) @skipIfNoFBGEMM @@ -162,7 +162,7 @@ def checkQuantized(model): # test one line API model = quantize(AnnotatedTwoLayerLinearModel(), test_only_eval_fn, - self.calib_data) + [self.calib_data]) checkQuantized(model) def test_nested1(self): @@ -204,7 +204,7 @@ def checkQuantized(model): # test one line API model = quantize(AnnotatedNestedModel(qengine), test_only_eval_fn, - self.calib_data) + [self.calib_data]) checkQuantized(model) @@ -245,7 +245,7 @@ def checkQuantized(model): # test one line API model = quantize(AnnotatedSubNestedModel(), test_only_eval_fn, - self.calib_data) + [self.calib_data]) checkQuantized(model) def test_nested3(self): @@ -287,7 +287,7 @@ def checkQuantized(model): # test one line API model = quantize(AnnotatedCustomConfigNestedModel(), test_only_eval_fn, - self.calib_data) + [self.calib_data]) checkQuantized(model) def test_skip_quant(self): @@ -315,7 +315,7 @@ def checkQuantized(model): checkQuantized(model) # test one line API - model = quantize(AnnotatedSkipQuantModel(qengine), test_only_eval_fn, self.calib_data) + model = quantize(AnnotatedSkipQuantModel(qengine), test_only_eval_fn, [self.calib_data]) checkQuantized(model) @skipIfNoFBGEMM @@ -341,7 +341,7 @@ def checkQuantized(model): checkQuantized(model) # test one line API - model = quantize(QuantStubModel(), test_only_eval_fn, self.calib_data) + model = quantize(QuantStubModel(), test_only_eval_fn, [self.calib_data]) checkQuantized(model) def test_resnet_base(self): @@ -400,7 +400,7 @@ def checkQuantized(model): checkQuantized(model) model_oneline = quantize( - NormalizationTestModel(), test_only_eval_fn, self.calib_data) + NormalizationTestModel(), test_only_eval_fn, [self.calib_data]) checkQuantized(model) def test_save_load_state_dict(self): @@ -463,7 +463,7 @@ def checkQuantized(model): # test one line API model_oneline = quantize(ActivationsTestModel(), test_only_eval_fn, - self.calib_data) + [self.calib_data]) checkQuantized(model_oneline) @override_qengines @@ -1083,7 +1083,7 @@ def checkQuantized(model): checkQuantized(model) model = quantize_qat(ManualLinearQATModel(qengine), test_only_train_fn, - self.train_data) + [self.train_data]) checkQuantized(model) def test_eval_only_fake_quant(self): @@ -1123,7 +1123,7 @@ def checkQuantized(model): checkQuantized(model) model = ManualConvLinearQATModel() - model = quantize_qat(model, test_only_train_fn, self.img_data_2d_train) + model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train]) checkQuantized(model) def test_train_save_load_eval(self): @@ -1434,7 +1434,7 @@ def checkQuantized(model): model = ModelForFusion(default_qat_qconfig).train() model = fuse_modules(model, [['conv1', 'bn1', 'relu1'], ['sub1.conv', 'sub1.bn']]) - model = quantize_qat(model, test_only_train_fn, self.img_data_1d_train) + model = quantize_qat(model, test_only_train_fn, [self.img_data_1d_train]) with self.assertRaisesRegex(RuntimeError, "Could not run 'aten::native_batch_norm' with arguments from the 'QuantizedCPU'"): checkQuantized(model) @@ -1514,7 +1514,7 @@ def checkQuantized(model): ['bn2', 'relu3'], ['sub1.conv', 'sub1.bn'], ['conv3', 'bn3', 'relu4']]) - model = quantize(model, test_only_eval_fn, self.img_data_1d) + model = quantize(model, test_only_eval_fn, [self.img_data_1d]) checkQuantized(model) def test_fusion_sequential_model_train(self): diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py index 0bf8f4f65fe1..f67a585d99b6 100644 --- a/test/quantization/test_quantize_jit.py +++ b/test/quantization/test_quantize_jit.py @@ -3095,7 +3095,7 @@ def test_single_linear(self): # compare the result of the two quantized models later linear_model.fc1.weight = torch.nn.Parameter(annotated_linear_model.fc1.module.weight.detach()) linear_model.fc1.bias = torch.nn.Parameter(annotated_linear_model.fc1.module.bias.detach()) - model_eager = quantize(annotated_linear_model, test_only_eval_fn, self.calib_data) + model_eager = quantize(annotated_linear_model, test_only_eval_fn, [self.calib_data]) qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)} model_traced = torch.jit.trace(linear_model, self.calib_data[0][0]) @@ -3135,7 +3135,7 @@ def test_observer_with_ignored_function(self): linear_model.fc1.weight = torch.nn.Parameter(annotated_linear_model.fc1.module.weight.detach()) linear_model.fc1.bias = torch.nn.Parameter(annotated_linear_model.fc1.module.bias.detach()) model_eager = quantize(annotated_linear_model, test_only_eval_fn, - self.calib_data) + [self.calib_data]) qconfig_dict = {'': qconfig} model_traced = torch.jit.trace(linear_model, self.calib_data[0][0]) @@ -3161,7 +3161,7 @@ def test_conv(self): # copy the weight from eager mode so that we can # compare the result of the two quantized models later conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach()) - model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d) + model_eager = quantize(annotated_conv_model, test_only_eval_fn, [self.img_data_2d]) qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)} model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0]) model_script = torch.jit.script(conv_model) @@ -3189,7 +3189,7 @@ def test_conv_transpose(self): # copy the weight from eager mode so that we can # compare the result of the two quantized models later conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach()) - model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d) + model_eager = quantize(annotated_conv_model, test_only_eval_fn, [self.img_data_2d]) qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)} model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0]) model_script = torch.jit.script(conv_model) @@ -3217,7 +3217,7 @@ def test_conv_bn(self): conv_model_to_script.conv.weight = torch.nn.Parameter(conv_model.conv.weight.detach()) fuse_modules(conv_model, ['conv', 'bn'], inplace=True) model_eager = quantize(conv_model, test_only_eval_fn, - self.img_data_2d) + [self.img_data_2d]) qconfig_dict = { '': default_qconfig } @@ -3248,7 +3248,7 @@ def test_nested(self): script_model.fc3.weight = torch.nn.Parameter(eager_model.fc3.module.weight.detach()) script_model.fc3.bias = torch.nn.Parameter(eager_model.fc3.module.bias.detach()) - model_eager = quantize(eager_model, test_only_eval_fn, self.calib_data) + model_eager = quantize(eager_model, test_only_eval_fn, [self.calib_data]) qconfig_dict = { 'sub2.fc1': default_per_channel_qconfig if qengine_is_fbgemm() else default_qconfig, 'fc3': default_qconfig @@ -3284,7 +3284,7 @@ def test_skip_quant(self): eager_model.fuse_modules() - model_eager = quantize(eager_model, test_only_eval_fn, self.calib_data) + model_eager = quantize(eager_model, test_only_eval_fn, [self.calib_data]) qconfig_dict = { '': get_default_qconfig(torch.backends.quantized.engine), 'fc': None diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py index 7bd16e5a452a..1c66c8fb986f 100644 --- a/test/quantization/test_quantized_op.py +++ b/test/quantization/test_quantized_op.py @@ -2970,7 +2970,7 @@ def embedding_bag_rowwise_offsets_run( embedding_dim, num_offsets, use_32bit_indices, use_32bit_offsets, enable_per_sample_weights, - include_last_offset, sparsity, atol, rtol): + include_last_offset, fallback_to_no_sparse, sparsity, atol, rtol): pt_op = torch.ops.quantized.embedding_bag_byte_rowwise_offsets pt_prepack_op = torch.ops.quantized.embedding_bag_byte_prepack if bit_rate == 4: @@ -3029,20 +3029,25 @@ def get_reference_result( pruned_weights = weights prune_weights = sparsity > 0 if prune_weights: - # Prune and generate mapping table - num_compressed_rows = 0 - unpruned_ids = [] - for i in range(num_embeddings): - if np.random.uniform() < sparsity: - mapping_table[i] = -1 - q_weights[i, :] = 0 - weights[i, :] = 0 - else: - mapping_table[i] = num_compressed_rows - num_compressed_rows += 1 - unpruned_ids.append(i) - q_weights = q_weights[unpruned_ids] - pruned_weights = weights[unpruned_ids] + if fallback_to_no_sparse: + # Testing that prune_weight with mapping_table {0} will + # fallback to non sparse embedding look up kernel. + mapping_table = np.zeros(1, dtype=np.int32) + else: + # Prune and generate mapping table + num_compressed_rows = 0 + unpruned_ids = [] + for i in range(num_embeddings): + if np.random.uniform() < sparsity: + mapping_table[i] = -1 + q_weights[i, :] = 0 + weights[i, :] = 0 + else: + mapping_table[i] = num_compressed_rows + num_compressed_rows += 1 + unpruned_ids.append(i) + q_weights = q_weights[unpruned_ids] + pruned_weights = weights[unpruned_ids] result = pt_op(q_weights, indices.int() if use_32bit_indices else indices, @@ -3094,6 +3099,7 @@ def get_reference_result( use_32bit_offsets=st.booleans(), enable_per_sample_weights=st.booleans(), include_last_offset=st.booleans(), + fallback_to_no_sparse=st.booleans(), sparsity=st.sampled_from([0.0, 0.5, 0.7])) def test_embedding_bag_byte(self, num_embeddings, embedding_dim, num_offsets, @@ -3101,11 +3107,13 @@ def test_embedding_bag_byte(self, num_embeddings, use_32bit_offsets, enable_per_sample_weights, include_last_offset, + fallback_to_no_sparse, sparsity): self.embedding_bag_rowwise_offsets_run( 8, num_embeddings, embedding_dim, num_offsets, use_32bit_indices, use_32bit_offsets, enable_per_sample_weights, include_last_offset, + fallback_to_no_sparse, sparsity=sparsity, atol=0.005, rtol=1e-3) """ Tests the correctness of the embedding_bag_4bit quantized operator """ @@ -3116,18 +3124,23 @@ def test_embedding_bag_byte(self, num_embeddings, use_32bit_offsets=st.booleans(), enable_per_sample_weights=st.booleans(), include_last_offset=st.booleans(), + fallback_to_no_sparse=st.booleans(), sparsity=st.sampled_from([0.0, 0.5, 0.7])) def test_embedding_bag_4bit(self, num_embeddings, embedding_dim, num_offsets, use_32bit_indices, use_32bit_offsets, enable_per_sample_weights, - include_last_offset, sparsity): + include_last_offset, + fallback_to_no_sparse, + sparsity): self.embedding_bag_rowwise_offsets_run(4, num_embeddings, embedding_dim, num_offsets, use_32bit_indices, use_32bit_offsets, enable_per_sample_weights, - include_last_offset, sparsity=sparsity, + include_last_offset, + fallback_to_no_sparse, + sparsity=sparsity, atol=0.1, rtol=1e-2) """ Tests the correctness of the quantized embedding lookup operator """ diff --git a/test/test_autograd.py b/test/test_autograd.py index 7ec70e8fbe17..5c8acd70a07a 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -5039,7 +5039,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks, 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul', 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub', 'exp', 'mean', 'inverse', 'triangular_solve', 'solve', 'addcmul', - 'addcdiv', 'linalg.tensorinv', ] + separate_complex_tests + 'addcdiv', 'linalg.tensorinv', 'matrix_exp'] + separate_complex_tests def add_test( name, diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index dad0e2feb5b2..8f6d0206de34 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -15,7 +15,8 @@ torch_to_numpy_dtype_dict, make_tensor) from torch.testing._internal.common_device_type import ( instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, - dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA) + dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA, + skipCUDAIfRocm) # TODO: remove this def _generate_input(shape, dtype, device, with_extremal): @@ -1357,27 +1358,136 @@ def test_rdiv(self, device, dtype): z = torch.tensor([30 / v.item() for v in x], device=device) self.assertEqual(y, z, exact_dtype=False) - @onlyCPU + @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False)) + def test_fmod_by_zero_float(self, device, dtype): + # check floating-point tensor fmod to zero is nan on both CPU and GPU + x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9) + zero = torch.zeros_like(x) + + self.assertTrue(torch.all(x.fmod(0.0).isnan())) + self.assertTrue(torch.all(x.fmod(zero).isnan())) + # out + out = torch.empty(0, device=device, dtype=dtype) + torch.fmod(x, zero, out=out) + self.assertEqual(out.size(), torch.Size([10, 10])) + self.assertTrue(torch.all(out.isnan())) + # in-place + x.fmod_(zero) + self.assertTrue(torch.all(x.isnan())) + + @onlyOnCPUAndCUDA # Check Issue https://github.com/pytorch/pytorch/issues/48130 + @skipCUDAIfRocm # Error happens on both ROCM and XLA + @dtypes(*torch.testing.get_all_int_dtypes()) + def test_fmod_by_zero_integral(self, device, dtype): + # check integral tensor fmod to zero + x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9) + zero = torch.zeros_like(x) + # out + out = torch.empty(0, device=device, dtype=dtype) + # In-place + x_ = x.clone() + # RuntimeError on CPU + if device == 'cpu': + with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): + x.fmod(zero) + with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): + torch.fmod(x, zero, out=out) + with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): + x.fmod_(zero) + # Different value for different dtype on GPU + else: + if dtype == torch.int64: + self.assertEqual(x.fmod(zero) == 4294967295, x >= 0) + self.assertEqual(x.fmod(zero) == -1, x < 0) + # out + torch.fmod(x, zero, out=out) + self.assertEqual(out == 4294967295, x >= 0) + self.assertEqual(out == -1, x < 0) + self.assertEqual(out.size(), torch.Size([10, 10])) + # in-place + x_.fmod_(zero) + self.assertEqual(x_ == 4294967295, x >= 0) + self.assertEqual(x_ == -1, x < 0) + else: + value = 255 if dtype == torch.uint8 else -1 + self.assertTrue(torch.all(x.fmod(zero) == value)) + # out + torch.fmod(x, zero, out=out) + self.assertTrue(torch.all(out == value)) + self.assertEqual(out.size(), torch.Size([10, 10])) + # in-place + x_.fmod_(zero) + self.assertTrue(torch.all(x_ == value)) + @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False)) def test_fmod(self, device, dtype): - m1 = torch.Tensor(10, 10).uniform_(-10., 10.).to(dtype=dtype, device=device) - res1 = m1.clone() - q = 3 - res1[:, 3].fmod_(q) - res2 = m1.clone() - for i in range(m1.size(1)): - res2[i, 3] = math.fmod(res2[i, 3], q) - self.assertEqual(res1, res2) + # Use numpy as reference + def _reference_implementation(x, mod): + np_x = x.cpu().numpy() + np_mod = 0 + # No type promotion + # Issue #47779: https://github.com/pytorch/pytorch/issues/47779 + if torch.is_tensor(mod): + np_mod = mod.cpu().numpy() + else: + np_mod = mod + # Non XLA platform needs to cast to int + if dtype in torch.testing.get_all_int_dtypes() and self.device_type in ['cpu', 'cuda']: + np_mod = int(np_mod) + exp = np.fmod(np_x, np_mod) + exp = torch.from_numpy(exp) + + res = torch.fmod(x, mod) + res = res.to(exp.dtype) + self.assertEqual(res, exp) + # out + out = torch.empty(0, device=device, dtype=dtype) + torch.fmod(x, mod, out=out) + out.to(exp.dtype) + self.assertEqual(out, exp) + self.assertEqual(out.size(), torch.Size([10, 10])) + # in-place + x.fmod_(mod) + x.to(exp.dtype) + self.assertEqual(out, exp) + + x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9) + # Exclude 0 + # mod with same dtype as x + mod = make_tensor((10, 10), device=device, dtype=dtype, low=1, high=9) + # mod with floating-point dtype + mod_float = make_tensor((10, 10), device=device, + dtype=torch.float if dtype in torch.testing.get_all_int_dtypes() else dtype, + low=1, high=9) + # non-contiguous + x_nc = x.t() + mod_nc = mod.t() - zero = torch.zeros_like(m1) + # Mods: Integer, Float, Tensor, Non-contiguous Tensor + mods = [3, 2.3, mod, mod_nc] + for m in mods: + _reference_implementation(x, m) + _reference_implementation(x_nc, m) + + # Integral Tensor fmod to floating-point Tensor + # Can not cast floating-point result to original integral Tensor without type promotion if dtype in torch.testing.get_all_int_dtypes(): - with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): - m1.fmod(0) - with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"): - m1.fmod(zero) + res = torch.fmod(x, mod_float) + exp = np.fmod(x.cpu().numpy(), mod_float.cpu().numpy()) + exp = torch.from_numpy(exp) + res = res.to(exp.dtype) + self.assertEqual(res, exp) + with self.assertRaisesRegex(RuntimeError, "result type (Half|Float|Double) " + "can't be cast to the desired " + "output type (Byte|Char|Short|Int|Long)"): + out = torch.empty(0, device=device, dtype=dtype) + torch.fmod(x, mod_float, out=out) + with self.assertRaisesRegex(RuntimeError, "result type (Half|Float|Double) " + "can't be cast to the desired " + "output type (Byte|Char|Short|Int|Long)"): + x.fmod_(mod_float) else: - self.assertTrue(torch.all(m1.fmod(0).isnan())) - self.assertTrue(torch.all(m1.fmod(zero).isnan())) + _reference_implementation(x, mod_float) @onlyCPU @dtypes(torch.float, torch.long) @@ -1596,11 +1706,6 @@ def reference_implementation(res2): def test_cdiv(self, device, dtype): self._test_cop(torch.div, lambda x, y: x / y, dtype, device) - @onlyCPU - @dtypes(torch.float) - def test_cfmod(self, device, dtype): - self._test_cop(torch.fmod, math.fmod, dtype, device) - @onlyCPU @dtypes(torch.float) def test_cremainder(self, device, dtype): diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 0d6ee2e03bd6..a1afc216d42a 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -484,6 +484,17 @@ def __len__(self): return self.size +class EmptyTensorDataset(torch.utils.data.Dataset): + def __init__(self, len): + self.len = len + + def __len__(self): + return self.len + + def __getitem__(self, any): + return torch.empty(0) + + class SynchronizedSeedDataset(SynchronizedDataset): def __getitem__(self, idx): self.sync_once() @@ -504,6 +515,24 @@ def _test_timeout_pin_memory(persistent_workers): _ = next(iter(dataloader)) +def _test_large_sampler_indices(persistent_workers): + # See + # test_large_sampler_indices + # https://github.com/pytorch/pytorch/issues/48666 + + dataloader = torch.utils.data.DataLoader( + EmptyTensorDataset(10000000), + batch_size=40960, + persistent_workers=persistent_workers, + num_workers=1) + + it = iter(dataloader) + + for x in it: + assert x.numel() == 0 + raise RuntimeError('My Error') + + def disable_stderr(worker_id): r""" Avoids printing "ERROR: Unexpected segmentation fault encountered in worker." @@ -978,6 +1007,24 @@ def test_timeout(self): finally: p.terminate() + def test_large_sampler_indices(self): + # Test that the data loader cleanly exit when the process errors + # 1. having an reference to the iterator + # 2. using a sampler that yields big elements s.t. _index_queues putters block + # + # More context: https://github.com/pytorch/pytorch/issues/48666 + + p = ErrorTrackingProcess(target=_test_large_sampler_indices, args=(self.persistent_workers,)) + p.start() + p.join(JOIN_TIMEOUT) + try: + self.assertFalse(p.is_alive()) + self.assertNotEqual(p.exitcode, 0) + self.assertIsInstance(p.exception, RuntimeError) + self.assertRegex(str(p.exception), r'My Error') + finally: + p.terminate() + def test_invalid_ctor_args_combinations(self): # general with self.assertRaisesRegex(ValueError, "num_workers option should be non-negative"): diff --git a/test/test_determination.py b/test/test_determination.py index 0f860cab5101..7e9420285e5a 100644 --- a/test/test_determination.py +++ b/test/test_determination.py @@ -112,6 +112,7 @@ def test_torch_file(self): "distributed/test_distributed_fork", "test_cpp_extensions_aot_ninja", "test_cpp_extensions_aot_no_ninja", + "test_utils", "test_determination", ], ) diff --git a/test/test_kernel_launch_checks.py b/test/test_kernel_launch_checks.py index 8796b9913f73..079a7182a1fc 100644 --- a/test/test_kernel_launch_checks.py +++ b/test/test_kernel_launch_checks.py @@ -9,26 +9,26 @@ def test_check_code(self): # Try some different spacings self.assertEqual(2, check_code_for_cuda_kernel_launches(""" some_function_call<<<1,2,0,stream>>>(arg1,arg2,arg3); -TORCH_CUDA_KERNEL_LAUNCH_CHECK(); +C10_CUDA_KERNEL_LAUNCH_CHECK(); some_function_call<<<1,2,0,stream>>>(arg1,arg2,arg3); some_function_call<<<1,2,0,stream>>>(arg1,arg2,arg3); -TORCH_CUDA_KERNEL_LAUNCH_CHECK(); +C10_CUDA_KERNEL_LAUNCH_CHECK(); some_function_call<<<1,2,0,stream>>>(arg1,arg2,arg3); some_other_stuff; some_function_call<<<1,2,0,stream>>>(arg1,arg2,arg3); -TORCH_CUDA_KERNEL_LAUNCH_CHECK(); +C10_CUDA_KERNEL_LAUNCH_CHECK(); some_function_call<<<1,2,0,stream>>> (arg1,arg2,arg3); -TORCH_CUDA_KERNEL_LAUNCH_CHECK(); +C10_CUDA_KERNEL_LAUNCH_CHECK(); some_function_call<<<1,2,0,stream>>> ( arg1 , arg2 , arg3 ) ; - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); """)) # Does it work for macros? self.assertEqual(0, check_code_for_cuda_kernel_launches(""" #define SOME_MACRO(x) some_function_call<<<1,2>>> ( x ) ; \\ - TORCH_CUDA_KERNEL_LAUNCH_CHECK(); + C10_CUDA_KERNEL_LAUNCH_CHECK(); """)) def test_check_cuda_launches(self): diff --git a/test/test_linalg.py b/test/test_linalg.py index 5b182b2fd49a..71c3cf654c1b 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -2180,6 +2180,131 @@ def test_dot_invalid_args(self, device): self._test_dot_vdot_invalid_args(device, torch.dot) self._test_dot_vdot_invalid_args(device, torch.dot, complex_dtypes=True) + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + def test_matrix_rank(self, device, dtype): + matrix_rank = torch.linalg.matrix_rank + + def run_test(shape0, shape1, batch): + a = torch.randn(*batch, shape0, shape1, dtype=dtype, device=device) + rank_a = matrix_rank(a) + + self.assertEqual(rank_a, matrix_rank(a.conj().transpose(-2, -1))) + aaH = torch.matmul(a, a.conj().transpose(-2, -1)) + rank_aaH = matrix_rank(aaH) + rank_aaH_hermitian = matrix_rank(aaH, hermitian=True) + self.assertEqual(rank_aaH, rank_aaH_hermitian) + aHa = torch.matmul(a.conj().transpose(-2, -1), a) + self.assertEqual(matrix_rank(aHa), matrix_rank(aHa, hermitian=True)) + + # check against NumPy + self.assertEqual(rank_a, np.linalg.matrix_rank(a.cpu().numpy())) + self.assertEqual(matrix_rank(a, 0.01), np.linalg.matrix_rank(a.cpu().numpy(), 0.01)) + + self.assertEqual(rank_aaH, np.linalg.matrix_rank(aaH.cpu().numpy())) + self.assertEqual(matrix_rank(aaH, 0.01), np.linalg.matrix_rank(aaH.cpu().numpy(), 0.01)) + + # hermitian flag for NumPy was added in 1.14.0 + if np.lib.NumpyVersion(np.__version__) >= '1.14.0': + self.assertEqual(rank_aaH_hermitian, + np.linalg.matrix_rank(aaH.cpu().numpy(), hermitian=True)) + self.assertEqual(matrix_rank(aaH, 0.01, True), + np.linalg.matrix_rank(aaH.cpu().numpy(), 0.01, True)) + + # check out= variant + out = torch.empty(a.shape[:-2], dtype=torch.int64, device=device) + ans = matrix_rank(a, out=out) + self.assertEqual(ans, out) + self.assertEqual(ans, rank_a) + + shapes = (3, 13) + batches = ((), (0, ), (4, ), (3, 5, )) + for (shape0, shape1), batch in zip(itertools.product(shapes, reversed(shapes)), batches): + run_test(shape0, shape1, batch) + + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + def test_matrix_rank_empty(self, device, dtype): + matrix_rank = torch.linalg.matrix_rank + + # NumPy doesn't work for input with no elements + def run_test(shape0, shape1, batch): + a = torch.randn(*batch, shape0, shape1, dtype=dtype, device=device) + rank_a = matrix_rank(a) + expected = torch.zeros(batch, dtype=torch.int64, device=device) + + self.assertEqual(rank_a, matrix_rank(a.conj().transpose(-2, -1))) + + aaH = torch.matmul(a, a.conj().transpose(-2, -1)) + rank_aaH = matrix_rank(aaH) + rank_aaH_hermitian = matrix_rank(aaH, hermitian=True) + self.assertEqual(rank_aaH, rank_aaH_hermitian) + + aHa = torch.matmul(a.conj().transpose(-2, -1), a) + self.assertEqual(matrix_rank(aHa), matrix_rank(aHa, hermitian=True)) + + self.assertEqual(rank_a, expected) + self.assertEqual(matrix_rank(a, 0.01), expected) + + self.assertEqual(rank_aaH, expected) + self.assertEqual(matrix_rank(aaH, 0.01), expected) + + self.assertEqual(rank_aaH_hermitian, expected) + self.assertEqual(matrix_rank(aaH, 0.01, True), expected) + + batches = ((), (4, ), (3, 5, )) + for batch in batches: + run_test(0, 0, batch) + run_test(0, 3, batch) + run_test(3, 0, batch) + + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + def test_matrix_rank_basic(self, device, dtype): + matrix_rank = torch.linalg.matrix_rank + + a = torch.eye(10, dtype=dtype, device=device) + self.assertEqual(matrix_rank(a).item(), 10) + self.assertEqual(matrix_rank(a, hermitian=True).item(), 10) + + a[5, 5] = 0 + self.assertEqual(matrix_rank(a).item(), 9) + self.assertEqual(matrix_rank(a, hermitian=True).item(), 9) + + @skipCUDAIfNoMagma + @skipCPUIfNoLapack + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + def test_old_matrix_rank(self, device, dtype): + a = torch.eye(10, dtype=dtype, device=device) + self.assertEqual(torch.matrix_rank(a).item(), 10) + self.assertEqual(torch.matrix_rank(a, True).item(), 10) + + a[5, 5] = 0 + self.assertEqual(torch.matrix_rank(a).item(), 9) + self.assertEqual(torch.matrix_rank(a, True).item(), 9) + + a = torch.randn(24, 42, dtype=dtype, device=device) + self.assertEqual(torch.matrix_rank(a), torch.matrix_rank(a.t())) + aaT = torch.mm(a, a.conj().t()) + self.assertEqual(torch.matrix_rank(aaT), torch.matrix_rank(aaT, True)) + aTa = torch.mm(a.conj().t(), a) + self.assertEqual(torch.matrix_rank(aTa), torch.matrix_rank(aTa, True)) + + a = torch.randn(35, 75, dtype=dtype, device=device) + self.assertEqual(torch.matrix_rank(a), np.linalg.matrix_rank(a.cpu().numpy())) + self.assertEqual(torch.matrix_rank(a, 0.01), np.linalg.matrix_rank(a.cpu().numpy(), 0.01)) + + aaT = torch.mm(a, a.conj().t()) + self.assertEqual(torch.matrix_rank(aaT), np.linalg.matrix_rank(aaT.cpu().numpy())) + self.assertEqual(torch.matrix_rank(aaT, 0.01), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01)) + + if np.lib.NumpyVersion(np.__version__) >= '1.14.0': + self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True)) + self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True)) + def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular, device, dtype): triangle_function = torch.triu if upper else torch.tril @@ -3873,38 +3998,6 @@ def run_test(M): self.assertEqual(torch.eye(matsize, dtype=dtype, device=device).expand(sizes), M.pinverse().matmul(M), atol=1e-7, rtol=0, msg='pseudo-inverse for invertible matrix') - @skipCUDAIfNoMagma - @skipCPUIfNoLapack - def test_matrix_rank(self, device): - a = torch.eye(10, device=device) - self.assertEqual(torch.matrix_rank(a).item(), 10) - self.assertEqual(torch.matrix_rank(a, True).item(), 10) - - a[5, 5] = 0 - self.assertEqual(torch.matrix_rank(a).item(), 9) - self.assertEqual(torch.matrix_rank(a, True).item(), 9) - - a = torch.randn(24, 42, device=device) - self.assertEqual(torch.matrix_rank(a), torch.matrix_rank(a.t())) - aaT = torch.mm(a, a.t()) - self.assertEqual(torch.matrix_rank(aaT), torch.matrix_rank(aaT, True)) - aTa = torch.mm(a.t(), a) - self.assertEqual(torch.matrix_rank(aTa), torch.matrix_rank(aTa, True)) - - from numpy.linalg import matrix_rank - a = torch.randn(35, 75, device=device) - self.assertEqual(torch.matrix_rank(a).item(), matrix_rank(a.cpu().numpy())) - self.assertEqual(torch.matrix_rank(a, 0.01).item(), matrix_rank(a.cpu().numpy(), 0.01)) - - aaT = torch.mm(a, a.t()) - self.assertEqual(torch.matrix_rank(aaT).item(), matrix_rank(aaT.cpu().numpy())) - self.assertEqual(torch.matrix_rank(aaT, 0.01).item(), matrix_rank(aaT.cpu().numpy(), 0.01)) - - if np.lib.NumpyVersion(np.__version__) >= '1.14.0': - self.assertEqual(torch.matrix_rank(aaT, True).item(), matrix_rank(aaT.cpu().numpy(), True)) - self.assertEqual(torch.matrix_rank(aaT, 0.01, True).item(), - matrix_rank(aaT.cpu().numpy(), 0.01, True)) - @skipCUDAIfNoMagma @skipCPUIfNoLapack @dtypes(torch.double) @@ -4008,7 +4101,7 @@ def test_matrix_exp_boundary_cases(self, device, dtype): @slowTest @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.float, torch.double) + @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_matrix_exp_analytic(self, device, dtype): # check zero matrix x = torch.zeros(20, 20, dtype=dtype, device=device) @@ -4152,7 +4245,7 @@ def run_test(*n): @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.float, torch.double) + @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble) def test_matrix_exp_compare_with_taylor(self, device, dtype): def normalize_to_1_operator_norm(sample, desired_norm): @@ -4184,10 +4277,10 @@ def get_taylor_approximation(a, deg): return res def scale_square(a, deg): - if a.norm() < 1.0: + if a.abs().pow(2).sum().sqrt() < 1.0: return get_taylor_approximation(a, 12) else: - s = int(torch.log2(a.norm()).ceil().item()) + s = int(torch.log2(a.abs().pow(2).sum().sqrt()).ceil().item()) b = a / (2 ** s) b = get_taylor_approximation(b, 18) for _ in range(s): @@ -4746,7 +4839,7 @@ def maybe_squeeze_result(l, r, result): @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_lu_solve_batched_non_contiguous(self, device, dtype): from numpy.linalg import solve from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value @@ -4765,20 +4858,22 @@ def lu_solve_test_helper(self, A_dims, b_dims, pivot, device, dtype): from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value b = torch.randn(*b_dims, dtype=dtype, device=device) - A = random_fullrank_matrix_distinct_singular_value(*A_dims, dtype=dtype, device=device) + A = random_fullrank_matrix_distinct_singular_value(*A_dims, dtype=dtype).to(device) LU_data, LU_pivots, info = torch.lu(A, get_infos=True, pivot=pivot) self.assertEqual(info, torch.zeros_like(info)) return b, A, LU_data, LU_pivots @skipCPUIfNoLapack @skipCUDAIfNoMagma - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3, + torch.float64: 1e-8, torch.complex128: 1e-8}) def test_lu_solve(self, device, dtype): def sub_test(pivot): for k, n in zip([2, 3, 5], [3, 5, 7]): b, A, LU_data, LU_pivots = self.lu_solve_test_helper((n,), (n, k), pivot, device, dtype) x = torch.lu_solve(b, LU_data, LU_pivots) - self.assertLessEqual(b.dist(A.mm(x)), 1e-12) + self.assertEqual(b, A.mm(x)) sub_test(True) if self.device_type == 'cuda': @@ -4786,7 +4881,9 @@ def sub_test(pivot): @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) + @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3, + torch.float64: 1e-8, torch.complex128: 1e-8}) def test_lu_solve_batched(self, device, dtype): def sub_test(pivot): def lu_solve_batch_test_helper(A_dims, b_dims, pivot): @@ -4797,7 +4894,8 @@ def lu_solve_batch_test_helper(A_dims, b_dims, pivot): x_exp = torch.stack(x_exp_list) # Stacked output x_act = torch.lu_solve(b, LU_data, LU_pivots) # Actual output self.assertEqual(x_exp, x_act) # Equality check - self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 1e-12) # Correctness check + Ax = torch.matmul(A, x_act) + self.assertEqual(b, Ax) for batchsize in [1, 3, 4]: lu_solve_batch_test_helper((5, batchsize), (batchsize, 5, 10), pivot) @@ -4815,20 +4913,20 @@ def lu_solve_batch_test_helper(A_dims, b_dims, pivot): @slowTest @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_lu_solve_batched_many_batches(self, device, dtype): def run_test(A_dims, b_dims): b, A, LU_data, LU_pivots = self.lu_solve_test_helper(A_dims, b_dims, True, device, dtype) x = torch.lu_solve(b, LU_data, LU_pivots) - b_ = torch.matmul(A, x) - self.assertEqual(b_, b.expand_as(b_)) + Ax = torch.matmul(A, x) + self.assertEqual(Ax, b.expand_as(Ax)) run_test((5, 65536), (65536, 5, 10)) run_test((5, 262144), (262144, 5, 10)) @skipCUDAIfNoMagma @skipCPUIfNoLapack - @dtypes(torch.double) + @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128) def test_lu_solve_batched_broadcasting(self, device, dtype): from numpy.linalg import solve from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value diff --git a/test/test_ops.py b/test/test_ops.py index 9043e7ac5f20..64afc6d36b25 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -88,20 +88,27 @@ def _grad_test_helper(self, device, dtype, op, variant): def _gradgrad_test_helper(self, device, dtype, op, variant): return self._check_helper(device, dtype, op, variant, 'gradgradcheck') + def _skip_helper(self, op, dtype): + if not op.test_complex_grad and dtype.is_complex: + self.skipTest("Skipped! complex grad tests marked to skip.") + # Tests that gradients are computed correctly @dtypes(torch.double, torch.cdouble) @ops(op_db) def test_fn_grad(self, device, dtype, op): + self._skip_helper(op, dtype) self._grad_test_helper(device, dtype, op, op.get_op()) @dtypes(torch.double, torch.cdouble) @ops(op_db) def test_method_grad(self, device, dtype, op): + self._skip_helper(op, dtype) self._grad_test_helper(device, dtype, op, op.get_method()) @dtypes(torch.double, torch.cdouble) @ops(op_db) def test_inplace_grad(self, device, dtype, op): + self._skip_helper(op, dtype) if not op.test_inplace_grad: self.skipTest("Skipped! Inplace gradcheck marked to skip.") self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace())) @@ -110,16 +117,19 @@ def test_inplace_grad(self, device, dtype, op): @dtypes(torch.double, torch.cdouble) @ops(op_db) def test_fn_gradgrad(self, device, dtype, op): + self._skip_helper(op, dtype) self._gradgrad_test_helper(device, dtype, op, op.get_op()) @dtypes(torch.double, torch.cdouble) @ops(op_db) def test_method_gradgrad(self, device, dtype, op): + self._skip_helper(op, dtype) self._gradgrad_test_helper(device, dtype, op, op.get_method()) @dtypes(torch.double, torch.cdouble) @ops(op_db) def test_inplace_gradgrad(self, device, dtype, op): + self._skip_helper(op, dtype) if not op.test_inplace_grad: self.skipTest("Skipped! Inplace gradgradcheck marked to skip.") self._gradgrad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace())) diff --git a/test/test_torch.py b/test/test_torch.py index 378c081cef1a..6c04dd00dc76 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -2312,6 +2312,10 @@ def test_show_config(self): # We can't usefully test the output; just make sure this doesn't crash torch.__config__.show() + @unittest.skipIf(IS_FBCODE, "CXX_FLAGS is only for OSS build.") + def test_cxx_flags(self): + torch.__config__._cxx_flags() + def test_parallel_info(self): torch.__config__.parallel_info() @@ -2536,6 +2540,22 @@ def test_empty_meta(self): z = x + y self.assertEqual(z.size(), (2 ** 20, 2 ** 20)) + def test_upsample_nearest1d_meta(self): + # TODO: this is not a sustainable way of testing meta functions, + # but I want some quick scaffolding first before a more + # integrated testing strategy + # NB: Can't make the exponent too big, or it will overflow + # signed 64-bit integer + x = torch.empty_meta(2 * 10 ** 8, 3, 2 * 10 ** 8) + z = torch.nn.functional.interpolate(x, scale_factor=2) + self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8)) + + # interpolate doesn't seem to support out= + # (not sure why passing None here doesn't work? How strange...) + z = torch.empty_meta(0) + torch._C._nn.upsample_nearest1d(x, (4 * 10 ** 8,), 2, out=z) + self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8)) + def test_normal_shape(self): warned = False for device in torch.testing.get_all_device_types(): @@ -5301,7 +5321,7 @@ def _test_helper(x, y, bias, memory_format): lambda x, y: x.expm1_(), lambda x, y: x.floor(), lambda x, y: x.floor_(), - # lambda x, y: x.fmod(2), # https://github.com/pytorch/pytorch/issues/24565 + lambda x, y: x.fmod(2), lambda x, y: x.frac(), lambda x, y: x.hypot(y), lambda x, y: x.hypot_(y), @@ -6695,7 +6715,6 @@ def inner(self, device, dtype): ('log10', '', _small_3d, lambda t, d: [], 1e-2, 5e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), ('log1p', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types_no_half, [torch.bfloat16]), ('log2', '', _small_3d, lambda t, d: [], 1e-2, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), - ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()), ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()), ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]), ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 2447a88decc5..656845598a49 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -857,18 +857,6 @@ def test_hardswish(self, device, dtype): torch.nn.functional.hardswish(inputTensorCpy, inplace=True) self.assertEqual(inputTensorCpy, expectedOutputTensor) - @onlyCPU - @dtypes(torch.float, torch.double) - def test_sigmoid(self, device, dtype): - # TODO: why not simulate math.sigmoid like with rsqrt? - inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000] - expectedOutput = [0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000] - precision_4dps = 0.0002 - - self.assertEqual(torch.tensor(inputValues, dtype=dtype, device=device).sigmoid(), - torch.tensor(expectedOutput, dtype=dtype, device=device), - atol=precision_4dps, rtol=0) - @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002}) @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16) @dtypes(torch.float, torch.double) diff --git a/test/test_utils.py b/test/test_utils.py index 1e6449d3764c..5f1e693ab12f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -3,17 +3,20 @@ import re import shutil import random +import subprocess import tempfile +import textwrap import unittest import torch import torch.nn as nn import torch.utils.data import torch.cuda from torch.utils.checkpoint import checkpoint, checkpoint_sequential +import torch.utils.cpp_extension import torch.hub as hub from torch.autograd._functions.utils import check_onnx_broadcast from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings -from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE +from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS from urllib.error import URLError # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for @@ -662,5 +665,58 @@ def forward(self, x): ms(torch.tensor([False], dtype=torch.bool)) +@unittest.skipIf(IS_SANDCASTLE, "cpp_extension is OSS only.") +class TestStandaloneCPPJIT(TestCase): + def test_load_standalone(self): + build_dir = tempfile.mkdtemp() + try: + src_path = os.path.join(build_dir, "main.cpp") + src = textwrap.dedent("""\ + #include + #include + int main() { + auto x = torch::eye(3); + std::cout << x << std::endl; + } + """) + with open(src_path, "wt") as f: + f.write(src) + + exec_path = torch.utils.cpp_extension.load( + "standalone_load_test", + src_path, + build_directory=build_dir, + is_python_module=False, + is_standalone=True, + ) + + ext = ".exe" if IS_WINDOWS else "" + self.assertEqual( + exec_path, + os.path.join(build_dir, f"standalone_load_test{ext}") + ) + + for shell in [True, False]: + r = subprocess.run( + [exec_path], + shell=shell, + stdout=subprocess.PIPE, + ) + self.assertEqual(r.returncode, 0) + self.assertEqual( + # Windows prints "\r\n" for newlines. + textwrap.dedent(r.stdout.decode("utf-8")).replace("\r\n", "\n"), + textwrap.dedent("""\ + 1 0 0 + 0 1 0 + 0 0 1 + [ CPUFloatType{3,3} ] + """) + ) + + finally: + shutil.rmtree(build_dir) + + if __name__ == '__main__': run_tests() diff --git a/tools/autograd/gen_annotated_fn_args.py b/tools/autograd/gen_annotated_fn_args.py index 6003b58c1784..c393c905c73f 100644 --- a/tools/autograd/gen_annotated_fn_args.py +++ b/tools/autograd/gen_annotated_fn_args.py @@ -52,7 +52,7 @@ def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None: @with_native_function def gen_annotated_args(f: NativeFunction) -> str: out_args: List[Dict[str, Any]] = [] - for arg in f.func.arguments: + for arg in f.func.arguments.positional: if arg.default is not None: continue out_arg: Dict[str, Any] = {} diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py index 6662b2a6ef7f..e55402f9e68d 100644 --- a/tools/autograd/gen_trace_type.py +++ b/tools/autograd/gen_trace_type.py @@ -137,7 +137,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen if f.func.is_out_fn(): # for *_out functions, handle the result argument differently for inplace/outplace. # For inplace: just add the input to the end to confirm with the JIT schema - name = f.func.out_arguments[0].name # TODO: old codegen behavior - should fix + name = f.func.arguments.out[0].name # TODO: old codegen behavior - should fix inplace = ADD_TRACE_INPUT.substitute(name=name, input=name) # for outplace: do nothing, except if the function is a factory. @@ -145,7 +145,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen # take an extra TensorOptions argument, which is missing in the _out function has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns) has_tensor_input_arg = any(a.type.is_tensor_like() - for a in itertools.chain(f.func.arguments, f.func.kwarg_only_arguments)) + for a in itertools.chain(f.func.arguments.positional, f.func.arguments.kwarg_only)) is_factory_method = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg) # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method` @@ -251,7 +251,7 @@ def format_prerecord_trace(f: NativeFunction) -> str: add_trace_inputs=format_trace_inputs(f) + additional_inputs, inplace_guard=INPLACE_GUARD.substitute( name=cpp.name(f.func), - mutable_input=f.func.out_arguments[0].name if f.func.out_arguments else 'self', + mutable_input=f.func.arguments.out[0].name if f.func.arguments.out else 'self', ) if is_inplace else '', ) @@ -269,7 +269,7 @@ def format_postrecord_trace(f: NativeFunction) -> str: # For outplacing ops, *_out overloads require special handling to move the # output *argument* to a return value if f.func.is_out_fn(): - output_names_outplace = [arg.name for arg in f.func.out_arguments] + output_names_outplace = [arg.name for arg in f.func.arguments.out] output_names_inplace = cpp.return_names(f) # Code size optimization: the common case is that the return value is diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py index b149194dc7d6..e60d90e4ecf8 100644 --- a/tools/autograd/gen_variable_type.py +++ b/tools/autograd/gen_variable_type.py @@ -78,7 +78,7 @@ 'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal', 'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_', 'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv', - 'linalg_eigh', + 'matrix_exp', 'linalg_eigh', } # Some operators invalidate the grad_accumulator. Let's reset it. diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py index a2f576781761..b20497b5a82c 100644 --- a/tools/codegen/api/cpp.py +++ b/tools/codegen/api/cpp.py @@ -1,7 +1,7 @@ from tools.codegen.model import * from tools.codegen.api.types import * import tools.codegen.local as local -from typing import Optional, Sequence, Union, Callable, List +from typing import Optional, Sequence, Union, List # This file describes the translation of JIT schema to the public C++ # API, which is what people use when they call functions like at::add. @@ -157,7 +157,7 @@ def return_names(f: NativeFunction) -> Sequence[str]: # corresponding output function (r.name will get recorded # in field_name later.) elif f.func.is_out_fn(): - name = f.func.out_arguments[i].name + name = f.func.arguments.out[i].name # If the return argument is explicitly named... elif r.name: name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments()) @@ -251,15 +251,15 @@ def argument_not_this( assert_never(a) def argument( - a: Union[Argument, TensorOptionsArguments, ThisArgument], + a: Union[Argument, TensorOptionsArguments, SelfArgument], ) -> Union[CppSingleArgumentPack, CppThisArgumentPack]: - if isinstance(a, ThisArgument): + if isinstance(a, SelfArgument): return CppThisArgumentPack(argument=a, type=argument_type(a.argument)) else: return CppSingleArgumentPack(argument_not_this(a)) def argument_faithful( - a: Union[Argument, TensorOptionsArguments, ThisArgument], + a: Union[Argument, TensorOptionsArguments, SelfArgument], ) -> CppArgumentPack: if isinstance(a, TensorOptionsArguments): return CppTensorOptionsArgumentPack( @@ -272,46 +272,20 @@ def argument_faithful( else: return argument(a) -# NB: this unconditionally groups arguments def group_arguments( func: FunctionSchema, *, method: bool -) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]: - args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = [] - - args.extend(func.out_arguments) - - if method: - args.extend(ThisArgument(a) if a.name == "self" else a for a in func.arguments) - else: - args.extend(func.arguments) - - # group up arguments for tensor options - - def pred(name: str, ty: Type) -> Callable[[Argument], bool]: - return lambda a: a.name == name and a.type in [ty, OptionalType(ty)] - predicates = [ # order matters - pred('dtype', Type.parse('ScalarType')), - pred('layout', Type.parse('Layout')), - pred('device', Type.parse('Device')), - pred('pin_memory', Type.parse('bool')), - ] - - i = 0 - while i < len(func.kwarg_only_arguments): - # If there is enough space... - if i <= len(func.kwarg_only_arguments) - len(predicates): - # And the next len(predicates) arguments look like TensorOptions arguments - if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])): - # Group them together as one argument - args.append(TensorOptionsArguments( - dtype=func.kwarg_only_arguments[i], - layout=func.kwarg_only_arguments[i + 1], - device=func.kwarg_only_arguments[i + 2], - pin_memory=func.kwarg_only_arguments[i + 3], - )) - i += len(predicates) - continue - args.append(func.kwarg_only_arguments[i]) - i += 1 - +) -> Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]]: + args: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = [] + args.extend(func.arguments.out) + args.extend(func.arguments.pre_self_positional) + if func.arguments.self_arg is not None: + if method: + args.append(func.arguments.self_arg) + else: + args.append(func.arguments.self_arg.argument) + args.extend(func.arguments.post_self_positional) + args.extend(func.arguments.pre_tensor_options_kwarg_only) + if func.arguments.tensor_options is not None: + args.append(func.arguments.tensor_options) + args.extend(func.arguments.post_tensor_options_kwarg_only) return args diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py index 813f06feb741..8f3925de0041 100644 --- a/tools/codegen/api/dispatcher.py +++ b/tools/codegen/api/dispatcher.py @@ -68,7 +68,7 @@ def name(func: FunctionSchema) -> str: def arguments(func: FunctionSchema) -> Tuple[DispatcherArgument, ...]: if local.use_c10_dispatcher().dispatcher_uses_new_style(): - return tuple(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments))) + return tuple(map(argument, itertools.chain(func.arguments.out, func.arguments.positional, func.arguments.kwarg_only))) else: return tuple( DispatcherArgument(type=la.type, name=la.name, argument=la.argument) diff --git a/tools/codegen/api/meta.py b/tools/codegen/api/meta.py index 4e3bf00e01dc..4bfc8e837ec1 100644 --- a/tools/codegen/api/meta.py +++ b/tools/codegen/api/meta.py @@ -55,5 +55,5 @@ def argument(a: Argument) -> MetaArgument: ) def arguments(func: FunctionSchema) -> Sequence[MetaArgument]: - assert not func.out_arguments - return list(map(argument, itertools.chain(func.arguments, func.kwarg_only_arguments))) + assert not func.arguments.out + return list(map(argument, itertools.chain(func.arguments.positional, func.arguments.kwarg_only))) diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py index b459ee2c23e2..b9e5257aef85 100644 --- a/tools/codegen/api/native.py +++ b/tools/codegen/api/native.py @@ -1,6 +1,6 @@ from tools.codegen.model import * -from tools.codegen.api.types import TensorOptionsArguments, NativeArgument, ThisArgument +from tools.codegen.api.types import NativeArgument import tools.codegen.api.cpp as cpp from tools.codegen import local @@ -43,7 +43,7 @@ def returns_type(rs: Sequence[Return]) -> str: def argument_type(a: Argument) -> str: return argumenttype_type(a.type, mutable=a.is_write) -def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Sequence[NativeArgument]: +def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> Sequence[NativeArgument]: if isinstance(a, Argument): return [NativeArgument( type=argument_type(a), @@ -51,8 +51,8 @@ def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Sequen default=cpp.default_expr(a.default, a.type) if a.default is not None else None, argument=a, )] - elif isinstance(a, ThisArgument): - # Erase ThisArgument from the distinction + elif isinstance(a, SelfArgument): + # Erase SelfArgument from the distinction return [NativeArgument( type=argument_type(a.argument), name=a.argument.name, diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py index 5ad881b372c5..4b407d45553a 100644 --- a/tools/codegen/api/python.py +++ b/tools/codegen/api/python.py @@ -527,9 +527,9 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature: # arguments are created based on different rules - see below. args = tuple(a for a in cpp.group_arguments(f.func, method=method) if isinstance(a, Argument)) - input_arg_set = set(a.name for a in f.func.arguments) - kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments) - out_arg_set = set(a.name for a in f.func.out_arguments) + input_arg_set = set(a.name for a in f.func.arguments.positional) + kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only) + out_arg_set = set(a.name for a in f.func.arguments.out) input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args))) input_kwargs = tuple(map(argument, filter(lambda a: a.name in kwarg_only_set, args))) @@ -544,7 +544,7 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature: # source of drift between eager and JIT. Pull this logic out to a shared place. has_tensor_input_arg = any(a.type.is_tensor_like() - for a in itertools.chain(f.func.arguments, f.func.kwarg_only_arguments)) + for a in itertools.chain(f.func.arguments.positional, f.func.arguments.kwarg_only)) if any(a.name == 'requires_grad' for a in f.func.schema_order_arguments()): raise ValueError('argument named requires_grad is reserved, should not explicitly add it in the schema') @@ -657,7 +657,7 @@ def dispatch_lambda_args(ps: PythonSignature, f: NativeFunction) -> Tuple[Dispat ps.deprecated_args_names) cpp_args = list(map(lambda n: m[n], ordered_args)) - out_args: Set[str] = set(a.name for a in f.func.out_arguments) + out_args: Set[str] = set(a.name for a in f.func.arguments.out) # Convert from cpp argument to lambda argument def dispatch_lambda_arg(cpp_arg: CppArgument) -> DispatchLambdaArgument: diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py index b5ee4f4ef459..32caf26f223f 100644 --- a/tools/codegen/api/types.py +++ b/tools/codegen/api/types.py @@ -6,28 +6,6 @@ # ------------------------------------------------------------------- # -# Grouping arguments - -# ------------------------------------------------------------------- # - -# Represents the implicit *this argument for method calls in C++ API -@dataclass(frozen=True) -class ThisArgument: - argument: Argument - -# Bundle of arguments that represent a TensorOptions in the C++ API. -@dataclass(frozen=True) -class TensorOptionsArguments: - dtype: Argument - layout: Argument - device: Argument - pin_memory: Argument - - def all(self) -> Sequence[Argument]: - return [self.dtype, self.layout, self.device, self.pin_memory] - -# ------------------------------------------------------------------- # - # cpp types # ------------------------------------------------------------------- # @@ -105,7 +83,7 @@ def explicit_arguments(self) -> Sequence[CppArgument]: @dataclass(frozen=True) class CppThisArgumentPack(CppArgumentPackIface): # The grouped JIT argument this formal was derived from - argument: ThisArgument + argument: SelfArgument # C++ type, e.g., Tensor& type: str @@ -210,7 +188,7 @@ def defn(self, name: Optional[str] = None, *, prefix: str = "") -> str: @staticmethod def _from_grouped_arguments( func: FunctionSchema, - arguments: Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]], + arguments: Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]], *, faithful: bool ) -> 'CppSignature': diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index a0a1f21d13dc..4db060acd401 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -244,7 +244,11 @@ def __call__(self, f: Union[StructuredNativeFunctions, NativeFunction]) -> List[ assert_never(f) def gen_structured(self, g: StructuredNativeFunctions) -> List[str]: - if self.dispatch_key not in g.out.dispatch: + if self.dispatch_key == 'Meta': + assert self.dispatch_key not in g.out.dispatch, \ + "Do not explicitly specify Meta dispatch key on structured " \ + "functions, they will be automatically generated for you" + elif self.dispatch_key not in g.out.dispatch: return [] # Inner helper function to close over g @@ -272,35 +276,42 @@ def gen_one(f: NativeFunction) -> Optional[str]: sig = NativeSignature.from_schema(f.func) if self.target is Target.DEFINITION: - out_impl_name = f"at::native::{g.out.dispatch[self.dispatch_key]}" - # TODO: work a little harder to generate fresh names for 'result' # TODO: less praying that I picked the right argument name for 'self' if k is SchemaKind.functional: out_expr = "result" - prologue = "auto result = tensor_from_meta(meta_result);" + if self.dispatch_key == "Meta": + prologue = "auto result = meta_tensor_from_meta(meta_result);" + else: + prologue = "auto result = tensor_from_meta(meta_result);" elif k is SchemaKind.inplace: out_expr = "self" prologue = "// TODO: consistency check assert" elif k is SchemaKind.out: # TODO: generalize this for multi-out - assert len(f.func.out_arguments) == 1, "multi-out structured not supported yet" + assert len(f.func.arguments.out) == 1, "multi-out structured not supported yet" # TODO: properly get the expression as it was brought into # scope by sig - out_expr = f.func.out_arguments[0].name + out_expr = f.func.arguments.out[0].name prologue = f""" // TODO: add a consistency check for meta_result {out_expr}.resize_(meta_result.sizes); """ + if self.dispatch_key == "Meta": + out_impl_call = "// meta function does nothing" + else: + out_impl_name = f"at::native::{g.out.dispatch[self.dispatch_key]}" + out_impl_call = f"{out_impl_name}({out_expr}, {functional_exprs});" + device_guard = "" if is_generic_dispatch_key(self.dispatch_key) or is_cuda_dispatch_key(self.dispatch_key): # TODO: avoid copypasting the computation of self_args, # candidate_args and device_of - self_args = (a for a in f.func.arguments if a.name == "self") - candidate_args = itertools.chain(self_args, f.func.out_arguments, f.func.arguments) + self_args = (a for a in f.func.arguments.positional if a.name == "self") + candidate_args = itertools.chain(self_args, f.func.arguments.out, f.func.arguments.positional) device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) device_guard = '' @@ -317,7 +328,7 @@ def gen_one(f: NativeFunction) -> Optional[str]: {device_guard} auto meta_result = meta::{meta_name}({functional_exprs}); {prologue} - {out_impl_name}({out_expr}, {functional_exprs}); + {out_impl_call} return {out_expr}; }} """ @@ -358,11 +369,11 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]: cuda_guard = "" if is_generic_dispatch_key(self.dispatch_key) or is_cuda_dispatch_key(self.dispatch_key): - self_args = (a for a in f.func.arguments if a.name == "self") + self_args = (a for a in f.func.arguments.positional if a.name == "self") # There is precedence for which argument we use to do # device guard. This describes the precedence order. - candidate_args = itertools.chain(self_args, f.func.out_arguments, f.func.arguments) + candidate_args = itertools.chain(self_args, f.func.arguments.out, f.func.arguments.positional) # Only tensor like arguments are eligible device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None) @@ -487,8 +498,8 @@ def __call__(self, f: NativeFunction) -> Optional[str]: return None assert not f.func.is_out_fn() - assert len(f.func.arguments) > 0 - assert sum(a.name == 'self' for a in f.func.arguments) == 1 + assert len(f.func.arguments.positional) > 0 + assert sum(a.name == 'self' for a in f.func.arguments.positional) == 1 name = cpp.name(f.func) @@ -757,7 +768,7 @@ def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[ # See Note [name and field_name] ret['field_name'] = r.name if f.func.is_out_fn(): - name_to_field_name[f.func.out_arguments[i].name] = r.name + name_to_field_name[f.func.arguments.out[i].name] = r.name returns.append(ret) @@ -778,7 +789,7 @@ def compute_cpp_argument_yaml(cpp_a: CppArgument, *, schema_order: bool, kwarg_o if cpp_a.default is not None: arg['default'] = cpp_a.default return arg - elif isinstance(cpp_a.argument, ThisArgument): + elif isinstance(cpp_a.argument, SelfArgument): raise AssertionError() elif isinstance(cpp_a.argument, Argument): return compute_argument_yaml( @@ -817,8 +828,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object: # These sets are used to conveniently test if an argument is a # kwarg-only or out argument - kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments) - out_arg_set = set(a.name for a in f.func.out_arguments) + kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only) + out_arg_set = set(a.name for a in f.func.arguments.out) sig_group = CppSignatureGroup.from_schema(f.func, method=False) cpp_args = sig_group.signature.arguments() @@ -1048,6 +1059,7 @@ def flatten_pre_group(d: Dict[SchemaKind, NativeFunction]) -> Sequence[Union[Nat # TODO: how come ValuesView isn't a Sequence lol grouped_native_functions = list(concatMap(flatten_pre_group, list(pre_grouped_native_functions.values()))) + structured_native_functions = [g for g in grouped_native_functions if isinstance(g, StructuredNativeFunctions)] template_dir = os.path.join(options.source_path, "templates") @@ -1093,6 +1105,9 @@ def make_file_manager(install_dir: str) -> FileManager: "QuantizedCUDA", "Math", "DefaultBackend", + # Meta is a magic key: it is automatically generated for structured + # kernels + "Meta", ] if options.backend_whitelist: dispatch_keys = [k for k in dispatch_keys if is_generic_dispatch_key(k) or k in options.backend_whitelist] @@ -1129,9 +1144,7 @@ def make_file_manager(install_dir: str) -> FileManager: }) cpu_fm.write('MetaFunctions.h', lambda: { - 'declarations': - list(mapMaybe(compute_meta_function_declaration, - (g for g in grouped_native_functions if isinstance(g, StructuredNativeFunctions)))), + 'declarations': list(map(compute_meta_function_declaration, structured_native_functions)), }) schema_selector = selector diff --git a/tools/codegen/model.py b/tools/codegen/model.py index a7d5ea220b46..f270d0737ade 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -1,7 +1,7 @@ import re from dataclasses import dataclass -from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn +from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn, Sequence, Callable from enum import Enum import itertools @@ -253,7 +253,7 @@ def validate_unstructured(self) -> None: # Validation is for nontrivial invariants that cannot be (conveniently) # encoded in the type system. def __post_init__(self) -> None: - if self.func.out_arguments: + if self.func.arguments.out: assert self.variants == {Variant.function}, "Native functions with out arguments MUST " \ "be declared with only function variant; e.g., variants: function; " \ "otherwise you will tickle a Python argument binding bug " \ @@ -380,20 +380,13 @@ class FunctionSchema: # The name of the operator this function schema describes. name: 'OperatorName' - arguments: Tuple['Argument', ...] - kwarg_only_arguments: Tuple['Argument', ...] # but not including out args - # Unlike in the previous codegen, we have factored out 'out' arguments - # in the canonical representation, removing them from kwarg - # arguments. This choice is justified by numerous downstream - # transformations which treat out arguments specially; additionally, - # you can see that canonicity is not violated! - out_arguments: Tuple['Argument', ...] # these are also kwarg-only + arguments: 'Arguments' # TODO: Need to handle collisions with argument names at some point returns: Tuple['Return', ...] def schema_order_arguments(self) -> Iterator['Argument']: - return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments) + return itertools.chain(self.arguments.positional, self.arguments.kwarg_only, self.arguments.out) @staticmethod def parse(func: str) -> 'FunctionSchema': @@ -404,20 +397,18 @@ def parse(func: str) -> 'FunctionSchema': assert args[-1] == ")", "Expecting closing )" args = args[:-1] name = OperatorName.parse(ops) - arguments, kwarg_only_arguments, out_arguments = parse_arguments(args) + arguments = Arguments.parse(args) returns = parse_returns(return_decl) r = FunctionSchema( name=name, arguments=arguments, - kwarg_only_arguments=kwarg_only_arguments, - out_arguments=out_arguments, returns=returns ) assert str(r) == func, f'{str(r)} != {func}' return r def __post_init__(self) -> None: - for arg, ret in zip(self.out_arguments, self.returns): + for arg, ret in zip(self.arguments.out, self.returns): assert arg.annotation == ret.annotation, \ "Out arguments must have matching return Tensor; furthermore, " \ "the ith-argument needs to correspond to the ith return" @@ -425,14 +416,14 @@ def __post_init__(self) -> None: # This means that all mutable returns should be aliased to a keyword argument # (except for "self", which we explicitly don't treat as an out argument because of its use in methods) # See Note [is_out_fn] - out_and_self = list(self.out_arguments) + [arg for arg in self.arguments if arg.name == "self"] + out_and_self = list(self.arguments.out) + [arg for arg in self.arguments.positional if arg.name == "self"] mutable_returns = [ret for ret in self.returns if ret.annotation is not None and ret.annotation.is_write] for ret in mutable_returns: assert any([ret.annotation == arg.annotation for arg in out_and_self]), \ "All mutable returns must be aliased either to a keyword argument, or to \"self\". " \ "Did you forget to mark an out argument as keyword-only?" - if self.out_arguments: - assert len(self.out_arguments) == len(self.returns), \ + if self.arguments.out: + assert len(self.arguments.out) == len(self.returns), \ "Must return as many arguments as there are out arguments" if self.name.name.inplace: # TODO: fixme @@ -508,7 +499,7 @@ def is_out_fn(self) -> bool: # but just with extra kwargs for the output elements. This # is difficult to actually check for and historically # we only do this check in tools/ - return bool(self.out_arguments) + return bool(self.arguments.out) def kind(self) -> SchemaKind: """ @@ -518,7 +509,7 @@ def kind(self) -> SchemaKind: the result into an explicitly provided out argument. """ is_inplace = self.name.name.inplace - is_out = bool(self.out_arguments) + is_out = bool(self.arguments.out) assert not (is_inplace and is_out) if is_inplace: return SchemaKind.inplace @@ -544,16 +535,6 @@ def signature(self) -> 'FunctionSchema': because you cannot overload on mutability annotation) """ - # dataclasses.replace could be used here, but it is less - # type safe so for now I've opted to type everything out - def strip_arg_annotation(a: Argument) -> Argument: - return Argument( - name=a.name, - type=a.type, - default=a.default, # hmmm - annotation=None, - ) - def strip_ret_annotation(r: Return) -> Return: return Return( name=r.name, @@ -570,20 +551,12 @@ def strip_ret_annotation(r: Return) -> Return: ), overload_name="", # stripped ), - arguments=tuple(map(strip_arg_annotation, self.arguments)), - kwarg_only_arguments=tuple(map(strip_arg_annotation, self.kwarg_only_arguments)), - out_arguments=(), # stripped + arguments=self.arguments.signature(), returns=tuple(map(strip_ret_annotation, self.returns)), ) def __str__(self) -> str: - all_arguments: List[str] = [] - all_arguments.extend(map(str, self.arguments)) - if self.kwarg_only_arguments or self.out_arguments: - all_arguments.append('*') - all_arguments.extend(map(str, self.kwarg_only_arguments)) - all_arguments.extend(map(str, self.out_arguments)) - all_arguments_str = ', '.join(all_arguments) + all_arguments_str = str(self.arguments) if len(self.returns) == 1: returns = str(self.returns[0]) # omit parentheses else: @@ -869,6 +842,221 @@ def __str__(self) -> str: return f"{type} {self.name}" +# Represents the self argument for functions that may be methods +@dataclass(frozen=True) +class SelfArgument: + argument: Argument + +# Bundle of arguments that represent a TensorOptions. This is mostly +# relevant for the public C++ API but we bake it into the core data +# model because other APIs often have to interact with it +@dataclass(frozen=True) +class TensorOptionsArguments: + dtype: Argument + layout: Argument + device: Argument + pin_memory: Argument + + def all(self) -> Sequence[Argument]: + return [self.dtype, self.layout, self.device, self.pin_memory] + +@dataclass(frozen=True) +class Arguments: + # pre_self_positional is usually empty, but is notably non-empty + # for where.self, where the condition argument comes before the + # self argument + pre_self_positional: Tuple[Argument, ...] + self_arg: Optional[SelfArgument] + post_self_positional: Tuple[Argument, ...] + + pre_tensor_options_kwarg_only: Tuple[Argument, ...] + tensor_options: Optional[TensorOptionsArguments] + # post_tensor_options is typically memory format, which should be + # part of tensor options but isn't right now, and is usually + # placed after the tensor options arguments + post_tensor_options_kwarg_only: Tuple[Argument, ...] + + # Unlike in the previous codegen, we have factored out 'out' arguments + # in the canonical representation, removing them from kwarg + # arguments. This choice is justified by numerous downstream + # transformations which treat out arguments specially; additionally, + # you can see that canonicity is not violated! + out: Tuple[Argument, ...] # these are also kwarg-only + + @property + def positional(self) -> Sequence[Argument]: + ret: List[Argument] = [] + ret.extend(self.pre_self_positional) + if self.self_arg is not None: + ret.append(self.self_arg.argument) + ret.extend(self.post_self_positional) + return ret + + # NB: doesn't contain out arguments + @property + def kwarg_only(self) -> Sequence[Argument]: + ret: List[Argument] = [] + ret.extend(self.pre_tensor_options_kwarg_only) + if self.tensor_options is not None: + ret.extend(self.tensor_options.all()) + ret.extend(self.post_tensor_options_kwarg_only) + return ret + + def signature(self) -> 'Arguments': + # dataclasses.replace could be used here, but it is less + # type safe so for now I've opted to type everything out + def strip_arg_annotation(a: Argument) -> Argument: + return Argument( + name=a.name, + type=a.type, + default=a.default, # hmmm + annotation=None, + ) + + return Arguments( + pre_self_positional=tuple(map(strip_arg_annotation, self.pre_self_positional)), + self_arg=SelfArgument( + strip_arg_annotation(self.self_arg.argument) + ) if self.self_arg is not None else None, + post_self_positional=tuple(map(strip_arg_annotation, self.post_self_positional)), + pre_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.pre_tensor_options_kwarg_only)), + # NB: tensor_options guaranteed to not have any alias annotations + tensor_options=self.tensor_options, + post_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.post_tensor_options_kwarg_only)), + # out arguments are dropped in signature + out=(), + ) + + + @staticmethod + def _preparse(args: str) -> Tuple[List[Argument], List[Argument], List[Argument]]: + positional: List[Argument] = [] + kwarg_only: List[Argument] = [] + out: List[Argument] = [] + arguments_acc = positional + + # TODO: Use a real parser here; this will get bamboozled + # by signatures that contain things like std::array (note the space) + for arg in args.split(', '): + if not arg: + continue + if arg == '*': + assert arguments_acc is positional, "invalid syntax: kwarg-only specifier * can only occur once" + arguments_acc = kwarg_only + continue + parg = Argument.parse(arg) + # Currently, we rely directly on the invariant that there are NO + # kwarg-only mutating arguments. If you want to relax this, + # we will need a more semantic way of matching that takes + # into account return arguments. In that case, you will have + # to manage out computation a level up, in FunctionSchema. See Note + # [is_out_fn] + if parg.annotation is not None and parg.annotation.is_write: + if arguments_acc is positional: + pass # do nothing + elif arguments_acc is kwarg_only: + arguments_acc = out + else: + assert arguments_acc is not out + arguments_acc.append(parg) + + return positional, kwarg_only, out + + @staticmethod + def parse(args: str) -> 'Arguments': + """ + Input: 'int x, int y, int z' + """ + + # We do this in two phases. First we parse into three + # main categories: positional, kwarg_only, out. + # Then, we reparse positional and kwarg_only to separate + # out the self argument and tensor options arguments. + + positional, kwarg_only, out = Arguments._preparse(args) + + # Split self argument + self_ix = None + for i, a in enumerate(positional): + if a.name == "self": + self_ix = i + break + pre_self_positional: List[Argument] + self_arg: Optional[SelfArgument] + post_self_positional: List[Argument] + if self_ix is not None: + pre_self_positional = positional[:self_ix] + self_arg = SelfArgument(positional[self_ix]) + post_self_positional = positional[self_ix + 1:] + else: + pre_self_positional = [] + self_arg = None + post_self_positional = positional + + # Group tensor options arguments + pre_tensor_options_kwarg_only: List[Argument] = [] + tensor_options: Optional[TensorOptionsArguments] = None + post_tensor_options_kwarg_only: List[Argument] = [] + kwarg_only_acc = pre_tensor_options_kwarg_only + + def pred(name: str, ty: Type) -> Callable[[Argument], bool]: + return lambda a: a.name == name and a.type in [ty, OptionalType(ty)] + predicates = [ # order matters + pred('dtype', Type.parse('ScalarType')), + pred('layout', Type.parse('Layout')), + pred('device', Type.parse('Device')), + pred('pin_memory', Type.parse('bool')), + ] + + i = 0 + while i < len(kwarg_only): + # If there is enough space... + if i <= len(kwarg_only) - len(predicates): + # And the next len(predicates) arguments look like TensorOptions arguments + if all(p(a) for p, a in zip(predicates, kwarg_only[i : i + len(predicates)])): + assert kwarg_only_acc is pre_tensor_options_kwarg_only + # Group them together as one argument + tensor_options = TensorOptionsArguments( + dtype=kwarg_only[i], + layout=kwarg_only[i + 1], + device=kwarg_only[i + 2], + pin_memory=kwarg_only[i + 3], + ) + i += len(predicates) + kwarg_only_acc = post_tensor_options_kwarg_only + continue + kwarg_only_acc.append(kwarg_only[i]) + i += 1 + + return Arguments( + pre_self_positional=tuple(pre_self_positional), + self_arg=self_arg, + post_self_positional=tuple(post_self_positional), + pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only), + tensor_options=tensor_options, + post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only), + out=tuple(out), + ) + + + def __str__(self) -> str: + all_arguments: List[str] = [] + all_arguments.extend(map(str, self.positional)) + if self.kwarg_only or self.out: + all_arguments.append('*') + all_arguments.extend(map(str, self.kwarg_only)) + all_arguments.extend(map(str, self.out)) + return ', '.join(all_arguments) + + def __post_init__(self) -> None: + # TODO: These invariants are weirdly asymmetric? + # TODO: Fancier types? + if self.self_arg is None: + assert not self.pre_self_positional + if self.tensor_options is None: + assert not self.post_tensor_options_kwarg_only + + # Names that validly are __iXXX__ indicating inplace operations. # Taken from https://www.python.org/dev/peps/pep-0203/#new-methods # NB: PyTorch hasn't actually implemented all of these @@ -965,40 +1153,3 @@ def parse_returns(return_decl: str) -> Tuple[Return, ...]: if return_decl[0] == '(' and return_decl[-1] == ')': return_decl = return_decl[1:-1] return tuple(Return.parse(arg) for arg in return_decl.split(', ')) - -def parse_arguments(args: str) -> Tuple[Tuple[Argument, ...], Tuple[Argument, ...], Tuple[Argument, ...]]: - """ - Input: 'int x, int y, int z' - Output: positional args, kwarg only args - """ - arguments: List[Argument] = [] - kwarg_only_arguments: List[Argument] = [] - out_arguments: List[Argument] = [] - arguments_acc = arguments - - # TODO: Use a real parser here; this will get bamboozled - # by signatures that contain things like std::array (note the space) - for arg in args.split(', '): - if not arg: - continue - if arg == '*': - assert arguments_acc is arguments, "invalid syntax: kwarg-only specifier * can only occur once" - arguments_acc = kwarg_only_arguments - continue - parg = Argument.parse(arg) - # Currently, we rely directly on the invariant that there are NO - # kwarg-only mutating arguments. If you want to relax this, - # we will need a more semantic way of matching that takes - # into account return arguments. In that case, you will have - # to manage out_arguments computation a level up, in - # FunctionSchema. See Note [is_out_fn] - if parg.annotation is not None and parg.annotation.is_write: - if arguments_acc is arguments: - pass # do nothing - elif arguments_acc is kwarg_only_arguments: - arguments_acc = out_arguments - else: - assert arguments_acc is not out_arguments - arguments_acc.append(parg) - - return tuple(arguments), tuple(kwarg_only_arguments), tuple(out_arguments) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 441a958adf33..5be6611c6904 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -389,6 +389,7 @@ def _crash_if_csrc_asan() -> _int: ... # THPModule_crashIfCsrcASAN def _crash_if_csrc_ubsan() -> _int: ... # THPModule_crashIfCsrcUBSAN def _crash_if_aten_asan() -> _int: ... # THPModule_crashIfATenASAN def _show_config() -> str: ... # THPModule_showConfig +def _cxx_flags() -> str: ... # THPModule_cxxFlags def _parallel_info() -> str: ... # THPModule_parallelInfo def _set_backcompat_broadcast_warn(arg: _bool) -> None: ... # THPModule_setBackcompatBroadcastWarn def _get_backcompat_broadcast_warn() -> _bool: ... # THPModule_getBackcompatBroadcastWarn diff --git a/torch/__config__.py b/torch/__config__.py index e4c3fde9ec3c..edddcbce4645 100644 --- a/torch/__config__.py +++ b/torch/__config__.py @@ -9,8 +9,11 @@ def show(): return torch._C._show_config() # TODO: In principle, we could provide more structured version/config -# information here. We're not for now; considering doing so if someone -# asks for it. +# information here. For now only CXX_FLAGS is exposed, as Timer +# uses them. +def _cxx_flags(): + """Returns the CXX_FLAGS used when building PyTorch.""" + return torch._C._cxx_flags() def parallel_info(): r"""Returns detailed string with parallelization settings""" diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 2d8533b512df..99f504a05c9c 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -1809,26 +1809,26 @@ def add_docstr_all(method, docstr): add_docstr_all('index_put_', r""" -index_put_(indices, value, accumulate=False) -> Tensor +index_put_(indices, values, accumulate=False) -> Tensor -Puts values from the tensor :attr:`value` into the tensor :attr:`self` using +Puts values from the tensor :attr:`values` into the tensor :attr:`self` using the indices specified in :attr:`indices` (which is a tuple of Tensors). The -expression ``tensor.index_put_(indices, value)`` is equivalent to -``tensor[indices] = value``. Returns :attr:`self`. +expression ``tensor.index_put_(indices, values)`` is equivalent to +``tensor[indices] = values``. Returns :attr:`self`. -If :attr:`accumulate` is ``True``, the elements in :attr:`value` are added to +If :attr:`accumulate` is ``True``, the elements in :attr:`values` are added to :attr:`self`. If accumulate is ``False``, the behavior is undefined if indices contain duplicate elements. Args: indices (tuple of LongTensor): tensors used to index into `self`. - value (Tensor): tensor of same dtype as `self`. + values (Tensor): tensor of same dtype as `self`. accumulate (bool): whether to accumulate into self """) add_docstr_all('index_put', r""" -index_put(tensor1, indices, value, accumulate=False) -> Tensor +index_put(tensor1, indices, values, accumulate=False) -> Tensor Out-place version of :meth:`~Tensor.index_put_`. `tensor1` corresponds to `self` in :meth:`torch.Tensor.index_put_`. diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 3b6ee12e7a68..dd4be74dde80 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -4577,6 +4577,8 @@ def merge_dicts(*dicts): Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted LU factorization of A from :meth:`torch.lu`. +This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`. + Arguments: b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*` is zero or more batch dimensions. diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index a5df6329030d..b23ab81ada93 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -335,6 +335,13 @@ static PyObject *THPModule_showConfig(PyObject *module, PyObject *noargs) END_HANDLE_TH_ERRORS } +static PyObject *THPModule_cxxFlags(PyObject *module, PyObject *noargs) +{ + HANDLE_TH_ERRORS + return THPUtils_packString(at::get_cxx_flags()); + END_HANDLE_TH_ERRORS +} + static PyObject *THPModule_parallelInfo(PyObject *module, PyObject *noargs) { HANDLE_TH_ERRORS @@ -584,6 +591,7 @@ static PyMethodDef TorchMethods[] = { {"_crash_if_csrc_ubsan", THPModule_crashIfCsrcUBSAN, METH_O, nullptr}, {"_crash_if_aten_asan", THPModule_crashIfATenASAN, METH_O, nullptr}, {"_show_config", THPModule_showConfig, METH_NOARGS, nullptr}, + {"_cxx_flags", THPModule_cxxFlags, METH_NOARGS, nullptr}, {"_parallel_info", THPModule_parallelInfo, METH_NOARGS, nullptr}, {"_set_backcompat_broadcast_warn", THPModule_setBackcompatBroadcastWarn, METH_O, nullptr}, {"_get_backcompat_broadcast_warn", THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr}, diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h index dd44e16ff3f1..f5d788d75e07 100644 --- a/torch/csrc/api/include/torch/linalg.h +++ b/torch/csrc/api/include/torch/linalg.h @@ -52,6 +52,14 @@ inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, opt return torch::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } +inline Tensor matrix_rank(const Tensor input, optional tol, bool hermitian) { + return torch::linalg_matrix_rank(input, tol, hermitian); +} + +inline Tensor& matrix_rank_out(Tensor& result, const Tensor input, optional tol, bool hermitian) { + return torch::linalg_matrix_rank_out(result, input, tol, hermitian); +} + inline Tensor tensorinv(const Tensor& self, int64_t ind) { return torch::linalg_tensorinv(self, ind); } @@ -133,6 +141,15 @@ inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, std::string o return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype); } +/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_rank +inline Tensor matrix_rank(const Tensor input, optional tol, bool hermitian) { + return detail::matrix_rank(input, tol, hermitian); +} + +inline Tensor& matrix_rank_out(Tensor& result, const Tensor input, optional tol, bool hermitian) { + return detail::matrix_rank_out(result, input, tol, hermitian); +} + /// Computes the inverse of a tensor /// /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorinv diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index 7b973bedf5a8..d36f4f428c53 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -935,8 +935,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) { } break; case aten::sigmoid: { - return computeOneOperand( - "aten_sigmoid", v, [](const ExprHandle& a) { return sigmoid(a); }); + return computeOneOperand("aten_sigmoid", v, [](const ExprHandle& a) { + return sigmoid(promoteIntegerToFloat(a)); + }); } break; case aten::reciprocal: { diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py index f25f3a8caad8..ffe4203b4181 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py @@ -16,7 +16,12 @@ def _ddp_comm_hook_wrapper(comm_hook, model, state): def _powerSGD_comm_hook_wrapper( - comm_hook, model, state, matrix_approximation_rank, random_seed=0 + comm_hook, + model, + state, + matrix_approximation_rank, + use_error_feedback=True, + random_seed=0, ): """ To be consistent with the wrappers of other DDP comm hooks, the input state only needs to be a process group, @@ -25,6 +30,7 @@ def _powerSGD_comm_hook_wrapper( powerSGD_state = powerSGD.PowerSGDState( process_group=state, matrix_approximation_rank=matrix_approximation_rank, + use_error_feedback=use_error_feedback, random_seed=random_seed, ) model.register_comm_hook(powerSGD_state, comm_hook) diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py index 9a6fbb4a31dd..17414df3024d 100644 --- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py +++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py @@ -30,17 +30,44 @@ def _orthogonalize(matrix, epsilon=1e-8): class PowerSGDState(object): - __slots__ = ["process_group", "matrix_approximation_rank", "rng"] - - def __init__(self, process_group, matrix_approximation_rank=1, random_seed=0): + __slots__ = [ + "process_group", + "matrix_approximation_rank", + "use_error_feedback", + "rng", + "error_dict", + ] + + def __init__( + self, + process_group, + matrix_approximation_rank=1, + use_error_feedback=True, + random_seed=0, + ): self.process_group = process_group self.matrix_approximation_rank = matrix_approximation_rank + # Error feedback is usually crucial for both for convergence and generalization, + # because PowerSGD is a biased compressor, + # i.e., compressing and decompressing a random gradient does not yield the original in expectation. + # This mechanism requires a temporary copy of the input gradients, + # so it increases the peak memory consumption by the size of gradient tensor. + # However, if the target matrices are known to be exactly low-ranked (instead of just low stable rank), + # sometimes it is possible to converge to the optima without error feedback. + # See: http://proceedings.mlr.press/v54/yurtsever17a/yurtsever17a.pdf + self.use_error_feedback = use_error_feedback # The purpose of this RNG is to generate different random seeds for initializing Q across iterations, # but in the same order for all the DDP replicas. # Different random seeds across iterations indicate different 'projections' of the gradients at different SGD steps. # If the same random projection is used, # there will be differences between the gradients that are never synchronized. self.rng = np.random.RandomState(random_seed) + # Since there is only a single state instance for all the input buckets, + # need to maintain a dictionary that maps each bucket to the local error. + # TODO(wayi): Currently the key is the (hashcode of) input tensor, which may change across steps, + # since the bucket can be rebuilt in the forward pass (to save peak memory usage). + # Need to add an index field to the input bucket of comm hook. + self.error_dict = {} def powerSGD_hook( @@ -98,6 +125,17 @@ def powerSGD_hook( padded_total_length = square_side_length ** 2 input_tensor.resize_(padded_total_length) input_tensor[total_length:padded_total_length].fill_(0) + + # Incorporate the error from the previous state into the gradients. + if state.use_error_feedback: + if input_tensor in state.error_dict: + input_tensor.add_(state.error_dict[input_tensor]) + else: + state.error_dict[input_tensor] = torch.zeros(padded_total_length, device=device) + # Keep a copy of the input tensor, + # so that we can compute the local error caused by compression later, + # by comparing this copy and the input tensor updated after decompression. + input_tensor_cp = torch.clone(input_tensor).detach() matrix = input_tensor.view(square_side_length, square_side_length) def create_low_rank_tensor(fill_random_values, rng): @@ -141,6 +179,9 @@ def decompress(fut): q = fut.value()[0].div_(world_size) torch.matmul(p, q.t(), out=matrix) + if state.use_error_feedback: + # Memorize the local errors. + state.error_dict[input_tensor] = input_tensor_cp - input_tensor ret = input_tensor.resize_(total_length) return [ret] diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py index ffcf75695d2f..57408f0c03f0 100644 --- a/torch/distributions/__init__.py +++ b/torch/distributions/__init__.py @@ -91,6 +91,7 @@ from .half_normal import HalfNormal from .independent import Independent from .kl import kl_divergence, register_kl +from .kumaraswamy import Kumaraswamy from .laplace import Laplace from .log_normal import LogNormal from .logistic_normal import LogisticNormal @@ -132,6 +133,7 @@ 'HalfCauchy', 'HalfNormal', 'Independent', + 'Kumaraswamy', 'Laplace', 'LogNormal', 'LogisticNormal', diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py index 5bd3a2d3bd1e..a569af34ebdc 100644 --- a/torch/distributions/gumbel.py +++ b/torch/distributions/gumbel.py @@ -5,9 +5,7 @@ from torch.distributions.uniform import Uniform from torch.distributions.transformed_distribution import TransformedDistribution from torch.distributions.transforms import AffineTransform, ExpTransform -from torch.distributions.utils import broadcast_all - -euler_constant = 0.57721566490153286060 # Euler Mascheroni Constant +from torch.distributions.utils import broadcast_all, euler_constant class Gumbel(TransformedDistribution): diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py index fe64ccc56009..ba7ba73d6063 100644 --- a/torch/distributions/kl.py +++ b/torch/distributions/kl.py @@ -31,7 +31,7 @@ from .poisson import Poisson from .transformed_distribution import TransformedDistribution from .uniform import Uniform -from .utils import _sum_rightmost +from .utils import _sum_rightmost, euler_constant as _euler_gamma _KL_REGISTRY = {} # Source of truth mapping a few general (type, type) pairs to functions. _KL_MEMOIZE: Dict[Tuple[Type, Type], Callable] = {} # Memoized version mapping many specific (type, type) pairs to functions. @@ -174,8 +174,6 @@ def kl_divergence(p, q): # KL Divergence Implementations ################################################################################ -_euler_gamma = 0.57721566490153286060 - # Same distributions diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py new file mode 100644 index 000000000000..4fb2e177e7be --- /dev/null +++ b/torch/distributions/kumaraswamy.py @@ -0,0 +1,66 @@ +import torch +from torch.distributions import constraints +from torch.distributions.uniform import Uniform +from torch.distributions.transformed_distribution import TransformedDistribution +from torch.distributions.transforms import AffineTransform, PowerTransform +from torch.distributions.utils import broadcast_all, euler_constant + + +def _moments(a, b, n): + """ + Computes nth moment of Kumaraswamy using using torch.lgamma + """ + arg1 = 1 + n / a + log_value = torch.lgamma(arg1) + torch.lgamma(b) - torch.lgamma(arg1 + b) + return b * torch.exp(log_value) + + +class Kumaraswamy(TransformedDistribution): + r""" + Samples from a Kumaraswamy distribution. + + Example:: + + >>> m = Kumaraswamy(torch.Tensor([1.0]), torch.Tensor([1.0])) + >>> m.sample() # sample from a Kumaraswamy distribution with concentration alpha=1 and beta=1 + tensor([ 0.1729]) + + Args: + concentration1 (float or Tensor): 1st concentration parameter of the distribution + (often referred to as alpha) + concentration0 (float or Tensor): 2nd concentration parameter of the distribution + (often referred to as beta) + """ + arg_constraints = {'concentration1': constraints.positive, 'concentration0': constraints.positive} + support = constraints.unit_interval + has_rsample = True + + def __init__(self, concentration1, concentration0, validate_args=None): + self.concentration1, self.concentration0 = broadcast_all(concentration1, concentration0) + finfo = torch.finfo(self.concentration0.dtype) + base_dist = Uniform(torch.full_like(self.concentration0, 0), + torch.full_like(self.concentration0, 1)) + transforms = [PowerTransform(exponent=self.concentration0.reciprocal()), + AffineTransform(loc=1., scale=-1.), + PowerTransform(exponent=self.concentration1.reciprocal())] + super(Kumaraswamy, self).__init__(base_dist, transforms, validate_args=validate_args) + + def expand(self, batch_shape, _instance=None): + new = self._get_checked_instance(Kumaraswamy, _instance) + new.concentration1 = self.concentration1.expand(batch_shape) + new.concentration0 = self.concentration0.expand(batch_shape) + return super(Kumaraswamy, self).expand(batch_shape, _instance=new) + + @property + def mean(self): + return _moments(self.concentration1, self.concentration0, 1) + + @property + def variance(self): + return _moments(self.concentration1, self.concentration0, 2) - torch.pow(self.mean, 2) + + def entropy(self): + t1 = (1 - self.concentration1.reciprocal()) + t0 = (1 - self.concentration0.reciprocal()) + H0 = torch.digamma(self.concentration0 + 1) + euler_constant + return t0 + t1 * H0 - torch.log(self.concentration1) - torch.log(self.concentration0) diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py index 0fd623086562..36ff1f71c35b 100644 --- a/torch/distributions/utils.py +++ b/torch/distributions/utils.py @@ -5,6 +5,9 @@ from typing import Dict, Any +euler_constant = 0.57721566490153286060 # Euler Mascheroni Constant + + def broadcast_all(*values): r""" Given a list of values (possibly containing numbers), returns a list where each diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 69b06369923b..78c226ab1739 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -723,9 +723,7 @@ def build_ExtSlice(ctx, base, extslice): if isinstance(expr.slice.value, ast.Tuple): # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k] # XXX: Indexing using a list is **different**! It triggers advanced indexing. - indices = [] - for index_expr in expr.slice.value.elts: - indices.append(build_expr(ctx, index_expr)) + indices = [build_expr(ctx, index_expr) for index_expr in expr.slice.value.elts] return Subscript(base, indices) else: return Subscript(base, [build_expr(ctx, expr.slice.value)]) @@ -733,6 +731,17 @@ def build_ExtSlice(ctx, base, extslice): return Subscript(base, [build_SliceExpr(ctx, base, expr.slice)]) elif sub_type is ast.ExtSlice: return Subscript(base, build_ExtSlice(ctx, base, expr.slice)) + elif sys.version_info >= (3, 9): # In Python3.9 array indicies are not wrapped in ast.Index + if sub_type is ast.Tuple: + # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k] + indices = [] + for index_expr in expr.slice.elts: + if isinstance(index_expr, ast.Slice): + indices.append(build_SliceExpr(ctx, base, index_expr)) + else: + indices.append(build_expr(ctx, index_expr)) + return Subscript(base, indices) + return Subscript(base, [build_expr(ctx, expr.slice)]) else: # Ellipsis (can only happen in Python 2) raise NotSupportedError(base.range(), "ellipsis is not supported") diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py index edd4d8a8afa6..d2cc7e1df9d0 100644 --- a/torch/linalg/__init__.py +++ b/torch/linalg/__init__.py @@ -213,6 +213,67 @@ [-3.1113, 2.7381]], dtype=torch.float64) """) +matrix_rank = _add_docstr(_linalg.linalg_matrix_rank, r""" +matrix_rank(input, tol=None, hermitian=False) -> Tensor + +Computes the numerical rank of a matrix :attr:`input`, or of each matrix in a batched :attr:`input`. +The matrix rank is computed as the number of singular values (or the absolute eigenvalues when :attr:`hermitian` is ``True``) +above the specified :attr:`tol` threshold. + +If :attr:`tol` is not specified, :attr:`tol` is set to +``S.max(dim=-1) * max(input.shape[-2:]) * eps`` where ``S`` is the singular values +(or the absolute eigenvalues when :attr:`hermitian` is ``True``), +and ``eps`` is the epsilon value for the datatype of :attr:`input`. +The epsilon value can be obtained using ``eps`` attribute of :class:`torch.finfo`. + +The method to compute the matrix rank is done using singular value decomposition (see :func:`torch.linalg.svd`) by default. +If :attr:`hermitian` is ``True``, then :attr:`input` is assumed to be Hermitian (symmetric if real-valued), +and the computation of the rank is done by obtaining the eigenvalues (see :func:`torch.linalg.eigvalsh`). + +Supports input of ``float``, ``double``, ``cfloat`` and ``cdouble`` datatypes. + +.. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU. + +Args: + input (Tensor): the input matrix of size :math:`(m, n)` or the batch of matrices of size :math:`(*, m, n)` + where `*` is one or more batch dimensions. + tol (float, optional): the tolerance value. Default: ``None`` + hermitian(bool, optional): indicates whether :attr:`input` is Hermitian. Default: ``False`` + +Examples:: + + >>> a = torch.eye(10) + >>> torch.linalg.matrix_rank(a) + tensor(10) + >>> b = torch.eye(10) + >>> b[0, 0] = 0 + >>> torch.linalg.matrix_rank(b) + tensor(9) + + >>> a = torch.randn(4, 3, 2) + >>> torch.linalg.matrix_rank(a) + tensor([2, 2, 2, 2]) + + >>> a = torch.randn(2, 4, 2, 3) + >>> torch.linalg.matrix_rank(a) + tensor([[2, 2, 2, 2], + [2, 2, 2, 2]]) + + >>> a = torch.randn(2, 4, 3, 3, dtype=torch.complex64) + >>> torch.linalg.matrix_rank(a) + tensor([[3, 3, 3, 3], + [3, 3, 3, 3]]) + >>> torch.linalg.matrix_rank(a, hermitian=True) + tensor([[3, 3, 3, 3], + [3, 3, 3, 3]]) + >>> torch.linalg.matrix_rank(a, tol=1.0) + tensor([[3, 2, 2, 2], + [1, 2, 1, 2]]) + >>> torch.linalg.matrix_rank(a, tol=1.0, hermitian=True) + tensor([[2, 2, 2, 1], + [1, 2, 2, 2]]) +""") + norm = _add_docstr(_linalg.linalg_norm, r""" linalg.norm(input, ord=None, dim=None, keepdim=False, *, out=None, dtype=None) -> Tensor diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py index 659d284b2afd..12018a34e23f 100644 --- a/torch/nn/intrinsic/qat/modules/conv_fused.py +++ b/torch/nn/intrinsic/qat/modules/conv_fused.py @@ -93,8 +93,13 @@ def _forward(self, input): bias_shape = [1] * len(self.weight.shape) bias_shape[1] = -1 scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape)) - # this does not include the conv bias - conv = self._conv_forward(input, scaled_weight) + # using zero bias here since the bias for original conv + # will be added later + if self.bias is not None: + zero_bias = torch.zeros_like(self.bias) + else: + zero_bias = torch.zeros(self.out_channels, device=scaled_weight.device) + conv = self._conv_forward(input, scaled_weight, zero_bias) conv_orig = conv / scale_factor.reshape(bias_shape) if self.bias is not None: conv_orig = conv_orig + self.bias.reshape(bias_shape) @@ -402,7 +407,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, def forward(self, input): return F.relu( - self._conv_forward(input, self.weight_fake_quant(self.weight))) + self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)) @classmethod def from_float(cls, mod): diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index b801d990c4a6..33f2a84aed74 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -246,16 +246,16 @@ def __init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _single(0), groups, bias, padding_mode) - def _conv_forward(self, input, weight): + def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): if self.padding_mode != 'zeros': return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode), - weight, self.bias, self.stride, + weight, bias, self.stride, _single(0), self.dilation, self.groups) - return F.conv1d(input, weight, self.bias, self.stride, + return F.conv1d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight) + return self._conv_forward(input, self.weight, self.bias) class Conv2d(_ConvNd): @@ -382,16 +382,16 @@ def __init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias, padding_mode) - def _conv_forward(self, input, weight): + def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]): if self.padding_mode != 'zeros': return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode), - weight, self.bias, self.stride, + weight, bias, self.stride, _pair(0), self.dilation, self.groups) - return F.conv2d(input, weight, self.bias, self.stride, + return F.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups) def forward(self, input: Tensor) -> Tensor: - return self._conv_forward(input, self.weight) + return self._conv_forward(input, self.weight, self.bias) class Conv3d(_ConvNd): __doc__ = r"""Applies a 3D convolution over an input signal composed of several input diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py index a9c5f8547329..4b3814983347 100644 --- a/torch/nn/qat/modules/conv.py +++ b/torch/nn/qat/modules/conv.py @@ -29,7 +29,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1, self.weight_fake_quant = qconfig.weight() def forward(self, input): - return self._conv_forward(input, self.weight_fake_quant(self.weight)) + return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias) @classmethod def from_float(cls, mod): diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index 2661a7e8338e..dd9060036b04 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -57,9 +57,6 @@ def clamp_max(g, self, max): # Opset 11 gather accepts negative indices @parse_args('v', 'i', 'v') def select(g, self, dim, index): - index_scalar_type = index.type().scalarType() - if index_scalar_type is None or index_scalar_type not in ['Long', 'Int']: - index = g.op("Cast", index, to_i=sym_help.cast_pytorch_to_onnx["Long"]) return g.op("Gather", self, index, axis_i=dim) diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index d1f90111fc5d..d36a2a04eae8 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -1342,9 +1342,6 @@ def index_select(g, self, dim, index): if index_dim == 0: # Index is a scalar. Reshape it to a size 1 tensor. index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1]))) - index_scalar_type = index.type().scalarType() - if index_scalar_type is None or index_scalar_type not in ['Long', 'Int']: - index = g.op("Cast", index, to_i=sym_help.cast_pytorch_to_onnx["Long"]) return g.op("Gather", self, index, axis_i=dim) diff --git a/torch/overrides.py b/torch/overrides.py index 0819769e1d79..36ae037ed557 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -494,6 +494,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.matmul: lambda input, other, out=None: -1, torch.matrix_power: lambda input, n: -1, torch.matrix_rank: lambda input, tol=None, symmetric=False: -1, + torch.linalg.matrix_rank: lambda input, tol=None, hermitian=False: -1, torch.matrix_exp: lambda input: -1, torch.max: lambda input, out=None: -1, torch.maximum: lambda input, other, out=None: -1, diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py index f34bdf1cd9b2..a8b87f0f156b 100644 --- a/torch/quantization/quantize.py +++ b/torch/quantization/quantize.py @@ -295,7 +295,7 @@ def quantize(model, run_fn, run_args, mapping=None, inplace=False): model = copy.deepcopy(model) model.eval() prepare(model, inplace=True) - run_fn(model, run_args) + run_fn(model, *run_args) convert(model, mapping, inplace=True) return model @@ -422,7 +422,7 @@ def quantize_qat(model, run_fn, run_args, inplace=False): model = copy.deepcopy(model) model.train() prepare_qat(model, inplace=True) - run_fn(model, run_args) + run_fn(model, *run_args) convert(model, inplace=True) return model diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index f2f91cbc9e13..7f4ec5af4f2c 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -67,6 +67,7 @@ def __init__(self, default_test_dtypes=None, # dtypes to test with by default. Gets intersected # with the dtypes support on the tested device test_inplace_grad=True, # whether to gradcheck and gradgradcheck the inplace variant + test_complex_grad=True, # whether to gradcheck and gradgradcheck for complex dtypes supports_tensor_out=True, # whether the op supports the out kwarg, returning a Tensor skips=tuple(), # information about which tests to skip decorators=None): # decorators to apply to generated tests @@ -92,6 +93,7 @@ def __init__(self, self.inplace_variant = getattr(torch.Tensor, inplace_name) if hasattr(torch.Tensor, name) else None self.test_inplace_grad = test_inplace_grad + self.test_complex_grad = test_complex_grad self.supports_tensor_out = supports_tensor_out self.skips = skips @@ -493,7 +495,24 @@ def sample_inputs(self, device, dtype, requires_grad=False): ] if TEST_SCIPY: + def reference_sigmoid(x): + # 'scipy.special.expit' not supported for the input types + if x.dtype in [np.complex64, np.complex128]: + return (1 / (1 + np.exp(-x))) + return scipy.special.expit(x) + op_db_scipy_reference = [ + UnaryUfuncInfo('sigmoid', + ref=reference_sigmoid, + decorators=(precisionOverride({torch.float16: 1e-2, + torch.bfloat16: 1e-2}),), + skips=(SkipInfo('TestUnaryUfuncs', 'test_reference_numerics', + device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),), + dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16), + dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16), + dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16), + promotes_integers_to_float=True, + test_complex_grad=False), # Reference: https://github.com/pytorch/pytorch/issues/48552 UnaryUfuncInfo('erf', ref=scipy.special.erf, decorators=(precisionOverride({torch.float16: 1e-2, diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py index 844750285592..ba7f0d650b22 100644 --- a/torch/testing/_internal/distributed/rpc/rpc_test.py +++ b/torch/testing/_internal/distributed/rpc/rpc_test.py @@ -68,7 +68,6 @@ def udf_with_torch_ops(device=-1, use_record_function=False): "aten::relu", "aten::threshold", "aten::sigmoid", - "aten::sigmoid", ] # Remote operations are prefixed with the following string for RPC profiling. diff --git a/torch/testing/check_kernel_launches.py b/torch/testing/check_kernel_launches.py index 3385fcdf9618..091f1be98561 100644 --- a/torch/testing/check_kernel_launches.py +++ b/torch/testing/check_kernel_launches.py @@ -5,7 +5,7 @@ # Regular expression identifies a kernel launch indicator by # finding something approximating the pattern ">>>(arguments);" -# It then requires that `TORCH_CUDA_KERNEL_LAUNCH_CHECK` be +# It then requires that `C10_CUDA_KERNEL_LAUNCH_CHECK` be # the next command. # It allows a single backslash `\` between the end of the launch # command and the beginning of the kernel check. This handles @@ -26,7 +26,7 @@ \\? # 0 or 1 backslashes (for launches in preprocessor macros) (?:[0-9]+: )? # Detects and ignores a line numbering, if present \s* # Maybe some whitespace (includes newlines) - TORCH_CUDA_KERNEL_LAUNCH_CHECK\(\); # Kernel launch guard! + C10_CUDA_KERNEL_LAUNCH_CHECK\(\); # Kernel launch guard! ) # End negative lookahead """, flags=re.MULTILINE | re.VERBOSE) @@ -53,7 +53,7 @@ def check_code_for_cuda_kernel_launches(code, filename=None): results = kernel_launch_regex.findall(code) # Search for bad launches for r in results: - print(f"Missing TORCH_CUDA_KERNEL_LAUNCH_CHECK in '{filename}'. Context:\n{r}", file=sys.stderr) + print(f"Missing C10_CUDA_KERNEL_LAUNCH_CHECK in '{filename}'. Context:\n{r}", file=sys.stderr) return len(results) diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py index cb778ab8923d..958d34ecc71a 100644 --- a/torch/utils/_cpp_extension_versioner.py +++ b/torch/utils/_cpp_extension_versioner.py @@ -38,12 +38,16 @@ def bump_version_if_changed(self, source_files, build_arguments, build_directory, - with_cuda): + with_cuda, + is_python_module, + is_standalone): hash_value = 0 hash_value = hash_source_files(hash_value, source_files) hash_value = hash_build_arguments(hash_value, build_arguments) hash_value = update_hash(hash_value, build_directory) hash_value = update_hash(hash_value, with_cuda) + hash_value = update_hash(hash_value, is_python_module) + hash_value = update_hash(hash_value, is_standalone) entry = self.entries.get(name) if entry is None: diff --git a/torch/utils/benchmark/utils/_stubs.py b/torch/utils/benchmark/utils/_stubs.py new file mode 100644 index 000000000000..e2ab6ec086a4 --- /dev/null +++ b/torch/utils/benchmark/utils/_stubs.py @@ -0,0 +1,46 @@ +import sys +from typing import Any, Callable, Dict, TYPE_CHECKING + + +if TYPE_CHECKING or sys.version_info >= (3, 8): + from typing import runtime_checkable, Protocol +else: + from typing_extensions import runtime_checkable, Protocol + + +class TimerClass(Protocol): + """This is the portion of the `timeit.Timer` API used by benchmark utils.""" + def __init__( + self, + stmt: str, + setup: str, + timer: Callable[[], float], + globals: Dict[str, Any] + ) -> None: + ... + + def timeit(self, number: int) -> float: + ... + + +@runtime_checkable +class TimeitModuleType(Protocol): + """Modules generated from `timeit_template.cpp`.""" + def timeit(self, number: int) -> float: + ... + + +class CallgrindModuleType(Protocol): + """Replicates the valgrind endpoints in `torch._C`. + + These bindings are used to collect Callgrind profiles on earlier versions + of PyTorch and will eventually be removed. + """ + __file__: str + __name__: str + + def _valgrind_supported_platform(self) -> bool: + ... + + def _valgrind_toggle(self) -> None: + ... diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py index ddfb8f08389d..1cbed2df51c2 100644 --- a/torch/utils/benchmark/utils/common.py +++ b/torch/utils/benchmark/utils/common.py @@ -48,7 +48,7 @@ def title(self) -> str: def setup_str(self) -> str: return ( - "" if self.setup == "pass" + "" if (self.setup == "pass" or not self.setup) else f"setup:\n{textwrap.indent(self.setup, ' ')}" if "\n" in self.setup else f"setup: {self.setup}" ) diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py new file mode 100644 index 000000000000..ebaa4213e027 --- /dev/null +++ b/torch/utils/benchmark/utils/cpp_jit.py @@ -0,0 +1,143 @@ +"""JIT C++ strings into executables.""" +import atexit +import os +import re +import shutil +import textwrap +import threading +import uuid +from typing import Any, List, Optional + +import torch +from torch.utils.benchmark.utils._stubs import CallgrindModuleType, TimeitModuleType +from torch.utils import cpp_extension + + +LOCK = threading.Lock() +SOURCE_ROOT = os.path.split(os.path.abspath(__file__))[0] + +# We calculate uuid once at import time so that separate processes will have +# separate build roots, but threads will share the same build root. +# `cpp_extension` uses build root as part of the cache key, so per-invocation +# uuid's (e.g. different build root per _compile_template call) would lead to +# a 0% cache hit rate and spurious recompilation. Consider the following: +# ``` +# setup = "auto x = torch::ones({1024, 1024});" +# stmt = "torch::mm(x, x);" +# for num_threads in [1, 2, 4, 8]: +# print(Timer(stmt, setup, num_threads=num_threads, language="c++").blocked_autorange()) +# ```` +# `setup` and `stmt` do not change, so we can reuse the executable from the +# first pass through the loop. +BUILD_ROOT = os.path.join( + torch._appdirs.user_cache_dir(appname="benchmark_utils_jit"), + f"build_{uuid.uuid4()}".replace("-", "") +) + +# BACK_TESTING_NOTE: +# There are two workflows where this code could be used. One is the obvious +# case where someone simply builds or installs PyTorch and uses Timer. +# The other is that the entire `torch/utils/benchmark` folder from a CURRENT +# PyTorch checkout is copy-pasted into a much OLDER version of the PyTorch +# source code. This is what we refer to here as "back testing". The rationale +# is that we might want to use current tooling to study some aspect of an +# earlier version of PyTorch. (e.g. a regression.) +# +# The problem is that Timer relies on several aspects of core PyTorch, namely +# some binding functions for Valgrind symbols in `torch._C` and the +# `torch.__config__._cxx_flags()` method. If we were to naively copy code +# around this wouldn't work as the symbols of interest aren't present in +# earlier versions of PyTorch. In order to work around this, we must add back +# testing shims. These shims will never activate during normal use, but will +# allow Timer to function outside of the "correct" version of PyTorch by +# emulating functionality that was added later. +# +# These shims are temporary, and as Timer becomes more integrated with +# PyTorch the cost and complexity of such shims will increase. Once back +# testing is no longer required (which is to say we have done enough historic +# analysis and the shims no longer justify their maintenance and code +# complexity costs) back testing paths will be removed. + +if hasattr(torch.__config__, "_cxx_flags"): + CXX_FLAGS = torch.__config__._cxx_flags().strip().split() + if "-g" not in CXX_FLAGS: + CXX_FLAGS.append("-g") +else: + # FIXME: Remove when back testing is no longer required. + CXX_FLAGS = ["-O2", "-fPIC", "-g"] + +EXTRA_INCLUDE_PATHS: List[str] = [os.path.join(SOURCE_ROOT, "valgrind_wrapper")] +CONDA_PREFIX = os.getenv("CONDA_PREFIX") +if CONDA_PREFIX is not None: + # Load will automatically search /usr/include, but not conda include. + EXTRA_INCLUDE_PATHS.append(os.path.join(CONDA_PREFIX, "include")) + + +COMPAT_CALLGRIND_BINDINGS: Optional[CallgrindModuleType] = None +def get_compat_bindings() -> CallgrindModuleType: + with LOCK: + global COMPAT_CALLGRIND_BINDINGS + if COMPAT_CALLGRIND_BINDINGS is None: + COMPAT_CALLGRIND_BINDINGS = cpp_extension.load( + name="callgrind_bindings", + sources=[os.path.join( + SOURCE_ROOT, + "valgrind_wrapper", + "compat_bindings.cpp" + )], + extra_cflags=CXX_FLAGS, + extra_include_paths=EXTRA_INCLUDE_PATHS, + ) + return COMPAT_CALLGRIND_BINDINGS + + +def _compile_template(stmt: str, setup: str, src: str, is_standalone: bool) -> Any: + for before, after, indentation in ( + ("// SETUP_TEMPLATE_LOCATION", setup, 4), + ("// STMT_TEMPLATE_LOCATION", stmt, 8) + ): + # C++ doesn't care about indentation so this code isn't load + # bearing the way it is with Python, but this makes the source + # look nicer if a human has to look at it. + src = re.sub( + before, + textwrap.indent(after, " " * indentation)[indentation:], + src + ) + + # We want to isolate different Timers. However `cpp_extension` will + # cache builds which will significantly reduce the cost of repeated + # invocations. + with LOCK: + if not os.path.exists(BUILD_ROOT): + os.makedirs(BUILD_ROOT) + atexit.register(shutil.rmtree, BUILD_ROOT) + + name = f"timer_cpp_{abs(hash(src))}" + build_dir = os.path.join(BUILD_ROOT, name) + os.makedirs(build_dir, exist_ok=True) + + src_path = os.path.join(build_dir, "timer_src.cpp") + with open(src_path, "wt") as f: + f.write(src) + + # `cpp_extension` has its own locking scheme, so we don't need our lock. + return cpp_extension.load( + name=name, + sources=[src_path], + build_directory=build_dir, + extra_cflags=CXX_FLAGS, + extra_include_paths=EXTRA_INCLUDE_PATHS, + is_python_module=not is_standalone, + is_standalone=is_standalone, + ) + + +def compile_timeit_template(stmt: str, setup: str) -> TimeitModuleType: + template_path: str = os.path.join(SOURCE_ROOT, "timeit_template.cpp") + with open(template_path, "rt") as f: + src: str = f.read() + + module = _compile_template(stmt, setup, src, is_standalone=False) + assert isinstance(module, TimeitModuleType) + return module diff --git a/torch/utils/benchmark/utils/timeit_template.cpp b/torch/utils/benchmark/utils/timeit_template.cpp new file mode 100644 index 000000000000..01d62efdb161 --- /dev/null +++ b/torch/utils/benchmark/utils/timeit_template.cpp @@ -0,0 +1,36 @@ +/* C++ template for Timer.timeit + +This template will be consumed by `cpp_jit.py`, and will replace: + `SETUP_TEMPLATE_LOCATION` + and + `STMT_TEMPLATE_LOCATION` +sections with user provided statements. +*/ +#include + +#include +#include + + +double timeit(int n) { + // Setup + // SETUP_TEMPLATE_LOCATION + + { + // Warmup + // STMT_TEMPLATE_LOCATION + } + + // Main loop + auto start_time = std::chrono::high_resolution_clock::now(); + for (int loop_idx = 0; loop_idx < n; loop_idx++) { + // STMT_TEMPLATE_LOCATION + } + auto end_time = std::chrono::high_resolution_clock::now(); + return std::chrono::duration(end_time - start_time).count(); +} + + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("timeit", &timeit); +} diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py index e017dda2d4dd..374b8bd6c6e0 100644 --- a/torch/utils/benchmark/utils/timer.py +++ b/torch/utils/benchmark/utils/timer.py @@ -1,16 +1,17 @@ """Timer class based on the timeit.Timer class, but torch aware.""" - +import enum import timeit import textwrap -from typing import Any, Callable, Dict, List, NoReturn, Optional +from typing import Any, Callable, Dict, List, NoReturn, Optional, Type, Union import numpy as np import torch -from torch.utils.benchmark.utils import common +from torch.utils.benchmark.utils import common, cpp_jit +from torch.utils.benchmark.utils._stubs import TimerClass, TimeitModuleType from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface -__all__ = ["Timer", "timer"] +__all__ = ["Timer", "timer", "Language"] if torch.has_cuda and torch.cuda.is_available(): @@ -21,6 +22,46 @@ def timer() -> float: timer = timeit.default_timer +class Language(enum.Enum): + PYTHON = 0 + CPP = 1 + + +class CPPTimer: + def __init__( + self, + stmt: str, + setup: str, + timer: Callable[[], float], + globals: Dict[str, Any], + ) -> None: + if timer is not timeit.default_timer: + raise NotImplementedError( + "PyTorch was built with CUDA and a GPU is present; however " + "Timer does not yet support GPU measurements. If your " + "code is CPU only, pass `timer=timeit.default_timer` to the " + "Timer's constructor to indicate this. (Note that this will " + "produce incorrect results if the GPU is in fact used, as " + "Timer will not synchronize CUDA.)" + ) + + if globals: + raise ValueError("C++ timing does not support globals.") + + self._stmt: str = textwrap.dedent(stmt) + self._setup: str = textwrap.dedent(setup) + self._timeit_module: Optional[TimeitModuleType] = None + + def timeit(self, number: int) -> float: + if self._timeit_module is None: + self._timeit_module = cpp_jit.compile_timeit_template( + self._stmt, + self._setup, + ) + + return self._timeit_module.timeit(number) + + class Timer(object): """Helper class for measuring execution time of PyTorch statements. @@ -122,7 +163,7 @@ class Timer(object): threadpool size which tries to utilize all cores. """ - _timer_cls = timeit.Timer + _timer_cls: Type[TimerClass] = timeit.Timer def __init__( self, @@ -135,21 +176,32 @@ def __init__( description: Optional[str] = None, env: Optional[str] = None, num_threads: int = 1, + language: Union[Language, str] = Language.PYTHON, ): if not isinstance(stmt, str): raise ValueError("Currently only a `str` stmt is supported.") - # We copy `globals` to prevent mutations from leaking, (for instance, - # `eval` adds the `__builtins__` key) and include `torch` if not - # specified as a convenience feature. - globals = dict(globals or {}) - globals.setdefault("torch", torch) - self._globals = globals + # We copy `globals` to prevent mutations from leaking. + # (For instance, `eval` adds the `__builtins__` key) + self._globals = dict(globals or {}) + if language in (Language.PYTHON, "py", "python"): + # Include `torch` if not specified as a convenience feature. + self._globals.setdefault("torch", torch) + self._language: Language = Language.PYTHON + + elif language in (Language.CPP, "cpp", "c++"): + assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped." + self._timer_cls = CPPTimer + setup = ("" if setup == "pass" else setup) + self._language = Language.CPP + + else: + raise ValueError(f"Invalid language `{language}`.") # Convenience adjustment so that multi-line code snippets defined in - # functions do not IndentationError inside timeit.Timer. The leading - # newline removal is for the initial newline that appears when defining - # block strings. For instance: + # functions do not IndentationError (Python) or look odd (C++). The + # leading newline removal is for the initial newline that appears when + # defining block strings. For instance: # textwrap.dedent(""" # print("This is a stmt") # """) @@ -158,15 +210,15 @@ def __init__( # Stripping this down to 'print("This is a stmt")' doesn't change # what gets executed, but it makes __repr__'s nicer. stmt = textwrap.dedent(stmt) - stmt = (stmt[1:] if stmt[0] == "\n" else stmt).rstrip() + stmt = (stmt[1:] if stmt and stmt[0] == "\n" else stmt).rstrip() setup = textwrap.dedent(setup) - setup = (setup[1:] if setup[0] == "\n" else setup).rstrip() + setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip() self._timer = self._timer_cls( stmt=stmt, setup=setup, timer=timer, - globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(globals), + globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(self._globals), ) self._task_spec = common.TaskSpec( stmt=stmt, @@ -369,6 +421,9 @@ def collect_callgrind( if not isinstance(self._task_spec.stmt, str): raise ValueError("`collect_callgrind` currently only supports string `stmt`") + if self._language != Language.PYTHON: + raise NotImplementedError("C++ Callgrind is later in the stack.") + # Check that the statement is valid. It doesn't guarantee success, but it's much # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in # the parent process rather than the valgrind subprocess. diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp b/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp new file mode 100644 index 000000000000..b52626fe76fd --- /dev/null +++ b/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp @@ -0,0 +1,25 @@ +/* Used to collect profiles of old versions of PyTorch. */ +#include +#include + + +bool _valgrind_supported_platform() { + #if defined(NVALGRIND) + return false; + #else + return true; + #endif +} + +void _valgrind_toggle() { + #if defined(NVALGRIND) + TORCH_CHECK(false, "Valgrind is not supported."); + #else + CALLGRIND_TOGGLE_COLLECT; + #endif +} + +PYBIND11_MODULE(callgrind_bindings, m) { + m.def("_valgrind_supported_platform", &_valgrind_supported_platform); + m.def("_valgrind_toggle", &_valgrind_toggle); +} diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.py b/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.py deleted file mode 100644 index b7404a653a20..000000000000 --- a/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Allow Timer.collect_callgrind to be used on earlier versions of PyTorch - -FIXME: Remove this module once we no longer need to back test. -""" -import os -import textwrap -from typing import List - -from torch.utils.cpp_extension import load_inline - - -# load_inline will automatically search /usr/include, but not conda include. -extra_include_paths: List[str] = [] -conda_prefix = os.getenv("CONDA_PREFIX") -if conda_prefix is not None: - extra_include_paths = [os.path.join(conda_prefix, "include")] - -bindings = load_inline( - name="callgrind_bindings", - cpp_sources=textwrap.dedent(""" - #include - - bool _valgrind_supported_platform() { - #if defined(NVALGRIND) - return false; - #else - return true; - #endif - } - - void _valgrind_toggle() { - #if defined(NVALGRIND) - TORCH_CHECK(false, "Valgrind is not supported."); - #else - CALLGRIND_TOGGLE_COLLECT; - #endif - } - """), - extra_include_paths=extra_include_paths, - functions=["_valgrind_supported_platform", "_valgrind_toggle"], -) diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py index bad9df90bcd2..b8513671beb9 100644 --- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py +++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py @@ -11,13 +11,13 @@ import sys import tempfile import textwrap -from types import ModuleType from typing import ( cast, Any, Callable, DefaultDict, Dict, Generator, List, NamedTuple, Optional, Tuple, Union, TYPE_CHECKING) import torch -from torch.utils.benchmark.utils import common +from torch.utils.benchmark.utils import common, cpp_jit +from torch.utils.benchmark.utils._stubs import CallgrindModuleType __all__ = ["FunctionCount", "FunctionCounts", "CallgrindStats", "CopyIfCallgrind"] @@ -444,17 +444,14 @@ def construct(self) -> str: class _ValgrindWrapper(object): def __init__(self) -> None: - self._bindings_module: Optional[ModuleType] = None + self._bindings_module: Optional[CallgrindModuleType] = None if hasattr(torch._C, "_valgrind_supported_platform"): self._supported_platform: bool = torch._C._valgrind_supported_platform() else: print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.") - # This import will JIT the Callgrind control bindings, so don't - # invoke unless we know we'll need it. - from torch.utils.benchmark.utils.valgrind_wrapper.compat_bindings import bindings - self._bindings_module = bindings - self._supported_platform = bindings._valgrind_supported_platform() + self._bindings_module = cpp_jit.get_compat_bindings() + self._supported_platform = self._bindings_module._valgrind_supported_platform() self._commands_available: Dict[str, bool] = {} if self._supported_platform: @@ -643,7 +640,7 @@ def _construct_script( number: int, error_log: str, stat_log: str, - bindings: Optional[ModuleType], + bindings: Optional[CallgrindModuleType], ) -> str: # The naive template looks something like: # "for _ in range({number}): {stmt}" diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index a2f47744c5f3..993b04ca23d8 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -23,6 +23,14 @@ IS_WINDOWS = sys.platform == 'win32' +LIB_EXT = '.pyd' if IS_WINDOWS else '.so' +EXEC_EXT = '.exe' if IS_WINDOWS else '' +SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared' + +_HERE = os.path.abspath(__file__) +_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE)) +TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib') + def _find_cuda_home() -> Optional[str]: r'''Finds the CUDA install path.''' @@ -400,7 +408,7 @@ def unix_cuda_flags(cflags): # overriding the option if the user explicitly passed it. _ccbin = os.getenv("CC") if ( - _ccbin is not None + _ccbin is not None and not any([flag.startswith('-ccbin') or flag.startswith('--compiler-bindir') for flag in cflags]) ): cflags.extend(['-ccbin', _ccbin]) @@ -848,9 +856,7 @@ def include_paths(cuda: bool = False) -> List[str]: Returns: A list of include path strings. ''' - here = os.path.abspath(__file__) - torch_path = os.path.dirname(os.path.dirname(here)) - lib_include = os.path.join(torch_path, 'include') + lib_include = os.path.join(_TORCH_PATH, 'include') paths = [ lib_include, # Remove this once torch/torch.h is officially no longer supported for C++ extensions. @@ -886,13 +892,8 @@ def library_paths(cuda: bool = False) -> List[str]: Returns: A list of library path strings. ''' - paths = [] - # We need to link against libtorch.so - here = os.path.abspath(__file__) - torch_path = os.path.dirname(os.path.dirname(here)) - lib_path = os.path.join(torch_path, 'lib') - paths.append(lib_path) + paths = [TORCH_LIB_PATH] if cuda and IS_HIP_EXTENSION: lib_dir = 'lib' @@ -925,6 +926,7 @@ def load(name, verbose=False, with_cuda: Optional[bool] = None, is_python_module=True, + is_standalone=False, keep_intermediates=True): r''' Loads a PyTorch C++ extension just-in-time (JIT). @@ -979,14 +981,23 @@ def load(name, ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers and libraries to be included. is_python_module: If ``True`` (default), imports the produced shared - library as a Python module. If ``False``, loads it into the process - as a plain dynamic library. + library as a Python module. If ``False``, behavior depends on + ``is_standalone``. + is_standalone: If ``False`` (default) loads the constructed extension + into the process as a plain dynamic library. If ``True``, build a + standalone executable. Returns: - If ``is_python_module`` is ``True``, returns the loaded PyTorch - extension as a Python module. If ``is_python_module`` is ``False`` - returns nothing (the shared library is loaded into the process as a side - effect). + If ``is_python_module`` is ``True``: + Returns the loaded PyTorch extension as a Python module. + + If ``is_python_module`` is ``False`` and ``is_standalone`` is ``False``: + Returns nothing. (The shared library is loaded into the process as + a side effect.) + + If ``is_standalone`` is ``True``. + Return the path to the executable. (On Windows, TORCH_LIB_PATH is + added to the PATH environment variable as a side effect.) Example: >>> from torch.utils.cpp_extension import load @@ -1007,6 +1018,7 @@ def load(name, verbose, with_cuda, is_python_module, + is_standalone, keep_intermediates=keep_intermediates) @@ -1155,6 +1167,7 @@ def load_inline(name, verbose, with_cuda, is_python_module, + is_standalone=False, keep_intermediates=keep_intermediates) @@ -1168,7 +1181,11 @@ def _jit_compile(name, verbose: bool, with_cuda: Optional[bool], is_python_module, + is_standalone, keep_intermediates=True) -> None: + if is_python_module and is_standalone: + raise ValueError("`is_python_module` and `is_standalone` are mutually exclusive.") + if with_cuda is None: with_cuda = any(map(_is_cuda_file, sources)) with_cudnn = any(['cudnn' in f for f in extra_ldflags or []]) @@ -1178,7 +1195,9 @@ def _jit_compile(name, sources, build_arguments=[extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths], build_directory=build_directory, - with_cuda=with_cuda + with_cuda=with_cuda, + is_python_module=is_python_module, + is_standalone=is_standalone, ) if version > 0: if version != old_version and verbose: @@ -1210,7 +1229,8 @@ def _jit_compile(name, extra_include_paths=extra_include_paths or [], build_directory=build_directory, verbose=verbose, - with_cuda=with_cuda) + with_cuda=with_cuda, + is_standalone=is_standalone) finally: baton.release() else: @@ -1221,6 +1241,10 @@ def _jit_compile(name, if verbose: print(f'Loading extension module {name}...') + + if is_standalone: + return _get_exec_path(name, build_directory) + return _import_module_from_library(name, build_directory, is_python_module) @@ -1275,7 +1299,8 @@ def _write_ninja_file_and_build_library( extra_include_paths, build_directory: str, verbose: bool, - with_cuda: Optional[bool]) -> None: + with_cuda: Optional[bool], + is_standalone: bool = False) -> None: verify_ninja_availability() if IS_WINDOWS: compiler = os.environ.get('CXX', 'cl') @@ -1287,7 +1312,8 @@ def _write_ninja_file_and_build_library( extra_ldflags = _prepare_ldflags( extra_ldflags or [], with_cuda, - verbose) + verbose, + is_standalone) build_file_path = os.path.join(build_directory, 'build.ninja') if verbose: print(f'Emitting ninja build file {build_file_path}...') @@ -1301,7 +1327,8 @@ def _write_ninja_file_and_build_library( extra_cuda_cflags=extra_cuda_cflags or [], extra_ldflags=extra_ldflags or [], extra_include_paths=extra_include_paths or [], - with_cuda=with_cuda) + with_cuda=with_cuda, + is_standalone=is_standalone) if verbose: print(f'Building extension module {name}...') @@ -1334,11 +1361,7 @@ def verify_ninja_availability(): raise RuntimeError("Ninja is required to load C++ extensions") -def _prepare_ldflags(extra_ldflags, with_cuda, verbose): - here = os.path.abspath(__file__) - torch_path = os.path.dirname(os.path.dirname(here)) - lib_path = os.path.join(torch_path, 'lib') - +def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone): if IS_WINDOWS: python_path = os.path.dirname(sys.executable) python_lib_path = os.path.join(python_path, 'libs') @@ -1353,11 +1376,13 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose): # Related issue: https://github.com/pytorch/pytorch/issues/31611 extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ') extra_ldflags.append('torch.lib') - extra_ldflags.append('torch_python.lib') - extra_ldflags.append(f'/LIBPATH:{python_lib_path}') - extra_ldflags.append(f'/LIBPATH:{lib_path}') + extra_ldflags.append(f'/LIBPATH:{TORCH_LIB_PATH}') + if not is_standalone: + extra_ldflags.append('torch_python.lib') + extra_ldflags.append(f'/LIBPATH:{python_lib_path}') + else: - extra_ldflags.append(f'-L{lib_path}') + extra_ldflags.append(f'-L{TORCH_LIB_PATH}') extra_ldflags.append('-lc10') if with_cuda: extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda') @@ -1365,7 +1390,11 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose): if with_cuda: extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda') extra_ldflags.append('-ltorch') - extra_ldflags.append('-ltorch_python') + if not is_standalone: + extra_ldflags.append('-ltorch_python') + + if is_standalone: + extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}") if with_cuda: if verbose: @@ -1565,6 +1594,17 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) -> raise RuntimeError(message) from e +def _get_exec_path(module_name, path): + if IS_WINDOWS and TORCH_LIB_PATH not in os.getenv('PATH', '').split(';'): + torch_lib_in_path = any( + os.path.exists(p) and os.path.samefile(p, TORCH_LIB_PATH) + for p in os.getenv('PATH', '').split(';') + ) + if not torch_lib_in_path: + os.environ['PATH'] = f"{TORCH_LIB_PATH};{os.getenv('PATH', '')}" + return os.path.join(path, f'{module_name}{EXEC_EXT}') + + def _import_module_from_library(module_name, path, is_python_module): # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path file, path, description = imp.find_module(module_name, [path]) @@ -1583,7 +1623,8 @@ def _write_ninja_file_to_build_library(path, extra_cuda_cflags, extra_ldflags, extra_include_paths, - with_cuda) -> None: + with_cuda, + is_standalone) -> None: extra_cflags = [flag.strip() for flag in extra_cflags] extra_cuda_cflags = [flag.strip() for flag in extra_cuda_cflags] extra_ldflags = [flag.strip() for flag in extra_ldflags] @@ -1603,8 +1644,10 @@ def _write_ninja_file_to_build_library(path, user_includes += system_includes system_includes.clear() - common_cflags = [f'-DTORCH_EXTENSION_NAME={name}'] - common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H') + common_cflags = [] + if not is_standalone: + common_cflags.append(f'-DTORCH_EXTENSION_NAME={name}') + common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H') # Note [Pybind11 ABI constants] # @@ -1674,19 +1717,16 @@ def object_file_path(source_file: str) -> str: return target objects = [object_file_path(src) for src in sources] + ldflags = ([] if is_standalone else [SHARED_FLAG]) + extra_ldflags - if IS_WINDOWS: - ldflags = ['/DLL'] + extra_ldflags - else: - ldflags = ['-shared'] + extra_ldflags # The darwin linker needs explicit consent to ignore unresolved symbols. if sys.platform.startswith('darwin'): ldflags.append('-undefined dynamic_lookup') elif IS_WINDOWS: ldflags = _nt_quote_args(ldflags) - ext = 'pyd' if IS_WINDOWS else 'so' - library_target = f'{name}.{ext}' + ext = EXEC_EXT if is_standalone else LIB_EXT + library_target = f'{name}{ext}' _write_ninja_file( path=path, diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 8d7726ebd129..d1025c02cc9b 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -618,46 +618,72 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter): # simple things like acquiring an internal lock of a queue may hang. # Therefore, in this case, we actually need to prevent `__del__` from # being executed, and rely on the automatic termination of daemonic - # children. Thus, we register an `atexit` hook that sets a global flag + # children. + # + # Thus, we register an `atexit` hook that sets a global flag # `_utils.python_exit_status`. Since `atexit` hooks are executed in the # reverse order of registration, we are guaranteed that this flag is - # set before library resources we use are freed. (Hooks freeing those - # resources are registered at importing the Python core libraries at - # the top of this file.) So in `__del__`, we check if - # `_utils.python_exit_status` is set or `None` (freed), and perform - # no-op if so. + # set before library resources we use are freed (which, at least in + # CPython, is done via an `atexit` handler defined in + # `multiprocessing/util.py` + # https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/util.py#L320-L362 + # registered when an object requiring this mechanism is first + # created, e.g., `mp.Queue` + # https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/context.py#L100-L103 + # https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/queues.py#L29 + # ) + # + # So in `__del__`, we check if `_utils.python_exit_status` is set or + # `None` (freed), and perform no-op if so. + # + # However, simply letting library clean-up codes run can also be bad, + # because such codes (i.e., `multiprocessing.util._exit_function()`) + # include join putting threads for `mp.Queue`, which can be blocking. + # Hence, the main process putting threads are called with + # `cancel_join_thread` at creation. See later section + # [ 3b. A process won't hang when putting into a queue; ] + # for more details. + # + # Here are two example cases where library clean-up codes can run + # before `__del__` is called: # - # Another problem with `__del__` is also related to the library cleanup - # calls. When a process ends, it shuts the all its daemonic children - # down with a SIGTERM (instead of joining them without a timeout). - # Simiarly for threads, but by a different mechanism. This fact, - # together with a few implementation details of multiprocessing, forces - # us to make workers daemonic. All of our problems arise when a - # DataLoader is used in a subprocess, and are caused by multiprocessing - # code which looks more or less like this: + # 1. If we hold onto a reference to the iterator, it more often + # than not tries to do `multiprocessing` library cleaning before + # clearing the alive referenced objects (https://github.com/pytorch/pytorch/issues/48666) + # and thus prevents our cleaning-up code to run first. # - # try: - # your_function_using_a_dataloader() - # finally: - # multiprocessing.util._exit_function() + # 2. A similar issue araises when a `DataLoader` is used in a subprocess. + # When a process ends, it shuts the all its daemonic children + # down with a SIGTERM (instead of joining them without a timeout). + # Simiarly for threads, but by a different mechanism. This fact, + # together with a few implementation details of multiprocessing, forces + # us to make workers daemonic. All of our problems arise when a + # DataLoader is used in a subprocess, and are caused by multiprocessing + # code which looks more or less like this: # - # The joining/termination mentioned above happens inside - # `_exit_function()`. Now, if `your_function_using_a_dataloader()` - # throws, the stack trace stored in the exception will prevent the - # frame which uses `DataLoaderIter` to be freed. If the frame has any - # reference to the `DataLoaderIter` (e.g., in a method of the iter), - # its `__del__`, which starts the shutdown procedure, will not be - # called. That, in turn, means that workers aren't notified. Attempting - # to join in `_exit_function` will then result in a hang. + # try: + # your_function_using_a_dataloader() + # finally: + # multiprocessing.util._exit_function() # - # For context, `_exit_function` is also registered as an `atexit` call. - # So it is unclear to me (@ssnl) why this is needed in a finally block. - # The code dates back to 2008 and there is no comment on the original - # PEP 371 or patch https://bugs.python.org/issue3050 (containing both - # the finally block and the `atexit` registration) that explains this. + # The joining/termination mentioned above happens inside + # `_exit_function()`. Now, if `your_function_using_a_dataloader()` + # throws, the stack trace stored in the exception will prevent the + # frame which uses `DataLoaderIter` to be freed. If the frame has any + # reference to the `DataLoaderIter` (e.g., in a method of the iter), + # its `__del__`, which starts the shutdown procedure, will not be + # called. That, in turn, means that workers aren't notified. Attempting + # to join in `_exit_function` will then result in a hang. # - # Another choice is to just shutdown workers with logic in 1 above - # whenever we see an error in `next`. This isn't ideal because + # For context, `_exit_function` is also registered as an `atexit` call. + # So it is unclear to me (@ssnl) why this is needed in a finally block. + # The code dates back to 2008 and there is no comment on the original + # PEP 371 or patch https://bugs.python.org/issue3050 (containing both + # the finally block and the `atexit` registration) that explains this. + # + # + # Finally, another choice is to just shutdown workers with logic in 1 + # above whenever we see an error in `next`. This isn't ideal because # a. It prevents users from using try-catch to resume data loading. # b. It doesn't prevent hanging if users have references to the # iterator. @@ -705,30 +731,33 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter): # We use `mp.Queue` which has a separate background thread to put # objects from an unbounded buffer array. The background thread is # daemonic and usually automatically joined when the process - # exits. + # *exits*. # - # However, in case that the receiver has ended abruptly while - # reading from the pipe, the join will hang forever. Therefore, - # for both `worker_result_queue` (worker -> main process/pin_memory_thread) - # and each `index_queue` (main process -> worker), we use - # `q.cancel_join_thread()` in sender process before any `q.put` to - # prevent this automatic join. - # - # Moreover, having all queues called `cancel_join_thread` makes - # implementing graceful shutdown logic in `__del__` much easier. - # It won't need to get from any queue, which would also need to be - # guarded by periodic status checks. + # In case that the receiver has ended abruptly while + # reading from the pipe, the join will hang forever. The usual + # solution for this in Python is calling `q.cancel_join_thread`, + # which prevents automatically joining it when finalizing + # (exiting). # # Nonetheless, `cancel_join_thread` must only be called when the # queue is **not** going to be read from or write into by another # process, because it may hold onto a lock or leave corrupted data # in the queue, leading other readers/writers to hang. # - # `pin_memory_thread`'s `data_queue` is a `queue.Queue` that does - # a blocking `put` if the queue is full. So there is no above - # problem, but we do need to wrap the `put` in a loop that breaks - # not only upon success, but also when the main process stops - # reading, i.e., is shutting down. + # Hence, + # + For worker processes, we only do so (for their output + # queues, i.e., `worker_result_queue`) before exiting. + # + For `pin_memory_thread`, its output queue `data_queue` is a + # `queue.Queue` that does blocking `put` if the queue is full. + # So there is no above problem, but as a result, in + # `_pin_memory_loop`, we do need to wrap the `put` in a loop + # that breaks not only upon success, but also when the main + # process stops reading, i.e., is shutting down. + # + For loader process, we `cancel_join_thread()` for all + # `_index_queues` because the whole purpose of workers and + # `pin_memory_thread` is to serve the loader process. If + # loader process is already exiting, we don't really care if + # the queues are corrupted. # # # Now let's get back to 1: @@ -867,7 +896,9 @@ def __init__(self, loader): for i in range(self._num_workers): # No certainty which module multiprocessing_context is index_queue = multiprocessing_context.Queue() # type: ignore - # index_queue.cancel_join_thread() + # Need to `cancel_join_thread` here! + # See sections (2) and (3b) above. + index_queue.cancel_join_thread() w = multiprocessing_context.Process( target=_utils.worker._worker_loop, args=(self._dataset_kind, self._dataset, index_queue, @@ -1234,6 +1265,9 @@ def _shutdown_workers(self): if not self._shutdown: self._shutdown = True try: + # Normal exit when last reference is gone / iterator is depleted. + # See (1) and the second half of the note. + # Exit `pin_memory_thread` first because exiting workers may leave # corrupted data in `worker_result_queue` which `pin_memory_thread` # reads from. diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index 9ad9c4b82e9d..8c1eb599a714 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -8103,7 +8103,7 @@ ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)), ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)), ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)), - ("TORCH_CUDA_KERNEL_LAUNCH_CHECK", ("TORCH_HIP_KERNEL_LAUNCH_CHECK", API_C10)) + ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10)) ] )