diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index 8b129533765a..4339d26a6d3c 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -93,7 +93,7 @@
             ]),
         ]),
         ("rocm", [
-            ("3.9", [
+            ("3.7", [
                 ("3.6", [
                     ('build_only', [XImportant(True)]),
                 ]),
diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
index 9ba9fb0a8c0c..91f757207915 100644
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -29,6 +29,7 @@
     "pytorch-linux-xenial-py3.6-gcc5.4",  # this one is used in doc builds
     "pytorch-linux-xenial-py3.6-gcc7.2",
     "pytorch-linux-xenial-py3.6-gcc7",
+    "pytorch-linux-bionic-rocm3.7-py3.6",
     "pytorch-linux-bionic-rocm3.8-py3.6",
     "pytorch-linux-bionic-rocm3.9-py3.6",
 ]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 59a26f30d1ba..f4b694afe964 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -453,8 +453,12 @@ jobs:
         no_output_timeout: "1h"
         command: |
           set -e
-          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
-            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
+          # TODO: Remove this after we figure out why rocm tests are failing
+          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
+            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
+          fi
+          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
+            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
           fi
           if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
             echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}"
@@ -534,8 +538,12 @@ jobs:
         command: |
           set -e
           export PYTHONUNBUFFERED=1
-          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
-            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
+          # TODO: Remove this after we figure out why rocm tests are failing
+          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
+            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
+          fi
+          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
+            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
           fi
           # See Note [Special build images]
           output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
@@ -7272,6 +7280,9 @@ workflows:
       - docker_build_job:
           name: "docker-pytorch-linux-xenial-py3.6-gcc7"
           image_name: "pytorch-linux-xenial-py3.6-gcc7"
+      - docker_build_job:
+          name: "docker-pytorch-linux-bionic-rocm3.7-py3.6"
+          image_name: "pytorch-linux-bionic-rocm3.7-py3.6"
       - docker_build_job:
           name: "docker-pytorch-linux-bionic-rocm3.8-py3.6"
           image_name: "pytorch-linux-bionic-rocm3.8-py3.6"
@@ -7702,11 +7713,11 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-py3.8-gcc9"
           resource_class: large
       - pytorch_linux_build:
-          name: pytorch_linux_bionic_rocm3_9_py3_6_build
+          name: pytorch_linux_bionic_rocm3_7_py3_6_build
           requires:
-            - "docker-pytorch-linux-bionic-rocm3.9-py3.6"
-          build_environment: "pytorch-linux-bionic-rocm3.9-py3.6-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.9-py3.6"
+            - "docker-pytorch-linux-bionic-rocm3.7-py3.6"
+          build_environment: "pytorch-linux-bionic-rocm3.7-py3.6-build"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-rocm3.7-py3.6"
           resource_class: xlarge
       - pytorch_macos_10_13_py3_build:
           name: pytorch_macos_10_13_py3_build
diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
index 150e2bb9f380..019c7f6e9d1c 100755
--- a/.circleci/docker/build.sh
+++ b/.circleci/docker/build.sh
@@ -274,6 +274,13 @@ case "$image" in
     VISION=yes
     KATEX=yes
     ;;
+  pytorch-linux-bionic-rocm3.7-py3.6)
+    ANACONDA_PYTHON_VERSION=3.6
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=3.7
+    ;;
   pytorch-linux-bionic-rocm3.8-py3.6)
     ANACONDA_PYTHON_VERSION=3.6
     PROTOBUF=yes
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 6c978987f779..a5876c3af738 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -15,8 +15,12 @@ jobs:
         no_output_timeout: "1h"
         command: |
           set -e
-          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
-            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
+          # TODO: Remove this after we figure out why rocm tests are failing
+          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
+            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
+          fi
+          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
+            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
           fi
           if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then
             echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}"
@@ -96,8 +100,12 @@ jobs:
         command: |
           set -e
           export PYTHONUNBUFFERED=1
-          if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then
-            export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6"
+          # TODO: Remove this after we figure out why rocm tests are failing
+          if [[ "${DOCKER_IMAGE}" == *rocm3.5* ]]; then
+            export DOCKER_TAG="ab1632df-fa59-40e6-8c23-98e004f61148"
+          fi
+          if [[ "${DOCKER_IMAGE}" == *rocm3.7* ]]; then
+            export DOCKER_TAG="1045c7b891104cb4fd23399eab413b6213e48aeb"
           fi
           # See Note [Special build images]
           output_image=${DOCKER_IMAGE}:${DOCKER_TAG}-${CIRCLE_SHA1}
diff --git a/.gitignore b/.gitignore
index 3d2e85be977f..d1f06437acee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -93,6 +93,8 @@ torch/lib64
 torch/include/
 torch/share/
 torch/test/
+torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
+torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
 # Root level file used in CI to specify certain env configs.
 # E.g., see .circleci/config.yaml
diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh
index 0549e9aa78a4..56ce8d525f89 100755
--- a/.jenkins/caffe2/build.sh
+++ b/.jenkins/caffe2/build.sh
@@ -265,7 +265,7 @@ fi
 ###############################################################################
 
 # Install ONNX into a local directory
-pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
+pip install --user "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
 
 report_compile_cache_stats
 
diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
index c7d60bedafd7..d4821c1b1a8d 100644
--- a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
+++ b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@@ -9,10 +9,10 @@ if "%CUDA_SUFFIX%" == "" (
 
 if "%REBUILD%"=="" (
   if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
+    curl --retry 3 -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --output %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z
   ) else (
-    aws s3 cp s3://ossci-windows/magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
+    aws s3 cp s3://ossci-windows/magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z --quiet
   )
-  7z x -aoa %TMP_DIR_WIN%\magma_2.5.3_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma
+  7z x -aoa %TMP_DIR_WIN%\magma_2.5.4_%CUDA_SUFFIX%_%BUILD_TYPE%.7z -o%TMP_DIR_WIN%\magma
 )
 set MAGMA_HOME=%TMP_DIR_WIN%\magma
diff --git a/BUILD.bazel b/BUILD.bazel
index 218d3b2ebcb7..76afe6aec1ea 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -131,6 +131,7 @@ genrule(
         "aten/src/ATen/RegisterQuantizedCPU.cpp",
         "aten/src/ATen/RegisterSparseCPU.cpp",
         "aten/src/ATen/RegisterMath.cpp",
+        "aten/src/ATen/RegisterMeta.cpp",
         "aten/src/ATen/RegisterDefaultBackend.cpp",
         "aten/src/ATen/RegisterSchema.cpp",
         "aten/src/ATen/Functions.h",
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
index dd6dae47d14f..df51c071c418 100644
--- a/aten/src/ATen/LegacyTHFunctionsCUDA.h
+++ b/aten/src/ATen/LegacyTHFunctionsCUDA.h
@@ -36,12 +36,6 @@ std::tuple<Tensor,Tensor> _th_topk(const Tensor & self, int64_t k, int64_t dim,
 Tensor & _th_renorm_out(Tensor & result, const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
-Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other);
-Tensor _th_fmod(const Tensor & self, Scalar other);
-Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other);
-Tensor _th_fmod(const Tensor & self, const Tensor & other);
-Tensor & _th_fmod_(Tensor & self, Scalar other);
-Tensor & _th_fmod_(Tensor & self, const Tensor & other);
 Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim);
 Tensor _th_cross_kernel(const Tensor & self, const Tensor & other, int64_t dim);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 8e3d2d579e91..a2bdc24ff51c 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -5,6 +5,10 @@
 #include <ATen/ExpandUtils.h>
 #include <ATen/Functions.h>
 
+// TODO: try to remove this
+// There is some back story, see https://github.com/pytorch/pytorch/issues/48684
+#include <ATen/NativeFunctions.h>
+
 namespace at {
 namespace indexing {
 
diff --git a/aten/src/ATen/TensorMeta.h b/aten/src/ATen/TensorMeta.h
index abca65feda17..59a7dc740175 100644
--- a/aten/src/ATen/TensorMeta.h
+++ b/aten/src/ATen/TensorMeta.h
@@ -3,8 +3,24 @@
 #include <ATen/ATen.h>  // TODO: improve
 // #include <ATen/NativeFunctions.h>
 
+#include <ATen/DimVector.h>
+#include <c10/core/TensorOptions.h>
+#include <ATen/core/Dimname.h>
+
 namespace at {
 
+namespace impl {
+
+struct MetaBase {
+  virtual void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) = 0;
+  void set_output(IntArrayRef sizes, TensorOptions options) {
+    set_output(0, sizes, {}, options, {});
+  }
+  virtual ~MetaBase() {}
+};
+
+} // namespace impl
+
 struct TensorMeta {
   DimVector sizes;
   // TODO: DimVector strides;
@@ -14,6 +30,11 @@ struct TensorMeta {
     : sizes(_sizes), options(_options) {}
 };
 
+inline Tensor meta_tensor_from_meta(const TensorMeta& meta) {
+  // TODO: eliminate indirection
+  return at::empty_meta(meta.sizes, meta.options);
+}
+
 inline Tensor tensor_from_meta(const TensorMeta& meta) {
   // TODO: eliminate indirection
   return at::empty(meta.sizes, meta.options);
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index 5fd07d892630..192e131897c8 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -185,4 +185,15 @@ std::string show_config() {
   return ss.str();
 }
 
+std::string get_cxx_flags() {
+  #if defined(FBCODE_CAFFE2)
+  TORCH_CHECK(
+    false,
+    "Buck does not populate the `CXX_FLAGS` field of Caffe2 build options. "
+    "As a result, `get_cxx_flags` is OSS only."
+  );
+  #endif
+  return caffe2::GetBuildOptions().at("CXX_FLAGS");
+}
+
 }
diff --git a/aten/src/ATen/Version.h b/aten/src/ATen/Version.h
index 18fd31d3ed87..3ac7a5858ace 100644
--- a/aten/src/ATen/Version.h
+++ b/aten/src/ATen/Version.h
@@ -11,4 +11,6 @@ CAFFE2_API std::string get_mkldnn_version();
 
 CAFFE2_API std::string get_openmp_version();
 
+CAFFE2_API std::string get_cxx_flags();
+
 }  // namespace at
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 0165c53ac60d..82421f49de1e 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -50,6 +50,15 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   } else {
     TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
   }
+#endif
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION >= 308
+  rocblas_atomics_mode rocblas_mode;
+  if (at::globalContext().deterministic()) {
+    rocblas_mode = rocblas_atomics_not_allowed;
+  } else {
+    rocblas_mode = rocblas_atomics_allowed;
+  }
+  TORCH_CUDABLAS_CHECK(rocblas_set_atomics_mode(handle, rocblas_mode));
 #endif
   return handle;
 }
diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
index 6b6974cda1e9..878c8fb3d8a1 100644
--- a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
+++ b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
@@ -1045,372 +1045,6 @@ Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm) {
     }
     return self;
 }
-Tensor & _th_fmod_out(Tensor & result, const Tensor & self, Scalar other) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toByte();
-            THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toChar();
-            THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toDouble();
-            THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toFloat();
-            THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toInt();
-            THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toLong();
-            THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toShort();
-            THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toHalf();
-            THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_fmod(const Tensor & self, Scalar other) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toByte();
-            THCudaByteTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toChar();
-            THCudaCharTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toDouble();
-            THCudaDoubleTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toFloat();
-            THCudaTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toInt();
-            THCudaIntTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toLong();
-            THCudaLongTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toShort();
-            THCudaShortTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toHalf();
-            THCudaHalfTensor_fmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor & _th_fmod_out(Tensor & result, const Tensor & self, const Tensor & other) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto result_ = checked_dense_tensor_unwrap(result, "result", 0, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod_out", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_fmod_out not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor _th_fmod(const Tensor & self, const Tensor & other) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-    auto result_ = c10::make_intrusive<TensorImpl, UndefinedTensorImpl>(c10::Storage(c10::Storage::use_byte_size_t(), 0, allocator(), true),DispatchKey::CUDA, scalarTypeToTypeMeta(dispatch_scalar_type)).release();
-    auto result = Tensor(c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>::reclaim(result_));
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaByteTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaCharTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaIntTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaLongTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaShortTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 2, "_th_fmod", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaHalfTensor_cfmod(globalContext().getTHCState(), result_, self_, other_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_fmod not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return result;
-}
-Tensor & _th_fmod_(Tensor & self, Scalar other) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toByte();
-            THCudaByteTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toChar();
-            THCudaCharTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toDouble();
-            THCudaDoubleTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toFloat();
-            THCudaTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toInt();
-            THCudaIntTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toLong();
-            THCudaLongTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toShort();
-            THCudaShortTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = other.toHalf();
-            THCudaHalfTensor_fmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return self;
-}
-Tensor & _th_fmod_(Tensor & self, const Tensor & other) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
-
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaByteTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaCharTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaDoubleTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaIntTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaLongTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaShortTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        case ScalarType::Half: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            auto other_ = checked_dense_tensor_unwrap(other, "other", 3, "_th_fmod_", false, DeviceType::CUDA, dispatch_scalar_type);
-            THCudaHalfTensor_cfmod(globalContext().getTHCState(), self_, self_, other_);
-            break;
-        }
-        default:
-            AT_ERROR("_th_fmod_ not supported on CUDAType for ", dispatch_scalar_type);
-    }
-    return self;
-}
 Tensor & _th_cross_kernel_out(Tensor & result, const Tensor & self, const Tensor & other, int64_t dim) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index 37b7c5bbb223..9cc040b4dc8f 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -1315,7 +1315,7 @@ Tensor _lu_solve_helper_cpu(const Tensor& self, const Tensor& LU_data, const Ten
   if (self.numel() == 0 || LU_data.numel() == 0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lu_solve_cpu", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "lu_solve_cpu", [&]{
     apply_lu_solve<scalar_t>(self_working_copy, LU_data_working_copy, LU_pivots_working_copy, infos);
   });
   if (self.dim() > 2) {
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index d43e1394bd56..d6cb17418365 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -40,7 +40,6 @@ DEFINE_DISPATCH(tanh_backward_stub);
 DEFINE_DISPATCH(maximum_stub);
 DEFINE_DISPATCH(minimum_stub);
 DEFINE_DISPATCH(fmod_stub);
-DEFINE_DISPATCH(fmod_scalar_stub);
 DEFINE_DISPATCH(logaddexp_stub);
 DEFINE_DISPATCH(logaddexp2_stub);
 DEFINE_DISPATCH(gcd_stub);
@@ -897,34 +896,42 @@ Tensor& floor_divide_(Tensor& self, Scalar other) {
 
 Tensor& fmod_out(Tensor & result, const Tensor& self, const Tensor& other) {
   auto iter = TensorIterator::binary_op(result, self, other);
-  TORCH_CHECK(iter.device_type() == at::kCPU, "Native fmod only supports CPU");
   fmod_stub(iter.device_type(), iter);
   return result;
 }
 
 Tensor& fmod_out(Tensor & result, const Tensor& self, Scalar other) {
-  auto iter = TensorIterator::unary_op(result, self);
-  TORCH_CHECK(iter.device_type() == at::kCPU, "Native fmod only supports CPU");
-  fmod_scalar_stub(iter.device_type(), iter, other);
+  Tensor other_tensor = wrapped_scalar_tensor(other);
+  // FIXME: 'other' is converted to match the dtype of 'self' to retain
+  //   BC with TH, but in the future, we should use normal type promotion,
+  //   like in numpy
+  // Issue #47779: https://github.com/pytorch/pytorch/issues/47779
+  at::fmod_out(result, self, other_tensor.to(self.dtype()));
   return result;
 }
 
 Tensor fmod(const Tensor& self, const Tensor & other) {
-  Tensor result = at::empty({0}, self.options());
-  return at::fmod_out(result, self, other);
+  Tensor result;
+  auto iter = TensorIterator::binary_op(result, self, other);
+  fmod_stub(iter.device_type(), iter);
+  return iter.output();
 }
 
 Tensor fmod(const Tensor& self, Scalar other) {
-  Tensor result = at::empty({0}, self.options());
-  return at::fmod_out(result, self, other);
+  Tensor other_tensor = wrapped_scalar_tensor(other);
+  // FIXME: 'other' is converted to match the dtype of 'self' to retain
+  //   BC with TH, but in the future, we should use normal type promotion,
+  //   like in numpy
+  // Issue #47779: https://github.com/pytorch/pytorch/issues/47779
+  return native::fmod(self, other_tensor.to(self.dtype()));
 }
 
 Tensor& fmod_(Tensor& self, const Tensor& other) {
-  return at::fmod_out(self, self, other);
+  return native::fmod_out(self, self, other);
 }
 
 Tensor& fmod_(Tensor& self, Scalar other) {
-  return at::fmod_out(self, self, other);
+  return native::fmod_out(self, self, other);
 }
 
 Tensor& logaddexp_out(Tensor& result, const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h
index 8b01ce9b06f3..d76dd9d205e9 100644
--- a/aten/src/ATen/native/BinaryOps.h
+++ b/aten/src/ATen/native/BinaryOps.h
@@ -62,7 +62,6 @@ DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
 DECLARE_DISPATCH(binary_fn, tanh_backward_stub);
 DECLARE_DISPATCH(binary_fn, mse_stub);
 DECLARE_DISPATCH(binary_fn, fmod_stub);
-DECLARE_DISPATCH(binary_fn_alpha, fmod_scalar_stub);
 DECLARE_DISPATCH(binary_fn, logaddexp_stub);
 DECLARE_DISPATCH(binary_fn, logaddexp2_stub);
 DECLARE_DISPATCH(binary_fn, gcd_stub);
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index ef85d5f602c3..afd4ec15d25f 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -111,36 +111,61 @@ Tensor pinverse(const Tensor& self, double rcond) {
   return at::matmul(V.conj() * S_pseudoinv.unsqueeze(-2), U.transpose(-2, -1).conj());
 }
 
-static inline Tensor _matrix_rank_helper(const Tensor& self, bool symmetric) {
+Tensor& linalg_matrix_rank_out(Tensor& result, const Tensor& self, optional<double> tol, bool hermitian) {
+  TORCH_CHECK(result.scalar_type() == ScalarType::Long,
+    "result dtype ", result.scalar_type(), " does not match the expected dtype ", ScalarType::Long);
+
+  // Matrices or batch of matrices are allowed
+  TORCH_CHECK(self.dim() >= 2, "linalg_matrix_rank: Expected as input a matrix or a batch of matrices, but got a tensor of size: ", self.sizes());
+
+  // matrix_rank assigns a scalar value for each matrix in the batch so
+  // result's shape is equal to self.shape[0:self.ndim-2]
+  // for single matrix result_shape = {}
+  auto result_shape = IntArrayRef(self.sizes().cbegin(), self.sizes().cend()-2);
+  at::native::resize_output(result, result_shape);
+
+  // NumPy doesn't take into account possible input with no elements and it errors on max not defined for this case
+  // Let's output 0 for this case, since that kind of matrices have zero number of non-zero rows, hence rank is 0.
+  if (self.numel() == 0) {
+    result.fill_(0);
+    return result;
+  }
+
+  // We compute matrix rank as the number of singular or absolute eigen values above 'tol' threshold
   Tensor S;
-  if (!symmetric) {
+  if (!hermitian) {
     Tensor U, V;
+    // TODO: replace self.svd with linalg_svd
     std::tie(U, S, V) = self.svd(/*some=*/true, /*compute_uv=*/false);
   } else {
-    Tensor eigvecs;
-    std::tie(S, eigvecs) = self.symeig(/*eigenvectors=*/false);
+    S = at::linalg_eigvalsh(self);
     S = S.abs();
   }
-  return S;
+
+  if (tol.has_value()) {
+    double tol_value = tol.value();
+    at::sum_out(result, S > tol_value, /*dim=*/-1);
+  } else {
+    ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype()));
+    double tol_value = _get_epsilon(real_dtype) * std::max(self.size(-1), self.size(-2));
+    Tensor max_S = S.amax(/*dim=*/-1);
+    at::sum_out(result, S > max_S.mul_(tol_value).unsqueeze_(-1), /*dim=*/-1);
+  }
+  return result;
 }
 
-Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) {
-  TORCH_CHECK((at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type())) && self.dim() == 2,
-              "matrix_rank(", self.scalar_type(), "{", self.sizes(), "}): expected a 2D tensor "
-              "of floating types");
+Tensor linalg_matrix_rank(const Tensor& self, optional<double> tol, bool hermitian) {
+  Tensor result = at::empty({0}, self.options().dtype(ScalarType::Long));
+  result = at::linalg_matrix_rank_out(result, self, tol, hermitian);
+  return result;
+}
 
-  Tensor S = _matrix_rank_helper(self, symmetric);
-  return (S > tol).sum();
+Tensor matrix_rank(const Tensor& self, double tol, bool symmetric) {
+  return at::linalg_matrix_rank(self, optional<double>(tol), symmetric);
 }
 
 Tensor matrix_rank(const Tensor& self, bool symmetric) {
-  TORCH_CHECK((at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type())) && self.dim() == 2,
-              "matrix_rank(", self.scalar_type(), "{", self.sizes(), "}): expected a 2D tensor "
-              "of floating types");
-
-  Tensor S = _matrix_rank_helper(self, symmetric);
-  double tol = _get_epsilon(self.scalar_type()) * std::max(self.size(0), self.size(1));
-  return (S > S.max().mul_(tol)).sum();
+  return at::linalg_matrix_rank(self, c10::nullopt, symmetric);
 }
 
 static void check_1d(const Tensor& t, const char* arg, const char* fn) {
@@ -924,8 +949,8 @@ inline Tensor _blob_to_Tensor(
   // Blob is assumed to be a 1D array, that is why
   // we also insert a fake dimension so that the result could directly
   // be used in _compute_linear_combination
-  auto tensor = at::from_blob((void*)blob.begin(), blob.size(), in.dtype())
-    .unsqueeze(0);
+  auto tensor = at::from_blob((void*)blob.begin(), blob.size(),
+    c10::toValueType(in.scalar_type())).unsqueeze(0);
   return _move_memory_if_cuda_input(tensor, in);
 }
 
@@ -1058,7 +1083,7 @@ Tensor compute_T12(const Tensor& A) {
     reinterpret_cast<void*>(&b),
     {num_prods, num_prods},
     {num_prods, 1},
-    A.dtype()
+    c10::toValueType(A.scalar_type())
   );
   bs = _move_memory_if_cuda_input(bs, A);
 
@@ -1130,7 +1155,7 @@ Tensor compute_T18(const Tensor& A) {
     reinterpret_cast<void*>(&b),
     {num_prods, num_prods},
     {num_prods, 1},
-    A.dtype()
+    c10::toValueType(A.scalar_type())
   );
   bs = _move_memory_if_cuda_input(bs, A);
 
@@ -1303,7 +1328,7 @@ Tensor backward_analytic_function_of_a_matrix(
     const Tensor& self, const Tensor& grad,
     const func_t& function_of_a_matrix
   ) {
-  auto self_transposed = self.transpose(-2, -1);
+  auto self_transposed = self.transpose(-2, -1).conj();
   auto self_transposed_sizes = self_transposed.sizes().vec();
   self_transposed_sizes[self.dim() - 2] <<= 1;
   self_transposed_sizes[self.dim() - 1] <<= 1;
diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
index d6da309d4cf7..e5a0423e493c 100644
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@@ -77,12 +77,13 @@ Tensor& resize_as_(
 Tensor& resize_(
     Tensor& self,
     IntArrayRef size,
-    c10::optional<MemoryFormat> optional_memory_format) {
+    c10::optional<MemoryFormat> optional_memory_format,
+    bool resize_storage) {
   if (self.has_names()) {
     return resize_named_tensor_(self, size, optional_memory_format);
   }
   auto* self_ = self.unsafeGetTensorImpl();
-  resize_impl_cpu_(self_, size, /*strides=*/c10::nullopt);
+  resize_impl_cpu_(self_, size, /*strides=*/c10::nullopt, resize_storage);
   if (optional_memory_format.has_value()) {
     auto memory_format =
         optional_memory_format.value();
@@ -95,5 +96,20 @@ Tensor& resize_(
   return self;
 }
 
+Tensor& resize_(
+    Tensor& self,
+    IntArrayRef size,
+    c10::optional<MemoryFormat> optional_memory_format) {
+  return resize_(self, size, optional_memory_format, /*resize_storage=*/true);
+}
+
+Tensor& resize_meta_(
+    Tensor& self,
+    IntArrayRef size,
+    c10::optional<MemoryFormat> optional_memory_format) {
+  // meta tensors don't have storage, so don't resize them
+  return resize_(self, size, optional_memory_format, /*resize_storage=*/false);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h
index 501cacfbd073..d3d8faf3aa23 100644
--- a/aten/src/ATen/native/Resize.h
+++ b/aten/src/ATen/native/Resize.h
@@ -43,7 +43,8 @@ static inline void maybe_resize_storage_cpu(TensorImpl* self, int64_t new_size)
 inline TensorImpl* resize_impl_cpu_(
     TensorImpl* self,
     IntArrayRef size,
-    c10::optional<IntArrayRef> stride) {
+    c10::optional<IntArrayRef> stride,
+    bool resize_storage = true) {
   if (self->sizes() == size && (!stride || self->strides() == stride)) {
     return self;
   }
@@ -57,7 +58,9 @@ inline TensorImpl* resize_impl_cpu_(
     self->set_sizes_contiguous(size);
     storage_size = self->numel();
   }
-  maybe_resize_storage_cpu(self, storage_size);
+  if (resize_storage) {
+    maybe_resize_storage_cpu(self, storage_size);
+  }
 
   return self;
 }
diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp
index 4fec66afd2c8..770774aaaeda 100644
--- a/aten/src/ATen/native/TensorIterator.cpp
+++ b/aten/src/ATen/native/TensorIterator.cpp
@@ -9,11 +9,11 @@
 
 namespace at {
 
-using DimMask = TensorIterator::DimMask;
-using PtrVector = TensorIterator::PtrVector;
-using loop_t = TensorIterator::loop_t;
-using loop2d_t = TensorIterator::loop2d_t;
-using StrideVector = TensorIterator::StrideVector;
+using DimMask = TensorIteratorBase::DimMask;
+using PtrVector = TensorIteratorBase::PtrVector;
+using loop_t = TensorIteratorBase::loop_t;
+using loop2d_t = TensorIteratorBase::loop2d_t;
+using StrideVector = TensorIteratorBase::StrideVector;
 
 /// Construction
 TensorIteratorConfig& TensorIteratorConfig::add_output(const Tensor& output) {
@@ -150,7 +150,7 @@ TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef sha
 // in the strides of trivial dimensions, so physical layout is unaffected but permutation information is lost)
 // We might change this behavior in future once performance considerations are resolved
 
-void TensorIterator::reorder_dimensions() {
+void TensorIteratorBase::reorder_dimensions() {
   // Sort the dimensions based on strides in ascending order with reduced dims
   // at the front. NOTE: that this inverts the order of C-contiguous tensors.
   // strides[0] is the fastest moving dimension instead of strides[ndim - 1].
@@ -224,7 +224,7 @@ void TensorIterator::reorder_dimensions() {
 
 // Computes a common dtype using type promotion
 // See the [Common Dtype Computation] note
-ScalarType TensorIterator::compute_common_dtype() {
+ScalarType TensorIteratorBase::compute_common_dtype() {
   at::native::ResultTypeState state = {};
   for (const auto& op : operands_) {
     if (op.is_output) {
@@ -251,7 +251,7 @@ ScalarType TensorIterator::compute_common_dtype() {
 // NOTE: Checks for more specific behaviors (e.g. the first and second
 //   inputs must share a dtype, but the third must have the long dtype)
 //   should be implemented directly and outside of TensorIterator.
-void TensorIterator::compute_types(const TensorIteratorConfig& config) {
+void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) {
   // Reviews operands (1/2)
   //   - validates that all input tensors are defined
   //   - computes common device
@@ -402,10 +402,25 @@ void TensorIterator::compute_types(const TensorIteratorConfig& config) {
     if (common_device == kCPU) {
       // Casts to outputs by creating temporaries of the correct dtype (if needed)
       if (config.cast_common_dtype_to_outputs_ && op.is_output && op.current_dtype != common_dtype_) {
+        TORCH_INTERNAL_ASSERT(op.tensor.defined());
         op.original_tensor = op.tensor;
+        // NB: do NOT use set_output here, as the temporary is NOT a true output;
+        // op.tensor is the true output and it was pre-provided for us.
+        // TODO: When we extend this to work with meta tensors, we'll need to
+        // skip this temporary allocation in that case (because it's
+        // unnecessary)
+        // TODO: The logic for cast_outputs will need to be handled by the
+        // structured kernels implementation.  What probably should happen
+        // is that we pass in the inferred dtype into the out kernel, and
+        // then after calling the out kernel, do the conversion (which
+        // is cast_outputs here), but integrating this with existing
+        // TensorIterator will take a little doing
         op.tensor = at::empty_like(op.tensor,
                                    op.tensor.options().dtype(common_dtype_),
                                    LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+        if (!names_.empty()) {
+          namedinference::propagate_names(op.tensor, names_);
+        }
         op.current_dtype = common_dtype_;
         op.target_dtype = common_dtype_;
     }
@@ -421,7 +436,7 @@ void TensorIterator::compute_types(const TensorIteratorConfig& config) {
   }
 }
 
-StrideVector TensorIterator::compatible_stride(int element_size) const {
+StrideVector TensorIteratorBase::compatible_stride(int element_size) const {
   auto stride = StrideVector();
   int64_t next_stride = element_size;
   for (int dim = 0; dim < ndim(); dim++) {
@@ -431,7 +446,7 @@ StrideVector TensorIterator::compatible_stride(int element_size) const {
   return stride;
 }
 
-DimVector TensorIterator::invert_perm(IntArrayRef input) const {
+DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const {
   // Invert the permutation caused by reorder_dimensions. This is not valid
   // after coalesce_dimensions is called.
   TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_);
@@ -443,7 +458,7 @@ DimVector TensorIterator::invert_perm(IntArrayRef input) const {
   return res;
 }
 
-void TensorIterator::allocate_or_resize_outputs() {
+void TensorIteratorBase::allocate_or_resize_outputs() {
   for (int i = 0; i < num_outputs_; i++) {
     auto& op = operands_[i];
     if (!op.tensor.defined() || op.will_resize) {
@@ -460,33 +475,27 @@ void TensorIterator::allocate_or_resize_outputs() {
       }
       auto tensor_shape = invert_perm(shape_);
       if (inverted) {
-        if (!op.tensor.defined()) {
-          // can just return contiguous output
-          // it is faster because it avoids allocating 0 size tensor and
-          // resizing and restriding it
-          op.tensor = at::empty(tensor_shape, op.options());
-        } else {
-          at::native::resize_output(op.tensor, tensor_shape);
-        }
+        // can just return contiguous output
+        // it is faster because it avoids allocating 0 size tensor and
+        // resizing and restriding it
+        set_output(i, tensor_shape, {}, op.options(), names_);
       } else {
         auto tensor_stride = invert_perm(op.stride_bytes);
         for (int dim = 0; dim < ndim(); dim++) {
           tensor_stride[dim] /= element_size;
         }
-        if (!op.tensor.defined()) {
-          op.tensor =
-            at::empty_strided(tensor_shape, tensor_stride, op.options());
-        } else {
-          at::native::resize_output(op.tensor, tensor_shape);
-          op.tensor.as_strided_(tensor_shape, tensor_stride);
-        }
+        set_output(i, tensor_shape, tensor_stride, op.options(), names_);
       }
       op.current_dtype = op.target_dtype;
+    } else if (op.tensor.defined() && !names_.empty()) {
+      // Even if we don't resize, we may still propagate names, esp
+      // if we were doing an inplace operation
+      namedinference::propagate_names(op.tensor, names_);
     }
   }
 }
 
-void TensorIterator::compute_names(const TensorIteratorConfig& config) {
+void TensorIteratorBase::compute_names(const TensorIteratorConfig& config) {
   bool should_infer_names = std::any_of(
       operands_.begin(),
       operands_.end(),
@@ -513,27 +522,7 @@ void TensorIterator::compute_names(const TensorIteratorConfig& config) {
   }
 }
 
-void TensorIterator::propagate_names_to_outputs() {
-  // names_ can be empty for two reasons:
-  // 1. We were performing ops on scalar tensors. Then there should be no names.
-  // 2. All of the defined inputs/outputs had no names. Then we shouldn't
-  //    run name inference.
-  if (names_.empty()) {
-    return;
-  }
-
-  // propagate names
-  for (int i = 0; i < num_outputs_; i++) {
-    auto& op = operands_[i];
-    // must call propagate_names_to_outputs after outputs have been allocated.
-    TORCH_INTERNAL_ASSERT(op.tensor.defined());
-    if (!names_.empty()) {
-      namedinference::propagate_names(op.tensor, names_);
-    }
-  }
-}
-
-void TensorIterator::coalesce_dimensions() {
+void TensorIteratorBase::coalesce_dimensions() {
   if (ndim() <= 1) {
     return;
   }
@@ -586,7 +575,7 @@ void TensorIterator::coalesce_dimensions() {
   has_coalesced_dimensions_ = true;
 }
 
-int64_t TensorIterator::numel() const {
+int64_t TensorIteratorBase::numel() const {
   int64_t numel = 1;
   for (int64_t size : shape_) {
     numel *= size;
@@ -594,7 +583,7 @@ int64_t TensorIterator::numel() const {
   return numel;
 }
 
-StrideVector TensorIterator::get_dim_strides(int dim) const {
+StrideVector TensorIteratorBase::get_dim_strides(int dim) const {
   auto dims = ndim();
   auto inner_strides = StrideVector();
   for (auto& op : operands_) {
@@ -603,7 +592,7 @@ StrideVector TensorIterator::get_dim_strides(int dim) const {
   return inner_strides;
 }
 
-SmallVector<char*, 4> TensorIterator::get_data_ptrs(ArrayRef<char*> base, IntArrayRef counter) const {
+SmallVector<char*, 4> TensorIteratorBase::get_data_ptrs(ArrayRef<char*> base, IntArrayRef counter) const {
   auto ptrs = SmallVector<char*, 4>(base);
   for (int dim = 0; dim < ndim(); dim++) {
     int64_t value = counter[dim];
@@ -614,7 +603,7 @@ SmallVector<char*, 4> TensorIterator::get_data_ptrs(ArrayRef<char*> base, IntArr
   return ptrs;
 }
 
-SmallVector<char*, 4> TensorIterator::get_base_ptrs() const {
+SmallVector<char*, 4> TensorIteratorBase::get_base_ptrs() const {
   auto ptrs = SmallVector<char*, 4>();
   for (int i = 0; i < ntensors(); i++) {
     ptrs.push_back((char*)data_ptr(i));
@@ -622,7 +611,7 @@ SmallVector<char*, 4> TensorIterator::get_base_ptrs() const {
   return ptrs;
 }
 
-bool TensorIterator::is_dim_reduced(int dim) const {
+bool TensorIteratorBase::is_dim_reduced(int dim) const {
   for (auto& op : operands_) {
     if (op.is_output && op.stride_bytes[dim] == 0 && shape_[dim] > 1) {
       return true;
@@ -631,7 +620,7 @@ bool TensorIterator::is_dim_reduced(int dim) const {
   return false;
 }
 
-void TensorIterator::permute_dimensions(IntArrayRef perm) {
+void TensorIteratorBase::permute_dimensions(IntArrayRef perm) {
   TORCH_INTERNAL_ASSERT(perm.size() == ndim());
 
   auto reorder = [perm](IntArrayRef data) {
@@ -651,7 +640,7 @@ void TensorIterator::permute_dimensions(IntArrayRef perm) {
   }
 }
 
-int64_t TensorIterator::num_output_elements() const {
+int64_t TensorIteratorBase::num_output_elements() const {
   int64_t elem = 1;
   for (int dim = 0; dim < ndim(); dim++) {
     if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0)  {
@@ -661,7 +650,7 @@ int64_t TensorIterator::num_output_elements() const {
   return elem;
 }
 
-int TensorIterator::num_reduce_dims() const {
+int TensorIteratorBase::num_reduce_dims() const {
   int count = 0;
   for (int dim = 0; dim < ndim(); dim++) {
     if (operands_[0].stride_bytes[dim] == 0) {
@@ -686,11 +675,11 @@ int TensorIterator::num_reduce_dims() const {
     }                                                                     \
   }
 
-void TensorIterator::for_each(loop_t loop, int64_t grain_size) {
+void TensorIteratorBase::for_each(loop_t loop, int64_t grain_size) {
   for_each(LOOP_WRAPPER(ntensors(), loop), grain_size);
 }
 
-void TensorIterator::for_each(loop2d_t loop, int64_t grain_size) {
+void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) {
   int64_t numel = this->numel();
   if (numel == 0) {
     return;
@@ -703,7 +692,7 @@ void TensorIterator::for_each(loop2d_t loop, int64_t grain_size) {
   }
 }
 
-StrideVector TensorIterator::get_strides() const {
+StrideVector TensorIteratorBase::get_strides() const {
   StrideVector strides;
   for (int dim = 0; dim < ndim(); dim++) {
     for (int arg = 0; arg < ntensors(); arg++) {
@@ -713,11 +702,11 @@ StrideVector TensorIterator::get_strides() const {
   return strides;
 }
 
-void TensorIterator::serial_for_each(loop_t loop, Range range) const {
+void TensorIteratorBase::serial_for_each(loop_t loop, Range range) const {
   serial_for_each(LOOP_WRAPPER(ntensors(), loop), range);
 }
 
-void TensorIterator::serial_for_each(loop2d_t loop, Range range) const {
+void TensorIteratorBase::serial_for_each(loop2d_t loop, Range range) const {
   if (range.size() == 0) {
     return;
   }
@@ -741,12 +730,12 @@ void TensorIterator::serial_for_each(loop2d_t loop, Range range) const {
   }
 }
 
-bool TensorIterator::is_trivial_1d() const {
+bool TensorIteratorBase::is_trivial_1d() const {
   // TODO: check for casting once it's supported
   return ndim() == 1;
 }
 
-bool TensorIterator::is_contiguous() const {
+bool TensorIteratorBase::is_contiguous() const {
   if (numel() == 1) {
     return true;
   }
@@ -757,7 +746,7 @@ bool TensorIterator::is_contiguous() const {
 }
 
 
-bool TensorIterator::is_scalar(int arg) const {
+bool TensorIteratorBase::is_scalar(int arg) const {
   const auto& stride = operands_[arg].stride_bytes;
   for (int i = 0; i < ndim(); i++) {
     if (stride[i] != 0 && shape_[i] != 1) {
@@ -767,11 +756,11 @@ bool TensorIterator::is_scalar(int arg) const {
   return true;
 }
 
-bool TensorIterator::is_cpu_scalar(int arg) const {
+bool TensorIteratorBase::is_cpu_scalar(int arg) const {
   return is_scalar(arg) && device(arg).is_cpu();
 }
 
-void TensorIterator::cast_outputs() {
+void TensorIteratorBase::cast_outputs() {
   for (auto& op : operands_) {
     if (op.is_output && op.original_tensor.defined() &&
         op.original_tensor.scalar_type() != op.current_dtype) {
@@ -784,19 +773,19 @@ void TensorIterator::cast_outputs() {
   }
 }
 
-void* TensorIterator::data_ptr(int arg) const {
+void* TensorIteratorBase::data_ptr(int arg) const {
   return operands_[arg].data;
 }
 
-void TensorIterator::remove_operand(int arg) {
+void TensorIteratorBase::remove_operand(int arg) {
   operands_.erase(operands_.begin() + arg);
 }
 
-void TensorIterator::unsafe_replace_operand(int arg, void* data) {
+void TensorIteratorBase::unsafe_replace_operand(int arg, void* data) {
   operands_[arg].data = data;
 }
 
-void TensorIterator::narrow(int dim, int64_t start, int64_t size) {
+void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) {
   TORCH_INTERNAL_ASSERT(dim < ndim() && size >= 1);
   shape_[dim] = size;
   view_offsets_[dim] += start;
@@ -808,7 +797,7 @@ void TensorIterator::narrow(int dim, int64_t start, int64_t size) {
   }
 }
 
-void TensorIterator::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
+void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) {
   TORCH_INTERNAL_ASSERT(start_dim <= ndim());
   for (int i = start_dim; i < ndim(); ++i) {
     for (auto& op : operands_) {
@@ -948,14 +937,14 @@ TensorIterator TensorIterator::reduce_op(Tensor& out1, Tensor& out2, const Tenso
     .build();
 }
 
-void TensorIterator::populate_operands(TensorIteratorConfig& config) {
+void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) {
   for (int i = 0; i < config.tensors_.size(); i++) {
     operands_.emplace_back(std::move(config.tensors_[i]));
   }
   num_outputs_ = config.num_outputs_;
 }
 
-void TensorIterator::mark_outputs() {
+void TensorIteratorBase::mark_outputs() {
   // TODO: merge this into populate_operands
   for (int i = 0; i < num_outputs_; i++) {
     operands_[i].is_output = true;
@@ -972,7 +961,7 @@ void TensorIterator::mark_outputs() {
   }
 }
 
-void TensorIterator::mark_resize_outputs(const TensorIteratorConfig& config) {
+void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config) {
   // Outputs cannot be broadcasted. Check that the shape of the outputs matches
   // the inferred shape. There's an exception for write-only tensors to support
   // our legacy behavior that functions with `out=` arguments resize their
@@ -994,7 +983,7 @@ void TensorIterator::mark_resize_outputs(const TensorIteratorConfig& config) {
   }
 }
 
-void TensorIterator::compute_mem_overlaps(const TensorIteratorConfig& config) {
+void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config) {
   if (!config.check_mem_overlap_) {
     return;
   }
@@ -1009,7 +998,7 @@ void TensorIterator::compute_mem_overlaps(const TensorIteratorConfig& config) {
   }
 }
 
-void TensorIterator::compute_shape(const TensorIteratorConfig& config) {
+void TensorIteratorBase::compute_shape(const TensorIteratorConfig& config) {
   if (config.static_shape_.has_value()) {
     shape_ = *config.static_shape_;
     return;
@@ -1045,7 +1034,7 @@ void TensorIterator::compute_shape(const TensorIteratorConfig& config) {
   }
 }
 
-void TensorIterator::compute_strides(const TensorIteratorConfig& config) {
+void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) {
   for (auto& op : operands_) {
     if (op.tensor.defined()) {
       IntArrayRef original_shape = config.static_shape_ ? shape_ : op.tensor.sizes();
@@ -1068,7 +1057,7 @@ void TensorIterator::compute_strides(const TensorIteratorConfig& config) {
   }
 }
 
-bool TensorIterator::can_use_32bit_indexing() const {
+bool TensorIteratorBase::can_use_32bit_indexing() const {
   int64_t max_value = std::numeric_limits<int32_t>::max();
   if (numel() > max_value) {
     return false;
@@ -1085,7 +1074,7 @@ bool TensorIterator::can_use_32bit_indexing() const {
   return true;
 }
 
-std::unique_ptr<TensorIterator> TensorIterator::split(int dim) {
+std::unique_ptr<TensorIterator> TensorIteratorBase::split(int dim) {
   TORCH_INTERNAL_ASSERT(dim >= 0 && dim < ndim() && shape()[dim] >= 2);
   std::unique_ptr<TensorIterator> copy(new TensorIterator(*this));
 
@@ -1101,7 +1090,7 @@ std::unique_ptr<TensorIterator> TensorIterator::split(int dim) {
 }
 
 
-int TensorIterator::get_dim_to_split() const {
+int TensorIteratorBase::get_dim_to_split() const {
   TORCH_INTERNAL_ASSERT(ndim() >= 1);
   int64_t max_extent = -1;
   int dim_to_split = -1;
@@ -1122,7 +1111,7 @@ int TensorIterator::get_dim_to_split() const {
   return dim_to_split;
 }
 
-bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) {
+bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) {
   // This function tries to do a fast setup to avoid needless reordering of dimensions and tracking output strides
   // Return true if it can do fast setup or false otherwise
   // TODO enable fast handling for reductions
@@ -1137,13 +1126,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) {
       {
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
-          if (!op.tensor.defined()) {
-            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
-            op.tensor = at::empty(shape_, op.options(), MemoryFormat::Contiguous);
-            op.current_dtype = op.target_dtype;
-          } else if (op.will_resize) {
-            at::native::resize_output(op.tensor, shape_);
-          }
+          set_output(i, shape_, {}, op.options().memory_format(MemoryFormat::Contiguous), names_);
         }
         break;
       }
@@ -1151,15 +1134,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) {
       {
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
-          if (!op.tensor.defined()) {
-            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
-            op.tensor = at::empty(shape_, op.options(), MemoryFormat::ChannelsLast);
-            op.current_dtype = op.target_dtype;
-          } else if (op.will_resize) {
-            at::native::resize_output(op.tensor, shape_);
-            op.tensor.unsafeGetTensorImpl()->empty_tensor_restride(
-                MemoryFormat::ChannelsLast);
-          }
+          set_output(i, shape_, {}, op.options().memory_format(MemoryFormat::ChannelsLast), names_);
         }
         break;
       }
@@ -1173,14 +1148,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) {
         TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs");
         for (int i = 0; i < num_outputs_; i++){
           auto& op = operands_[i];
-          if (!op.tensor.defined()) {
-            TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i);
-            op.tensor = at::empty_strided(shape_, operands_[i_defined].tensor.strides(), op.options());
-            op.current_dtype = op.target_dtype;
-          } else if (op.will_resize) {
-            at::native::resize_output(op.tensor, shape_);
-            op.tensor.as_strided_(shape_, operands_[i_defined].tensor.strides());
-          }
+          set_output(i, shape_, operands_[i_defined].tensor.strides(), op.options(), names_);
         }
         break;
       }
@@ -1205,7 +1173,7 @@ bool TensorIterator::fast_set_up(const TensorIteratorConfig& config) {
   return true;
 }
 
-FastSetupType TensorIterator::compute_fast_setup_type(const TensorIteratorConfig& config) {
+FastSetupType TensorIteratorBase::compute_fast_setup_type(const TensorIteratorConfig& config) {
   if (is_reduction_ || !all_ops_same_shape_) {
     return FastSetupType::NONE;
   }
@@ -1257,11 +1225,9 @@ FastSetupType TensorIterator::compute_fast_setup_type(const TensorIteratorConfig
   return FastSetupType::NONE;
 }
 
-TensorIterator::TensorIterator(TensorIteratorConfig& config) {
-  build(config);
-}
+TensorIteratorBase::TensorIteratorBase() {}
 
-void TensorIterator::build(TensorIteratorConfig& config) {
+void TensorIteratorBase::build(TensorIteratorConfig& config) {
   // populate some persistent configuration fields
   is_reduction_ = config.is_reduction_;
 
@@ -1291,8 +1257,6 @@ void TensorIterator::build(TensorIteratorConfig& config) {
     // coalesce adjacent dimensions when possible
     coalesce_dimensions();
   }
-  // perform name inference
-  propagate_names_to_outputs();
 
   for (auto& op : operands_) {
     TORCH_INTERNAL_ASSERT(op.tensor.defined());
@@ -1307,14 +1271,40 @@ void TensorIterator::build(TensorIteratorConfig& config) {
   view_offsets_ = DimVector(ndim_offsets, 0);
 }
 
-SplitUntil32Bit TensorIterator::with_32bit_indexing() const {
+void TensorIterator::set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) {
+  auto& op = operands_[output_idx];
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(output_idx < num_outputs_);
+  if (!op.tensor.defined()) {
+      TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", output_idx);
+      if (strides.empty()) {
+          op.tensor = at::empty(sizes, options);
+      } else {
+          op.tensor = at::empty_strided(sizes, strides, options);
+      }
+      op.current_dtype = op.target_dtype;
+  } else if (op.will_resize) {
+      at::native::resize_output(op.tensor, sizes);
+      if (!strides.empty()) {
+        TORCH_INTERNAL_ASSERT(!options.memory_format_opt().has_value());
+        op.tensor.as_strided_(sizes, strides);
+      } else if (options.memory_format_opt().has_value()) {
+        op.tensor.unsafeGetTensorImpl()->empty_tensor_restride(*options.memory_format_opt());
+      }
+  }
+  if (!names.empty()) {
+    TORCH_INTERNAL_ASSERT(op.tensor.defined());
+    namedinference::propagate_names(op.tensor, names);
+  }
+}
+
+SplitUntil32Bit TensorIteratorBase::with_32bit_indexing() const {
   return SplitUntil32Bit(*this);
 }
 
 /// SplitUntil32Bit. Recursively splits an iterator into sub-iterators that
 /// can use 32-bit indexing.
 
-SplitUntil32Bit::iterator::iterator(const TensorIterator& iter) {
+SplitUntil32Bit::iterator::iterator(const TensorIteratorBase& iter) {
   vec.emplace_back(new TensorIterator(iter));
   vec.emplace_back(nullptr); // ++ first pops the last element
   ++(*this);
diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h
index 86c3992e5241..25b255d72fef 100644
--- a/aten/src/ATen/native/TensorIterator.h
+++ b/aten/src/ATen/native/TensorIterator.h
@@ -8,6 +8,7 @@
 #include <bitset>
 #include <ATen/NamedTensorUtils.h>
 #include <ATen/Parallel.h>
+#include <ATen/TensorMeta.h>
 
 // TensorIterator is a helper class for element-wise operations, such as
 // arithmetic, comparisons, and trigonometric functions. It handles
@@ -131,13 +132,15 @@ enum class FastSetupType : uint8_t {
 };
 
 class TensorIteratorConfig;
+struct TensorIterator;
 
-struct CAFFE2_API TensorIterator {
+struct CAFFE2_API TensorIteratorBase : public impl::MetaBase {
   using DimMask = std::bitset<64>;
   using PtrVector = SmallVector<char*, 4>;
   using StrideVector = SmallVector<int64_t, 6>;
 
-  TensorIterator(TensorIteratorConfig&);
+  TensorIteratorBase();
+  void build(TensorIteratorConfig&);
 
   // The inner-loop function operates on the fastest moving dimension. It
   // implements element-wise operations in terms of 1-d strided tensors.
@@ -152,19 +155,10 @@ struct CAFFE2_API TensorIterator {
   using loop_t = c10::function_ref<void(char** data, const int64_t* strides, int64_t size)>;
   using loop2d_t = c10::function_ref<void(char** data, const int64_t* strides, int64_t size0, int64_t size1)>;
 
-  using loop_subiter_t = c10::function_ref<void(TensorIterator& subiter)>;
+  using loop_subiter_t = c10::function_ref<void(TensorIteratorBase& subiter)>;
 
   void foreach_reduced_elt(loop_subiter_t loop, bool parallelize=true);
 
-  static TensorIterator binary_float_op(Tensor& out, const Tensor& a, const Tensor& b);
-  static TensorIterator binary_op(Tensor& out, const Tensor& a, const Tensor& b);
-  static TensorIterator comparison_op(Tensor& out, const Tensor& a, const Tensor& b);
-  static TensorIterator unary_op(Tensor& out, const Tensor& a);
-  static TensorIterator unary_float_op(Tensor& out, const Tensor& a);
-  static TensorIterator nullary_op(Tensor& out);
-  static TensorIterator reduce_op(Tensor& out, const Tensor& a);
-  static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
-
   int ndim() const { return shape_.size(); }
   IntArrayRef shape() const { return shape_; }
   int64_t numel() const;
@@ -297,8 +291,6 @@ struct CAFFE2_API TensorIterator {
   }
 
 protected:
-  void build(TensorIteratorConfig&);
-
   // Mutable reference as it moves tensors out of TensorIteratorConfig
   void populate_operands(TensorIteratorConfig&);
   void mark_outputs();
@@ -402,8 +394,26 @@ struct CAFFE2_API TensorIterator {
   bool is_reduction_ = false;
 };
 
+struct CAFFE2_API TensorIterator final : public TensorIteratorBase {
+  TensorIterator() : TensorIteratorBase() {}
+  // Slicing is OK, TensorIterator guaranteed NOT to have any fields
+  TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {}
+
+  static TensorIterator binary_float_op(Tensor& out, const Tensor& a, const Tensor& b);
+  static TensorIterator binary_op(Tensor& out, const Tensor& a, const Tensor& b);
+  static TensorIterator comparison_op(Tensor& out, const Tensor& a, const Tensor& b);
+  static TensorIterator unary_op(Tensor& out, const Tensor& a);
+  static TensorIterator unary_float_op(Tensor& out, const Tensor& a);
+  static TensorIterator nullary_op(Tensor& out);
+  static TensorIterator reduce_op(Tensor& out, const Tensor& a);
+  static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
+
+  void set_output(int64_t output_idx, IntArrayRef sizes, IntArrayRef strides, TensorOptions options, DimnameList names) override;
+};
+
 class CAFFE2_API TensorIteratorConfig final {
 public:
+  friend struct TensorIteratorBase;
   friend struct TensorIterator;
 
   TensorIteratorConfig() {}
@@ -478,7 +488,9 @@ class CAFFE2_API TensorIteratorConfig final {
   // It would be better if this was && qualified, but this would be at the cost
   // of a lot of boilerplate above
   TensorIterator build() {
-    return TensorIterator(*this);
+    TensorIterator iter;
+    iter.build(*this);
+    return iter;
   }
 
 private:
@@ -508,9 +520,10 @@ class CAFFE2_API TensorIteratorConfig final {
 struct CAFFE2_API SplitUntil32Bit {
   struct CAFFE2_API iterator {
     iterator() {};
-    iterator(const TensorIterator& iter);
+    iterator(const TensorIteratorBase& iter);
     iterator(iterator&&) = default;
 
+    // Guaranteed to be a TensorIterator proper!
     TensorIterator& operator*() const;
     iterator& operator++();
     bool operator==(const iterator& other) const {
@@ -524,13 +537,13 @@ struct CAFFE2_API SplitUntil32Bit {
     std::vector<std::unique_ptr<TensorIterator>> vec;
   };
 
-  SplitUntil32Bit(const TensorIterator& iter) : iter(iter) {}
+  SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {}
 
   iterator begin() const;
   iterator end() const;
 
 private:
-  const TensorIterator& iter;
+  const TensorIteratorBase& iter;
 };
 
 }  // namespace at
diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp
index 6d3ba3acb4fc..3837c7567e31 100644
--- a/aten/src/ATen/native/TensorIteratorReduce.cpp
+++ b/aten/src/ATen/native/TensorIteratorReduce.cpp
@@ -7,13 +7,13 @@
 
 namespace at {
 
-using loop2d_t = TensorIterator::loop2d_t;
+using loop2d_t = TensorIteratorBase::loop2d_t;
 
-static bool use_two_pass_reduction(TensorIterator& iter);
-static void two_pass_reduction(TensorIterator& iter, loop2d_t loop);
-static void parallel_dim_reduction(TensorIterator& iter, loop2d_t loop);
+static bool use_two_pass_reduction(TensorIteratorBase& iter);
+static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop);
+static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop);
 
-void TensorIterator::parallel_reduce(loop2d_t loop) {
+void TensorIteratorBase::parallel_reduce(loop2d_t loop) {
   TORCH_CHECK(ntensors() == 2, "parallel_reduce only supports one input and one output");
   int64_t numel = this->numel();
   if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
@@ -26,11 +26,11 @@ void TensorIterator::parallel_reduce(loop2d_t loop) {
   }
 }
 
-static bool use_two_pass_reduction(TensorIterator& iter) {
+static bool use_two_pass_reduction(TensorIteratorBase& iter) {
   return iter.output(0).numel() == 1;
 }
 
-static void two_pass_reduction(TensorIterator& iter, loop2d_t loop) {
+static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) {
   int max_threads = at::get_num_threads();
 
   auto dst = iter.output(0);
@@ -65,7 +65,7 @@ static void two_pass_reduction(TensorIterator& iter, loop2d_t loop) {
 
 /// Chooses a dimension over which to parallelize. Prefers the outer-most
 /// dimension thats larger than the number of available threads.
-static int find_split_dim(TensorIterator& iter) {
+static int find_split_dim(TensorIteratorBase& iter) {
   int num_threads = at::get_num_threads();
   auto shape = iter.shape();
 
@@ -84,7 +84,7 @@ static int find_split_dim(TensorIterator& iter) {
 }
 
 static std::tuple<int64_t, int64_t>
-round_columns(TensorIterator& iter, int dim, int multiple, int64_t begin, int64_t end) {
+round_columns(TensorIteratorBase& iter, int dim, int multiple, int64_t begin, int64_t end) {
   begin = begin - (begin % multiple);
   if (end != iter.shape()[dim]) {
     // only round the 'end' column down if it's not the final column
@@ -93,7 +93,7 @@ round_columns(TensorIterator& iter, int dim, int multiple, int64_t begin, int64_
   return std::make_tuple(begin, end);
 }
 
-static void parallel_dim_reduction(TensorIterator& iter, loop2d_t loop) {
+static void parallel_dim_reduction(TensorIteratorBase& iter, loop2d_t loop) {
   AT_ASSERT(iter.ndim() >= 1);
   int dim = find_split_dim(iter);
   int64_t cols = iter.shape()[dim];
@@ -116,7 +116,7 @@ static void parallel_dim_reduction(TensorIterator& iter, loop2d_t loop) {
   });
 }
 
-void TensorIterator::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) {
+void TensorIteratorBase::foreach_reduced_elt(loop_subiter_t loop, bool parallelize) {
   AT_ASSERT(ninputs() == 1);
   AT_ASSERT(noutputs() >= 1);
 
@@ -153,7 +153,7 @@ void TensorIterator::foreach_reduced_elt(loop_subiter_t loop, bool parallelize)
         return;
       }
 
-      auto sub_iter = *this;
+      TensorIterator sub_iter(*this);
 
       sub_iter.narrow(dim, begin, end - begin);
       // On some broken setups, `#ifdef _OPENMP` is true,
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 66e0e141a633..daea7e7f68bb 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -385,8 +385,8 @@ Tensor& sqrt_(Tensor& self) { return unary_op_impl_(self, at::sqrt_out); }
 Tensor square(const Tensor& self) { return at::pow(self, 2); }
 Tensor& square_(Tensor& self) { return at::pow_out(self, self, 2); }
 
-Tensor& sigmoid_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, sigmoid_stub);  }
-Tensor sigmoid(const Tensor& self) { return unary_op_impl(self, at::sigmoid_out);  }
+Tensor& sigmoid_out(Tensor& result, const Tensor& self) { return unary_op_impl_float_out(result, self, sigmoid_stub);  }
+Tensor sigmoid(const Tensor& self) { return unary_op_impl_float(self, sigmoid_stub);  }
 Tensor& sigmoid_(Tensor& self) { return unary_op_impl_(self, at::sigmoid_out);  }
 
 Tensor& logit_out(
diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
index f35d9054a729..36c01b2af49e 100644
--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@@ -639,6 +639,9 @@ void mse_kernel(TensorIterator& iter) {
 }
 
 void fmod_kernel(TensorIterator& iter) {
+  // Use the dtype of the first argument to retain BC,
+  // change to common_dtype for type promotion in the future
+  // Issue #47779: https://github.com/pytorch/pytorch/issues/47779
   if (isIntegralType(iter.dtype(), /*includeBool=*/ false)) {
     AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "fmod_cpu", [&]() {
       cpu_kernel(iter, [=](scalar_t x, scalar_t d) -> scalar_t {
@@ -660,32 +663,6 @@ void fmod_kernel(TensorIterator& iter) {
   }
 }
 
-void fmod_scalar_kernel(TensorIterator& iter, Scalar divisor) {
-  if (isIntegralType(iter.dtype(), /*includeBool=*/ false)) {
-    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "fmod_scalar_cpu", [&]() {
-      const auto div = divisor.to<scalar_t>();
-      TORCH_CHECK(div != 0, "ZeroDivisionError");
-      cpu_kernel(iter, [=](scalar_t x) -> scalar_t {
-        return x % div;
-      });
-    });
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "fmod_scalar_cpu", [&]() {
-      const auto div = divisor.to<scalar_t>();
-      const auto div_vec = Vec256<scalar_t>(div);
-      cpu_kernel_vec(
-        iter,
-        [=](scalar_t x) -> scalar_t {
-          return std::fmod(x, div);
-        },
-        [=](Vec256<scalar_t> x) {
-          return x.fmod(div_vec);
-        });
-      });
-  }
-
-}
-
 void logaddexp_kernel(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "logaddexp_cpu", [&]() {
     cpu_kernel_vec(
@@ -872,7 +849,6 @@ REGISTER_DISPATCH(logit_backward_stub, &logit_backward_kernel);
 REGISTER_DISPATCH(tanh_backward_stub, &tanh_backward_kernel);
 REGISTER_DISPATCH(mse_stub, &mse_kernel);
 REGISTER_DISPATCH(fmod_stub, &fmod_kernel);
-REGISTER_DISPATCH(fmod_scalar_stub, &fmod_scalar_kernel);
 REGISTER_DISPATCH(logaddexp_stub, &logaddexp_kernel);
 REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_kernel);
 REGISTER_DISPATCH(gcd_stub, &gcd_kernel);
diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h
index 129cdc0845c4..b94e4b44aae0 100644
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@@ -109,7 +109,7 @@ static inline void vectorized_outer_reduction(char** data, int64_t inner_stride,
 }
 
 template<typename traits, typename res_t>
-static void set_result(const int index, const res_t result, const TensorIterator &iter, const int num_outputs) {
+static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
   // static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
   if (index < num_outputs) {
     char *out = (char *) iter.data_ptr(index);
@@ -118,20 +118,20 @@ static void set_result(const int index, const res_t result, const TensorIterator
 }
 
 template<typename traits, typename res_t>
-static void set_results(const res_t result, const TensorIterator &iter, const int num_outputs) {
+static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
   AT_ASSERT(num_outputs == 1);
   set_result<traits>(0, result, iter, num_outputs);
 }
 
 template<typename traits, std::size_t i = 0, typename... tuple_t>
 static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
-for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, const int num_outputs) {
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
   return i;
 }
 
 template<typename traits, std::size_t i = 0, typename... tuple_t>
 static inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
-for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, const int num_outputs) {
+for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
   if (i < (size_t)num_outputs) {
     set_result<traits>(i, std::get<i>(t), iter, num_outputs);
     return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
@@ -140,7 +140,7 @@ for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIterator &iter, c
 }
 
 template<typename traits, typename... res_t>
-static void set_results(const std::tuple<res_t...>& result, const TensorIterator &iter, const int num_outputs) {
+static void set_results(const std::tuple<res_t...>& result, const TensorIteratorBase &iter, const int num_outputs) {
   AT_ASSERT(num_outputs >= 1);
   std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
   AT_ASSERT((size_t)num_outputs == result_size);
@@ -178,7 +178,7 @@ struct all_same : guts::conjunction<
 // into several pieces, reduce each separately, and then combine them.
 
 template <typename ops_t, typename init_t>
-void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
+void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
   using rf_t = decltype(&ops_t::reduce);
   using cf_t = decltype(&ops_t::combine);
   using pf_t = decltype(&ops_t::project);
@@ -202,7 +202,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
     "the accumulate type must be default-constructible"
   );
   const int num_outputs = iter.noutputs();
-  iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIterator &sub_iter) {
+  iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) {
     auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
       int ntensors = sub_iter.ntensors();
       sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) {
@@ -244,7 +244,7 @@ void binary_kernel_reduce(TensorIterator& iter, ops_t ops, init_t init) {
 }
 
 template <typename func_t, typename vec_func_t>
-void binary_kernel_reduce_vec(TensorIterator& iter, func_t op, vec_func_t vop, double ident = 0) {
+void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
   using traits = binary_function_traits<func_t>;
   static_assert(
     all_same<
diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu
index 87b7304ad097..0f1df59618be 100644
--- a/aten/src/ATen/native/cuda/Activation.cu
+++ b/aten/src/ATen/native/cuda/Activation.cu
@@ -112,7 +112,7 @@ Tensor prelu_cuda(const Tensor& self, const Tensor& weight_) {
         input_stride0,
         input_stride1,
         input_numel);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
   }
   return result;
@@ -230,7 +230,7 @@ std::tuple<Tensor, Tensor> prelu_backward_cuda(const Tensor& grad_out_, const Te
         input_stride0,
         input_stride1,
         input_numel);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
     // update weight_grad
     std::vector<int64_t> reduce_dims;
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
index b24f26a2396e..5066480535b9 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu
@@ -520,7 +520,7 @@ namespace {
                 sizeB, sizeC, isizeH, isizeW, osizeH, osizeW,
                 kernel_stride_C, kernel_size_C,
                 istrideB, istrideC, istrideH, istrideW);
-              TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
             }
           );
         break;
@@ -563,7 +563,7 @@ namespace {
                 input_data, output_data,
                 isizeH, isizeW, osizeH, osizeW,
                 istrideD, istrideH, istrideW);
-              TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
             }
           );
         break;
@@ -666,7 +666,7 @@ namespace {
                 sizeB, sizeC, isizeH, isizeW, osizeH, osizeW,
                 kernel_stride_C, kernel_size_C,
                 ostrideB, ostrideC, ostrideH, ostrideW);
-              TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
             }
           );
         break;
@@ -703,7 +703,7 @@ namespace {
                 atomic_adaptive_average_gradinput <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
                   gradInput_data, gradOutput_data,
                   isizeH, isizeW, osizeH, osizeW);
-                TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
               }
               else
               {
@@ -711,7 +711,7 @@ namespace {
                 adaptive_average_gradinput <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>> (
                   gradInput_data, gradOutput_data,
                   isizeH, isizeW, osizeH, osizeW);
-                TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
               }
             }
           );
diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
index 26f95a6e010b..3e87105298e0 100644
--- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling3d.cu
@@ -123,7 +123,7 @@ void adaptiveaveragepool_loop(
         istrideD,
         istrideT, istrideH, istrideW,
         offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     totalZ -= 65535;
     offsetZ += 65535;
   }
@@ -216,7 +216,7 @@ void adaptiveaveragegradinput_loop(
         isizeT, isizeH, isizeW,
         osizeT, osizeH, osizeW,
         offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     totalZ -= 65535;
     offsetZ += 65535;
   }
@@ -310,7 +310,7 @@ void atomicadaptiveaveragegradinput_loop(
         isizeT, isizeH, isizeW,
         osizeT, osizeH, osizeW,
         offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     totalZ -= 65535;
     offsetZ += 65535;
   }
diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
index ab97030d5264..dfe4c49b80aa 100644
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling2d.cu
@@ -251,7 +251,7 @@ void adaptive_max_pool2d_out_cuda_template(
                                     indices_data,
                                     isizeH, isizeW, osizeH, osizeW,
                                     istrideD, istrideH, istrideW);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     );
   } else {
@@ -287,7 +287,7 @@ void adaptive_max_pool2d_out_cuda_template(
                                     indices_data,
                                     isizeH, isizeW, osizeH, osizeW,
                                     istrideD, istrideH, istrideW);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     );
   }
@@ -344,7 +344,7 @@ void adaptive_max_pool2d_backward_out_cuda_template(
                                               gradInput_data, gradOutput_data,
                                               indices_data,
                                               isizeH, isizeW, osizeH, osizeW);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
         else
         {
@@ -353,7 +353,7 @@ void adaptive_max_pool2d_backward_out_cuda_template(
                                               gradInput_data, gradOutput_data,
                                               indices_data,
                                               isizeH, isizeW, osizeH, osizeW);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
       }
     );
@@ -391,7 +391,7 @@ void adaptive_max_pool2d_backward_out_cuda_template(
                                               gradInput_data, gradOutput_data,
                                               indices_data,
                                               isizeH, isizeW, osizeH, osizeW);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
         else
         {
@@ -400,7 +400,7 @@ void adaptive_max_pool2d_backward_out_cuda_template(
                                               gradInput_data, gradOutput_data,
                                               indices_data,
                                               isizeH, isizeW, osizeH, osizeW);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
       }
     );
diff --git a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
index dc9ebefc6d35..d515cf78bbca 100644
--- a/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
+++ b/aten/src/ATen/native/cuda/AdaptiveMaxPooling3d.cu
@@ -131,7 +131,7 @@ void adaptivemaxpool_loop(
     adaptivemaxpool<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
       input_data, output_data, indices_data, isizeT, isizeH, isizeW,
       osizeT, osizeH, osizeW, istrideD, istrideT, istrideH, istrideW, offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     totalZ -= 65535;
     offsetZ += 65535;
@@ -209,7 +209,7 @@ void adaptivemaxgradinput_loop(
     adaptivemaxgradinput<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
       gradInput_data, gradOutput_data, indices_data,
       isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     totalZ -= 65535;
     offsetZ += 65535;
   }
@@ -285,7 +285,7 @@ void atomicadaptivemaxgradinput_loop(
     atomicadaptivemaxgradinput<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
       gradInput_data, gradOutput_data, indices_data,
       isizeT, isizeH, isizeW, osizeT, osizeH, osizeW, offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     totalZ -= 65535;
     offsetZ += 65535;
   }
diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu
index 69ed46ad61f6..0ac6537a8de9 100644
--- a/aten/src/ATen/native/cuda/AmpKernels.cu
+++ b/aten/src/ATen/native/cuda/AmpKernels.cu
@@ -243,7 +243,7 @@ Tensor _amp_update_scale_cuda(Tensor& growth_tracker,
     growth_factor,
     backoff_factor,
     growth_interval);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   return new_scale;
 }
diff --git a/aten/src/ATen/native/cuda/AveragePool2d.cu b/aten/src/ATen/native/cuda/AveragePool2d.cu
index 6d74e5b4196f..274ced8edcdf 100644
--- a/aten/src/ATen/native/cuda/AveragePool2d.cu
+++ b/aten/src/ATen/native/cuda/AveragePool2d.cu
@@ -325,7 +325,7 @@ void avg_pool2d_out_cuda_template(
                 output_data,
                 divisor_override_value,
                 count_include_pad, use_divisor);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         case MemoryFormat::Contiguous: {
@@ -343,7 +343,7 @@ void avg_pool2d_out_cuda_template(
               output_data,
               divisor_override_value,
               count_include_pad, use_divisor);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
@@ -457,7 +457,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
               gradInput_data,
               divisor_override_value,
               count_include_pad, use_divisor);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         case MemoryFormat::Contiguous: {
@@ -475,7 +475,7 @@ Tensor& avg_pool2d_backward_out_cuda_template(
               gradInput_data,
               divisor_override_value,
               count_include_pad, use_divisor);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index a9cdfbb65705..eaee3c87b1f8 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -1025,6 +1025,23 @@ void magmaLuSolve<float>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
+template<>
+void magmaLuSolve<c10::complex<double>>(
+    magma_int_t n, magma_int_t nrhs, c10::complex<double>* dA, magma_int_t ldda, magma_int_t* ipiv,
+    c10::complex<double>* dB, magma_int_t lddb, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_zgetrs_gpu(MagmaNoTrans, n, nrhs, reinterpret_cast<magmaDoubleComplex*>(dA), ldda, ipiv, reinterpret_cast<magmaDoubleComplex*>(dB), lddb, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaLuSolve<c10::complex<float>>(
+    magma_int_t n, magma_int_t nrhs, c10::complex<float>* dA, magma_int_t ldda, magma_int_t* ipiv,
+    c10::complex<float>* dB, magma_int_t lddb, magma_int_t* info) {
+  MagmaStreamSyncGuard guard;
+  magma_cgetrs_gpu(MagmaNoTrans, n, nrhs, reinterpret_cast<magmaFloatComplex*>(dA), ldda, ipiv, reinterpret_cast<magmaFloatComplex*>(dB), lddb, info);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
 
 template<>
 void magmaLuSolveBatched<double>(
@@ -1043,6 +1060,24 @@ void magmaLuSolveBatched<float>(
  info = magma_sgetrs_batched(MagmaNoTrans, n, nrhs, dA_array, ldda, dipiv_array, dB_array, lddb, batchsize, magma_queue.get_queue());
  AT_CUDA_CHECK(cudaGetLastError());
 }
+
+template<>
+void magmaLuSolveBatched<c10::complex<double>>(
+    magma_int_t n, magma_int_t nrhs, c10::complex<double>** dA_array, magma_int_t ldda, magma_int_t** dipiv_array,
+    c10::complex<double>** dB_array, magma_int_t lddb, magma_int_t& info,
+    magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+  info = magma_zgetrs_batched(MagmaNoTrans, n, nrhs, reinterpret_cast<magmaDoubleComplex**>(dA_array), ldda, dipiv_array, reinterpret_cast<magmaDoubleComplex**>(dB_array), lddb, batchsize, magma_queue.get_queue());
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template<>
+void magmaLuSolveBatched<c10::complex<float>>(
+    magma_int_t n, magma_int_t nrhs, c10::complex<float>** dA_array, magma_int_t ldda, magma_int_t** dipiv_array,
+    c10::complex<float>** dB_array, magma_int_t lddb, magma_int_t& info,
+    magma_int_t batchsize, const MAGMAQueue& magma_queue) {
+ info = magma_cgetrs_batched(MagmaNoTrans, n, nrhs, reinterpret_cast<magmaFloatComplex**>(dA_array), ldda, dipiv_array, reinterpret_cast<magmaFloatComplex**>(dB_array), lddb, batchsize, magma_queue.get_queue());
+ AT_CUDA_CHECK(cudaGetLastError());
+}
 #endif
 
 #define ALLOCATE_ARRAY(name, type, size) \
@@ -2149,7 +2184,7 @@ Tensor _lu_solve_helper_cuda(const Tensor& self, const Tensor& LU_data, const Te
   if (self.numel() == 0 || LU_data.numel() == 0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   }
-  AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "lu_solve_cuda", [&]{
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "lu_solve_cuda", [&]{
     apply_lu_solve<scalar_t>(self_working_copy, LU_data_working_copy, LU_pivots_working_copy, info);
   });
   TORCH_CHECK(info == 0, "MAGMA lu_solve : invalid argument: ", -info);
diff --git a/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu b/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu
index 04cf2cc0c7e8..db43b0552b70 100644
--- a/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu
+++ b/aten/src/ATen/native/cuda/BinaryRemainderKernel.cu
@@ -36,6 +36,27 @@ void remainder_kernel_cuda(TensorIterator& iter) {
   }
 }
 
+void fmod_kernel_cuda(TensorIterator& iter) {
+  // Use the dtype of the first argument to retain BC,
+  // change to common_dtype for type promotion in the future
+  // Issue #47779: https://github.com/pytorch/pytorch/issues/47779
+  if (isIntegralType(iter.dtype(), /*includeBool*/ false)) {
+    AT_DISPATCH_INTEGRAL_TYPES(iter.dtype(), "fmod_cuda", [&]() {
+      gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
+        return a % b;
+      });
+    });
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "fmod_cuda", [&]() {
+      gpu_kernel_with_scalars(iter,
+        []GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
+          return ::fmod(a, b);
+        });
+    });
+  }
+}
+
 REGISTER_DISPATCH(remainder_stub, &remainder_kernel_cuda);
+REGISTER_DISPATCH(fmod_stub, &fmod_kernel_cuda);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu
index dad691981ddb..e28e7414aac6 100644
--- a/aten/src/ATen/native/cuda/Bucketization.cu
+++ b/aten/src/ATen/native/cuda/Bucketization.cu
@@ -86,7 +86,7 @@ void searchsorted_cuda_contiguous(Tensor& result, const Tensor& input, const Ten
 
   searchsorted_cuda_kernel<<<grid, block, 0, stream>>>(
     data_out, data_in, data_bd, idim_in, idim_bd, numel_in, right, boundaries.dim() == 1);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 void dispatch(Tensor& result, const Tensor& input, const Tensor& boundaries, bool out_int32, bool right) {
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
index d0b8c40ee4dc..d92a0d6fd1ff 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool2d.cu
@@ -413,7 +413,7 @@ void max_pool2d_with_indices_out_cuda_template(
                   in_stride_h, in_stride_w,
                   kernel_stride_C, kernel_size_C,
                   output_data, indices_data);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         case MemoryFormat::Contiguous: {
@@ -425,7 +425,7 @@ void max_pool2d_with_indices_out_cuda_template(
                   nbatch, nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
                   kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                   output_data, indices_data);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
@@ -583,7 +583,7 @@ void max_pool2d_with_indices_backward_out_cuda_template(
                   in_stride_h, in_stride_w,
                   kernel_stride_C, kernel_size_C,
                   gradInput_data);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         case MemoryFormat::Contiguous: {
@@ -607,7 +607,7 @@ void max_pool2d_with_indices_backward_out_cuda_template(
                   nInputPlane, inputHeight, inputWidth, outputHeight, outputWidth,
                   kH, kW, dH, dW, padH, padW, dilationH, dilationW,
                   gradInput_data);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           break;
         }
         default: TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous");
diff --git a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
index bbafebacbf13..e6eacbb8424e 100644
--- a/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/DilatedMaxPool3d.cu
@@ -112,7 +112,7 @@ void max_pool3d_with_indices_out_frame(
          pT, pH, pW,
          dilationT, dilationH, dilationW,
          offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     totalZ -= 65535;
     offsetZ += 65535;
@@ -177,7 +177,7 @@ void max_pool3d_with_indices_backward_out_frame(
         pT, pH, pW,
         dilationT, dilationH, dilationW,
         offsetZ);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     totalZ -= 65535;
     offsetZ += 65535;
diff --git a/aten/src/ATen/native/cuda/DistanceKernel.cu b/aten/src/ATen/native/cuda/DistanceKernel.cu
index 60a2c943e742..515388a0fe3e 100644
--- a/aten/src/ATen/native/cuda/DistanceKernel.cu
+++ b/aten/src/ATen/native/cuda/DistanceKernel.cu
@@ -231,19 +231,19 @@ void cdist_kernel_impl(Tensor& result, const Tensor& x1, const Tensor& x2, doubl
   AT_DISPATCH_FLOATING_TYPES(x1.scalar_type(), "cdist_cuda", [&] {
     if (p == 0.0) {
       cdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), p, r1, r2, m, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p == 1.0) {
       cdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), p, r1, r2, m, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p == 2.0) {
       cdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), p, r1, r2, m, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (std::isinf(p)) {
       cdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), p, r1, r2, m, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       cdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), p, r1, r2, m, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   });
 }
@@ -261,19 +261,19 @@ void pdist_forward_kernel_impl(Tensor& result, const Tensor& self, double p) {
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_cuda", [&] {
     if (p == 0.0) {
       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::zero><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p == 1.0) {
       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p == 2.0) {
       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (std::isinf(p)) {
       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       pdist_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(result.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), n, m, p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   });
 }
@@ -303,19 +303,19 @@ void pdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
   AT_DISPATCH_FLOATING_TYPES(self.scalar_type(), "pdist_cuda_backward", [&] {
     if (p == 1.0) {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p < 2.0) {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p == 2.0) {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (std::isinf(p)) {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       pdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(), grad.data_ptr<scalar_t>(), self.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(), grad.stride(0), n, m, dist.numel(), p, n2, n2_squared_minus_1);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   });
 
@@ -354,27 +354,27 @@ void cdist_backward_kernel_impl(Tensor& result, const Tensor& grad, const Tensor
       cdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::one><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(),
       grad.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(),
       gs, p, r1, r2, m, count, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p < 2.0) {
       cdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::lt_two><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(),
       grad.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(),
       gs, p, r1, r2, m, count, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (p == 2.0) {
       cdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::two><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(),
       grad.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(),
       gs, p, r1, r2, m, count, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (std::isinf(p)) {
       cdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::inf><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(),
       grad.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(),
       gs, p, r1, r2, m, count, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       cdist_backward_kernel_cuda_impl<scalar_t, dists<scalar_t>::p><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(buffer.data_ptr<scalar_t>(),
       grad.data_ptr<scalar_t>(), x1.data_ptr<scalar_t>(), x2.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(),
       gs, p, r1, r2, m, count, r_size, l1_size, l2_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   });
 
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 8301f0564572..67adbaabbb84 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -243,7 +243,7 @@ inline void launcher(
                       nelem,
                       pa,
                       rng_engine_inputs);
-              TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
               break;
             case 2:
               fused_dropout_kernel_vec<
@@ -259,7 +259,7 @@ inline void launcher(
                       nelem,
                       pa,
                       rng_engine_inputs);
-              TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
               break;
           }
         } else {
@@ -273,7 +273,7 @@ inline void launcher(
                       nelem,
                       pa,
                       rng_engine_inputs);
-              TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
               break;
             default:
               if (!self.is_contiguous() && ret.is_contiguous() &&
@@ -289,7 +289,7 @@ inline void launcher(
                         nelem,
                         pa,
                         rng_engine_inputs);
-                TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
               } else {
                 fused_dropout_kernel<scalar_t, accscalar_t, index_type, -1>
                     <<<grid,
@@ -302,7 +302,7 @@ inline void launcher(
                         nelem,
                         pa,
                         rng_engine_inputs);
-                TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
               }
           }
         }
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 297b99abcd68..fd97d8ab26b6 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -263,7 +263,7 @@ Tensor embedding_dense_backward_cuda(const Tensor & grad_, const Tensor & indice
                 static_cast<int>(num_indices),
                 static_cast<int64_t>(stride),
                 static_cast<int>(padding_idx));
-            TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
            });
          });
        });
@@ -370,7 +370,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
           static_cast<accscalar_t>(max_norm),
           static_cast<accscalar_t>(norm_type),
           dim, self.stride(0), self.stride(1));
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
     });
   });
diff --git a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
index 3931edeb6f12..689db4347067 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBackwardKernel.cu
@@ -235,7 +235,7 @@ Tensor embedding_backward_cuda_kernel(
               segment_offsets.data_ptr<index_t>(),
               num_of_segments,
               numel);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     // In order to compute `partial_segment_offset`, which is the start index
@@ -263,7 +263,7 @@ Tensor embedding_backward_cuda_kernel(
               partials_per_segment_offset.data_ptr<index_t>(),
               segment_offsets.data_ptr<index_t>(),
               num_of_segments);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     const int stride_warped = ceil_div(stride, C10_WARP_SIZE)*C10_WARP_SIZE;
@@ -296,7 +296,7 @@ Tensor embedding_backward_cuda_kernel(
                   partial_segment_offset.data_ptr<index_t>(),
                   num_of_partial_segments, grad_weight_per_segment.data_ptr<partial_weight_t>(),
                   stride_warped);
-                TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
           } else {
                 compute_grad_weight<scalar_t><<<grid, block, 0, stream>>>(
                   orig_indices.data_ptr<index_t>(),
@@ -307,7 +307,7 @@ Tensor embedding_backward_cuda_kernel(
                   num_of_partial_segments,
                   grad_weight_per_segment.data_ptr<partial_weight_t>(),
                   stride_warped);
-                TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
           }
 
           // Finally, we sum all the partial-sums and scatter them
@@ -323,7 +323,7 @@ Tensor embedding_backward_cuda_kernel(
                 num_of_partial_segments,
                 padding_idx,
                 stride_warped);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
     });
   });
diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu
index 1ace60f40600..5bed5532baee 100644
--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@@ -244,7 +244,7 @@ Tensor embedding_bag_backward_cuda_max(const Tensor &grad,
               scalar_t, index_t><<<grid, block, 0, stream>>>(
               max_indices.data_ptr<index_t>(), grad.data_ptr<scalar_t>(),
               grad_weight.data_ptr<scalar_t>(), stride, numBags);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
         });
       });
 
@@ -335,7 +335,7 @@ _embedding_bag_cuda(const Tensor &weight, const Tensor &indices,
             mode == MODE_MAX ? max_indices.data_ptr<index_t>() : NULL,
             per_sample_weights.defined() ? per_sample_weights.data_ptr<scalar_t>() : NULL,
             per_sample_weights.defined() ? per_sample_weights.stride(0) : 0);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
     });
   });
@@ -475,7 +475,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda(
             num_samples,
             embedding_features,
             output.data_ptr<scalar_t>());
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
     }
   );
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
index 9a9586bdc160..bee3cfa4d436 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu
@@ -205,7 +205,7 @@ void fractional_max_pool2d_out_cuda_template(
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
           devOutput, devIndices, devInput, devSamples,
           poolSizeH, poolSizeW);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
        }
      );
 }
@@ -272,7 +272,7 @@ void fractional_max_pool2d_backward_out_cuda_template(
       fractional_max_pool2d_backward_out_cuda_frame<scalar_t>
         <<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
         devGradInput, devGradOutput, devIndices);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     );
 }
diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
index b4aebc4b1e4c..0d492de48570 100644
--- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
+++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu
@@ -241,7 +241,7 @@ void fractional_max_pool3d_out_cuda_template(
           randomSamples.packed_accessor64<scalar_t, 3>(),
           poolSizeT, poolSizeH, poolSizeW
         );
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     );
   }
@@ -327,7 +327,7 @@ void fractional_max_pool3d_backward_out_cuda_template(
           gradOutput_.packed_accessor64<scalar_t, 5>(),
           indices_.packed_accessor64<int64_t, 5>()
         );
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     );
   }
diff --git a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
index 3168698848ad..180385aaf052 100644
--- a/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
+++ b/aten/src/ATen/native/cuda/FunctionOfAMatrixUtilsKernel.cu
@@ -37,7 +37,7 @@ void _lauch_kernel(int total_n_elems, const func_t& f) {
   auto stream = at::cuda::getCurrentCUDAStream();
   _elemwise_kernel<n_threads, n_elems_per_thread, func_t>
     <<<grid, block, 0, stream>>>(total_n_elems, f);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu
index 273f0af25c8b..a08c13037e34 100644
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@@ -708,7 +708,7 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         grid_sampler_2d_kernel<scalar_t>
           <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
@@ -719,7 +719,7 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     });
   }
@@ -749,7 +749,7 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         grid_sampler_3d_kernel<scalar_t>
           <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
@@ -760,7 +760,7 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     });
   }
@@ -796,7 +796,7 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         grid_sampler_2d_backward_kernel<scalar_t>
           <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
@@ -809,7 +809,7 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     });
   }
@@ -846,7 +846,7 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         grid_sampler_3d_backward_kernel<scalar_t>
           <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
@@ -859,7 +859,7 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
             static_cast<GridSamplerInterpolation>(interpolation_mode),
             static_cast<GridSamplerPadding>(padding_mode),
             align_corners);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     });
   }
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index be2c477d4cdc..7d7a59b32406 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -90,7 +90,7 @@ static void launch_kernel(int64_t N, const func_t& f) {
   dim3 grid((N + block.x * vt - 1) / (block.x * vt));
   auto stream = at::cuda::getCurrentCUDAStream();
   index_elementwise_kernel<nt, vt, func_t><<<grid, block, 0, stream>>>(N, f);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename func_t>
diff --git a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp
index 92895d947758..1bbe47dbfb2e 100644
--- a/aten/src/ATen/native/cuda/LegacyDefinitions.cpp
+++ b/aten/src/ATen/native/cuda/LegacyDefinitions.cpp
@@ -76,50 +76,4 @@ Tensor & masked_scatter__cuda(Tensor& self, const Tensor & mask, const Tensor &
   }
 }
 
-Tensor & fmod_cuda_out(Tensor & result, const Tensor & self, Scalar other) {
-  at::assert_no_internal_overlap(result);
-  return legacy::cuda::_th_fmod_out(result, self, other);
-}
-
-Tensor fmod_cuda(const Tensor & self, Scalar other) {
-  return legacy::cuda::_th_fmod(self, other);
-}
-
-Tensor & fmod_cuda_out(Tensor & result, const Tensor & self, const Tensor & other) {
-  at::assert_no_internal_overlap(result);
-  Tensor b_self, b_other;
-  // optimization that codegen used to do; avoids broadcast.
-  if (other.dim() == 0) {
-    return fmod_cuda_out(result, self, other.item());
-  }
-  std::tie(b_self, b_other) = expand_outplace(self, other, "fmod_out");
-  return legacy::cuda::_th_fmod_out(result, b_self, b_other);
-}
-
-Tensor fmod_cuda(const Tensor & self, const Tensor & other) {
-  // optimization that codegen used to do; avoids broadcast.
-  if (other.dim() == 0) {
-    return fmod_cuda(self, other.item());
-  }
-  Tensor b_self, b_other;
-  std::tie(b_self, b_other) = expand_outplace(self, other, "fmod");
-  return legacy::cuda::_th_fmod(b_self, b_other);
-}
-
-Tensor & fmod_cuda_(Tensor & self, Scalar other) {
-  at::assert_no_internal_overlap(self);
-  return legacy::cuda::_th_fmod_(self, other);
-}
-
-Tensor & fmod_cuda_(Tensor & self, const Tensor & other) {
-  // optimization that codegen used to do; avoids broadcast.
-  if (other.dim() == 0) {
-    return fmod_cuda_(self, other.item());
-  }
-  at::assert_no_internal_overlap(self);
-  Tensor b_other;
-  std::tie(b_other) = expand_inplace(self, other, "fmod_");
-  return legacy::cuda::_th_fmod_(self, b_other);
-}
-
 }} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh
index ef4e1f618a59..fb1f8e6720b4 100644
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@@ -183,7 +183,7 @@ static inline void launch_unrolled_kernel_for_multi_outputs(int64_t N, const fun
   int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   unrolled_elementwise_kernel_for_multi_outputs<num_outputs, func_t, array_t><<<grid, num_threads, 0, stream>>>(N, f, data, ic, oc);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename func_t>
diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu
index 29ff6e6a47ad..69718b206d6b 100644
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@@ -281,7 +281,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
                       log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2),
                       tg_batch_offsets.data_ptr<int64_t>(), tg_target_stride,
                       batch_size, BLANK);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return std::make_tuple(neg_log_likelihood, log_alpha);
 }
 
@@ -633,7 +633,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data_ptr<int64_t>(), tg_target_stride,
        batch_size, BLANK);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 
   // Very crude heuristic for what is a small problem., based on linearly regressing problem dimensions on
@@ -690,7 +690,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data_ptr<int64_t>(), tg_target_stride,
        batch_size, num_labels, BLANK, zero_infinity);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else { // small problem, use naive algorithm
     // Still no block/grid configuration guru...
     int threads_input = max_threads;
@@ -713,7 +713,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
        log_beta.stride(0), log_beta.stride(1), log_beta.stride(2),
        tg_batch_offsets.data_ptr<int64_t>(), tg_target_stride,
        batch_size, num_labels, BLANK, zero_infinity);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK(); // catch launch errors
+    C10_CUDA_KERNEL_LAUNCH_CHECK(); // catch launch errors
   }
 
   // zero those invalid graident elements due to padding
@@ -737,7 +737,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
       grad.size(1),
       grad.size(2)
     );
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 
   return grad;
diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu
index 81a634c2fd47..c3517ab49d1c 100644
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@@ -169,7 +169,7 @@ Tensor& max_unpooling2d_forward_out_cuda(
             oheight,
             owidth,
             output.data_ptr<scalar_t>());
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }));
   if (self.ndimension() == 3) {
     output.resize_({numChannels, oheight, owidth});
@@ -343,7 +343,7 @@ Tensor& max_unpooling3d_forward_out_cuda(
               oH,
               oW,
               offsetZ);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           totalZ -= 65535;
           offsetZ += 65535;
         }
@@ -446,7 +446,7 @@ at::Tensor& max_unpooling2d_backward_out_cuda(
             oheight,
             owidth,
             grad_input.data_ptr<scalar_t>());
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }));
   return grad_input;
 }
@@ -550,7 +550,7 @@ at::Tensor& max_unpooling3d_backward_out_cuda(
               indices.packed_accessor64<int64_t, 4>(),
               grad_input_reshaped.packed_accessor64<scalar_t, 4>(),
               offsetZ);
-          TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+          C10_CUDA_KERNEL_LAUNCH_CHECK();
           totalZ -= 65535;
           offsetZ += 65535;
         }
diff --git a/aten/src/ATen/native/cuda/MultiTensorApply.cuh b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
index ade7c09e4951..8b300483c945 100644
--- a/aten/src/ATen/native/cuda/MultiTensorApply.cuh
+++ b/aten/src/ATen/native/cuda/MultiTensorApply.cuh
@@ -96,7 +96,7 @@ void multi_tensor_apply(
                         tensorListMeta,
                         callable,
                         args...);
-                    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
                     // Reset.
                     loc_block_info = 0;
@@ -152,7 +152,7 @@ void multi_tensor_apply(
                         tensorListMeta,
                         callable,
                         args...);
-                    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+                    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
                     // Reset.
                     loc_block_info = 0;
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index 147f7f3fad6f..a435c7060f45 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -95,7 +95,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) {
       kernel_pointwise_flip_apply2<scalar_t, int64_t>
         <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
           in_tensor_info, out_tensor_info, N, flip_dim, total_dims);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
     return out_tensor;
   }
@@ -132,7 +132,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) {
       stride_contiguous.cuda().data_ptr<int64_t>(),
       shape_t.cuda().data_ptr<int64_t>(),
       total_dims);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
 
   return out_tensor;
@@ -197,7 +197,7 @@ Tensor roll_cuda(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) {
       size,
       in_tensor.stride(dim),
       total_dims);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   });
 
   return out_tensor;
diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu
index 6bb2ae79ffa9..6ba73e1c143e 100644
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@@ -67,14 +67,14 @@ Tensor& triu_tril_cuda_template(Tensor& result, const Tensor& self, int64_t k, c
       triu_tril_kernel<scalar_t, int32_t, upper>
         <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
           result_info, self_info, k, N);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       auto result_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(result);
       auto self_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(self);
       triu_tril_kernel<scalar_t, int64_t, upper>
         <<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
           result_info, self_info, k, N);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   });
   return result;
@@ -192,7 +192,7 @@ Tensor& apply_diag(Tensor& result, const Tensor& self, int64_t dimension) {
               sz,
               self_stride_0 + self_stride_1,
               result_stride);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   } else {
     auto n_elems = self.numel();
@@ -221,7 +221,7 @@ Tensor& apply_diag(Tensor& result, const Tensor& self, int64_t dimension) {
               n_elems,
               result_stride_0 + result_stride_1,
               self_stride);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   }
 
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index e0a27adbd985..512154fd02df 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -95,7 +95,7 @@ void sqrt_kernel_cuda(TensorIterator& iter) {
 }
 
 void sigmoid_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "sigmoid_cuda", [&]() {
+  AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "sigmoid_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
       scalar_t one = scalar_t(1);
       return  one / (one + std::exp(- a));
diff --git a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
index 1884b09a4fab..64bda79809bb 100644
--- a/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
+++ b/aten/src/ATen/native/cuda/UnfoldBackwardKernel.cu
@@ -45,7 +45,7 @@ static void _launch_unfold_backward_kernel(int total_n_elems, func_t f) {
   auto stream = at::cuda::getCurrentCUDAStream();
   _unfold_backward_elementwise_kernel<n_threads, n_elems_per_thread, func_t>
     <<<grid, block, 0, stream>>>(total_n_elems, f);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
index 53af1d463606..13f0741bb5da 100644
--- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu
@@ -228,7 +228,7 @@ static void upsample_bicubic2d_out_cuda_template(
                 align_corners,
                 idata,
                 odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -302,7 +302,7 @@ static void upsample_bicubic2d_backward_out_cuda_template(
                0,
                stream>>>(
                 num_kernels, rheight, rwidth, align_corners, idata, odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
index 248d972bb320..4b142d5024d8 100644
--- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
@@ -213,7 +213,7 @@ static void upsample_bilinear2d_out_cuda_template(
                0,
                stream>>>(
                 num_kernels, rheight, rwidth, align_corners, idata, odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -305,7 +305,7 @@ static void upsample_bilinear2d_backward_out_cuda_template(
                 align_corners,
                 idata,
                 odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
index 08824565b150..eda43fbfa398 100644
--- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu
@@ -160,7 +160,7 @@ static void upsample_linear1d_out_cuda_template(
                num_threads,
                0,
                stream>>>(num_kernels, rwidth, align_corners, idata, odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -220,7 +220,7 @@ static void upsample_linear1d_backward_out_cuda_template(
                num_threads,
                0,
                stream>>>(num_kernels, rwidth, align_corners, idata, odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 425d450b375f..ef287ca592da 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -128,7 +128,7 @@ static void upsample_nearest1d_out_cuda_template(
 
         upsample_nearest1d_out_frame<scalar_t><<<gdim, bdim, 0, stream>>>(
             idata, nbatch, channels, input_width, output_width, odata, scale_factor);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -190,7 +190,7 @@ static void upsample_nearest1d_backward_out_cuda_template(
         upsample_nearest1d_backward_out_frame<scalar_t, accscalar_t>
             <<<gdim, bdim, 0, stream>>>(
                 odata, nbatch, channels, output_width, input_width, idata, scale_factor);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
index a7f935e5f681..0ac02e292b28 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu
@@ -204,7 +204,7 @@ static void upsample_nearest2d_out_cuda_template(
                 output_width,
                 height_scale,
                 width_scale);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -286,7 +286,7 @@ static void upsample_nearest2d_backward_out_cuda_template(
                 idata,
                 height_scale,
                 width_scale);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
index 820358152351..000e116e7bdf 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -199,7 +199,7 @@ static void upsample_nearest3d_out_cuda_template(
             depth_scale,
             height_scale,
             width_scale);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -291,7 +291,7 @@ static void upsample_nearest3d_backward_out_cuda_template(
                 depth_scale,
                 height_scale,
                 width_scale);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index cf623723eaaa..bfd191977c4d 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -271,7 +271,7 @@ static void upsample_trilinear3d_out_cuda_template(
                 align_corners,
                 idata,
                 odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
@@ -360,7 +360,7 @@ static void upsample_trilinear3d_backward_out_cuda_template(
                 align_corners,
                 idata,
                 odata);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
 }
 
diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu
index 6cb9351548fa..1fd710a65e9f 100644
--- a/aten/src/ATen/native/cuda/group_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu
@@ -570,7 +570,7 @@ void GroupNormKernelImplInternal(
       : cuda_utils::kCUDABlockReduceNumThreads;
   RowwiseMomentsCUDAKernel<T><<<N * G, num_threads, 0, cuda_stream>>>(
       D * HxW, eps, X_data, mean_data, rstd_data);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   if (HxW == 1) {
     GroupNorm1dForward<T>(X, mean, rstd, gamma, beta, N, C, G, Y);
@@ -604,7 +604,7 @@ void GroupNormKernelImplInternal(
     const int64_t B = (N * C + kCUDANumThreads - 1) / kCUDANumThreads;
     ComputeFusedParamsCUDAKernel<T><<<B, kCUDANumThreads, 0, cuda_stream>>>(
         N, C, G, mean_data, rstd_data, gamma_data, beta_data, a_data, b_data);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     auto iter = TensorIteratorConfig()
                     .check_all_same_dtype(std::is_same<T, T_ACC>::value)
                     .resize_outputs(false)
@@ -698,7 +698,7 @@ void GroupNorm1dBackward(
             gamma_data,
             c2_data,
             c3_data);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     if (gamma.defined()) {
       auto iter = TensorIteratorConfig()
@@ -754,7 +754,7 @@ void GroupNorm1dBackward(
           rstd_data,
           dgamma_data,
           dbeta_data);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
       // The algorithm for colwise reduction here is to accumulate each 32 cols
@@ -773,7 +773,7 @@ void GroupNorm1dBackward(
               rstd_data,
               dgamma_data,
               dbeta_data);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   }
 }
@@ -837,7 +837,7 @@ void GroupNormBackwardKernelImplInternal(
       : cuda_utils::kCUDABlockReduceNumThreads;
   ComputeInternalGradientsCUDAKernel<T><<<N * C, num_threads, 0, cuda_stream>>>(
       HxW, dY_data, X_data, ds_data, db_data);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 
   if (dX.defined()) {
     Tensor c1 = at::empty({0}, X.options().dtype(kAccType));
@@ -873,7 +873,7 @@ void GroupNormBackwardKernelImplInternal(
             db_data,
             c2_data,
             c3_data);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     if (gamma.defined()) {
       auto iter = TensorIteratorConfig()
@@ -925,7 +925,7 @@ void GroupNormBackwardKernelImplInternal(
           db_data,
           dgamma_data,
           dbeta_data);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
       // The algorithm for colwise reduction here is to accumulate each 32 cols
@@ -944,7 +944,7 @@ void GroupNormBackwardKernelImplInternal(
               db_data,
               dgamma_data,
               dbeta_data);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   }
 }
diff --git a/aten/src/ATen/native/cuda/im2col.cuh b/aten/src/ATen/native/cuda/im2col.cuh
index e0eada961f24..aee072fcea82 100644
--- a/aten/src/ATen/native/cuda/im2col.cuh
+++ b/aten/src/ATen/native/cuda/im2col.cuh
@@ -108,7 +108,7 @@ void im2col(
       height_col,
       width_col,
       data_col);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename dt, typename accT>
@@ -208,7 +208,7 @@ void col2im(
           output_height,
           output_width,
           data_im);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index 7c5a08e9ede7..817001e126ae 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -278,10 +278,10 @@ void LayerNormKernelImplInternal(
   RowwiseMomentsCUDAKernel<T>
       <<<M, cuda_utils::kCUDABlockReduceNumThreads, 0, cuda_stream>>>(
           N, eps, X_data, mean_data, rstd_data);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   LayerNormForwardCUDAKernel<T><<<M, kCUDANumThreads, 0, cuda_stream>>>(
       N, X_data, mean_data, rstd_data, gamma_data, beta_data, Y_data);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 void LayerNormKernelImpl(
@@ -340,7 +340,7 @@ void LayerNormBackwardKernelImplInternal(
     ComputeInternalGradientsCUDAKernel<T>
         <<<M, cuda_utils::kCUDABlockReduceNumThreads, 0, cuda_stream>>>(
             N, dY_data, X_data, gamma_data, ds_data, db_data);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
     ComputeGradientFusedParamsCUDAKernel<T>
         <<<B, kCUDANumThreads, 0, cuda_stream>>>(
@@ -352,7 +352,7 @@ void LayerNormBackwardKernelImplInternal(
             db_data,
             scale_data,
             bias_data);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     LayerNormBackwardCUDAKenrel<T><<<M, kCUDANumThreads, 0, cuda_stream>>>(
         N,
         dY_data,
@@ -362,7 +362,7 @@ void LayerNormBackwardKernelImplInternal(
         scale_data,
         bias_data,
         dX_data);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
   if (dgamma->defined() || dbeta->defined()) {
     T* dgamma_data =
@@ -381,7 +381,7 @@ void LayerNormBackwardKernelImplInternal(
               rstd_data,
               dgamma_data,
               dbeta_data);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       const int64_t B =
           (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
@@ -397,7 +397,7 @@ void LayerNormBackwardKernelImplInternal(
               rstd_data,
               dgamma_data,
               dbeta_data);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
   }
 }
diff --git a/aten/src/ATen/native/cuda/vol2col.cuh b/aten/src/ATen/native/cuda/vol2col.cuh
index ec97888a5c47..960b44e6d106 100644
--- a/aten/src/ATen/native/cuda/vol2col.cuh
+++ b/aten/src/ATen/native/cuda/vol2col.cuh
@@ -129,7 +129,7 @@ void vol2col(
       height_col,
       width_col,
       data_col);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename T, typename accT>
@@ -264,7 +264,7 @@ void col2vol(
           output_height,
           output_width,
           data_vol);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 2d6e570d25c8..3cc6f3a93f5a 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1693,6 +1693,7 @@
     CPU: resize_
     CUDA: resize_cuda_
     QuantizedCPU: quantized_resize_cpu_
+    Meta: resize_meta_
 
 - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
   use_c10_dispatcher: full
@@ -5587,15 +5588,13 @@
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: fmod_
-    CUDA: fmod_cuda_
+    CPU, CUDA: fmod_
 
 - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method
   dispatch:
-    CPU: fmod_
-    CUDA: fmod_cuda_
+    CPU, CUDA: fmod_
 
 - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   use_c10_dispatcher: full
@@ -6581,27 +6580,23 @@
 
 - func: fmod.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: fmod_out
-    CUDA: fmod_cuda_out
+    CPU, CUDA: fmod_out
 
 - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: fmod
-    CUDA: fmod_cuda
+    CPU, CUDA: fmod
 
 - func: fmod.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
-    CPU: fmod_out
-    CUDA: fmod_cuda_out
+    CPU, CUDA: fmod_out
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    CPU: fmod
-    CUDA: fmod_cuda
+    CPU, CUDA: fmod
 
 - func: hypot.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   dispatch:
@@ -9508,6 +9503,18 @@
   dispatch:
     Math: linalg_tensorsolve_out
 
+- func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_matrix_rank
+
+- func: linalg_matrix_rank.out(Tensor self, float? tol=None, bool hermitian=False, *, Tensor(a!) out) -> Tensor(a!)
+  python_module: linalg
+  variants: function
+  dispatch:
+    Math: linalg_matrix_rank_out
+
 ## Functions that are only for testing
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 1217a94d753b..1c52242641e7 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -31,10 +31,18 @@ at::Tensor embedding_bag_4bit_impl(
   // Get compressed indices for pruned_weights op.
   int32_t* compressed_indices_mapping_data = nullptr;
   int compressed_index_size = 0;
+  bool fallback_to_no_sparse = false;
   if (pruned_weights) {
     compressed_index_size = compressed_indices_mapping.value().numel();
     compressed_indices_mapping_data =
         compressed_indices_mapping.value().data_ptr<int32_t>();
+
+    // if compressed_indices_mapping is [0], it is a indicator that
+    // we should fallback to non sparse embedding look up kernel.
+    if ((compressed_index_size == 1 &&
+         compressed_indices_mapping_data[0] == 0)) {
+      fallback_to_no_sparse = true;
+    }
   }
 
   const int64_t N = weight.size(0);
@@ -69,7 +77,7 @@ at::Tensor embedding_bag_4bit_impl(
   constexpr int prefetch_distance = 16;
 
 #ifdef USE_FBGEMM
-  if (!pruned_weights) {
+  if (!pruned_weights || fallback_to_no_sparse) {
     // Generate the fbgemm kernel
     auto kernel = fbgemm::GenerateEmbeddingSpMDMNBit<IndexType, OffsetType>(
         /*bit rate=*/4,
@@ -209,10 +217,18 @@ at::Tensor embedding_bag_byte_impl(
   // Get compressed indices for pruned_weights.
   int32_t* compressed_indices_mapping_data = nullptr;
   int compressed_index_size = 0;
+  bool fallback_to_no_sparse = false;
   if (pruned_weights) {
     compressed_index_size = compressed_indices_mapping.value().numel();
     compressed_indices_mapping_data =
         compressed_indices_mapping.value().data_ptr<int32_t>();
+
+    // if compressed_indices_mapping is [0], it is a indicator that
+    // we should fallback to non sparse embedding look up kernel.
+    if ((compressed_index_size == 1 &&
+         compressed_indices_mapping_data[0] == 0)) {
+      fallback_to_no_sparse = true;
+    }
   }
 
   const int64_t N = weight.size(0);
@@ -247,7 +263,7 @@ at::Tensor embedding_bag_byte_impl(
 
   const int index_size = indices.numel();
 #ifdef USE_FBGEMM
-  if (!pruned_weights) {
+  if (!pruned_weights || fallback_to_no_sparse) {
     auto kernel_i8 =
         fbgemm::GenerateEmbeddingSpMDM<uint8_t, IndexType, OffsetType>(
             /*block_size=*/D,
@@ -613,12 +629,12 @@ class QEmbeddingBag final {
           false /* is_embedding_op */);
     } else if (bit_rate == 4) {
       return packed_weight->embeddingbag_4bit(
-        indices,
-        offsets,
-        pruned_weights,
-        per_sample_weights_,
-        compressed_indices_mapping,
-        include_last_offset);
+          indices,
+          offsets,
+          pruned_weights,
+          per_sample_weights_,
+          compressed_indices_mapping,
+          include_last_offset);
     } else {
       TORCH_INTERNAL_ASSERT(
           "Currently only support 8-bit embedding_bag quantization");
diff --git a/aten/src/ATen/templates/Functions.cpp b/aten/src/ATen/templates/Functions.cpp
index 81e2a9f6d406..37c2919bb458 100644
--- a/aten/src/ATen/templates/Functions.cpp
+++ b/aten/src/ATen/templates/Functions.cpp
@@ -7,6 +7,22 @@
 
 namespace at {
 
+Tensor var(const Tensor& self, int dim) {
+  return at::var(self, IntArrayRef{dim});
+}
+
+std::tuple<Tensor,Tensor> var_mean(const Tensor& self, int dim) {
+  return at::var_mean(self, IntArrayRef{dim});
+}
+
+Tensor std(const Tensor& self, int dim) {
+  return at::std(self, IntArrayRef{dim});
+}
+
+std::tuple<Tensor,Tensor> std_mean(const Tensor& self, int dim) {
+  return at::std_mean(self, IntArrayRef{dim});
+}
+
 ${function_definitions}
 
 }
diff --git a/aten/src/ATen/templates/Functions.h b/aten/src/ATen/templates/Functions.h
index 5ec90e7e3d61..50623dc2dfed 100644
--- a/aten/src/ATen/templates/Functions.h
+++ b/aten/src/ATen/templates/Functions.h
@@ -7,7 +7,6 @@
 #include <c10/core/Storage.h>
 #include <ATen/core/Generator.h>
 #include <c10/util/Deprecated.h>
-#include <ATen/NativeFunctions.h> // TODO: try to delete this
 #include <ATen/DeviceGuard.h>
 #include <c10/core/TensorOptions.h>
 #include <ATen/core/Reduction.h>
@@ -19,26 +18,38 @@
 
 namespace at {
 
+// These functions are defined in ATen/Utils.cpp.
+#define TENSOR(T, S)                                                          \
+  CAFFE2_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
+  inline Tensor tensor(                                                       \
+      std::initializer_list<T> values, const TensorOptions& options) {        \
+    return at::tensor(ArrayRef<T>(values), options);                          \
+  }                                                                           \
+  inline Tensor tensor(T value, const TensorOptions& options) {               \
+    return at::tensor(ArrayRef<T>(value), options);                           \
+  }                                                                           \
+  inline Tensor tensor(ArrayRef<T> values) {                                  \
+    return at::tensor(std::move(values), at::dtype(k##S));                    \
+  }                                                                           \
+  inline Tensor tensor(std::initializer_list<T> values) {                     \
+    return at::tensor(ArrayRef<T>(values));                                   \
+  }                                                                           \
+  inline Tensor tensor(T value) {                                             \
+    return at::tensor(ArrayRef<T>(value));                                    \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
+AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+
 ${function_declarations}
 
 // Special C++ only overloads for std()-like functions (See gh-40287)
 // These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
 // So, for example std(0) would select the std(unbiased=False) overload
-inline Tensor var(const Tensor& self, int dim) {
-  return at::native::var(self, IntArrayRef{dim});
-}
-
-inline std::tuple<Tensor,Tensor> var_mean(const Tensor& self, int dim) {
-  return at::native::var_mean(self, IntArrayRef{dim});
-}
-
-inline Tensor std(const Tensor& self, int dim) {
-  return at::native::std(self, IntArrayRef{dim});
-}
-
-inline std::tuple<Tensor,Tensor> std_mean(const Tensor& self, int dim) {
-  return at::native::std_mean(self, IntArrayRef{dim});
-}
+CAFFE2_API Tensor var(const Tensor& self, int dim);
+CAFFE2_API std::tuple<Tensor,Tensor> var_mean(const Tensor& self, int dim);
+CAFFE2_API Tensor std(const Tensor& self, int dim);
+CAFFE2_API std::tuple<Tensor,Tensor> std_mean(const Tensor& self, int dim);
 
 namespace {
   inline std::vector<int64_t> zero_sizes(const TensorOptions& options) {
diff --git a/aten/src/ATen/templates/NativeFunctions.h b/aten/src/ATen/templates/NativeFunctions.h
index 0244efcb3a6b..b4cb31f60ee8 100644
--- a/aten/src/ATen/templates/NativeFunctions.h
+++ b/aten/src/ATen/templates/NativeFunctions.h
@@ -23,29 +23,6 @@ struct Type;
 } // namespace at
 
 namespace at {
-// These functions are defined in ATen/Utils.cpp.
-#define TENSOR(T, S)                                                          \
-  CAFFE2_API Tensor tensor(ArrayRef<T> values, const TensorOptions& options); \
-  inline Tensor tensor(                                                       \
-      std::initializer_list<T> values, const TensorOptions& options) {        \
-    return at::tensor(ArrayRef<T>(values), options);                          \
-  }                                                                           \
-  inline Tensor tensor(T value, const TensorOptions& options) {               \
-    return at::tensor(ArrayRef<T>(value), options);                           \
-  }                                                                           \
-  inline Tensor tensor(ArrayRef<T> values) {                                  \
-    return at::tensor(std::move(values), at::dtype(k##S));                    \
-  }                                                                           \
-  inline Tensor tensor(std::initializer_list<T> values) {                     \
-    return at::tensor(ArrayRef<T>(values));                                   \
-  }                                                                           \
-  inline Tensor tensor(T value) {                                             \
-    return at::tensor(ArrayRef<T>(value));                                    \
-  }
-AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
-AT_FORALL_COMPLEX_TYPES(TENSOR)
-#undef TENSOR
-
 namespace native {
 
 ${native_function_declarations}
diff --git a/aten/src/THC/THCTensorMathPairwise.cu b/aten/src/THC/THCTensorMathPairwise.cu
index da57a1ad36f8..04fb34df4f70 100644
--- a/aten/src/THC/THCTensorMathPairwise.cu
+++ b/aten/src/THC/THCTensorMathPairwise.cu
@@ -21,36 +21,5 @@ struct TensorMulConstantOp {
   const T val;
 };
 
-template <typename T>
-struct TensorFmodOp {
-  TensorFmodOp(T v) : val((float)v) {}
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = (T) fmodf((float) *in, val);
-  }
-
-  __device__ __forceinline__ void operator()(T* v) {
-    *v = (T) fmodf((float) *v, val);
-  }
-
-  const float val;
-};
-
-template <>
-struct TensorFmodOp<double> {
-  TensorFmodOp(double v) : val(v) {}
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = fmod(*in, val);
-  }
-
-  __device__ __forceinline__ void operator()(double* v) {
-    *v = fmod(*v, val);
-  }
-
-  const double val;
-};
-
-#include <THC/generic/THCTensorMathPairwise.cu>
-#include <THC/THCGenerateAllTypes.h>
-
 #include <THC/generic/THCTensorMathPairwise.cu>
 #include <THC/THCGenerateBoolType.h>
diff --git a/aten/src/THC/THCTensorMathPointwise.cuh b/aten/src/THC/THCTensorMathPointwise.cuh
index 2b511983934f..bb2e31de2669 100644
--- a/aten/src/THC/THCTensorMathPointwise.cuh
+++ b/aten/src/THC/THCTensorMathPointwise.cuh
@@ -36,50 +36,6 @@ struct TensorMulOp {
   }
 };
 
-template <typename T>
-struct TensorCFmodOp {
-  __device__ __forceinline__ void operator()(T* out, T* in) {
-    *out = *out % *in;
-  }
-
-  __device__ __forceinline__ void operator()(T* out, T* in1, T* in2) {
-    *out = *in1 % *in2;
-  }
-};
-
-template <>
-struct TensorCFmodOp<float> {
-  __device__ __forceinline__ void operator()(float* out, float* in) {
-    *out = fmodf(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(float* out, float* in1, float* in2) {
-    *out = fmodf(*in1, *in2);
-  }
-};
-
-template <>
-struct TensorCFmodOp<double> {
-  __device__ __forceinline__ void operator()(double* out, double* in) {
-    *out = fmod(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(double* out, double* in1, double* in2) {
-    *out = fmod(*in1, *in2);
-  }
-};
-
-template <>
-struct TensorCFmodOp<at::Half> {
-  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in) {
-    *out = fmodf(*out, *in);
-  }
-
-  __device__ __forceinline__ void operator()(at::Half* out, at::Half* in1, at::Half* in2) {
-    *out = fmodf(*in1, *in2);
-  }
-};
-
 template <typename T>
 struct TensorCrossOp {
   TensorCrossOp(int64_t sx, int64_t sy, int64_t so) : sx(sx), sy(sy), so(so) {}
diff --git a/aten/src/THC/generic/THCTensorMathPairwise.h b/aten/src/THC/generic/THCTensorMathPairwise.h
index 7bae0a54fa41..2e530aa4be42 100644
--- a/aten/src/THC/generic/THCTensorMathPairwise.h
+++ b/aten/src/THC/generic/THCTensorMathPairwise.h
@@ -7,7 +7,6 @@ THC_API int THCTensor_(equal)(THCState *state, THCTensor *self, THCTensor *src);
 #if !defined(THC_REAL_IS_BOOL)
 
 THC_API void THCTensor_(mul)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
-THC_API void THCTensor_(fmod)(THCState *state, THCTensor *self, THCTensor *src, scalar_t value);
 
 #endif
 
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index 54fe16bc85c0..c33716c5f565 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -83,23 +83,5 @@ void THCTensor_(cmul)(THCState *state, THCTensor *self_, THCTensor *src1, THCTen
   at::mul_out(out, at::Tensor(retainTensorImpl(src1)), at::Tensor(retainTensorImpl(src2)));
 }
 
-void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2)
-{
-  THCAssertSameGPU(THCTensor_(checkGPU)(state, 3, self, src1, src2));
-  THArgCheck(THCTensor_(nElement)(state, src1) ==
-             THCTensor_(nElement)(state, src2), 2, "sizes do not match");
-
-  if (self == src1) {
-    if (!THC_pointwiseApply2<scalar_t, scalar_t>(state, self, src2, TensorCFmodOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  } else {
-    THCTensor_(resizeAs)(state, self, src1);
-    if (!THC_pointwiseApply3<scalar_t, scalar_t, scalar_t>(state, self, src1, src2, TensorCFmodOp<scalar_t>())) {
-      THArgCheck(false, 2, CUTORCH_DIM_WARNING);
-    }
-  }
-}
-
 #endif
 #endif
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h
index 658f344f8b14..56ead75ba992 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.h
+++ b/aten/src/THC/generic/THCTensorMathPointwise.h
@@ -20,7 +20,6 @@ THC_API void THCTensor_(cmul)(THCState *state, THCTensor *self, THCTensor *src1,
 THC_API void THCTensor_(cdiv)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(clshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 THC_API void THCTensor_(crshift)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
-THC_API void THCTensor_(cfmod)(THCState *state, THCTensor *self, THCTensor *src1, THCTensor *src2);
 
 #endif
 #endif
diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
index e3b2348ef6ab..5d1a473b5597 100644
--- a/c10/cuda/CUDAException.h
+++ b/c10/cuda/CUDAException.h
@@ -33,4 +33,4 @@
 // This should be used directly after every kernel launch to ensure
 // the launch happened correctly and provide an early, close-to-source
 // diagnostic if it didn't.
-#define TORCH_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
+#define C10_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
diff --git a/caffe2/sgd/adadelta_op_gpu.cu b/caffe2/sgd/adadelta_op_gpu.cu
index 8ead39705a70..92416b503bcd 100644
--- a/caffe2/sgd/adadelta_op_gpu.cu
+++ b/caffe2/sgd/adadelta_op_gpu.cu
@@ -47,7 +47,7 @@ void AdadeltaUpdate<CUDAContext>(
       CAFFE_CUDA_NUM_THREADS,
       0,
       context->cuda_stream()>>>(N, w, g, h, d, epsilon, decay, lr, nw, nh, nd);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 } // namespace
@@ -155,7 +155,7 @@ class CUDASparseAdadeltaOp final : public Operator<Context> {
             paramOut,
             momentOut,
             momentDeltaOut);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     return true;
   }
 
diff --git a/caffe2/sgd/adagrad_fused_op_gpu.cu b/caffe2/sgd/adagrad_fused_op_gpu.cu
index 88d51e1026a8..539b9919e8e7 100644
--- a/caffe2/sgd/adagrad_fused_op_gpu.cu
+++ b/caffe2/sgd/adagrad_fused_op_gpu.cu
@@ -638,7 +638,7 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final
              0,
              context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     if (block_size <= maxThreads) {
@@ -661,7 +661,7 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final
           is_mean ? grad_buffer_data : grad,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       // calling cuda kernel with ExactBlock = false
       sparse_adagrad_fused_length_sum_gradient_kernel<
@@ -680,7 +680,7 @@ class CUDASparseAdagradFusedWithSparseLengthsSumGradientOp final
           is_mean ? grad_buffer_data : grad,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
     return true;
   }
@@ -822,7 +822,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (block_size > 64) {
       sparse_adagrad_fused_length_weighted_sum_gradient_kernel<
           IndexType,
@@ -842,7 +842,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (block_size > 32) {
       sparse_adagrad_fused_length_weighted_sum_gradient_kernel<
           IndexType,
@@ -862,7 +862,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       sparse_adagrad_fused_length_weighted_sum_gradient_kernel<
           IndexType,
@@ -882,7 +882,7 @@ class CUDASparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
     return true;
   }
@@ -1012,7 +1012,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
              0,
              context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     // 0: nearest rounding
@@ -1045,7 +1045,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
             lr,
             seed,
             weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         rowwise_sparse_adagrad_fused_length_sum_gradient_kernel<
             IndexType,
@@ -1065,7 +1065,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
             lr,
             seed,
             weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     } else {
       if (round_option_) {
@@ -1091,7 +1091,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
                 lr,
                 seed,
                 weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         rowwise_sparse_adagrad_fused_length_sum_gradient_kernel<
             IndexType,
@@ -1115,7 +1115,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
                 lr,
                 seed,
                 weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     }
     return true;
@@ -1253,7 +1253,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
              0,
              context_.cuda_stream()>>>(
               grad, lengths, grad_buffer_data, block_size);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     sorted_linear_ind_buffer_.ResizeLike(indicesInput);
@@ -1265,7 +1265,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
             indices,
             prefix_sum_length_data,
             seg_id_buffer_.template mutable_data<int>());
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     sort_pairs_wrapper<IndexType>(
         num_indices,
@@ -1330,7 +1330,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
                 lr,
                 seed,
                 weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
             IndexType,
@@ -1357,7 +1357,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
                 lr,
                 seed,
                 weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     } else {
       const int sm_size = block_size * sizeof(float);
@@ -1392,7 +1392,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
                 lr,
                 seed,
                 weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       } else {
         rowwise_sparse_adagrad_fused_length_sum_gradient_dedup_kernel<
             IndexType,
@@ -1419,7 +1419,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsSumGradientExactOp final
                 lr,
                 seed,
                 weight_decay_);
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
       }
     }
 
@@ -1569,7 +1569,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (block_size > 64) {
       rowwise_sparse_adagrad_fused_length_weighted_sum_gradient_kernel<
           IndexType,
@@ -1589,7 +1589,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else if (block_size > 32) {
       rowwise_sparse_adagrad_fused_length_weighted_sum_gradient_kernel<
           IndexType,
@@ -1609,7 +1609,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
       rowwise_sparse_adagrad_fused_length_weighted_sum_gradient_kernel<
           IndexType,
@@ -1629,7 +1629,7 @@ class CUDARowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
           out_weight_grads,
           lr,
           weight_decay_);
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     return true;
diff --git a/caffe2/sgd/adagrad_op_gpu.cu b/caffe2/sgd/adagrad_op_gpu.cu
index 0937bbaef05e..8abb3376ca87 100644
--- a/caffe2/sgd/adagrad_op_gpu.cu
+++ b/caffe2/sgd/adagrad_op_gpu.cu
@@ -44,7 +44,7 @@ void adagrad_update<CUDAContext>(
       0,
       context->cuda_stream()>>>(
       N, w, g, h, nw, nh, epsilon, decay, lr, weight_decay);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename SIndex, typename THalf>
@@ -189,7 +189,7 @@ class CUDASparseAdagradOp final : public Operator<Context> {
             Input(GRAD).template data<float>(),
             Input(LR).template data<float>(),
             weight_decay_);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
     return true;
   }
 
@@ -233,7 +233,7 @@ bool RowWiseSparseAdagradOp<CUDAContext>::DoRunWithType() {
       Input(GRAD).template data<float>(),
       Input(LR).template data<float>(),
       weight_decay_);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return true;
 }
 
diff --git a/caffe2/sgd/adam_op_gpu.cu b/caffe2/sgd/adam_op_gpu.cu
index b5edb3fd3ab2..42ab975faacb 100644
--- a/caffe2/sgd/adam_op_gpu.cu
+++ b/caffe2/sgd/adam_op_gpu.cu
@@ -47,7 +47,7 @@ void adam_update<CUDAContext>(
       0,
       context->cuda_stream()>>>(
       N, g, m, v, ng, nm, nv, beta1, beta2, eps_hat, correction, lr);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 __global__ void AdamCompute(
@@ -95,7 +95,7 @@ void adam_compute<CUDAContext>(
       0,
       context->cuda_stream()>>>(
       N, w, g, m, v, nw, nm, nv, beta1, beta2, eps_hat, correction, lr);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 __global__ void AdamComputeOutputGrad(
@@ -145,7 +145,7 @@ void adam_compute_output_grad<CUDAContext>(
       0,
       context->cuda_stream()>>>(
       N, w, g, m, v, nw, nm, nv, ng, beta1, beta2, eps_hat, correction, lr);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename SIndex>
@@ -336,7 +336,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
             correction,
             Input(LR).template data<float>(),
             iter);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
     SparseAdamOutputGradKernel<SIndex>
@@ -358,7 +358,7 @@ bool SparseAdamOp<float, CUDAContext>::DoRunWithType() {
             correction,
             Input(LR).template data<float>(),
             iter);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 
   return true;
@@ -403,7 +403,7 @@ bool RowWiseSparseAdamOp<float, CUDAContext>::DoRunWithType() {
             Input(GRAD).template data<float>(),
             correction,
             Input(LR).template data<float>());
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD));
     RowWiseSparseAdamOutputGradKernel<SIndex>
@@ -424,7 +424,7 @@ bool RowWiseSparseAdamOp<float, CUDAContext>::DoRunWithType() {
             Input(GRAD).template data<float>(),
             correction,
             Input(LR).template data<float>());
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 
   return true;
diff --git a/caffe2/sgd/fp16_momentum_sgd_op.cu b/caffe2/sgd/fp16_momentum_sgd_op.cu
index be0be6d78643..985f4f2864d1 100644
--- a/caffe2/sgd/fp16_momentum_sgd_op.cu
+++ b/caffe2/sgd/fp16_momentum_sgd_op.cu
@@ -215,7 +215,7 @@ void fp16_momentum_sgd_update<CUDAContext>(
           nesterov,
           weight_decay,
           reinterpret_cast<half2*>(param));
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
       // not setting N to N/2
     } else {
       FP16MomentumSGDFP32Kernel<<<
@@ -233,7 +233,7 @@ void fp16_momentum_sgd_update<CUDAContext>(
           nesterov,
           weight_decay,
           reinterpret_cast<half2*>(param));
-      TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
       // not setting N to N/2
     }
 
diff --git a/caffe2/sgd/fp32_momentum_sgd_op.cu b/caffe2/sgd/fp32_momentum_sgd_op.cu
index f31b24ba7541..1ae9015bc6d9 100644
--- a/caffe2/sgd/fp32_momentum_sgd_op.cu
+++ b/caffe2/sgd/fp32_momentum_sgd_op.cu
@@ -108,7 +108,7 @@ void fp32_momentum_sgd_update<CUDAContext>(
       nesterov,
       weight_decay,
       reinterpret_cast<float2*>(param));
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   // not setting N to N/2
   // TODO_ check float performance vs float2
 }
diff --git a/caffe2/sgd/lars_op_gpu.cu b/caffe2/sgd/lars_op_gpu.cu
index e3129235255f..2a1d6c79e833 100644
--- a/caffe2/sgd/lars_op_gpu.cu
+++ b/caffe2/sgd/lars_op_gpu.cu
@@ -31,7 +31,7 @@ void LarsOp<float, CUDAContext>::ComputeLearningRate(
     float* lr_rescaled) {
   ComputeLearningRateKernel<<<1, 1, 0, context_.cuda_stream()>>>(
       wd, trust, lr_max, offset, lr_min, X_norm, dX_norm, lr_rescaled);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 REGISTER_CUDA_OPERATOR(Lars, LarsOp<float, CUDAContext>);
diff --git a/caffe2/sgd/momentum_sgd_op_gpu.cu b/caffe2/sgd/momentum_sgd_op_gpu.cu
index e2439a75d711..e8eb00654e65 100644
--- a/caffe2/sgd/momentum_sgd_op_gpu.cu
+++ b/caffe2/sgd/momentum_sgd_op_gpu.cu
@@ -82,14 +82,14 @@ void momentum_sgd_update<CUDAContext>(
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
     MomentumSGDKernel<false>
         <<<CaffeGetBlocksSGD(N),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(N, g, m, ng, nm, lr, momentum, param);
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
 }
 
@@ -152,7 +152,7 @@ bool SparseMomentumSGDUpdateOp<float, CUDAContext>::DoRunWithType() {
         Input(GRAD).template data<float>(),
         Output(OUTPUT_GRAD)->template mutable_data<float>(),
         Input(LR).template data<float>());
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return true;
 }
 
diff --git a/caffe2/sgd/rmsprop_op_gpu.cu b/caffe2/sgd/rmsprop_op_gpu.cu
index f2b5b4f19463..d6f236739084 100644
--- a/caffe2/sgd/rmsprop_op_gpu.cu
+++ b/caffe2/sgd/rmsprop_op_gpu.cu
@@ -43,7 +43,7 @@ void rmsprop_update<CUDAContext>(
     CUDAContext* context) {
   RmsPropUpdate<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>(
       N, g, ms, mom, ng, nms, nmom, decay, momentum, epsilon, lr);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 
diff --git a/caffe2/sgd/yellowfin_op_gpu.cu b/caffe2/sgd/yellowfin_op_gpu.cu
index 7644821cf3f1..cb62ae433557 100644
--- a/caffe2/sgd/yellowfin_op_gpu.cu
+++ b/caffe2/sgd/yellowfin_op_gpu.cu
@@ -32,7 +32,7 @@ void YellowFinOp<float, CUDAContext>::GetLrMu() {
   // Finding root of cubic formula for YF's Single Step
   GetLrMuKernel<<<1, 1, 0, context_.cuda_stream()>>>(
       g_norm2_max_deb_, g_norm2_min_deb_, distance_deb_, variance_, mu_, lr_);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
   MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);
   MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);
 }
@@ -79,7 +79,7 @@ void YellowFinOp<float, CUDAContext>::MomentumSgdUpdate() {
       param_out_,
       moment_out_,
       nesterov_);
-  TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 REGISTER_CUDA_OPERATOR(YellowFin, YellowFinOp<float, CUDAContext>);
diff --git a/caffe2/utils/eigen_utils.h b/caffe2/utils/eigen_utils.h
index 83e7cb2317bb..d5dbe121f6f8 100644
--- a/caffe2/utils/eigen_utils.h
+++ b/caffe2/utils/eigen_utils.h
@@ -73,17 +73,28 @@ using EArrXf = Eigen::ArrayXf;
 using EArrXd = Eigen::ArrayXd;
 using EArrXi = Eigen::ArrayXi;
 using EArrXb = EArrXt<bool>;
+using EArrXI32  = EArrXt<int32_t>;
+using EArrXU16  = EArrXt<uint16_t>;
+using EArrXU8  = EArrXt<uint8_t>;
 
 // 2-d array, column major
 template <typename T>
 using EArrXXt = Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>;
 using EArrXXf = Eigen::ArrayXXf;
+using EArrXXI32 = EArrXXt<int32_t>;
+using EArrXXU16 = EArrXXt<uint16_t>;
+using EArrXXU8 = EArrXXt<uint8_t>;
+using EArrXXi = EArrXXt<int>;
 
 // 2-d array, row major
 template <typename T>
 using ERArrXXt =
     Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
 using ERArrXXf = ERArrXXt<float>;
+using ERArrXXI32t = ERArrXXt<int32_t>;
+using ERArrXXU16t = ERArrXXt<uint16_t>;
+using ERArrXXU8t = ERArrXXt<uint8_t>;
+using ERArrXXi = ERArrXXt<int>;
 
 // 1-d vector
 template <typename T>
diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst
index aebc39038368..fe09626e60d8 100644
--- a/docs/source/distributions.rst
+++ b/docs/source/distributions.rst
@@ -167,6 +167,15 @@ Probability distributions - torch.distributions
     :undoc-members:
     :show-inheritance:
 
+:hidden:`Kumaraswamy`
+~~~~~~~~~~~~~~~~~~~~~
+
+.. currentmodule:: torch.distributions.kumaraswamy
+.. autoclass:: Kumaraswamy
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 :hidden:`Laplace`
 ~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index eb7b3c120c61..b5d78572c06b 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -16,6 +16,7 @@ Functions
 .. autofunction:: det
 .. autofunction:: eigh
 .. autofunction:: eigvalsh
+.. autofunction:: matrix_rank
 .. autofunction:: norm
 .. autofunction:: tensorinv
 .. autofunction:: tensorsolve
diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst
index 443d3849582f..cc195e7a93a9 100644
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@@ -20,14 +20,15 @@ MKL and MAGMA. Here are the steps to build with them.
 
     REM Download MAGMA files
     REM version available:
+    REM 2.5.4 (CUDA 10.1 10.2 11.0 11.1) x (Debug Release)
     REM 2.5.3 (CUDA 10.1 10.2 11.0) x (Debug Release)
     REM 2.5.2 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
     REM 2.5.1 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
     REM 2.5.0 (CUDA 9.0 9.2 10.0 10.1) x (Debug Release)
     REM 2.4.0 (CUDA 8.0 9.2) x (Release)
-    set CUDA_PREFIX=cuda92
+    set CUDA_PREFIX=cuda101
     set CONFIG=release
-    curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.1_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z
     7z x -aoa magma.7z -omagma
     
     REM Setting essential environment variables
diff --git a/setup.py b/setup.py
index fd777b141688..01f173d6825b 100644
--- a/setup.py
+++ b/setup.py
@@ -327,8 +327,16 @@ def check_file(f):
 
     # Use copies instead of symbolic files.
     # Windows has very poor support for them.
-    sym_files = ['tools/shared/_utils_internal.py']
-    orig_files = ['torch/_utils_internal.py']
+    sym_files = [
+        'tools/shared/_utils_internal.py',
+        'torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h',
+        'torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h',
+    ]
+    orig_files = [
+        'torch/_utils_internal.py',
+        'third_party/valgrind-headers/callgrind.h',
+        'third_party/valgrind-headers/valgrind.h',
+    ]
     for sym_file, orig_file in zip(sym_files, orig_files):
         same = False
         if os.path.exists(sym_file):
@@ -907,6 +915,9 @@ def print_box(msg):
                 'share/cmake/Gloo/*.cmake',
                 'share/cmake/Tensorpipe/*.cmake',
                 'share/cmake/Torch/*.cmake',
+                'utils/benchmark/utils/*.cpp',
+                'utils/benchmark/utils/valgrind_wrapper/*.cpp',
+                'utils/benchmark/utils/valgrind_wrapper/*.h',
             ],
             'caffe2': [
                 'python/serialized_test/data/operator_test/*.zip',
diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py
index 8b1bdf1419ee..779a704bfd92 100644
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@@ -7,7 +7,7 @@
 
 import torch
 import torch.utils.benchmark as benchmark_utils
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_WINDOWS, slowTest
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_SANDCASTLE, IS_WINDOWS, slowTest
 from torch.testing._internal import expecttest
 import numpy as np
 
@@ -162,6 +162,17 @@ def test_timer(self):
         ).timeit(5).median
         self.assertIsInstance(sample, float)
 
+    @slowTest
+    @unittest.skipIf(IS_SANDCASTLE, "C++ timing is OSS only.")
+    def test_cpp_timer(self):
+        timer = benchmark_utils.Timer(
+            "torch::Tensor y = x + 1;",
+            setup="torch::Tensor x = torch::empty({1});",
+            language=benchmark_utils.Language.CPP,
+        )
+        t = timer.timeit(10)
+        self.assertIsInstance(t.median, float)
+
     class _MockTimer:
         _seed = 0
 
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index 68f5922753a3..d75f21740435 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -44,7 +44,7 @@
                                  Distribution, Exponential, ExponentialFamily,
                                  FisherSnedecor, Gamma, Geometric, Gumbel,
                                  HalfCauchy, HalfNormal,
-                                 Independent, Laplace, LogisticNormal,
+                                 Independent, Kumaraswamy, Laplace, LogisticNormal,
                                  LogNormal, LowRankMultivariateNormal,
                                  MixtureSameFamily, Multinomial, MultivariateNormal,
                                  NegativeBinomial, Normal, OneHotCategorical, Pareto,
@@ -240,6 +240,16 @@ def is_all_nan(tensor):
             'reinterpreted_batch_ndims': 3,
         },
     ]),
+    Example(Kumaraswamy, [
+        {
+            'concentration1': torch.empty(2, 3).uniform_(1, 2).requires_grad_(),
+            'concentration0': torch.empty(2, 3).uniform_(1, 2).requires_grad_(),
+        },
+        {
+            'concentration1': torch.rand(4).uniform_(1, 2).requires_grad_(),
+            'concentration0': torch.rand(4).uniform_(1, 2).requires_grad_(),
+        },
+    ]),
     Example(Laplace, [
         {
             'loc': torch.randn(5, 5, requires_grad=True),
@@ -2249,6 +2259,42 @@ def test_gumbel_sample(self):
                                         scipy.stats.gumbel_r(loc=loc, scale=scale),
                                         'Gumbel(loc={}, scale={})'.format(loc, scale))
 
+    def test_kumaraswamy_shape(self):
+        concentration1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        concentration0 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        concentration1_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        concentration0_1d = torch.tensor(torch.randn(1).abs(), requires_grad=True)
+        self.assertEqual(Kumaraswamy(concentration1, concentration0).sample().size(), (2, 3))
+        self.assertEqual(Kumaraswamy(concentration1, concentration0).sample((5,)).size(), (5, 2, 3))
+        self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample().size(), (1,))
+        self.assertEqual(Kumaraswamy(concentration1_1d, concentration0_1d).sample((1,)).size(), (1, 1))
+        self.assertEqual(Kumaraswamy(1.0, 1.0).sample().size(), ())
+        self.assertEqual(Kumaraswamy(1.0, 1.0).sample((1,)).size(), (1,))
+
+    # Kumaraswamy distribution is not implemented in SciPy
+    # Hence these tests are explicit
+    def test_kumaraswamy_mean_variance(self):
+        c1_1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        c0_1 = torch.tensor(torch.randn(2, 3).abs(), requires_grad=True)
+        c1_2 = torch.tensor(torch.randn(4).abs(), requires_grad=True)
+        c0_2 = torch.tensor(torch.randn(4).abs(), requires_grad=True)
+        cases = [(c1_1, c0_1), (c1_2, c0_2)]
+        for i, (a, b) in enumerate(cases):
+            m = Kumaraswamy(a, b)
+            samples = m.sample((60000, ))
+            expected = samples.mean(0)
+            actual = m.mean
+            error = (expected - actual).abs()
+            max_error = max(error[error == error])
+            self.assertLess(max_error, 0.01,
+                            "Kumaraswamy example {}/{}, incorrect .mean".format(i + 1, len(cases)))
+            expected = samples.var(0)
+            actual = m.variance
+            error = (expected - actual).abs()
+            max_error = max(error[error == error])
+            self.assertLess(max_error, 0.01,
+                            "Kumaraswamy example {}/{}, incorrect .variance".format(i + 1, len(cases)))
+
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_fishersnedecor(self):
         df1 = torch.randn(2, 3).abs().requires_grad_()
@@ -2622,6 +2668,18 @@ def test_valid_parameter_broadcasting(self):
              (1, 2)),
             (Gumbel(loc=torch.tensor([0.]), scale=torch.tensor([[1.]])),
              (1, 1)),
+            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=1.),
+             (2,)),
+            (Kumaraswamy(concentration1=1, concentration0=torch.tensor([1., 1.])),
+             (2, )),
+            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([1.])),
+             (2,)),
+            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([[1.], [1.]])),
+             (2, 2)),
+            (Kumaraswamy(concentration1=torch.tensor([1., 1.]), concentration0=torch.tensor([[1.]])),
+             (1, 2)),
+            (Kumaraswamy(concentration1=torch.tensor([1.]), concentration0=torch.tensor([[1.]])),
+             (1, 1)),
             (Laplace(loc=torch.tensor([0., 0.]), scale=1),
              (2,)),
             (Laplace(loc=0, scale=torch.tensor([1., 1.])),
@@ -2701,6 +2759,14 @@ def test_invalid_parameter_broadcasting(self):
                 'concentration': torch.tensor([0, 0]),
                 'rate': torch.tensor([1, 1, 1])
             }),
+            (Kumaraswamy, {
+                'concentration1': torch.tensor([[1, 1]]),
+                'concentration0': torch.tensor([1, 1, 1, 1])
+            }),
+            (Kumaraswamy, {
+                'concentration1': torch.tensor([[[1, 1, 1], [1, 1, 1]]]),
+                'concentration0': torch.tensor([1, 1])
+            }),
             (Laplace, {
                 'loc': torch.tensor([0, 0]),
                 'scale': torch.tensor([1, 1, 1])
@@ -3242,6 +3308,15 @@ def test_gumbel_shape_scalar_params(self):
         self.assertEqual(gumbel.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
         self.assertEqual(gumbel.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
 
+    def test_kumaraswamy_shape_scalar_params(self):
+        kumaraswamy = Kumaraswamy(1, 1)
+        self.assertEqual(kumaraswamy._batch_shape, torch.Size())
+        self.assertEqual(kumaraswamy._event_shape, torch.Size())
+        self.assertEqual(kumaraswamy.sample().size(), torch.Size())
+        self.assertEqual(kumaraswamy.sample((3, 2)).size(), torch.Size((3, 2)))
+        self.assertEqual(kumaraswamy.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
+        self.assertEqual(kumaraswamy.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
+
     def test_vonmises_shape_tensor_params(self):
         von_mises = VonMises(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
         self.assertEqual(von_mises._batch_shape, torch.Size((2,)))
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 0f481689a10a..abaa078bb353 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -2349,22 +2349,6 @@ def forward(self, input, indices):
         indices = torch.tensor([[1, 0], [0, 1], [0, 1]], dtype=torch.int64)
         self.run_test(GatherModel(), input=(input, indices))
 
-    @skipIfUnsupportedMinOpsetVersion(9)
-    def test_gather_float_index(self):
-        class GatherFloatIndexModel(torch.jit.ScriptModule):
-            @torch.jit.script_method
-            def forward(self, input, mask):
-                seq_length, batch_size = mask.shape
-                result = input[0][0][0]
-                for i in torch.arange(2, seq_length):
-                    result = input[0][i][0]
-                return result
-
-        model = GatherFloatIndexModel()
-        x = torch.randint(0, 5, (8, 8, 17), dtype=torch.long)
-        y = torch.ones(8, 1, dtype=torch.uint8)
-        self.run_test(model, (x, y))
-
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_expand(self):
         class ExpandModel(torch.nn.Module):
diff --git a/test/quantization/test_numeric_suite.py b/test/quantization/test_numeric_suite.py
index 44963735d6c2..74ecc4a90469 100644
--- a/test/quantization/test_numeric_suite.py
+++ b/test/quantization/test_numeric_suite.py
@@ -104,7 +104,7 @@ def compare_and_validate_results(float_model, q_model):
             model.eval()
             if hasattr(model, "fuse_model"):
                 model.fuse_model()
-            q_model = quantize(model, test_only_eval_fn, self.img_data_2d)
+            q_model = quantize(model, test_only_eval_fn, [self.img_data_2d])
             compare_and_validate_results(model, q_model)
 
     @override_qengines
@@ -126,7 +126,7 @@ def compare_and_validate_results(float_model, q_model):
             model.eval()
             if hasattr(model, "fuse_model"):
                 model.fuse_model()
-            q_model = quantize(model, test_only_eval_fn, self.calib_data)
+            q_model = quantize(model, test_only_eval_fn, [self.calib_data])
             compare_and_validate_results(model, q_model)
 
     @override_qengines
@@ -197,7 +197,7 @@ def compare_and_validate_results(float_model, q_model, module_swap_list, data):
             model.eval()
             if hasattr(model, "fuse_model"):
                 model.fuse_model()
-            q_model = quantize(model, test_only_eval_fn, self.img_data_2d)
+            q_model = quantize(model, test_only_eval_fn, [self.img_data_2d])
             compare_and_validate_results(
                 model, q_model, module_swap_list, self.img_data_2d[0][0]
             )
@@ -223,7 +223,7 @@ def compare_and_validate_results(float_model, q_model, module_swap_list, data):
             model.eval()
             if hasattr(model, "fuse_model"):
                 model.fuse_model()
-            q_model = quantize(model, test_only_eval_fn, self.calib_data)
+            q_model = quantize(model, test_only_eval_fn, [self.calib_data])
             compare_and_validate_results(model, q_model, module_swap_list, linear_data)
 
     @override_qengines
@@ -233,7 +233,7 @@ def test_compare_model_stub_submodule_static(self):
         qengine = torch.backends.quantized.engine
 
         model = ModelWithSubModules().eval()
-        q_model = quantize(model, test_only_eval_fn, self.img_data_2d)
+        q_model = quantize(model, test_only_eval_fn, [self.img_data_2d])
         module_swap_list = [SubModule]
         ob_dict = compare_model_stub(
             model, q_model, module_swap_list, self.img_data_2d[0][0]
@@ -350,7 +350,7 @@ def compare_and_validate_results(float_model, q_model, data):
             model.eval()
             if hasattr(model, "fuse_model"):
                 model.fuse_model()
-            q_model = quantize(model, test_only_eval_fn, self.img_data_2d)
+            q_model = quantize(model, test_only_eval_fn, [self.img_data_2d])
             compare_and_validate_results(model, q_model, self.img_data_2d[0][0])
 
     @override_qengines
@@ -376,7 +376,7 @@ def compare_and_validate_results(float_model, q_model, data):
             model.eval()
             if hasattr(model, "fuse_model"):
                 model.fuse_model()
-            q_model = quantize(model, test_only_eval_fn, self.calib_data)
+            q_model = quantize(model, test_only_eval_fn, [self.calib_data])
             compare_and_validate_results(model, q_model, linear_data)
 
     @override_qengines
diff --git a/test/quantization/test_qat_module.py b/test/quantization/test_qat_module.py
index 4144c0744104..32de0ff50f0e 100644
--- a/test/quantization/test_qat_module.py
+++ b/test/quantization/test_qat_module.py
@@ -110,7 +110,11 @@ def _forward(self, input):
         running_std = torch.sqrt(self.running_var + self.eps)
         scale_factor = self.gamma / running_std
         scaled_weight = self.weight * scale_factor.reshape([-1, 1, 1, 1])
-        conv = self._conv_forward(input, self.weight_fake_quant(scaled_weight))
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias)
+        else:
+            zero_bias = torch.zeros(self.out_channels, device=scaled_weight.device)
+        conv = self._conv_forward(input, self.weight_fake_quant(scaled_weight), zero_bias)
 
         if self.training and not self.freeze_bn:
             # recovering original conv to get original batch_mean and batch_var
diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index ee4a114dcee0..745437a86ca3 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -120,7 +120,7 @@ def checkQuantized(model):
                 base = AnnotatedSingleLayerLinearModel(qengine)
                 base.qconfig = qconfig
                 keys_before = set(list(base.state_dict().keys()))
-                model = quantize(base, test_only_eval_fn, self.calib_data)
+                model = quantize(base, test_only_eval_fn, [self.calib_data])
                 checkQuantized(model)
                 keys_after = set(list(base.state_dict().keys()))
                 self.assertEqual(keys_before, keys_after)  # simple check that nothing changed
@@ -128,7 +128,7 @@ def checkQuantized(model):
                 # in-place version
                 model = AnnotatedSingleLayerLinearModel(qengine)
                 model.qconfig = qconfig
-                quantize(model, test_only_eval_fn, self.calib_data, inplace=True)
+                quantize(model, test_only_eval_fn, [self.calib_data], inplace=True)
                 checkQuantized(model)
 
     @skipIfNoFBGEMM
@@ -162,7 +162,7 @@ def checkQuantized(model):
 
             # test one line API
             model = quantize(AnnotatedTwoLayerLinearModel(), test_only_eval_fn,
-                             self.calib_data)
+                             [self.calib_data])
             checkQuantized(model)
 
     def test_nested1(self):
@@ -204,7 +204,7 @@ def checkQuantized(model):
 
                 # test one line API
                 model = quantize(AnnotatedNestedModel(qengine), test_only_eval_fn,
-                                 self.calib_data)
+                                 [self.calib_data])
                 checkQuantized(model)
 
 
@@ -245,7 +245,7 @@ def checkQuantized(model):
 
         # test one line API
         model = quantize(AnnotatedSubNestedModel(), test_only_eval_fn,
-                         self.calib_data)
+                         [self.calib_data])
         checkQuantized(model)
 
     def test_nested3(self):
@@ -287,7 +287,7 @@ def checkQuantized(model):
 
                 # test one line API
                 model = quantize(AnnotatedCustomConfigNestedModel(), test_only_eval_fn,
-                                 self.calib_data)
+                                 [self.calib_data])
                 checkQuantized(model)
 
     def test_skip_quant(self):
@@ -315,7 +315,7 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 # test one line API
-                model = quantize(AnnotatedSkipQuantModel(qengine), test_only_eval_fn, self.calib_data)
+                model = quantize(AnnotatedSkipQuantModel(qengine), test_only_eval_fn, [self.calib_data])
                 checkQuantized(model)
 
     @skipIfNoFBGEMM
@@ -341,7 +341,7 @@ def checkQuantized(model):
         checkQuantized(model)
 
         # test one line API
-        model = quantize(QuantStubModel(), test_only_eval_fn, self.calib_data)
+        model = quantize(QuantStubModel(), test_only_eval_fn, [self.calib_data])
         checkQuantized(model)
 
     def test_resnet_base(self):
@@ -400,7 +400,7 @@ def checkQuantized(model):
         checkQuantized(model)
 
         model_oneline = quantize(
-            NormalizationTestModel(), test_only_eval_fn, self.calib_data)
+            NormalizationTestModel(), test_only_eval_fn, [self.calib_data])
         checkQuantized(model)
 
     def test_save_load_state_dict(self):
@@ -463,7 +463,7 @@ def checkQuantized(model):
 
         # test one line API
         model_oneline = quantize(ActivationsTestModel(), test_only_eval_fn,
-                                 self.calib_data)
+                                 [self.calib_data])
         checkQuantized(model_oneline)
 
     @override_qengines
@@ -1083,7 +1083,7 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 model = quantize_qat(ManualLinearQATModel(qengine), test_only_train_fn,
-                                     self.train_data)
+                                     [self.train_data])
                 checkQuantized(model)
 
     def test_eval_only_fake_quant(self):
@@ -1123,7 +1123,7 @@ def checkQuantized(model):
                 checkQuantized(model)
 
                 model = ManualConvLinearQATModel()
-                model = quantize_qat(model, test_only_train_fn, self.img_data_2d_train)
+                model = quantize_qat(model, test_only_train_fn, [self.img_data_2d_train])
                 checkQuantized(model)
 
     def test_train_save_load_eval(self):
@@ -1434,7 +1434,7 @@ def checkQuantized(model):
         model = ModelForFusion(default_qat_qconfig).train()
         model = fuse_modules(model, [['conv1', 'bn1', 'relu1'],
                              ['sub1.conv', 'sub1.bn']])
-        model = quantize_qat(model, test_only_train_fn, self.img_data_1d_train)
+        model = quantize_qat(model, test_only_train_fn, [self.img_data_1d_train])
         with self.assertRaisesRegex(RuntimeError, "Could not run 'aten::native_batch_norm' with arguments from the 'QuantizedCPU'"):
             checkQuantized(model)
 
@@ -1514,7 +1514,7 @@ def checkQuantized(model):
                              ['bn2', 'relu3'],
                              ['sub1.conv', 'sub1.bn'],
                              ['conv3', 'bn3', 'relu4']])
-        model = quantize(model, test_only_eval_fn, self.img_data_1d)
+        model = quantize(model, test_only_eval_fn, [self.img_data_1d])
         checkQuantized(model)
 
     def test_fusion_sequential_model_train(self):
diff --git a/test/quantization/test_quantize_jit.py b/test/quantization/test_quantize_jit.py
index 0bf8f4f65fe1..f67a585d99b6 100644
--- a/test/quantization/test_quantize_jit.py
+++ b/test/quantization/test_quantize_jit.py
@@ -3095,7 +3095,7 @@ def test_single_linear(self):
         # compare the result of the two quantized models later
         linear_model.fc1.weight = torch.nn.Parameter(annotated_linear_model.fc1.module.weight.detach())
         linear_model.fc1.bias = torch.nn.Parameter(annotated_linear_model.fc1.module.bias.detach())
-        model_eager = quantize(annotated_linear_model, test_only_eval_fn, self.calib_data)
+        model_eager = quantize(annotated_linear_model, test_only_eval_fn, [self.calib_data])
 
         qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
         model_traced = torch.jit.trace(linear_model, self.calib_data[0][0])
@@ -3135,7 +3135,7 @@ def test_observer_with_ignored_function(self):
             linear_model.fc1.weight = torch.nn.Parameter(annotated_linear_model.fc1.module.weight.detach())
             linear_model.fc1.bias = torch.nn.Parameter(annotated_linear_model.fc1.module.bias.detach())
             model_eager = quantize(annotated_linear_model, test_only_eval_fn,
-                                   self.calib_data)
+                                   [self.calib_data])
 
             qconfig_dict = {'': qconfig}
             model_traced = torch.jit.trace(linear_model, self.calib_data[0][0])
@@ -3161,7 +3161,7 @@ def test_conv(self):
         # copy the weight from eager mode so that we can
         # compare the result of the two quantized models later
         conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach())
-        model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d)
+        model_eager = quantize(annotated_conv_model, test_only_eval_fn, [self.img_data_2d])
         qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
         model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0])
         model_script = torch.jit.script(conv_model)
@@ -3189,7 +3189,7 @@ def test_conv_transpose(self):
         # copy the weight from eager mode so that we can
         # compare the result of the two quantized models later
         conv_model.conv.weight = torch.nn.Parameter(annotated_conv_model.conv.weight.detach())
-        model_eager = quantize(annotated_conv_model, test_only_eval_fn, self.img_data_2d)
+        model_eager = quantize(annotated_conv_model, test_only_eval_fn, [self.img_data_2d])
         qconfig_dict = {'': get_default_qconfig(torch.backends.quantized.engine)}
         model_traced = torch.jit.trace(conv_model, self.img_data_2d[0][0])
         model_script = torch.jit.script(conv_model)
@@ -3217,7 +3217,7 @@ def test_conv_bn(self):
         conv_model_to_script.conv.weight = torch.nn.Parameter(conv_model.conv.weight.detach())
         fuse_modules(conv_model, ['conv', 'bn'], inplace=True)
         model_eager = quantize(conv_model, test_only_eval_fn,
-                               self.img_data_2d)
+                               [self.img_data_2d])
         qconfig_dict = {
             '': default_qconfig
         }
@@ -3248,7 +3248,7 @@ def test_nested(self):
         script_model.fc3.weight = torch.nn.Parameter(eager_model.fc3.module.weight.detach())
         script_model.fc3.bias = torch.nn.Parameter(eager_model.fc3.module.bias.detach())
 
-        model_eager = quantize(eager_model, test_only_eval_fn, self.calib_data)
+        model_eager = quantize(eager_model, test_only_eval_fn, [self.calib_data])
         qconfig_dict = {
             'sub2.fc1': default_per_channel_qconfig if qengine_is_fbgemm() else default_qconfig,
             'fc3': default_qconfig
@@ -3284,7 +3284,7 @@ def test_skip_quant(self):
 
         eager_model.fuse_modules()
 
-        model_eager = quantize(eager_model, test_only_eval_fn, self.calib_data)
+        model_eager = quantize(eager_model, test_only_eval_fn, [self.calib_data])
         qconfig_dict = {
             '': get_default_qconfig(torch.backends.quantized.engine),
             'fc': None
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index 7bd16e5a452a..1c66c8fb986f 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -2970,7 +2970,7 @@ def embedding_bag_rowwise_offsets_run(
             embedding_dim, num_offsets,
             use_32bit_indices, use_32bit_offsets,
             enable_per_sample_weights,
-            include_last_offset, sparsity, atol, rtol):
+            include_last_offset, fallback_to_no_sparse, sparsity, atol, rtol):
         pt_op = torch.ops.quantized.embedding_bag_byte_rowwise_offsets
         pt_prepack_op = torch.ops.quantized.embedding_bag_byte_prepack
         if bit_rate == 4:
@@ -3029,20 +3029,25 @@ def get_reference_result(
         pruned_weights = weights
         prune_weights = sparsity > 0
         if prune_weights:
-            # Prune and generate mapping table
-            num_compressed_rows = 0
-            unpruned_ids = []
-            for i in range(num_embeddings):
-                if np.random.uniform() < sparsity:
-                    mapping_table[i] = -1
-                    q_weights[i, :] = 0
-                    weights[i, :] = 0
-                else:
-                    mapping_table[i] = num_compressed_rows
-                    num_compressed_rows += 1
-                    unpruned_ids.append(i)
-            q_weights = q_weights[unpruned_ids]
-            pruned_weights = weights[unpruned_ids]
+            if fallback_to_no_sparse:
+                # Testing that prune_weight with mapping_table {0} will
+                # fallback to non sparse embedding look up kernel.
+                mapping_table = np.zeros(1, dtype=np.int32)
+            else:
+                # Prune and generate mapping table
+                num_compressed_rows = 0
+                unpruned_ids = []
+                for i in range(num_embeddings):
+                    if np.random.uniform() < sparsity:
+                        mapping_table[i] = -1
+                        q_weights[i, :] = 0
+                        weights[i, :] = 0
+                    else:
+                        mapping_table[i] = num_compressed_rows
+                        num_compressed_rows += 1
+                        unpruned_ids.append(i)
+                q_weights = q_weights[unpruned_ids]
+                pruned_weights = weights[unpruned_ids]
 
         result = pt_op(q_weights,
                        indices.int() if use_32bit_indices else indices,
@@ -3094,6 +3099,7 @@ def get_reference_result(
            use_32bit_offsets=st.booleans(),
            enable_per_sample_weights=st.booleans(),
            include_last_offset=st.booleans(),
+           fallback_to_no_sparse=st.booleans(),
            sparsity=st.sampled_from([0.0, 0.5, 0.7]))
     def test_embedding_bag_byte(self, num_embeddings,
                                 embedding_dim, num_offsets,
@@ -3101,11 +3107,13 @@ def test_embedding_bag_byte(self, num_embeddings,
                                 use_32bit_offsets,
                                 enable_per_sample_weights,
                                 include_last_offset,
+                                fallback_to_no_sparse,
                                 sparsity):
         self.embedding_bag_rowwise_offsets_run(
             8, num_embeddings, embedding_dim, num_offsets,
             use_32bit_indices, use_32bit_offsets,
             enable_per_sample_weights, include_last_offset,
+            fallback_to_no_sparse,
             sparsity=sparsity, atol=0.005, rtol=1e-3)
 
     """ Tests the correctness of the embedding_bag_4bit quantized operator """
@@ -3116,18 +3124,23 @@ def test_embedding_bag_byte(self, num_embeddings,
            use_32bit_offsets=st.booleans(),
            enable_per_sample_weights=st.booleans(),
            include_last_offset=st.booleans(),
+           fallback_to_no_sparse=st.booleans(),
            sparsity=st.sampled_from([0.0, 0.5, 0.7]))
     def test_embedding_bag_4bit(self, num_embeddings,
                                 embedding_dim, num_offsets,
                                 use_32bit_indices,
                                 use_32bit_offsets,
                                 enable_per_sample_weights,
-                                include_last_offset, sparsity):
+                                include_last_offset,
+                                fallback_to_no_sparse,
+                                sparsity):
         self.embedding_bag_rowwise_offsets_run(4, num_embeddings,
                                                embedding_dim, num_offsets,
                                                use_32bit_indices, use_32bit_offsets,
                                                enable_per_sample_weights,
-                                               include_last_offset, sparsity=sparsity,
+                                               include_last_offset,
+                                               fallback_to_no_sparse,
+                                               sparsity=sparsity,
                                                atol=0.1, rtol=1e-2)
 
     """ Tests the correctness of the quantized embedding lookup operator """
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 7ec70e8fbe17..5c8acd70a07a 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -5039,7 +5039,7 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul',
                 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub',
                 'exp', 'mean', 'inverse', 'triangular_solve', 'solve', 'addcmul',
-                'addcdiv', 'linalg.tensorinv', ] + separate_complex_tests
+                'addcdiv', 'linalg.tensorinv', 'matrix_exp'] + separate_complex_tests
 
 def add_test(
         name,
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index dad0e2feb5b2..8f6d0206de34 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -15,7 +15,8 @@
     torch_to_numpy_dtype_dict, make_tensor)
 from torch.testing._internal.common_device_type import (
     instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA,
-    dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA)
+    dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyOnCPUAndCUDA,
+    skipCUDAIfRocm)
 
 # TODO: remove this
 def _generate_input(shape, dtype, device, with_extremal):
@@ -1357,27 +1358,136 @@ def test_rdiv(self, device, dtype):
         z = torch.tensor([30 / v.item() for v in x], device=device)
         self.assertEqual(y, z, exact_dtype=False)
 
-    @onlyCPU
+    @dtypes(*torch.testing.get_all_fp_dtypes(include_bfloat16=False))
+    def test_fmod_by_zero_float(self, device, dtype):
+        # check floating-point tensor fmod to zero is nan on both CPU and GPU
+        x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9)
+        zero = torch.zeros_like(x)
+
+        self.assertTrue(torch.all(x.fmod(0.0).isnan()))
+        self.assertTrue(torch.all(x.fmod(zero).isnan()))
+        # out
+        out = torch.empty(0, device=device, dtype=dtype)
+        torch.fmod(x, zero, out=out)
+        self.assertEqual(out.size(), torch.Size([10, 10]))
+        self.assertTrue(torch.all(out.isnan()))
+        # in-place
+        x.fmod_(zero)
+        self.assertTrue(torch.all(x.isnan()))
+
+    @onlyOnCPUAndCUDA  # Check Issue https://github.com/pytorch/pytorch/issues/48130
+    @skipCUDAIfRocm  # Error happens on both ROCM and XLA
+    @dtypes(*torch.testing.get_all_int_dtypes())
+    def test_fmod_by_zero_integral(self, device, dtype):
+        # check integral tensor fmod to zero
+        x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9)
+        zero = torch.zeros_like(x)
+        # out
+        out = torch.empty(0, device=device, dtype=dtype)
+        # In-place
+        x_ = x.clone()
+        # RuntimeError on CPU
+        if device == 'cpu':
+            with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
+                x.fmod(zero)
+            with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
+                torch.fmod(x, zero, out=out)
+            with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
+                x.fmod_(zero)
+        # Different value for different dtype on GPU
+        else:
+            if dtype == torch.int64:
+                self.assertEqual(x.fmod(zero) == 4294967295, x >= 0)
+                self.assertEqual(x.fmod(zero) == -1, x < 0)
+                # out
+                torch.fmod(x, zero, out=out)
+                self.assertEqual(out == 4294967295, x >= 0)
+                self.assertEqual(out == -1, x < 0)
+                self.assertEqual(out.size(), torch.Size([10, 10]))
+                # in-place
+                x_.fmod_(zero)
+                self.assertEqual(x_ == 4294967295, x >= 0)
+                self.assertEqual(x_ == -1, x < 0)
+            else:
+                value = 255 if dtype == torch.uint8 else -1
+                self.assertTrue(torch.all(x.fmod(zero) == value))
+                # out
+                torch.fmod(x, zero, out=out)
+                self.assertTrue(torch.all(out == value))
+                self.assertEqual(out.size(), torch.Size([10, 10]))
+                # in-place
+                x_.fmod_(zero)
+                self.assertTrue(torch.all(x_ == value))
+
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
     def test_fmod(self, device, dtype):
-        m1 = torch.Tensor(10, 10).uniform_(-10., 10.).to(dtype=dtype, device=device)
-        res1 = m1.clone()
-        q = 3
-        res1[:, 3].fmod_(q)
-        res2 = m1.clone()
-        for i in range(m1.size(1)):
-            res2[i, 3] = math.fmod(res2[i, 3], q)
-        self.assertEqual(res1, res2)
+        # Use numpy as reference
+        def _reference_implementation(x, mod):
+            np_x = x.cpu().numpy()
+            np_mod = 0
+            # No type promotion
+            # Issue #47779: https://github.com/pytorch/pytorch/issues/47779
+            if torch.is_tensor(mod):
+                np_mod = mod.cpu().numpy()
+            else:
+                np_mod = mod
+                # Non XLA platform needs to cast to int
+                if dtype in torch.testing.get_all_int_dtypes() and self.device_type in ['cpu', 'cuda']:
+                    np_mod = int(np_mod)
+            exp = np.fmod(np_x, np_mod)
+            exp = torch.from_numpy(exp)
+
+            res = torch.fmod(x, mod)
+            res = res.to(exp.dtype)
+            self.assertEqual(res, exp)
+            # out
+            out = torch.empty(0, device=device, dtype=dtype)
+            torch.fmod(x, mod, out=out)
+            out.to(exp.dtype)
+            self.assertEqual(out, exp)
+            self.assertEqual(out.size(), torch.Size([10, 10]))
+            # in-place
+            x.fmod_(mod)
+            x.to(exp.dtype)
+            self.assertEqual(out, exp)
+
+        x = make_tensor((10, 10), device=device, dtype=dtype, low=-9, high=9)
+        # Exclude 0
+        # mod with same dtype as x
+        mod = make_tensor((10, 10), device=device, dtype=dtype, low=1, high=9)
+        # mod with floating-point dtype
+        mod_float = make_tensor((10, 10), device=device,
+                                dtype=torch.float if dtype in torch.testing.get_all_int_dtypes() else dtype,
+                                low=1, high=9)
+        # non-contiguous
+        x_nc = x.t()
+        mod_nc = mod.t()
 
-        zero = torch.zeros_like(m1)
+        # Mods: Integer, Float, Tensor, Non-contiguous Tensor
+        mods = [3, 2.3, mod, mod_nc]
+        for m in mods:
+            _reference_implementation(x, m)
+            _reference_implementation(x_nc, m)
+
+        # Integral Tensor fmod to floating-point Tensor
+        # Can not cast floating-point result to original integral Tensor without type promotion
         if dtype in torch.testing.get_all_int_dtypes():
-            with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
-                m1.fmod(0)
-            with self.assertRaisesRegex(RuntimeError, "ZeroDivisionError"):
-                m1.fmod(zero)
+            res = torch.fmod(x, mod_float)
+            exp = np.fmod(x.cpu().numpy(), mod_float.cpu().numpy())
+            exp = torch.from_numpy(exp)
+            res = res.to(exp.dtype)
+            self.assertEqual(res, exp)
+            with self.assertRaisesRegex(RuntimeError, "result type (Half|Float|Double) "
+                                                      "can't be cast to the desired "
+                                                      "output type (Byte|Char|Short|Int|Long)"):
+                out = torch.empty(0, device=device, dtype=dtype)
+                torch.fmod(x, mod_float, out=out)
+            with self.assertRaisesRegex(RuntimeError, "result type (Half|Float|Double) "
+                                                      "can't be cast to the desired "
+                                                      "output type (Byte|Char|Short|Int|Long)"):
+                x.fmod_(mod_float)
         else:
-            self.assertTrue(torch.all(m1.fmod(0).isnan()))
-            self.assertTrue(torch.all(m1.fmod(zero).isnan()))
+            _reference_implementation(x, mod_float)
 
     @onlyCPU
     @dtypes(torch.float, torch.long)
@@ -1596,11 +1706,6 @@ def reference_implementation(res2):
     def test_cdiv(self, device, dtype):
         self._test_cop(torch.div, lambda x, y: x / y, dtype, device)
 
-    @onlyCPU
-    @dtypes(torch.float)
-    def test_cfmod(self, device, dtype):
-        self._test_cop(torch.fmod, math.fmod, dtype, device)
-
     @onlyCPU
     @dtypes(torch.float)
     def test_cremainder(self, device, dtype):
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 0d6ee2e03bd6..a1afc216d42a 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -484,6 +484,17 @@ def __len__(self):
         return self.size
 
 
+class EmptyTensorDataset(torch.utils.data.Dataset):
+    def __init__(self, len):
+        self.len = len
+
+    def __len__(self):
+        return self.len
+
+    def __getitem__(self, any):
+        return torch.empty(0)
+
+
 class SynchronizedSeedDataset(SynchronizedDataset):
     def __getitem__(self, idx):
         self.sync_once()
@@ -504,6 +515,24 @@ def _test_timeout_pin_memory(persistent_workers):
     _ = next(iter(dataloader))
 
 
+def _test_large_sampler_indices(persistent_workers):
+    # See
+    #   test_large_sampler_indices
+    #   https://github.com/pytorch/pytorch/issues/48666
+
+    dataloader = torch.utils.data.DataLoader(
+        EmptyTensorDataset(10000000),
+        batch_size=40960,
+        persistent_workers=persistent_workers,
+        num_workers=1)
+
+    it = iter(dataloader)
+
+    for x in it:
+        assert x.numel() == 0
+        raise RuntimeError('My Error')
+
+
 def disable_stderr(worker_id):
     r"""
     Avoids printing "ERROR: Unexpected segmentation fault encountered in worker."
@@ -978,6 +1007,24 @@ def test_timeout(self):
             finally:
                 p.terminate()
 
+    def test_large_sampler_indices(self):
+        # Test that the data loader cleanly exit when the process errors
+        #   1. having an reference to the iterator
+        #   2. using a sampler that yields big elements s.t. _index_queues putters block
+        #
+        # More context: https://github.com/pytorch/pytorch/issues/48666
+
+        p = ErrorTrackingProcess(target=_test_large_sampler_indices, args=(self.persistent_workers,))
+        p.start()
+        p.join(JOIN_TIMEOUT)
+        try:
+            self.assertFalse(p.is_alive())
+            self.assertNotEqual(p.exitcode, 0)
+            self.assertIsInstance(p.exception, RuntimeError)
+            self.assertRegex(str(p.exception), r'My Error')
+        finally:
+            p.terminate()
+
     def test_invalid_ctor_args_combinations(self):
         # general
         with self.assertRaisesRegex(ValueError, "num_workers option should be non-negative"):
diff --git a/test/test_determination.py b/test/test_determination.py
index 0f860cab5101..7e9420285e5a 100644
--- a/test/test_determination.py
+++ b/test/test_determination.py
@@ -112,6 +112,7 @@ def test_torch_file(self):
                 "distributed/test_distributed_fork",
                 "test_cpp_extensions_aot_ninja",
                 "test_cpp_extensions_aot_no_ninja",
+                "test_utils",
                 "test_determination",
             ],
         )
diff --git a/test/test_kernel_launch_checks.py b/test/test_kernel_launch_checks.py
index 8796b9913f73..079a7182a1fc 100644
--- a/test/test_kernel_launch_checks.py
+++ b/test/test_kernel_launch_checks.py
@@ -9,26 +9,26 @@ def test_check_code(self):
         # Try some different spacings
         self.assertEqual(2, check_code_for_cuda_kernel_launches("""
 some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
-TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+C10_CUDA_KERNEL_LAUNCH_CHECK();
 some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
 
 some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
-TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+C10_CUDA_KERNEL_LAUNCH_CHECK();
 some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
 some_other_stuff;
 some_function_call<TemplateArg><<<1,2,0,stream>>>(arg1,arg2,arg3);
-TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+C10_CUDA_KERNEL_LAUNCH_CHECK();
 some_function_call<TemplateArg><<<1,2,0,stream>>> (arg1,arg2,arg3);
-TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+C10_CUDA_KERNEL_LAUNCH_CHECK();
 some_function_call<TemplateArg><<<1,2,0,stream>>> ( arg1 , arg2 , arg3 ) ;
 
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
         """))
 
         # Does it work for macros?
         self.assertEqual(0, check_code_for_cuda_kernel_launches("""
 #define SOME_MACRO(x) some_function_call<<<1,2>>> ( x ) ;  \\
-    TORCH_CUDA_KERNEL_LAUNCH_CHECK();
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
         """))
 
     def test_check_cuda_launches(self):
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 5b182b2fd49a..71c3cf654c1b 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2180,6 +2180,131 @@ def test_dot_invalid_args(self, device):
         self._test_dot_vdot_invalid_args(device, torch.dot)
         self._test_dot_vdot_invalid_args(device, torch.dot, complex_dtypes=True)
 
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_matrix_rank(self, device, dtype):
+        matrix_rank = torch.linalg.matrix_rank
+
+        def run_test(shape0, shape1, batch):
+            a = torch.randn(*batch, shape0, shape1, dtype=dtype, device=device)
+            rank_a = matrix_rank(a)
+
+            self.assertEqual(rank_a, matrix_rank(a.conj().transpose(-2, -1)))
+            aaH = torch.matmul(a, a.conj().transpose(-2, -1))
+            rank_aaH = matrix_rank(aaH)
+            rank_aaH_hermitian = matrix_rank(aaH, hermitian=True)
+            self.assertEqual(rank_aaH, rank_aaH_hermitian)
+            aHa = torch.matmul(a.conj().transpose(-2, -1), a)
+            self.assertEqual(matrix_rank(aHa), matrix_rank(aHa, hermitian=True))
+
+            # check against NumPy
+            self.assertEqual(rank_a, np.linalg.matrix_rank(a.cpu().numpy()))
+            self.assertEqual(matrix_rank(a, 0.01), np.linalg.matrix_rank(a.cpu().numpy(), 0.01))
+
+            self.assertEqual(rank_aaH, np.linalg.matrix_rank(aaH.cpu().numpy()))
+            self.assertEqual(matrix_rank(aaH, 0.01), np.linalg.matrix_rank(aaH.cpu().numpy(), 0.01))
+
+            # hermitian flag for NumPy was added in 1.14.0
+            if np.lib.NumpyVersion(np.__version__) >= '1.14.0':
+                self.assertEqual(rank_aaH_hermitian,
+                                 np.linalg.matrix_rank(aaH.cpu().numpy(), hermitian=True))
+                self.assertEqual(matrix_rank(aaH, 0.01, True),
+                                 np.linalg.matrix_rank(aaH.cpu().numpy(), 0.01, True))
+
+            # check out= variant
+            out = torch.empty(a.shape[:-2], dtype=torch.int64, device=device)
+            ans = matrix_rank(a, out=out)
+            self.assertEqual(ans, out)
+            self.assertEqual(ans, rank_a)
+
+        shapes = (3, 13)
+        batches = ((), (0, ), (4, ), (3, 5, ))
+        for (shape0, shape1), batch in zip(itertools.product(shapes, reversed(shapes)), batches):
+            run_test(shape0, shape1, batch)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_matrix_rank_empty(self, device, dtype):
+        matrix_rank = torch.linalg.matrix_rank
+
+        # NumPy doesn't work for input with no elements
+        def run_test(shape0, shape1, batch):
+            a = torch.randn(*batch, shape0, shape1, dtype=dtype, device=device)
+            rank_a = matrix_rank(a)
+            expected = torch.zeros(batch, dtype=torch.int64, device=device)
+
+            self.assertEqual(rank_a, matrix_rank(a.conj().transpose(-2, -1)))
+
+            aaH = torch.matmul(a, a.conj().transpose(-2, -1))
+            rank_aaH = matrix_rank(aaH)
+            rank_aaH_hermitian = matrix_rank(aaH, hermitian=True)
+            self.assertEqual(rank_aaH, rank_aaH_hermitian)
+
+            aHa = torch.matmul(a.conj().transpose(-2, -1), a)
+            self.assertEqual(matrix_rank(aHa), matrix_rank(aHa, hermitian=True))
+
+            self.assertEqual(rank_a, expected)
+            self.assertEqual(matrix_rank(a, 0.01), expected)
+
+            self.assertEqual(rank_aaH, expected)
+            self.assertEqual(matrix_rank(aaH, 0.01), expected)
+
+            self.assertEqual(rank_aaH_hermitian, expected)
+            self.assertEqual(matrix_rank(aaH, 0.01, True), expected)
+
+        batches = ((), (4, ), (3, 5, ))
+        for batch in batches:
+            run_test(0, 0, batch)
+            run_test(0, 3, batch)
+            run_test(3, 0, batch)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_matrix_rank_basic(self, device, dtype):
+        matrix_rank = torch.linalg.matrix_rank
+
+        a = torch.eye(10, dtype=dtype, device=device)
+        self.assertEqual(matrix_rank(a).item(), 10)
+        self.assertEqual(matrix_rank(a, hermitian=True).item(), 10)
+
+        a[5, 5] = 0
+        self.assertEqual(matrix_rank(a).item(), 9)
+        self.assertEqual(matrix_rank(a, hermitian=True).item(), 9)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_old_matrix_rank(self, device, dtype):
+        a = torch.eye(10, dtype=dtype, device=device)
+        self.assertEqual(torch.matrix_rank(a).item(), 10)
+        self.assertEqual(torch.matrix_rank(a, True).item(), 10)
+
+        a[5, 5] = 0
+        self.assertEqual(torch.matrix_rank(a).item(), 9)
+        self.assertEqual(torch.matrix_rank(a, True).item(), 9)
+
+        a = torch.randn(24, 42, dtype=dtype, device=device)
+        self.assertEqual(torch.matrix_rank(a), torch.matrix_rank(a.t()))
+        aaT = torch.mm(a, a.conj().t())
+        self.assertEqual(torch.matrix_rank(aaT), torch.matrix_rank(aaT, True))
+        aTa = torch.mm(a.conj().t(), a)
+        self.assertEqual(torch.matrix_rank(aTa), torch.matrix_rank(aTa, True))
+
+        a = torch.randn(35, 75, dtype=dtype, device=device)
+        self.assertEqual(torch.matrix_rank(a), np.linalg.matrix_rank(a.cpu().numpy()))
+        self.assertEqual(torch.matrix_rank(a, 0.01), np.linalg.matrix_rank(a.cpu().numpy(), 0.01))
+
+        aaT = torch.mm(a, a.conj().t())
+        self.assertEqual(torch.matrix_rank(aaT), np.linalg.matrix_rank(aaT.cpu().numpy()))
+        self.assertEqual(torch.matrix_rank(aaT, 0.01), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01))
+
+        if np.lib.NumpyVersion(np.__version__) >= '1.14.0':
+            self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True))
+            self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True))
+
     def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular,
                                      device, dtype):
         triangle_function = torch.triu if upper else torch.tril
@@ -3873,38 +3998,6 @@ def run_test(M):
             self.assertEqual(torch.eye(matsize, dtype=dtype, device=device).expand(sizes), M.pinverse().matmul(M),
                              atol=1e-7, rtol=0, msg='pseudo-inverse for invertible matrix')
 
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    def test_matrix_rank(self, device):
-        a = torch.eye(10, device=device)
-        self.assertEqual(torch.matrix_rank(a).item(), 10)
-        self.assertEqual(torch.matrix_rank(a, True).item(), 10)
-
-        a[5, 5] = 0
-        self.assertEqual(torch.matrix_rank(a).item(), 9)
-        self.assertEqual(torch.matrix_rank(a, True).item(), 9)
-
-        a = torch.randn(24, 42, device=device)
-        self.assertEqual(torch.matrix_rank(a), torch.matrix_rank(a.t()))
-        aaT = torch.mm(a, a.t())
-        self.assertEqual(torch.matrix_rank(aaT), torch.matrix_rank(aaT, True))
-        aTa = torch.mm(a.t(), a)
-        self.assertEqual(torch.matrix_rank(aTa), torch.matrix_rank(aTa, True))
-
-        from numpy.linalg import matrix_rank
-        a = torch.randn(35, 75, device=device)
-        self.assertEqual(torch.matrix_rank(a).item(), matrix_rank(a.cpu().numpy()))
-        self.assertEqual(torch.matrix_rank(a, 0.01).item(), matrix_rank(a.cpu().numpy(), 0.01))
-
-        aaT = torch.mm(a, a.t())
-        self.assertEqual(torch.matrix_rank(aaT).item(), matrix_rank(aaT.cpu().numpy()))
-        self.assertEqual(torch.matrix_rank(aaT, 0.01).item(), matrix_rank(aaT.cpu().numpy(), 0.01))
-
-        if np.lib.NumpyVersion(np.__version__) >= '1.14.0':
-            self.assertEqual(torch.matrix_rank(aaT, True).item(), matrix_rank(aaT.cpu().numpy(), True))
-            self.assertEqual(torch.matrix_rank(aaT, 0.01, True).item(),
-                             matrix_rank(aaT.cpu().numpy(), 0.01, True))
-
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @dtypes(torch.double)
@@ -4008,7 +4101,7 @@ def test_matrix_exp_boundary_cases(self, device, dtype):
     @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.float, torch.double)
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_matrix_exp_analytic(self, device, dtype):
         # check zero matrix
         x = torch.zeros(20, 20, dtype=dtype, device=device)
@@ -4152,7 +4245,7 @@ def run_test(*n):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.float, torch.double)
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
     def test_matrix_exp_compare_with_taylor(self, device, dtype):
 
         def normalize_to_1_operator_norm(sample, desired_norm):
@@ -4184,10 +4277,10 @@ def get_taylor_approximation(a, deg):
             return res
 
         def scale_square(a, deg):
-            if a.norm() < 1.0:
+            if a.abs().pow(2).sum().sqrt() < 1.0:
                 return get_taylor_approximation(a, 12)
             else:
-                s = int(torch.log2(a.norm()).ceil().item())
+                s = int(torch.log2(a.abs().pow(2).sum().sqrt()).ceil().item())
                 b = a / (2 ** s)
                 b = get_taylor_approximation(b, 18)
                 for _ in range(s):
@@ -4746,7 +4839,7 @@ def maybe_squeeze_result(l, r, result):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_lu_solve_batched_non_contiguous(self, device, dtype):
         from numpy.linalg import solve
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
@@ -4765,20 +4858,22 @@ def lu_solve_test_helper(self, A_dims, b_dims, pivot, device, dtype):
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
 
         b = torch.randn(*b_dims, dtype=dtype, device=device)
-        A = random_fullrank_matrix_distinct_singular_value(*A_dims, dtype=dtype, device=device)
+        A = random_fullrank_matrix_distinct_singular_value(*A_dims, dtype=dtype).to(device)
         LU_data, LU_pivots, info = torch.lu(A, get_infos=True, pivot=pivot)
         self.assertEqual(info, torch.zeros_like(info))
         return b, A, LU_data, LU_pivots
 
     @skipCPUIfNoLapack
     @skipCUDAIfNoMagma
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
     def test_lu_solve(self, device, dtype):
         def sub_test(pivot):
             for k, n in zip([2, 3, 5], [3, 5, 7]):
                 b, A, LU_data, LU_pivots = self.lu_solve_test_helper((n,), (n, k), pivot, device, dtype)
                 x = torch.lu_solve(b, LU_data, LU_pivots)
-                self.assertLessEqual(b.dist(A.mm(x)), 1e-12)
+                self.assertEqual(b, A.mm(x))
 
         sub_test(True)
         if self.device_type == 'cuda':
@@ -4786,7 +4881,9 @@ def sub_test(pivot):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
+                        torch.float64: 1e-8, torch.complex128: 1e-8})
     def test_lu_solve_batched(self, device, dtype):
         def sub_test(pivot):
             def lu_solve_batch_test_helper(A_dims, b_dims, pivot):
@@ -4797,7 +4894,8 @@ def lu_solve_batch_test_helper(A_dims, b_dims, pivot):
                 x_exp = torch.stack(x_exp_list)  # Stacked output
                 x_act = torch.lu_solve(b, LU_data, LU_pivots)  # Actual output
                 self.assertEqual(x_exp, x_act)  # Equality check
-                self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 1e-12)  # Correctness check
+                Ax = torch.matmul(A, x_act)
+                self.assertEqual(b, Ax)
 
             for batchsize in [1, 3, 4]:
                 lu_solve_batch_test_helper((5, batchsize), (batchsize, 5, 10), pivot)
@@ -4815,20 +4913,20 @@ def lu_solve_batch_test_helper(A_dims, b_dims, pivot):
     @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_lu_solve_batched_many_batches(self, device, dtype):
         def run_test(A_dims, b_dims):
             b, A, LU_data, LU_pivots = self.lu_solve_test_helper(A_dims, b_dims, True, device, dtype)
             x = torch.lu_solve(b, LU_data, LU_pivots)
-            b_ = torch.matmul(A, x)
-            self.assertEqual(b_, b.expand_as(b_))
+            Ax = torch.matmul(A, x)
+            self.assertEqual(Ax, b.expand_as(Ax))
 
         run_test((5, 65536), (65536, 5, 10))
         run_test((5, 262144), (262144, 5, 10))
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    @dtypes(torch.double)
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
     def test_lu_solve_batched_broadcasting(self, device, dtype):
         from numpy.linalg import solve
         from torch.testing._internal.common_utils import random_fullrank_matrix_distinct_singular_value
diff --git a/test/test_ops.py b/test/test_ops.py
index 9043e7ac5f20..64afc6d36b25 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -88,20 +88,27 @@ def _grad_test_helper(self, device, dtype, op, variant):
     def _gradgrad_test_helper(self, device, dtype, op, variant):
         return self._check_helper(device, dtype, op, variant, 'gradgradcheck')
 
+    def _skip_helper(self, op, dtype):
+        if not op.test_complex_grad and dtype.is_complex:
+            self.skipTest("Skipped! complex grad tests marked to skip.")
+
     # Tests that gradients are computed correctly
     @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_fn_grad(self, device, dtype, op):
+        self._skip_helper(op, dtype)
         self._grad_test_helper(device, dtype, op, op.get_op())
 
     @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_method_grad(self, device, dtype, op):
+        self._skip_helper(op, dtype)
         self._grad_test_helper(device, dtype, op, op.get_method())
 
     @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_inplace_grad(self, device, dtype, op):
+        self._skip_helper(op, dtype)
         if not op.test_inplace_grad:
             self.skipTest("Skipped! Inplace gradcheck marked to skip.")
         self._grad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
@@ -110,16 +117,19 @@ def test_inplace_grad(self, device, dtype, op):
     @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_fn_gradgrad(self, device, dtype, op):
+        self._skip_helper(op, dtype)
         self._gradgrad_test_helper(device, dtype, op, op.get_op())
 
     @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_method_gradgrad(self, device, dtype, op):
+        self._skip_helper(op, dtype)
         self._gradgrad_test_helper(device, dtype, op, op.get_method())
 
     @dtypes(torch.double, torch.cdouble)
     @ops(op_db)
     def test_inplace_gradgrad(self, device, dtype, op):
+        self._skip_helper(op, dtype)
         if not op.test_inplace_grad:
             self.skipTest("Skipped! Inplace gradgradcheck marked to skip.")
         self._gradgrad_test_helper(device, dtype, op, self._get_safe_inplace(op.get_inplace()))
diff --git a/test/test_torch.py b/test/test_torch.py
index 378c081cef1a..6c04dd00dc76 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -2312,6 +2312,10 @@ def test_show_config(self):
             # We can't usefully test the output; just make sure this doesn't crash
             torch.__config__.show()
 
+        @unittest.skipIf(IS_FBCODE, "CXX_FLAGS is only for OSS build.")
+        def test_cxx_flags(self):
+            torch.__config__._cxx_flags()
+
         def test_parallel_info(self):
             torch.__config__.parallel_info()
 
@@ -2536,6 +2540,22 @@ def test_empty_meta(self):
             z = x + y
             self.assertEqual(z.size(), (2 ** 20, 2 ** 20))
 
+        def test_upsample_nearest1d_meta(self):
+            # TODO: this is not a sustainable way of testing meta functions,
+            # but I want some quick scaffolding first before a more
+            # integrated testing strategy
+            # NB: Can't make the exponent too big, or it will overflow
+            # signed 64-bit integer
+            x = torch.empty_meta(2 * 10 ** 8, 3, 2 * 10 ** 8)
+            z = torch.nn.functional.interpolate(x, scale_factor=2)
+            self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8))
+
+            # interpolate doesn't seem to support out=
+            # (not sure why passing None here doesn't work? How strange...)
+            z = torch.empty_meta(0)
+            torch._C._nn.upsample_nearest1d(x, (4 * 10 ** 8,), 2, out=z)
+            self.assertEqual(z.size(), (2 * 10 ** 8, 3, 4 * 10 ** 8))
+
         def test_normal_shape(self):
             warned = False
             for device in torch.testing.get_all_device_types():
@@ -5301,7 +5321,7 @@ def _test_helper(x, y, bias, memory_format):
                 lambda x, y: x.expm1_(),
                 lambda x, y: x.floor(),
                 lambda x, y: x.floor_(),
-                # lambda x, y: x.fmod(2), # https://github.com/pytorch/pytorch/issues/24565
+                lambda x, y: x.fmod(2),
                 lambda x, y: x.frac(),
                 lambda x, y: x.hypot(y),
                 lambda x, y: x.hypot_(y),
@@ -6695,7 +6715,6 @@ def inner(self, device, dtype):
     ('log10', '', _small_3d, lambda t, d: [], 1e-2, 5e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
     ('log1p', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types_no_half, [torch.bfloat16]),
     ('log2', '', _small_3d, lambda t, d: [], 1e-2, 1e-1, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
-    ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
     ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5,
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 2447a88decc5..656845598a49 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -857,18 +857,6 @@ def test_hardswish(self, device, dtype):
         torch.nn.functional.hardswish(inputTensorCpy, inplace=True)
         self.assertEqual(inputTensorCpy, expectedOutputTensor)
 
-    @onlyCPU
-    @dtypes(torch.float, torch.double)
-    def test_sigmoid(self, device, dtype):
-        # TODO: why not simulate math.sigmoid like with rsqrt?
-        inputValues = [-1000, -1, 0, 0.5, 1, 2, 1000]
-        expectedOutput = [0.0000, 0.2689, 0.5, 0.6225, 0.7311, 0.8808, 1.000]
-        precision_4dps = 0.0002
-
-        self.assertEqual(torch.tensor(inputValues, dtype=dtype, device=device).sigmoid(),
-                         torch.tensor(expectedOutput, dtype=dtype, device=device),
-                         atol=precision_4dps, rtol=0)
-
     @precisionOverride({torch.bfloat16: 1e-2, torch.float: 0.0002, torch.double: 0.0002})
     @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16)
     @dtypes(torch.float, torch.double)
diff --git a/test/test_utils.py b/test/test_utils.py
index 1e6449d3764c..5f1e693ab12f 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -3,17 +3,20 @@
 import re
 import shutil
 import random
+import subprocess
 import tempfile
+import textwrap
 import unittest
 import torch
 import torch.nn as nn
 import torch.utils.data
 import torch.cuda
 from torch.utils.checkpoint import checkpoint, checkpoint_sequential
+import torch.utils.cpp_extension
 import torch.hub as hub
 from torch.autograd._functions.utils import check_onnx_broadcast
 from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE
+from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS
 from urllib.error import URLError
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
@@ -662,5 +665,58 @@ def forward(self, x):
             ms(torch.tensor([False], dtype=torch.bool))
 
 
+@unittest.skipIf(IS_SANDCASTLE, "cpp_extension is OSS only.")
+class TestStandaloneCPPJIT(TestCase):
+    def test_load_standalone(self):
+        build_dir = tempfile.mkdtemp()
+        try:
+            src_path = os.path.join(build_dir, "main.cpp")
+            src = textwrap.dedent("""\
+                #include <iostream>
+                #include <torch/torch.h>
+                int main() {
+                    auto x = torch::eye(3);
+                    std::cout << x << std::endl;
+                }
+            """)
+            with open(src_path, "wt") as f:
+                f.write(src)
+
+            exec_path = torch.utils.cpp_extension.load(
+                "standalone_load_test",
+                src_path,
+                build_directory=build_dir,
+                is_python_module=False,
+                is_standalone=True,
+            )
+
+            ext = ".exe" if IS_WINDOWS else ""
+            self.assertEqual(
+                exec_path,
+                os.path.join(build_dir, f"standalone_load_test{ext}")
+            )
+
+            for shell in [True, False]:
+                r = subprocess.run(
+                    [exec_path],
+                    shell=shell,
+                    stdout=subprocess.PIPE,
+                )
+                self.assertEqual(r.returncode, 0)
+                self.assertEqual(
+                    # Windows prints "\r\n" for newlines.
+                    textwrap.dedent(r.stdout.decode("utf-8")).replace("\r\n", "\n"),
+                    textwrap.dedent("""\
+                     1  0  0
+                     0  1  0
+                     0  0  1
+                    [ CPUFloatType{3,3} ]
+                    """)
+                )
+
+        finally:
+            shutil.rmtree(build_dir)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/autograd/gen_annotated_fn_args.py b/tools/autograd/gen_annotated_fn_args.py
index 6003b58c1784..c393c905c73f 100644
--- a/tools/autograd/gen_annotated_fn_args.py
+++ b/tools/autograd/gen_annotated_fn_args.py
@@ -52,7 +52,7 @@ def gen_annotated(native_yaml_path: str, out: str, autograd_dir: str) -> None:
 @with_native_function
 def gen_annotated_args(f: NativeFunction) -> str:
     out_args: List[Dict[str, Any]] = []
-    for arg in f.func.arguments:
+    for arg in f.func.arguments.positional:
         if arg.default is not None:
             continue
         out_arg: Dict[str, Any] = {}
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index 6662b2a6ef7f..e55402f9e68d 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -137,7 +137,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
     if f.func.is_out_fn():
         # for *_out functions, handle the result argument differently for inplace/outplace.
         # For inplace: just add the input to the end to confirm with the JIT schema
-        name = f.func.out_arguments[0].name  # TODO: old codegen behavior - should fix
+        name = f.func.arguments.out[0].name  # TODO: old codegen behavior - should fix
         inplace = ADD_TRACE_INPUT.substitute(name=name, input=name)
 
         # for outplace: do nothing, except if the function is a factory.
@@ -145,7 +145,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
         # take an extra TensorOptions argument, which is missing in the _out function
         has_tensor_return = any(r.type.is_tensor_like() for r in f.func.returns)
         has_tensor_input_arg = any(a.type.is_tensor_like()
-                                   for a in itertools.chain(f.func.arguments, f.func.kwarg_only_arguments))
+                                   for a in itertools.chain(f.func.arguments.positional, f.func.arguments.kwarg_only))
         is_factory_method = f.category_override == 'factory' or (has_tensor_return and not has_tensor_input_arg)
 
         # HACK: preserve old codegen behavior - the old codegen set the `is_factory_method`
@@ -251,7 +251,7 @@ def format_prerecord_trace(f: NativeFunction) -> str:
         add_trace_inputs=format_trace_inputs(f) + additional_inputs,
         inplace_guard=INPLACE_GUARD.substitute(
             name=cpp.name(f.func),
-            mutable_input=f.func.out_arguments[0].name if f.func.out_arguments else 'self',
+            mutable_input=f.func.arguments.out[0].name if f.func.arguments.out else 'self',
         ) if is_inplace else '',
     )
 
@@ -269,7 +269,7 @@ def format_postrecord_trace(f: NativeFunction) -> str:
     # For outplacing ops, *_out overloads require special handling to move the
     # output *argument* to a return value
     if f.func.is_out_fn():
-        output_names_outplace = [arg.name for arg in f.func.out_arguments]
+        output_names_outplace = [arg.name for arg in f.func.arguments.out]
         output_names_inplace = cpp.return_names(f)
 
         # Code size optimization: the common case is that the return value is
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index b149194dc7d6..e60d90e4ecf8 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -78,7 +78,7 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'linalg_eigh',
+    'matrix_exp', 'linalg_eigh',
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index a2f576781761..b20497b5a82c 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -1,7 +1,7 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import *
 import tools.codegen.local as local
-from typing import Optional, Sequence, Union, Callable, List
+from typing import Optional, Sequence, Union, List
 
 # This file describes the translation of JIT schema to the public C++
 # API, which is what people use when they call functions like at::add.
@@ -157,7 +157,7 @@ def return_names(f: NativeFunction) -> Sequence[str]:
         # corresponding output function (r.name will get recorded
         # in field_name later.)
         elif f.func.is_out_fn():
-            name = f.func.out_arguments[i].name
+            name = f.func.arguments.out[i].name
         # If the return argument is explicitly named...
         elif r.name:
             name_conflict = any(r.name == a.name for a in f.func.schema_order_arguments())
@@ -251,15 +251,15 @@ def argument_not_this(
         assert_never(a)
 
 def argument(
-    a: Union[Argument, TensorOptionsArguments, ThisArgument],
+    a: Union[Argument, TensorOptionsArguments, SelfArgument],
 ) -> Union[CppSingleArgumentPack, CppThisArgumentPack]:
-    if isinstance(a, ThisArgument):
+    if isinstance(a, SelfArgument):
         return CppThisArgumentPack(argument=a, type=argument_type(a.argument))
     else:
         return CppSingleArgumentPack(argument_not_this(a))
 
 def argument_faithful(
-    a: Union[Argument, TensorOptionsArguments, ThisArgument],
+    a: Union[Argument, TensorOptionsArguments, SelfArgument],
 ) -> CppArgumentPack:
     if isinstance(a, TensorOptionsArguments):
         return CppTensorOptionsArgumentPack(
@@ -272,46 +272,20 @@ def argument_faithful(
     else:
         return argument(a)
 
-# NB: this unconditionally groups arguments
 def group_arguments(
     func: FunctionSchema, *, method: bool
-) -> Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]]:
-    args: List[Union[Argument, ThisArgument, TensorOptionsArguments]] = []
-
-    args.extend(func.out_arguments)
-
-    if method:
-        args.extend(ThisArgument(a) if a.name == "self" else a for a in func.arguments)
-    else:
-        args.extend(func.arguments)
-
-    # group up arguments for tensor options
-
-    def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
-        return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
-    predicates = [  # order matters
-        pred('dtype', Type.parse('ScalarType')),
-        pred('layout', Type.parse('Layout')),
-        pred('device', Type.parse('Device')),
-        pred('pin_memory', Type.parse('bool')),
-    ]
-
-    i = 0
-    while i < len(func.kwarg_only_arguments):
-        # If there is enough space...
-        if i <= len(func.kwarg_only_arguments) - len(predicates):
-            # And the next len(predicates) arguments look like TensorOptions arguments
-            if all(p(a) for p, a in zip(predicates, func.kwarg_only_arguments[i : i + len(predicates)])):
-                # Group them together as one argument
-                args.append(TensorOptionsArguments(
-                    dtype=func.kwarg_only_arguments[i],
-                    layout=func.kwarg_only_arguments[i + 1],
-                    device=func.kwarg_only_arguments[i + 2],
-                    pin_memory=func.kwarg_only_arguments[i + 3],
-                ))
-                i += len(predicates)
-                continue
-        args.append(func.kwarg_only_arguments[i])
-        i += 1
-
+) -> Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]]:
+    args: List[Union[Argument, SelfArgument, TensorOptionsArguments]] = []
+    args.extend(func.arguments.out)
+    args.extend(func.arguments.pre_self_positional)
+    if func.arguments.self_arg is not None:
+        if method:
+            args.append(func.arguments.self_arg)
+        else:
+            args.append(func.arguments.self_arg.argument)
+    args.extend(func.arguments.post_self_positional)
+    args.extend(func.arguments.pre_tensor_options_kwarg_only)
+    if func.arguments.tensor_options is not None:
+        args.append(func.arguments.tensor_options)
+    args.extend(func.arguments.post_tensor_options_kwarg_only)
     return args
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 813f06feb741..8f3925de0041 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -68,7 +68,7 @@ def name(func: FunctionSchema) -> str:
 
 def arguments(func: FunctionSchema) -> Tuple[DispatcherArgument, ...]:
     if local.use_c10_dispatcher().dispatcher_uses_new_style():
-        return tuple(map(argument, itertools.chain(func.out_arguments, func.arguments, func.kwarg_only_arguments)))
+        return tuple(map(argument, itertools.chain(func.arguments.out, func.arguments.positional, func.arguments.kwarg_only)))
     else:
         return tuple(
             DispatcherArgument(type=la.type, name=la.name, argument=la.argument)
diff --git a/tools/codegen/api/meta.py b/tools/codegen/api/meta.py
index 4e3bf00e01dc..4bfc8e837ec1 100644
--- a/tools/codegen/api/meta.py
+++ b/tools/codegen/api/meta.py
@@ -55,5 +55,5 @@ def argument(a: Argument) -> MetaArgument:
     )
 
 def arguments(func: FunctionSchema) -> Sequence[MetaArgument]:
-    assert not func.out_arguments
-    return list(map(argument, itertools.chain(func.arguments, func.kwarg_only_arguments)))
+    assert not func.arguments.out
+    return list(map(argument, itertools.chain(func.arguments.positional, func.arguments.kwarg_only)))
diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
index b459ee2c23e2..b9e5257aef85 100644
--- a/tools/codegen/api/native.py
+++ b/tools/codegen/api/native.py
@@ -1,6 +1,6 @@
 from tools.codegen.model import *
 
-from tools.codegen.api.types import TensorOptionsArguments, NativeArgument, ThisArgument
+from tools.codegen.api.types import NativeArgument
 import tools.codegen.api.cpp as cpp
 from tools.codegen import local
 
@@ -43,7 +43,7 @@ def returns_type(rs: Sequence[Return]) -> str:
 def argument_type(a: Argument) -> str:
     return argumenttype_type(a.type, mutable=a.is_write)
 
-def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Sequence[NativeArgument]:
+def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> Sequence[NativeArgument]:
     if isinstance(a, Argument):
         return [NativeArgument(
             type=argument_type(a),
@@ -51,8 +51,8 @@ def argument(a: Union[Argument, ThisArgument, TensorOptionsArguments]) -> Sequen
             default=cpp.default_expr(a.default, a.type) if a.default is not None else None,
             argument=a,
         )]
-    elif isinstance(a, ThisArgument):
-        # Erase ThisArgument from the distinction
+    elif isinstance(a, SelfArgument):
+        # Erase SelfArgument from the distinction
         return [NativeArgument(
             type=argument_type(a.argument),
             name=a.argument.name,
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index 5ad881b372c5..4b407d45553a 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -527,9 +527,9 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
     # arguments are created based on different rules - see below.
     args = tuple(a for a in cpp.group_arguments(f.func, method=method) if isinstance(a, Argument))
 
-    input_arg_set = set(a.name for a in f.func.arguments)
-    kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments)
-    out_arg_set = set(a.name for a in f.func.out_arguments)
+    input_arg_set = set(a.name for a in f.func.arguments.positional)
+    kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only)
+    out_arg_set = set(a.name for a in f.func.arguments.out)
 
     input_args = tuple(map(argument, filter(lambda a: a.name in input_arg_set, args)))
     input_kwargs = tuple(map(argument, filter(lambda a: a.name in kwarg_only_set, args)))
@@ -544,7 +544,7 @@ def signature(f: NativeFunction, *, method: bool = False) -> PythonSignature:
     # source of drift between eager and JIT. Pull this logic out to a shared place.
 
     has_tensor_input_arg = any(a.type.is_tensor_like()
-                               for a in itertools.chain(f.func.arguments, f.func.kwarg_only_arguments))
+                               for a in itertools.chain(f.func.arguments.positional, f.func.arguments.kwarg_only))
     if any(a.name == 'requires_grad' for a in f.func.schema_order_arguments()):
         raise ValueError('argument named requires_grad is reserved, should not explicitly add it in the schema')
 
@@ -657,7 +657,7 @@ def dispatch_lambda_args(ps: PythonSignature, f: NativeFunction) -> Tuple[Dispat
                               ps.deprecated_args_names)
         cpp_args = list(map(lambda n: m[n], ordered_args))
 
-    out_args: Set[str] = set(a.name for a in f.func.out_arguments)
+    out_args: Set[str] = set(a.name for a in f.func.arguments.out)
 
     # Convert from cpp argument to lambda argument
     def dispatch_lambda_arg(cpp_arg: CppArgument) -> DispatchLambdaArgument:
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index b5ee4f4ef459..32caf26f223f 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -6,28 +6,6 @@
 
 # ------------------------------------------------------------------- #
 
-#                       Grouping arguments
-
-# ------------------------------------------------------------------- #
-
-# Represents the implicit *this argument for method calls in C++ API
-@dataclass(frozen=True)
-class ThisArgument:
-    argument: Argument
-
-# Bundle of arguments that represent a TensorOptions in the C++ API.
-@dataclass(frozen=True)
-class TensorOptionsArguments:
-    dtype: Argument
-    layout: Argument
-    device: Argument
-    pin_memory: Argument
-
-    def all(self) -> Sequence[Argument]:
-        return [self.dtype, self.layout, self.device, self.pin_memory]
-
-# ------------------------------------------------------------------- #
-
 #                           cpp types
 
 # ------------------------------------------------------------------- #
@@ -105,7 +83,7 @@ def explicit_arguments(self) -> Sequence[CppArgument]:
 @dataclass(frozen=True)
 class CppThisArgumentPack(CppArgumentPackIface):
     # The grouped JIT argument this formal was derived from
-    argument: ThisArgument
+    argument: SelfArgument
 
     # C++ type, e.g., Tensor&
     type: str
@@ -210,7 +188,7 @@ def defn(self, name: Optional[str] = None, *, prefix: str = "") -> str:
     @staticmethod
     def _from_grouped_arguments(
         func: FunctionSchema,
-        arguments: Sequence[Union[Argument, TensorOptionsArguments, ThisArgument]],
+        arguments: Sequence[Union[Argument, TensorOptionsArguments, SelfArgument]],
         *,
         faithful: bool
     ) -> 'CppSignature':
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index a0a1f21d13dc..4db060acd401 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -244,7 +244,11 @@ def __call__(self, f: Union[StructuredNativeFunctions, NativeFunction]) -> List[
             assert_never(f)
 
     def gen_structured(self, g: StructuredNativeFunctions) -> List[str]:
-        if self.dispatch_key not in g.out.dispatch:
+        if self.dispatch_key == 'Meta':
+            assert self.dispatch_key not in g.out.dispatch, \
+                "Do not explicitly specify Meta dispatch key on structured " \
+                "functions, they will be automatically generated for you"
+        elif self.dispatch_key not in g.out.dispatch:
             return []
 
         # Inner helper function to close over g
@@ -272,35 +276,42 @@ def gen_one(f: NativeFunction) -> Optional[str]:
             sig = NativeSignature.from_schema(f.func)
 
             if self.target is Target.DEFINITION:
-                out_impl_name = f"at::native::{g.out.dispatch[self.dispatch_key]}"
-
                 # TODO: work a little harder to generate fresh names for 'result'
                 # TODO: less praying that I picked the right argument name for 'self'
 
                 if k is SchemaKind.functional:
                     out_expr = "result"
-                    prologue = "auto result = tensor_from_meta(meta_result);"
+                    if self.dispatch_key == "Meta":
+                        prologue = "auto result = meta_tensor_from_meta(meta_result);"
+                    else:
+                        prologue = "auto result = tensor_from_meta(meta_result);"
                 elif k is SchemaKind.inplace:
                     out_expr = "self"
                     prologue = "// TODO: consistency check assert"
                 elif k is SchemaKind.out:
                     # TODO: generalize this for multi-out
-                    assert len(f.func.out_arguments) == 1, "multi-out structured not supported yet"
+                    assert len(f.func.arguments.out) == 1, "multi-out structured not supported yet"
                     # TODO: properly get the expression as it was brought into
                     # scope by sig
-                    out_expr = f.func.out_arguments[0].name
+                    out_expr = f.func.arguments.out[0].name
                     prologue = f"""
 // TODO: add a consistency check for meta_result
 {out_expr}.resize_(meta_result.sizes);
 """
 
+                if self.dispatch_key == "Meta":
+                    out_impl_call = "// meta function does nothing"
+                else:
+                    out_impl_name = f"at::native::{g.out.dispatch[self.dispatch_key]}"
+                    out_impl_call = f"{out_impl_name}({out_expr}, {functional_exprs});"
+
                 device_guard = ""
 
                 if is_generic_dispatch_key(self.dispatch_key) or is_cuda_dispatch_key(self.dispatch_key):
                     # TODO: avoid copypasting the computation of self_args,
                     # candidate_args and device_of
-                    self_args = (a for a in f.func.arguments if a.name == "self")
-                    candidate_args = itertools.chain(self_args, f.func.out_arguments, f.func.arguments)
+                    self_args = (a for a in f.func.arguments.positional if a.name == "self")
+                    candidate_args = itertools.chain(self_args, f.func.arguments.out, f.func.arguments.positional)
                     device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None)
 
                     device_guard = ''
@@ -317,7 +328,7 @@ def gen_one(f: NativeFunction) -> Optional[str]:
     {device_guard}
     auto meta_result = meta::{meta_name}({functional_exprs});
     {prologue}
-    {out_impl_name}({out_expr}, {functional_exprs});
+    {out_impl_call}
     return {out_expr};
 }}
 """
@@ -358,11 +369,11 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
 
             cuda_guard = ""
             if is_generic_dispatch_key(self.dispatch_key) or is_cuda_dispatch_key(self.dispatch_key):
-                self_args = (a for a in f.func.arguments if a.name == "self")
+                self_args = (a for a in f.func.arguments.positional if a.name == "self")
 
                 # There is precedence for which argument we use to do
                 # device guard.  This describes the precedence order.
-                candidate_args = itertools.chain(self_args, f.func.out_arguments, f.func.arguments)
+                candidate_args = itertools.chain(self_args, f.func.arguments.out, f.func.arguments.positional)
 
                 # Only tensor like arguments are eligible
                 device_of = next((f'{a.name}' for a in candidate_args if a.type.is_tensor_like()), None)
@@ -487,8 +498,8 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
             return None
 
         assert not f.func.is_out_fn()
-        assert len(f.func.arguments) > 0
-        assert sum(a.name == 'self' for a in f.func.arguments) == 1
+        assert len(f.func.arguments.positional) > 0
+        assert sum(a.name == 'self' for a in f.func.arguments.positional) == 1
 
         name = cpp.name(f.func)
 
@@ -757,7 +768,7 @@ def compute_returns_yaml(f: NativeFunction) -> Tuple[List[Dict[str, str]], Dict[
             # See Note [name and field_name]
             ret['field_name'] = r.name
             if f.func.is_out_fn():
-                name_to_field_name[f.func.out_arguments[i].name] = r.name
+                name_to_field_name[f.func.arguments.out[i].name] = r.name
 
         returns.append(ret)
 
@@ -778,7 +789,7 @@ def compute_cpp_argument_yaml(cpp_a: CppArgument, *, schema_order: bool, kwarg_o
         if cpp_a.default is not None:
             arg['default'] = cpp_a.default
         return arg
-    elif isinstance(cpp_a.argument, ThisArgument):
+    elif isinstance(cpp_a.argument, SelfArgument):
         raise AssertionError()
     elif isinstance(cpp_a.argument, Argument):
         return compute_argument_yaml(
@@ -817,8 +828,8 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
 
     # These sets are used to conveniently test if an argument is a
     # kwarg-only or out argument
-    kwarg_only_set = set(a.name for a in f.func.kwarg_only_arguments)
-    out_arg_set = set(a.name for a in f.func.out_arguments)
+    kwarg_only_set = set(a.name for a in f.func.arguments.kwarg_only)
+    out_arg_set = set(a.name for a in f.func.arguments.out)
 
     sig_group = CppSignatureGroup.from_schema(f.func, method=False)
     cpp_args = sig_group.signature.arguments()
@@ -1048,6 +1059,7 @@ def flatten_pre_group(d: Dict[SchemaKind, NativeFunction]) -> Sequence[Union[Nat
 
     # TODO: how come ValuesView isn't a Sequence lol
     grouped_native_functions = list(concatMap(flatten_pre_group, list(pre_grouped_native_functions.values())))
+    structured_native_functions = [g for g in grouped_native_functions if isinstance(g, StructuredNativeFunctions)]
 
     template_dir = os.path.join(options.source_path, "templates")
 
@@ -1093,6 +1105,9 @@ def make_file_manager(install_dir: str) -> FileManager:
         "QuantizedCUDA",
         "Math",
         "DefaultBackend",
+        # Meta is a magic key: it is automatically generated for structured
+        # kernels
+        "Meta",
     ]
     if options.backend_whitelist:
         dispatch_keys = [k for k in dispatch_keys if is_generic_dispatch_key(k) or k in options.backend_whitelist]
@@ -1129,9 +1144,7 @@ def make_file_manager(install_dir: str) -> FileManager:
     })
 
     cpu_fm.write('MetaFunctions.h', lambda: {
-        'declarations':
-            list(mapMaybe(compute_meta_function_declaration,
-                          (g for g in grouped_native_functions if isinstance(g, StructuredNativeFunctions)))),
+        'declarations': list(map(compute_meta_function_declaration, structured_native_functions)),
     })
 
     schema_selector = selector
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index a7d5ea220b46..f270d0737ade 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -1,7 +1,7 @@
 import re
 
 from dataclasses import dataclass
-from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn
+from typing import List, Dict, Optional, Iterator, Tuple, Set, NoReturn, Sequence, Callable
 from enum import Enum
 import itertools
 
@@ -253,7 +253,7 @@ def validate_unstructured(self) -> None:
     # Validation is for nontrivial invariants that cannot be (conveniently)
     # encoded in the type system.
     def __post_init__(self) -> None:
-        if self.func.out_arguments:
+        if self.func.arguments.out:
             assert self.variants == {Variant.function}, "Native functions with out arguments MUST " \
                 "be declared with only function variant; e.g., variants: function; " \
                 "otherwise you will tickle a Python argument binding bug " \
@@ -380,20 +380,13 @@ class FunctionSchema:
     # The name of the operator this function schema describes.
     name: 'OperatorName'
 
-    arguments: Tuple['Argument', ...]
-    kwarg_only_arguments: Tuple['Argument', ...]  # but not including out args
-    # Unlike in the previous codegen, we have factored out 'out' arguments
-    # in the canonical representation, removing them from kwarg
-    # arguments.  This choice is justified by numerous downstream
-    # transformations which treat out arguments specially; additionally,
-    # you can see that canonicity is not violated!
-    out_arguments: Tuple['Argument', ...]  # these are also kwarg-only
+    arguments: 'Arguments'
 
     # TODO: Need to handle collisions with argument names at some point
     returns: Tuple['Return', ...]
 
     def schema_order_arguments(self) -> Iterator['Argument']:
-        return itertools.chain(self.arguments, self.kwarg_only_arguments, self.out_arguments)
+        return itertools.chain(self.arguments.positional, self.arguments.kwarg_only, self.arguments.out)
 
     @staticmethod
     def parse(func: str) -> 'FunctionSchema':
@@ -404,20 +397,18 @@ def parse(func: str) -> 'FunctionSchema':
         assert args[-1] == ")", "Expecting closing )"
         args = args[:-1]
         name = OperatorName.parse(ops)
-        arguments, kwarg_only_arguments, out_arguments = parse_arguments(args)
+        arguments = Arguments.parse(args)
         returns = parse_returns(return_decl)
         r = FunctionSchema(
             name=name,
             arguments=arguments,
-            kwarg_only_arguments=kwarg_only_arguments,
-            out_arguments=out_arguments,
             returns=returns
         )
         assert str(r) == func, f'{str(r)} != {func}'
         return r
 
     def __post_init__(self) -> None:
-        for arg, ret in zip(self.out_arguments, self.returns):
+        for arg, ret in zip(self.arguments.out, self.returns):
             assert arg.annotation == ret.annotation, \
                 "Out arguments must have matching return Tensor; furthermore, " \
                 "the ith-argument needs to correspond to the ith return"
@@ -425,14 +416,14 @@ def __post_init__(self) -> None:
         # This means that all mutable returns should be aliased to a keyword argument
         # (except for "self", which we explicitly don't treat as an out argument because of its use in methods)
         # See Note [is_out_fn]
-        out_and_self = list(self.out_arguments) + [arg for arg in self.arguments if arg.name == "self"]
+        out_and_self = list(self.arguments.out) + [arg for arg in self.arguments.positional if arg.name == "self"]
         mutable_returns = [ret for ret in self.returns if ret.annotation is not None and ret.annotation.is_write]
         for ret in mutable_returns:
             assert any([ret.annotation == arg.annotation for arg in out_and_self]), \
                 "All mutable returns must be aliased either to a keyword argument, or to \"self\". " \
                 "Did you forget to mark an out argument as keyword-only?"
-        if self.out_arguments:
-            assert len(self.out_arguments) == len(self.returns), \
+        if self.arguments.out:
+            assert len(self.arguments.out) == len(self.returns), \
                 "Must return as many arguments as there are out arguments"
         if self.name.name.inplace:
             # TODO: fixme
@@ -508,7 +499,7 @@ def is_out_fn(self) -> bool:
         #     but just with extra kwargs for the output elements.  This
         #     is difficult to actually check for and historically
         #     we only do this check in tools/
-        return bool(self.out_arguments)
+        return bool(self.arguments.out)
 
     def kind(self) -> SchemaKind:
         """
@@ -518,7 +509,7 @@ def kind(self) -> SchemaKind:
         the result into an explicitly provided out argument.
         """
         is_inplace = self.name.name.inplace
-        is_out = bool(self.out_arguments)
+        is_out = bool(self.arguments.out)
         assert not (is_inplace and is_out)
         if is_inplace:
             return SchemaKind.inplace
@@ -544,16 +535,6 @@ def signature(self) -> 'FunctionSchema':
           because you cannot overload on mutability annotation)
         """
 
-        # dataclasses.replace could be used here, but it is less
-        # type safe so for now I've opted to type everything out
-        def strip_arg_annotation(a: Argument) -> Argument:
-            return Argument(
-                name=a.name,
-                type=a.type,
-                default=a.default,  # hmmm
-                annotation=None,
-            )
-
         def strip_ret_annotation(r: Return) -> Return:
             return Return(
                 name=r.name,
@@ -570,20 +551,12 @@ def strip_ret_annotation(r: Return) -> Return:
                 ),
                 overload_name="",  # stripped
             ),
-            arguments=tuple(map(strip_arg_annotation, self.arguments)),
-            kwarg_only_arguments=tuple(map(strip_arg_annotation, self.kwarg_only_arguments)),
-            out_arguments=(),  # stripped
+            arguments=self.arguments.signature(),
             returns=tuple(map(strip_ret_annotation, self.returns)),
         )
 
     def __str__(self) -> str:
-        all_arguments: List[str] = []
-        all_arguments.extend(map(str, self.arguments))
-        if self.kwarg_only_arguments or self.out_arguments:
-            all_arguments.append('*')
-        all_arguments.extend(map(str, self.kwarg_only_arguments))
-        all_arguments.extend(map(str, self.out_arguments))
-        all_arguments_str = ', '.join(all_arguments)
+        all_arguments_str = str(self.arguments)
         if len(self.returns) == 1:
             returns = str(self.returns[0])  # omit parentheses
         else:
@@ -869,6 +842,221 @@ def __str__(self) -> str:
             return f"{type} {self.name}"
 
 
+# Represents the self argument for functions that may be methods
+@dataclass(frozen=True)
+class SelfArgument:
+    argument: Argument
+
+# Bundle of arguments that represent a TensorOptions.  This is mostly
+# relevant for the public C++ API but we bake it into the core data
+# model because other APIs often have to interact with it
+@dataclass(frozen=True)
+class TensorOptionsArguments:
+    dtype: Argument
+    layout: Argument
+    device: Argument
+    pin_memory: Argument
+
+    def all(self) -> Sequence[Argument]:
+        return [self.dtype, self.layout, self.device, self.pin_memory]
+
+@dataclass(frozen=True)
+class Arguments:
+    # pre_self_positional is usually empty, but is notably non-empty
+    # for where.self, where the condition argument comes before the
+    # self argument
+    pre_self_positional: Tuple[Argument, ...]
+    self_arg: Optional[SelfArgument]
+    post_self_positional: Tuple[Argument, ...]
+
+    pre_tensor_options_kwarg_only: Tuple[Argument, ...]
+    tensor_options: Optional[TensorOptionsArguments]
+    # post_tensor_options is typically memory format, which should be
+    # part of tensor options but isn't right now, and is usually
+    # placed after the tensor options arguments
+    post_tensor_options_kwarg_only: Tuple[Argument, ...]
+
+    # Unlike in the previous codegen, we have factored out 'out' arguments
+    # in the canonical representation, removing them from kwarg
+    # arguments.  This choice is justified by numerous downstream
+    # transformations which treat out arguments specially; additionally,
+    # you can see that canonicity is not violated!
+    out: Tuple[Argument, ...]  # these are also kwarg-only
+
+    @property
+    def positional(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.pre_self_positional)
+        if self.self_arg is not None:
+            ret.append(self.self_arg.argument)
+        ret.extend(self.post_self_positional)
+        return ret
+
+    # NB: doesn't contain out arguments
+    @property
+    def kwarg_only(self) -> Sequence[Argument]:
+        ret: List[Argument] = []
+        ret.extend(self.pre_tensor_options_kwarg_only)
+        if self.tensor_options is not None:
+            ret.extend(self.tensor_options.all())
+        ret.extend(self.post_tensor_options_kwarg_only)
+        return ret
+
+    def signature(self) -> 'Arguments':
+        # dataclasses.replace could be used here, but it is less
+        # type safe so for now I've opted to type everything out
+        def strip_arg_annotation(a: Argument) -> Argument:
+            return Argument(
+                name=a.name,
+                type=a.type,
+                default=a.default,  # hmmm
+                annotation=None,
+            )
+
+        return Arguments(
+            pre_self_positional=tuple(map(strip_arg_annotation, self.pre_self_positional)),
+            self_arg=SelfArgument(
+                strip_arg_annotation(self.self_arg.argument)
+            ) if self.self_arg is not None else None,
+            post_self_positional=tuple(map(strip_arg_annotation, self.post_self_positional)),
+            pre_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.pre_tensor_options_kwarg_only)),
+            # NB: tensor_options guaranteed to not have any alias annotations
+            tensor_options=self.tensor_options,
+            post_tensor_options_kwarg_only=tuple(map(strip_arg_annotation, self.post_tensor_options_kwarg_only)),
+            # out arguments are dropped in signature
+            out=(),
+        )
+
+
+    @staticmethod
+    def _preparse(args: str) -> Tuple[List[Argument], List[Argument], List[Argument]]:
+        positional: List[Argument] = []
+        kwarg_only: List[Argument] = []
+        out: List[Argument] = []
+        arguments_acc = positional
+
+        # TODO: Use a real parser here; this will get bamboozled
+        # by signatures that contain things like std::array<bool, 2> (note the space)
+        for arg in args.split(', '):
+            if not arg:
+                continue
+            if arg == '*':
+                assert arguments_acc is positional, "invalid syntax: kwarg-only specifier * can only occur once"
+                arguments_acc = kwarg_only
+                continue
+            parg = Argument.parse(arg)
+            # Currently, we rely directly on the invariant that there are NO
+            # kwarg-only mutating arguments.  If you want to relax this,
+            # we will need a more semantic way of matching that takes
+            # into account return arguments.  In that case, you will have
+            # to manage out computation a level up, in FunctionSchema.  See Note
+            # [is_out_fn]
+            if parg.annotation is not None and parg.annotation.is_write:
+                if arguments_acc is positional:
+                    pass  # do nothing
+                elif arguments_acc is kwarg_only:
+                    arguments_acc = out
+            else:
+                assert arguments_acc is not out
+            arguments_acc.append(parg)
+
+        return positional, kwarg_only, out
+
+    @staticmethod
+    def parse(args: str) -> 'Arguments':
+        """
+        Input: 'int x, int y, int z'
+        """
+
+        # We do this in two phases.  First we parse into three 
+        # main categories: positional, kwarg_only, out.
+        # Then, we reparse positional and kwarg_only to separate
+        # out the self argument and tensor options arguments.
+
+        positional, kwarg_only, out = Arguments._preparse(args)
+
+        # Split self argument
+        self_ix = None
+        for i, a in enumerate(positional):
+            if a.name == "self":
+                self_ix = i
+                break
+        pre_self_positional: List[Argument]
+        self_arg: Optional[SelfArgument]
+        post_self_positional: List[Argument]
+        if self_ix is not None:
+            pre_self_positional = positional[:self_ix]
+            self_arg = SelfArgument(positional[self_ix])
+            post_self_positional = positional[self_ix + 1:]
+        else:
+            pre_self_positional = []
+            self_arg = None
+            post_self_positional = positional
+
+        # Group tensor options arguments
+        pre_tensor_options_kwarg_only: List[Argument] = []
+        tensor_options: Optional[TensorOptionsArguments] = None
+        post_tensor_options_kwarg_only: List[Argument] = []
+        kwarg_only_acc = pre_tensor_options_kwarg_only
+
+        def pred(name: str, ty: Type) -> Callable[[Argument], bool]:
+            return lambda a: a.name == name and a.type in [ty, OptionalType(ty)]
+        predicates = [  # order matters
+            pred('dtype', Type.parse('ScalarType')),
+            pred('layout', Type.parse('Layout')),
+            pred('device', Type.parse('Device')),
+            pred('pin_memory', Type.parse('bool')),
+        ]
+
+        i = 0
+        while i < len(kwarg_only):
+            # If there is enough space...
+            if i <= len(kwarg_only) - len(predicates):
+                # And the next len(predicates) arguments look like TensorOptions arguments
+                if all(p(a) for p, a in zip(predicates, kwarg_only[i : i + len(predicates)])):
+                    assert kwarg_only_acc is pre_tensor_options_kwarg_only
+                    # Group them together as one argument
+                    tensor_options = TensorOptionsArguments(
+                        dtype=kwarg_only[i],
+                        layout=kwarg_only[i + 1],
+                        device=kwarg_only[i + 2],
+                        pin_memory=kwarg_only[i + 3],
+                    )
+                    i += len(predicates)
+                    kwarg_only_acc = post_tensor_options_kwarg_only
+                    continue
+            kwarg_only_acc.append(kwarg_only[i])
+            i += 1
+
+        return Arguments(
+            pre_self_positional=tuple(pre_self_positional),
+            self_arg=self_arg,
+            post_self_positional=tuple(post_self_positional),
+            pre_tensor_options_kwarg_only=tuple(pre_tensor_options_kwarg_only),
+            tensor_options=tensor_options,
+            post_tensor_options_kwarg_only=tuple(post_tensor_options_kwarg_only),
+            out=tuple(out),
+        )
+
+
+    def __str__(self) -> str:
+        all_arguments: List[str] = []
+        all_arguments.extend(map(str, self.positional))
+        if self.kwarg_only or self.out:
+            all_arguments.append('*')
+        all_arguments.extend(map(str, self.kwarg_only))
+        all_arguments.extend(map(str, self.out))
+        return ', '.join(all_arguments)
+
+    def __post_init__(self) -> None:
+        # TODO: These invariants are weirdly asymmetric?
+        # TODO: Fancier types?
+        if self.self_arg is None:
+            assert not self.pre_self_positional
+        if self.tensor_options is None:
+            assert not self.post_tensor_options_kwarg_only
+
+
 # Names that validly are __iXXX__ indicating inplace operations.
 # Taken from https://www.python.org/dev/peps/pep-0203/#new-methods
 # NB: PyTorch hasn't actually implemented all of these
@@ -965,40 +1153,3 @@ def parse_returns(return_decl: str) -> Tuple[Return, ...]:
     if return_decl[0] == '(' and return_decl[-1] == ')':
         return_decl = return_decl[1:-1]
     return tuple(Return.parse(arg) for arg in return_decl.split(', '))
-
-def parse_arguments(args: str) -> Tuple[Tuple[Argument, ...], Tuple[Argument, ...], Tuple[Argument, ...]]:
-    """
-    Input: 'int x, int y, int z'
-    Output: positional args, kwarg only args
-    """
-    arguments: List[Argument] = []
-    kwarg_only_arguments: List[Argument] = []
-    out_arguments: List[Argument] = []
-    arguments_acc = arguments
-
-    # TODO: Use a real parser here; this will get bamboozled
-    # by signatures that contain things like std::array<bool, 2> (note the space)
-    for arg in args.split(', '):
-        if not arg:
-            continue
-        if arg == '*':
-            assert arguments_acc is arguments, "invalid syntax: kwarg-only specifier * can only occur once"
-            arguments_acc = kwarg_only_arguments
-            continue
-        parg = Argument.parse(arg)
-        # Currently, we rely directly on the invariant that there are NO
-        # kwarg-only mutating arguments.  If you want to relax this,
-        # we will need a more semantic way of matching that takes
-        # into account return arguments.  In that case, you will have
-        # to manage out_arguments computation a level up, in
-        # FunctionSchema.  See Note [is_out_fn]
-        if parg.annotation is not None and parg.annotation.is_write:
-            if arguments_acc is arguments:
-                pass  # do nothing
-            elif arguments_acc is kwarg_only_arguments:
-                arguments_acc = out_arguments
-        else:
-            assert arguments_acc is not out_arguments
-        arguments_acc.append(parg)
-
-    return tuple(arguments), tuple(kwarg_only_arguments), tuple(out_arguments)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 441a958adf33..5be6611c6904 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -389,6 +389,7 @@ def _crash_if_csrc_asan() -> _int: ...  # THPModule_crashIfCsrcASAN
 def _crash_if_csrc_ubsan() -> _int: ...  # THPModule_crashIfCsrcUBSAN
 def _crash_if_aten_asan() -> _int: ...  # THPModule_crashIfATenASAN
 def _show_config() -> str: ...  # THPModule_showConfig
+def _cxx_flags() -> str: ...  # THPModule_cxxFlags
 def _parallel_info() -> str: ...  # THPModule_parallelInfo
 def _set_backcompat_broadcast_warn(arg: _bool) -> None: ...  # THPModule_setBackcompatBroadcastWarn
 def _get_backcompat_broadcast_warn() -> _bool: ...  # THPModule_getBackcompatBroadcastWarn
diff --git a/torch/__config__.py b/torch/__config__.py
index e4c3fde9ec3c..edddcbce4645 100644
--- a/torch/__config__.py
+++ b/torch/__config__.py
@@ -9,8 +9,11 @@ def show():
     return torch._C._show_config()
 
 # TODO: In principle, we could provide more structured version/config
-# information here.  We're not for now; considering doing so if someone
-# asks for it.
+# information here. For now only CXX_FLAGS is exposed, as Timer
+# uses them.
+def _cxx_flags():
+    """Returns the CXX_FLAGS used when building PyTorch."""
+    return torch._C._cxx_flags()
 
 def parallel_info():
     r"""Returns detailed string with parallelization settings"""
diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 2d8533b512df..99f504a05c9c 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -1809,26 +1809,26 @@ def add_docstr_all(method, docstr):
 
 add_docstr_all('index_put_',
                r"""
-index_put_(indices, value, accumulate=False) -> Tensor
+index_put_(indices, values, accumulate=False) -> Tensor
 
-Puts values from the tensor :attr:`value` into the tensor :attr:`self` using
+Puts values from the tensor :attr:`values` into the tensor :attr:`self` using
 the indices specified in :attr:`indices` (which is a tuple of Tensors). The
-expression ``tensor.index_put_(indices, value)`` is equivalent to
-``tensor[indices] = value``. Returns :attr:`self`.
+expression ``tensor.index_put_(indices, values)`` is equivalent to
+``tensor[indices] = values``. Returns :attr:`self`.
 
-If :attr:`accumulate` is ``True``, the elements in :attr:`value` are added to
+If :attr:`accumulate` is ``True``, the elements in :attr:`values` are added to
 :attr:`self`. If accumulate is ``False``, the behavior is undefined if indices
 contain duplicate elements.
 
 Args:
     indices (tuple of LongTensor): tensors used to index into `self`.
-    value (Tensor): tensor of same dtype as `self`.
+    values (Tensor): tensor of same dtype as `self`.
     accumulate (bool): whether to accumulate into self
 """)
 
 add_docstr_all('index_put',
                r"""
-index_put(tensor1, indices, value, accumulate=False) -> Tensor
+index_put(tensor1, indices, values, accumulate=False) -> Tensor
 
 Out-place version of :meth:`~Tensor.index_put_`.
 `tensor1` corresponds to `self` in :meth:`torch.Tensor.index_put_`.
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index 3b6ee12e7a68..dd4be74dde80 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -4577,6 +4577,8 @@ def merge_dicts(*dicts):
 Returns the LU solve of the linear system :math:`Ax = b` using the partially pivoted
 LU factorization of A from :meth:`torch.lu`.
 
+This function supports ``float``, ``double``, ``cfloat`` and ``cdouble`` dtypes for :attr:`input`.
+
 Arguments:
     b (Tensor): the RHS tensor of size :math:`(*, m, k)`, where :math:`*`
                 is zero or more batch dimensions.
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index a5df6329030d..b23ab81ada93 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -335,6 +335,13 @@ static PyObject *THPModule_showConfig(PyObject *module, PyObject *noargs)
   END_HANDLE_TH_ERRORS
 }
 
+static PyObject *THPModule_cxxFlags(PyObject *module, PyObject *noargs)
+{
+  HANDLE_TH_ERRORS
+  return THPUtils_packString(at::get_cxx_flags());
+  END_HANDLE_TH_ERRORS
+}
+
 static PyObject *THPModule_parallelInfo(PyObject *module, PyObject *noargs)
 {
   HANDLE_TH_ERRORS
@@ -584,6 +591,7 @@ static PyMethodDef TorchMethods[] = {
   {"_crash_if_csrc_ubsan", THPModule_crashIfCsrcUBSAN, METH_O, nullptr},
   {"_crash_if_aten_asan", THPModule_crashIfATenASAN, METH_O, nullptr},
   {"_show_config",    THPModule_showConfig, METH_NOARGS, nullptr},
+  {"_cxx_flags", THPModule_cxxFlags, METH_NOARGS, nullptr},
   {"_parallel_info",    THPModule_parallelInfo, METH_NOARGS, nullptr},
   {"_set_backcompat_broadcast_warn", THPModule_setBackcompatBroadcastWarn, METH_O, nullptr},
   {"_get_backcompat_broadcast_warn", THPModule_getBackcompatBroadcastWarn, METH_NOARGS, nullptr},
diff --git a/torch/csrc/api/include/torch/linalg.h b/torch/csrc/api/include/torch/linalg.h
index dd44e16ff3f1..f5d788d75e07 100644
--- a/torch/csrc/api/include/torch/linalg.h
+++ b/torch/csrc/api/include/torch/linalg.h
@@ -52,6 +52,14 @@ inline Tensor& norm_out(Tensor& result, const Tensor& self, std::string ord, opt
   return torch::linalg_norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
+inline Tensor matrix_rank(const Tensor input, optional<double> tol, bool hermitian) {
+  return torch::linalg_matrix_rank(input, tol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(Tensor& result, const Tensor input, optional<double> tol, bool hermitian) {
+  return torch::linalg_matrix_rank_out(result, input, tol, hermitian);
+}
+
 inline Tensor tensorinv(const Tensor& self, int64_t ind) {
   return torch::linalg_tensorinv(self, ind);
 }
@@ -133,6 +141,15 @@ inline Tensor& linalg_norm_out(Tensor& result, const Tensor& self, std::string o
   return detail::norm_out(result, self, ord, opt_dim, keepdim, opt_dtype);
 }
 
+/// See https://pytorch.org/docs/master/linalg.html#torch.linalg.matrix_rank
+inline Tensor matrix_rank(const Tensor input, optional<double> tol, bool hermitian) {
+  return detail::matrix_rank(input, tol, hermitian);
+}
+
+inline Tensor& matrix_rank_out(Tensor& result, const Tensor input, optional<double> tol, bool hermitian) {
+  return detail::matrix_rank_out(result, input, tol, hermitian);
+}
+
 /// Computes the inverse of a tensor
 ///
 /// See https://pytorch.org/docs/master/linalg.html#torch.linalg.tensorinv
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 7b973bedf5a8..d36f4f428c53 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -935,8 +935,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::sigmoid: {
-      return computeOneOperand(
-          "aten_sigmoid", v, [](const ExprHandle& a) { return sigmoid(a); });
+      return computeOneOperand("aten_sigmoid", v, [](const ExprHandle& a) {
+        return sigmoid(promoteIntegerToFloat(a));
+      });
     } break;
 
     case aten::reciprocal: {
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
index f25f3a8caad8..ffe4203b4181 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/__init__.py
@@ -16,7 +16,12 @@ def _ddp_comm_hook_wrapper(comm_hook, model, state):
 
 
 def _powerSGD_comm_hook_wrapper(
-    comm_hook, model, state, matrix_approximation_rank, random_seed=0
+    comm_hook,
+    model,
+    state,
+    matrix_approximation_rank,
+    use_error_feedback=True,
+    random_seed=0,
 ):
     """
     To be consistent with the wrappers of other DDP comm hooks, the input state only needs to be a process group,
@@ -25,6 +30,7 @@ def _powerSGD_comm_hook_wrapper(
     powerSGD_state = powerSGD.PowerSGDState(
         process_group=state,
         matrix_approximation_rank=matrix_approximation_rank,
+        use_error_feedback=use_error_feedback,
         random_seed=random_seed,
     )
     model.register_comm_hook(powerSGD_state, comm_hook)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index 9a6fbb4a31dd..17414df3024d 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -30,17 +30,44 @@ def _orthogonalize(matrix, epsilon=1e-8):
 
 
 class PowerSGDState(object):
-    __slots__ = ["process_group", "matrix_approximation_rank", "rng"]
-
-    def __init__(self, process_group, matrix_approximation_rank=1, random_seed=0):
+    __slots__ = [
+        "process_group",
+        "matrix_approximation_rank",
+        "use_error_feedback",
+        "rng",
+        "error_dict",
+    ]
+
+    def __init__(
+        self,
+        process_group,
+        matrix_approximation_rank=1,
+        use_error_feedback=True,
+        random_seed=0,
+    ):
         self.process_group = process_group
         self.matrix_approximation_rank = matrix_approximation_rank
+        # Error feedback is usually crucial for both for convergence and generalization,
+        # because PowerSGD is a biased compressor,
+        # i.e., compressing and decompressing a random gradient does not yield the original in expectation.
+        # This mechanism requires a temporary copy of the input gradients,
+        # so it increases the peak memory consumption by the size of gradient tensor.
+        # However, if the target matrices are known to be exactly low-ranked (instead of just low stable rank),
+        # sometimes it is possible to converge to the optima without error feedback.
+        # See: http://proceedings.mlr.press/v54/yurtsever17a/yurtsever17a.pdf
+        self.use_error_feedback = use_error_feedback
         # The purpose of this RNG is to generate different random seeds for initializing Q across iterations,
         # but in the same order for all the DDP replicas.
         # Different random seeds across iterations indicate different 'projections' of the gradients at different SGD steps.
         # If the same random projection is used,
         # there will be differences between the gradients that are never synchronized.
         self.rng = np.random.RandomState(random_seed)
+        # Since there is only a single state instance for all the input buckets,
+        # need to maintain a dictionary that maps each bucket to the local error.
+        # TODO(wayi): Currently the key is the (hashcode of) input tensor, which may change across steps,
+        # since the bucket can be rebuilt in the forward pass (to save peak memory usage).
+        # Need to add an index field to the input bucket of comm hook.
+        self.error_dict = {}
 
 
 def powerSGD_hook(
@@ -98,6 +125,17 @@ def powerSGD_hook(
     padded_total_length = square_side_length ** 2
     input_tensor.resize_(padded_total_length)
     input_tensor[total_length:padded_total_length].fill_(0)
+
+    # Incorporate the error from the previous state into the gradients.
+    if state.use_error_feedback:
+        if input_tensor in state.error_dict:
+            input_tensor.add_(state.error_dict[input_tensor])
+        else:
+            state.error_dict[input_tensor] = torch.zeros(padded_total_length, device=device)
+        # Keep a copy of the input tensor,
+        # so that we can compute the local error caused by compression later,
+        # by comparing this copy and the input tensor updated after decompression.
+        input_tensor_cp = torch.clone(input_tensor).detach()
     matrix = input_tensor.view(square_side_length, square_side_length)
 
     def create_low_rank_tensor(fill_random_values, rng):
@@ -141,6 +179,9 @@ def decompress(fut):
         q = fut.value()[0].div_(world_size)
         torch.matmul(p, q.t(), out=matrix)
 
+        if state.use_error_feedback:
+            # Memorize the local errors.
+            state.error_dict[input_tensor] = input_tensor_cp - input_tensor
         ret = input_tensor.resize_(total_length)
         return [ret]
 
diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py
index ffcf75695d2f..57408f0c03f0 100644
--- a/torch/distributions/__init__.py
+++ b/torch/distributions/__init__.py
@@ -91,6 +91,7 @@
 from .half_normal import HalfNormal
 from .independent import Independent
 from .kl import kl_divergence, register_kl
+from .kumaraswamy import Kumaraswamy
 from .laplace import Laplace
 from .log_normal import LogNormal
 from .logistic_normal import LogisticNormal
@@ -132,6 +133,7 @@
     'HalfCauchy',
     'HalfNormal',
     'Independent',
+    'Kumaraswamy',
     'Laplace',
     'LogNormal',
     'LogisticNormal',
diff --git a/torch/distributions/gumbel.py b/torch/distributions/gumbel.py
index 5bd3a2d3bd1e..a569af34ebdc 100644
--- a/torch/distributions/gumbel.py
+++ b/torch/distributions/gumbel.py
@@ -5,9 +5,7 @@
 from torch.distributions.uniform import Uniform
 from torch.distributions.transformed_distribution import TransformedDistribution
 from torch.distributions.transforms import AffineTransform, ExpTransform
-from torch.distributions.utils import broadcast_all
-
-euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
+from torch.distributions.utils import broadcast_all, euler_constant
 
 
 class Gumbel(TransformedDistribution):
diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py
index fe64ccc56009..ba7ba73d6063 100644
--- a/torch/distributions/kl.py
+++ b/torch/distributions/kl.py
@@ -31,7 +31,7 @@
 from .poisson import Poisson
 from .transformed_distribution import TransformedDistribution
 from .uniform import Uniform
-from .utils import _sum_rightmost
+from .utils import _sum_rightmost, euler_constant as _euler_gamma
 
 _KL_REGISTRY = {}  # Source of truth mapping a few general (type, type) pairs to functions.
 _KL_MEMOIZE: Dict[Tuple[Type, Type], Callable] = {}  # Memoized version mapping many specific (type, type) pairs to functions.
@@ -174,8 +174,6 @@ def kl_divergence(p, q):
 # KL Divergence Implementations
 ################################################################################
 
-_euler_gamma = 0.57721566490153286060
-
 # Same distributions
 
 
diff --git a/torch/distributions/kumaraswamy.py b/torch/distributions/kumaraswamy.py
new file mode 100644
index 000000000000..4fb2e177e7be
--- /dev/null
+++ b/torch/distributions/kumaraswamy.py
@@ -0,0 +1,66 @@
+import torch
+from torch.distributions import constraints
+from torch.distributions.uniform import Uniform
+from torch.distributions.transformed_distribution import TransformedDistribution
+from torch.distributions.transforms import AffineTransform, PowerTransform
+from torch.distributions.utils import broadcast_all, euler_constant
+
+
+def _moments(a, b, n):
+    """
+    Computes nth moment of Kumaraswamy using using torch.lgamma
+    """
+    arg1 = 1 + n / a
+    log_value = torch.lgamma(arg1) + torch.lgamma(b) - torch.lgamma(arg1 + b)
+    return b * torch.exp(log_value)
+
+
+class Kumaraswamy(TransformedDistribution):
+    r"""
+    Samples from a Kumaraswamy distribution.
+
+    Example::
+
+        >>> m = Kumaraswamy(torch.Tensor([1.0]), torch.Tensor([1.0]))
+        >>> m.sample()  # sample from a Kumaraswamy distribution with concentration alpha=1 and beta=1
+        tensor([ 0.1729])
+
+    Args:
+        concentration1 (float or Tensor): 1st concentration parameter of the distribution
+            (often referred to as alpha)
+        concentration0 (float or Tensor): 2nd concentration parameter of the distribution
+            (often referred to as beta)
+    """
+    arg_constraints = {'concentration1': constraints.positive, 'concentration0': constraints.positive}
+    support = constraints.unit_interval
+    has_rsample = True
+
+    def __init__(self, concentration1, concentration0, validate_args=None):
+        self.concentration1, self.concentration0 = broadcast_all(concentration1, concentration0)
+        finfo = torch.finfo(self.concentration0.dtype)
+        base_dist = Uniform(torch.full_like(self.concentration0, 0),
+                            torch.full_like(self.concentration0, 1))
+        transforms = [PowerTransform(exponent=self.concentration0.reciprocal()),
+                      AffineTransform(loc=1., scale=-1.),
+                      PowerTransform(exponent=self.concentration1.reciprocal())]
+        super(Kumaraswamy, self).__init__(base_dist, transforms, validate_args=validate_args)
+
+    def expand(self, batch_shape, _instance=None):
+        new = self._get_checked_instance(Kumaraswamy, _instance)
+        new.concentration1 = self.concentration1.expand(batch_shape)
+        new.concentration0 = self.concentration0.expand(batch_shape)
+        return super(Kumaraswamy, self).expand(batch_shape, _instance=new)
+
+    @property
+    def mean(self):
+        return _moments(self.concentration1, self.concentration0, 1)
+
+    @property
+    def variance(self):
+        return _moments(self.concentration1, self.concentration0, 2) - torch.pow(self.mean, 2)
+
+    def entropy(self):
+        t1 = (1 - self.concentration1.reciprocal())
+        t0 = (1 - self.concentration0.reciprocal())
+        H0 = torch.digamma(self.concentration0 + 1) + euler_constant
+        return t0 + t1 * H0 - torch.log(self.concentration1) - torch.log(self.concentration0)
diff --git a/torch/distributions/utils.py b/torch/distributions/utils.py
index 0fd623086562..36ff1f71c35b 100644
--- a/torch/distributions/utils.py
+++ b/torch/distributions/utils.py
@@ -5,6 +5,9 @@
 from typing import Dict, Any
 
 
+euler_constant = 0.57721566490153286060  # Euler Mascheroni Constant
+
+
 def broadcast_all(*values):
     r"""
     Given a list of values (possibly containing numbers), returns a list where each
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 69b06369923b..78c226ab1739 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -723,9 +723,7 @@ def build_ExtSlice(ctx, base, extslice):
             if isinstance(expr.slice.value, ast.Tuple):
                 # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k]
                 # XXX: Indexing using a list is **different**! It triggers advanced indexing.
-                indices = []
-                for index_expr in expr.slice.value.elts:
-                    indices.append(build_expr(ctx, index_expr))
+                indices = [build_expr(ctx, index_expr) for index_expr in expr.slice.value.elts]
                 return Subscript(base, indices)
             else:
                 return Subscript(base, [build_expr(ctx, expr.slice.value)])
@@ -733,6 +731,17 @@ def build_ExtSlice(ctx, base, extslice):
             return Subscript(base, [build_SliceExpr(ctx, base, expr.slice)])
         elif sub_type is ast.ExtSlice:
             return Subscript(base, build_ExtSlice(ctx, base, expr.slice))
+        elif sys.version_info >= (3, 9):  # In Python3.9 array indicies are not wrapped in ast.Index
+            if sub_type is ast.Tuple:
+                # N-dimensional indexing using Tuple: x[(i, j, k)] is equivalent to x[i, j, k]
+                indices = []
+                for index_expr in expr.slice.elts:
+                    if isinstance(index_expr, ast.Slice):
+                        indices.append(build_SliceExpr(ctx, base, index_expr))
+                    else:
+                        indices.append(build_expr(ctx, index_expr))
+                return Subscript(base, indices)
+            return Subscript(base, [build_expr(ctx, expr.slice)])
         else:  # Ellipsis (can only happen in Python 2)
             raise NotSupportedError(base.range(), "ellipsis is not supported")
 
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index edd4d8a8afa6..d2cc7e1df9d0 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -213,6 +213,67 @@
             [-3.1113,  2.7381]], dtype=torch.float64)
 """)
 
+matrix_rank = _add_docstr(_linalg.linalg_matrix_rank, r"""
+matrix_rank(input, tol=None, hermitian=False) -> Tensor
+
+Computes the numerical rank of a matrix :attr:`input`, or of each matrix in a batched :attr:`input`.
+The matrix rank is computed as the number of singular values (or the absolute eigenvalues when :attr:`hermitian` is ``True``)
+above the specified :attr:`tol` threshold.
+
+If :attr:`tol` is not specified, :attr:`tol` is set to
+``S.max(dim=-1) * max(input.shape[-2:]) * eps`` where ``S`` is the singular values
+(or the absolute eigenvalues when :attr:`hermitian` is ``True``),
+and ``eps`` is the epsilon value for the datatype of :attr:`input`.
+The epsilon value can be obtained using ``eps`` attribute of :class:`torch.finfo`.
+
+The method to compute the matrix rank is done using singular value decomposition (see :func:`torch.linalg.svd`) by default.
+If :attr:`hermitian` is ``True``, then :attr:`input` is assumed to be Hermitian (symmetric if real-valued),
+and the computation of the rank is done by obtaining the eigenvalues (see :func:`torch.linalg.eigvalsh`).
+
+Supports input of ``float``, ``double``, ``cfloat`` and ``cdouble`` datatypes.
+
+.. note:: When given inputs on a CUDA device, this function synchronizes that device with the CPU.
+
+Args:
+    input (Tensor): the input matrix of size :math:`(m, n)` or the batch of matrices of size :math:`(*, m, n)`
+                    where `*` is one or more batch dimensions.
+    tol (float, optional): the tolerance value. Default: ``None``
+    hermitian(bool, optional): indicates whether :attr:`input` is Hermitian. Default: ``False``
+
+Examples::
+
+    >>> a = torch.eye(10)
+    >>> torch.linalg.matrix_rank(a)
+    tensor(10)
+    >>> b = torch.eye(10)
+    >>> b[0, 0] = 0
+    >>> torch.linalg.matrix_rank(b)
+    tensor(9)
+
+    >>> a = torch.randn(4, 3, 2)
+    >>> torch.linalg.matrix_rank(a)
+    tensor([2, 2, 2, 2])
+
+    >>> a = torch.randn(2, 4, 2, 3)
+    >>> torch.linalg.matrix_rank(a)
+    tensor([[2, 2, 2, 2],
+            [2, 2, 2, 2]])
+
+    >>> a = torch.randn(2, 4, 3, 3, dtype=torch.complex64)
+    >>> torch.linalg.matrix_rank(a)
+    tensor([[3, 3, 3, 3],
+            [3, 3, 3, 3]])
+    >>> torch.linalg.matrix_rank(a, hermitian=True)
+    tensor([[3, 3, 3, 3],
+            [3, 3, 3, 3]])
+    >>> torch.linalg.matrix_rank(a, tol=1.0)
+    tensor([[3, 2, 2, 2],
+            [1, 2, 1, 2]])
+    >>> torch.linalg.matrix_rank(a, tol=1.0, hermitian=True)
+    tensor([[2, 2, 2, 1],
+            [1, 2, 2, 2]])
+""")
+
 norm = _add_docstr(_linalg.linalg_norm, r"""
 linalg.norm(input, ord=None, dim=None, keepdim=False, *, out=None, dtype=None) -> Tensor
 
diff --git a/torch/nn/intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
index 659d284b2afd..12018a34e23f 100644
--- a/torch/nn/intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -93,8 +93,13 @@ def _forward(self, input):
         bias_shape = [1] * len(self.weight.shape)
         bias_shape[1] = -1
         scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape))
-        # this does not include the conv bias
-        conv = self._conv_forward(input, scaled_weight)
+        # using zero bias here since the bias for original conv
+        # will be added later
+        if self.bias is not None:
+            zero_bias = torch.zeros_like(self.bias)
+        else:
+            zero_bias = torch.zeros(self.out_channels, device=scaled_weight.device)
+        conv = self._conv_forward(input, scaled_weight, zero_bias)
         conv_orig = conv / scale_factor.reshape(bias_shape)
         if self.bias is not None:
             conv_orig = conv_orig + self.bias.reshape(bias_shape)
@@ -402,7 +407,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
 
     def forward(self, input):
         return F.relu(
-            self._conv_forward(input, self.weight_fake_quant(self.weight)))
+            self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias))
 
     @classmethod
     def from_float(cls, mod):
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index b801d990c4a6..33f2a84aed74 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -246,16 +246,16 @@ def __init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             False, _single(0), groups, bias, padding_mode)
 
-    def _conv_forward(self, input, weight):
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
         if self.padding_mode != 'zeros':
             return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
-                            weight, self.bias, self.stride,
+                            weight, bias, self.stride,
                             _single(0), self.dilation, self.groups)
-        return F.conv1d(input, weight, self.bias, self.stride,
+        return F.conv1d(input, weight, bias, self.stride,
                         self.padding, self.dilation, self.groups)
 
     def forward(self, input: Tensor) -> Tensor:
-        return self._conv_forward(input, self.weight)
+        return self._conv_forward(input, self.weight, self.bias)
 
 
 class Conv2d(_ConvNd):
@@ -382,16 +382,16 @@ def __init__(
             in_channels, out_channels, kernel_size, stride, padding, dilation,
             False, _pair(0), groups, bias, padding_mode)
 
-    def _conv_forward(self, input, weight):
+    def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
         if self.padding_mode != 'zeros':
             return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
-                            weight, self.bias, self.stride,
+                            weight, bias, self.stride,
                             _pair(0), self.dilation, self.groups)
-        return F.conv2d(input, weight, self.bias, self.stride,
+        return F.conv2d(input, weight, bias, self.stride,
                         self.padding, self.dilation, self.groups)
 
     def forward(self, input: Tensor) -> Tensor:
-        return self._conv_forward(input, self.weight)
+        return self._conv_forward(input, self.weight, self.bias)
 
 class Conv3d(_ConvNd):
     __doc__ = r"""Applies a 3D convolution over an input signal composed of several input
diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py
index a9c5f8547329..4b3814983347 100644
--- a/torch/nn/qat/modules/conv.py
+++ b/torch/nn/qat/modules/conv.py
@@ -29,7 +29,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
         self.weight_fake_quant = qconfig.weight()
 
     def forward(self, input):
-        return self._conv_forward(input, self.weight_fake_quant(self.weight))
+        return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
 
     @classmethod
     def from_float(cls, mod):
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 2661a7e8338e..dd9060036b04 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -57,9 +57,6 @@ def clamp_max(g, self, max):
 # Opset 11 gather accepts negative indices
 @parse_args('v', 'i', 'v')
 def select(g, self, dim, index):
-    index_scalar_type = index.type().scalarType()
-    if index_scalar_type is None or index_scalar_type not in ['Long', 'Int']:
-        index = g.op("Cast", index, to_i=sym_help.cast_pytorch_to_onnx["Long"])
     return g.op("Gather", self, index, axis_i=dim)
 
 
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index d1f90111fc5d..d36a2a04eae8 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1342,9 +1342,6 @@ def index_select(g, self, dim, index):
         if index_dim == 0:
             # Index is a scalar. Reshape it to a size 1 tensor.
             index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1])))
-    index_scalar_type = index.type().scalarType()
-    if index_scalar_type is None or index_scalar_type not in ['Long', 'Int']:
-        index = g.op("Cast", index, to_i=sym_help.cast_pytorch_to_onnx["Long"])
     return g.op("Gather", self, index, axis_i=dim)
 
 
diff --git a/torch/overrides.py b/torch/overrides.py
index 0819769e1d79..36ae037ed557 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -494,6 +494,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.matmul: lambda input, other, out=None: -1,
         torch.matrix_power: lambda input, n: -1,
         torch.matrix_rank: lambda input, tol=None, symmetric=False: -1,
+        torch.linalg.matrix_rank: lambda input, tol=None, hermitian=False: -1,
         torch.matrix_exp: lambda input: -1,
         torch.max: lambda input, out=None: -1,
         torch.maximum: lambda input, other, out=None: -1,
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index f34bdf1cd9b2..a8b87f0f156b 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -295,7 +295,7 @@ def quantize(model, run_fn, run_args, mapping=None, inplace=False):
         model = copy.deepcopy(model)
     model.eval()
     prepare(model, inplace=True)
-    run_fn(model, run_args)
+    run_fn(model, *run_args)
     convert(model, mapping, inplace=True)
     return model
 
@@ -422,7 +422,7 @@ def quantize_qat(model, run_fn, run_args, inplace=False):
         model = copy.deepcopy(model)
     model.train()
     prepare_qat(model, inplace=True)
-    run_fn(model, run_args)
+    run_fn(model, *run_args)
     convert(model, inplace=True)
     return model
 
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index f2f91cbc9e13..7f4ec5af4f2c 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -67,6 +67,7 @@ def __init__(self,
                  default_test_dtypes=None,  # dtypes to test with by default. Gets intersected
                                             # with the dtypes support on the tested device
                  test_inplace_grad=True,  # whether to gradcheck and gradgradcheck the inplace variant
+                 test_complex_grad=True,  # whether to gradcheck and gradgradcheck for complex dtypes
                  supports_tensor_out=True,  # whether the op supports the out kwarg, returning a Tensor
                  skips=tuple(),  # information about which tests to skip
                  decorators=None):  # decorators to apply to generated tests
@@ -92,6 +93,7 @@ def __init__(self,
         self.inplace_variant = getattr(torch.Tensor, inplace_name) if hasattr(torch.Tensor, name) else None
 
         self.test_inplace_grad = test_inplace_grad
+        self.test_complex_grad = test_complex_grad
         self.supports_tensor_out = supports_tensor_out
 
         self.skips = skips
@@ -493,7 +495,24 @@ def sample_inputs(self, device, dtype, requires_grad=False):
 ]
 
 if TEST_SCIPY:
+    def reference_sigmoid(x):
+        # 'scipy.special.expit' not supported for the input types
+        if x.dtype in [np.complex64, np.complex128]:
+            return (1 / (1 + np.exp(-x)))
+        return scipy.special.expit(x)
+
     op_db_scipy_reference = [
+        UnaryUfuncInfo('sigmoid',
+                       ref=reference_sigmoid,
+                       decorators=(precisionOverride({torch.float16: 1e-2,
+                                                      torch.bfloat16: 1e-2}),),
+                       skips=(SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
+                                       device_type='cpu', dtypes=[torch.cfloat, torch.cdouble]),),
+                       dtypes=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                       dtypesIfCPU=all_types_and_complex_and(torch.bool, torch.bfloat16),
+                       dtypesIfCUDA=all_types_and(torch.bool, torch.half, torch.bfloat16),
+                       promotes_integers_to_float=True,
+                       test_complex_grad=False),  # Reference: https://github.com/pytorch/pytorch/issues/48552
         UnaryUfuncInfo('erf',
                        ref=scipy.special.erf,
                        decorators=(precisionOverride({torch.float16: 1e-2,
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 844750285592..ba7f0d650b22 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -68,7 +68,6 @@ def udf_with_torch_ops(device=-1, use_record_function=False):
     "aten::relu",
     "aten::threshold",
     "aten::sigmoid",
-    "aten::sigmoid",
 ]
 
 # Remote operations are prefixed with the following string for RPC profiling.
diff --git a/torch/testing/check_kernel_launches.py b/torch/testing/check_kernel_launches.py
index 3385fcdf9618..091f1be98561 100644
--- a/torch/testing/check_kernel_launches.py
+++ b/torch/testing/check_kernel_launches.py
@@ -5,7 +5,7 @@
 
 # Regular expression identifies a kernel launch indicator by
 # finding something approximating the pattern ">>>(arguments);"
-# It then requires that `TORCH_CUDA_KERNEL_LAUNCH_CHECK` be
+# It then requires that `C10_CUDA_KERNEL_LAUNCH_CHECK` be
 # the next command.
 # It allows a single backslash `\` between the end of the launch
 # command and the beginning of the kernel check. This handles
@@ -26,7 +26,7 @@
         \\?                                  # 0 or 1 backslashes (for launches in preprocessor macros)
         (?:[0-9]+: )?                        # Detects and ignores a line numbering, if present
         \s*                                  # Maybe some whitespace (includes newlines)
-        TORCH_CUDA_KERNEL_LAUNCH_CHECK\(\);  # Kernel launch guard!
+        C10_CUDA_KERNEL_LAUNCH_CHECK\(\);  # Kernel launch guard!
     )             # End negative lookahead
 """, flags=re.MULTILINE | re.VERBOSE)
 
@@ -53,7 +53,7 @@ def check_code_for_cuda_kernel_launches(code, filename=None):
 
     results = kernel_launch_regex.findall(code)               # Search for bad launches
     for r in results:
-        print(f"Missing TORCH_CUDA_KERNEL_LAUNCH_CHECK in '{filename}'. Context:\n{r}", file=sys.stderr)
+        print(f"Missing C10_CUDA_KERNEL_LAUNCH_CHECK in '{filename}'. Context:\n{r}", file=sys.stderr)
     return len(results)
 
 
diff --git a/torch/utils/_cpp_extension_versioner.py b/torch/utils/_cpp_extension_versioner.py
index cb778ab8923d..958d34ecc71a 100644
--- a/torch/utils/_cpp_extension_versioner.py
+++ b/torch/utils/_cpp_extension_versioner.py
@@ -38,12 +38,16 @@ def bump_version_if_changed(self,
                                 source_files,
                                 build_arguments,
                                 build_directory,
-                                with_cuda):
+                                with_cuda,
+                                is_python_module,
+                                is_standalone):
         hash_value = 0
         hash_value = hash_source_files(hash_value, source_files)
         hash_value = hash_build_arguments(hash_value, build_arguments)
         hash_value = update_hash(hash_value, build_directory)
         hash_value = update_hash(hash_value, with_cuda)
+        hash_value = update_hash(hash_value, is_python_module)
+        hash_value = update_hash(hash_value, is_standalone)
 
         entry = self.entries.get(name)
         if entry is None:
diff --git a/torch/utils/benchmark/utils/_stubs.py b/torch/utils/benchmark/utils/_stubs.py
new file mode 100644
index 000000000000..e2ab6ec086a4
--- /dev/null
+++ b/torch/utils/benchmark/utils/_stubs.py
@@ -0,0 +1,46 @@
+import sys
+from typing import Any, Callable, Dict, TYPE_CHECKING
+
+
+if TYPE_CHECKING or sys.version_info >= (3, 8):
+    from typing import runtime_checkable, Protocol
+else:
+    from typing_extensions import runtime_checkable, Protocol
+
+
+class TimerClass(Protocol):
+    """This is the portion of the `timeit.Timer` API used by benchmark utils."""
+    def __init__(
+        self,
+        stmt: str,
+        setup: str,
+        timer: Callable[[], float],
+        globals: Dict[str, Any]
+    ) -> None:
+        ...
+
+    def timeit(self, number: int) -> float:
+        ...
+
+
+@runtime_checkable
+class TimeitModuleType(Protocol):
+    """Modules generated from `timeit_template.cpp`."""
+    def timeit(self, number: int) -> float:
+        ...
+
+
+class CallgrindModuleType(Protocol):
+    """Replicates the valgrind endpoints in `torch._C`.
+
+    These bindings are used to collect Callgrind profiles on earlier versions
+    of PyTorch and will eventually be removed.
+    """
+    __file__: str
+    __name__: str
+
+    def _valgrind_supported_platform(self) -> bool:
+        ...
+
+    def _valgrind_toggle(self) -> None:
+        ...
diff --git a/torch/utils/benchmark/utils/common.py b/torch/utils/benchmark/utils/common.py
index ddfb8f08389d..1cbed2df51c2 100644
--- a/torch/utils/benchmark/utils/common.py
+++ b/torch/utils/benchmark/utils/common.py
@@ -48,7 +48,7 @@ def title(self) -> str:
 
     def setup_str(self) -> str:
         return (
-            "" if self.setup == "pass"
+            "" if (self.setup == "pass" or not self.setup)
             else f"setup:\n{textwrap.indent(self.setup, '  ')}" if "\n" in self.setup
             else f"setup: {self.setup}"
         )
diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py
new file mode 100644
index 000000000000..ebaa4213e027
--- /dev/null
+++ b/torch/utils/benchmark/utils/cpp_jit.py
@@ -0,0 +1,143 @@
+"""JIT C++ strings into executables."""
+import atexit
+import os
+import re
+import shutil
+import textwrap
+import threading
+import uuid
+from typing import Any, List, Optional
+
+import torch
+from torch.utils.benchmark.utils._stubs import CallgrindModuleType, TimeitModuleType
+from torch.utils import cpp_extension
+
+
+LOCK = threading.Lock()
+SOURCE_ROOT = os.path.split(os.path.abspath(__file__))[0]
+
+# We calculate uuid once at import time so that separate processes will have
+# separate build roots, but threads will share the same build root.
+# `cpp_extension` uses build root as part of the cache key, so per-invocation
+# uuid's (e.g. different build root per _compile_template call) would lead to
+# a 0% cache hit rate and spurious recompilation. Consider the following:
+#   ```
+#   setup = "auto x = torch::ones({1024, 1024});"
+#   stmt = "torch::mm(x, x);"
+#   for num_threads in [1, 2, 4, 8]:
+#     print(Timer(stmt, setup, num_threads=num_threads, language="c++").blocked_autorange())
+#   ````
+# `setup` and `stmt` do not change, so we can reuse the executable from the
+# first pass through the loop.
+BUILD_ROOT = os.path.join(
+    torch._appdirs.user_cache_dir(appname="benchmark_utils_jit"),
+    f"build_{uuid.uuid4()}".replace("-", "")
+)
+
+# BACK_TESTING_NOTE:
+#   There are two workflows where this code could be used. One is the obvious
+#   case where someone simply builds or installs PyTorch and uses Timer.
+#   The other is that the entire `torch/utils/benchmark` folder from a CURRENT
+#   PyTorch checkout is copy-pasted into a much OLDER version of the PyTorch
+#   source code. This is what we refer to here as "back testing". The rationale
+#   is that we might want to use current tooling to study some aspect of an
+#   earlier version of PyTorch. (e.g. a regression.)
+#
+#   The problem is that Timer relies on several aspects of core PyTorch, namely
+#   some binding functions for Valgrind symbols in `torch._C` and the
+#   `torch.__config__._cxx_flags()` method. If we were to naively copy code
+#   around this wouldn't work as the symbols of interest aren't present in
+#   earlier versions of PyTorch. In order to work around this, we must add back
+#   testing shims. These shims will never activate during normal use, but will
+#   allow Timer to function outside of the "correct" version of PyTorch by
+#   emulating functionality that was added later.
+#
+#   These shims are temporary, and as Timer becomes more integrated with
+#   PyTorch the cost and complexity of such shims will increase. Once back
+#   testing is no longer required (which is to say we have done enough historic
+#   analysis and the shims no longer justify their maintenance and code
+#   complexity costs) back testing paths will be removed.
+
+if hasattr(torch.__config__, "_cxx_flags"):
+    CXX_FLAGS = torch.__config__._cxx_flags().strip().split()
+    if "-g" not in CXX_FLAGS:
+        CXX_FLAGS.append("-g")
+else:
+    # FIXME: Remove when back testing is no longer required.
+    CXX_FLAGS = ["-O2", "-fPIC", "-g"]
+
+EXTRA_INCLUDE_PATHS: List[str] = [os.path.join(SOURCE_ROOT, "valgrind_wrapper")]
+CONDA_PREFIX = os.getenv("CONDA_PREFIX")
+if CONDA_PREFIX is not None:
+    # Load will automatically search /usr/include, but not conda include.
+    EXTRA_INCLUDE_PATHS.append(os.path.join(CONDA_PREFIX, "include"))
+
+
+COMPAT_CALLGRIND_BINDINGS: Optional[CallgrindModuleType] = None
+def get_compat_bindings() -> CallgrindModuleType:
+    with LOCK:
+        global COMPAT_CALLGRIND_BINDINGS
+        if COMPAT_CALLGRIND_BINDINGS is None:
+            COMPAT_CALLGRIND_BINDINGS = cpp_extension.load(
+                name="callgrind_bindings",
+                sources=[os.path.join(
+                    SOURCE_ROOT,
+                    "valgrind_wrapper",
+                    "compat_bindings.cpp"
+                )],
+                extra_cflags=CXX_FLAGS,
+                extra_include_paths=EXTRA_INCLUDE_PATHS,
+            )
+    return COMPAT_CALLGRIND_BINDINGS
+
+
+def _compile_template(stmt: str, setup: str, src: str, is_standalone: bool) -> Any:
+    for before, after, indentation in (
+        ("// SETUP_TEMPLATE_LOCATION", setup, 4),
+        ("// STMT_TEMPLATE_LOCATION", stmt, 8)
+    ):
+        # C++ doesn't care about indentation so this code isn't load
+        # bearing the way it is with Python, but this makes the source
+        # look nicer if a human has to look at it.
+        src = re.sub(
+            before,
+            textwrap.indent(after, " " * indentation)[indentation:],
+            src
+        )
+
+    # We want to isolate different Timers. However `cpp_extension` will
+    # cache builds which will significantly reduce the cost of repeated
+    # invocations.
+    with LOCK:
+        if not os.path.exists(BUILD_ROOT):
+            os.makedirs(BUILD_ROOT)
+            atexit.register(shutil.rmtree, BUILD_ROOT)
+
+        name = f"timer_cpp_{abs(hash(src))}"
+        build_dir = os.path.join(BUILD_ROOT, name)
+        os.makedirs(build_dir, exist_ok=True)
+
+        src_path = os.path.join(build_dir, "timer_src.cpp")
+        with open(src_path, "wt") as f:
+            f.write(src)
+
+    # `cpp_extension` has its own locking scheme, so we don't need our lock.
+    return cpp_extension.load(
+        name=name,
+        sources=[src_path],
+        build_directory=build_dir,
+        extra_cflags=CXX_FLAGS,
+        extra_include_paths=EXTRA_INCLUDE_PATHS,
+        is_python_module=not is_standalone,
+        is_standalone=is_standalone,
+    )
+
+
+def compile_timeit_template(stmt: str, setup: str) -> TimeitModuleType:
+    template_path: str = os.path.join(SOURCE_ROOT, "timeit_template.cpp")
+    with open(template_path, "rt") as f:
+        src: str = f.read()
+
+    module = _compile_template(stmt, setup, src, is_standalone=False)
+    assert isinstance(module, TimeitModuleType)
+    return module
diff --git a/torch/utils/benchmark/utils/timeit_template.cpp b/torch/utils/benchmark/utils/timeit_template.cpp
new file mode 100644
index 000000000000..01d62efdb161
--- /dev/null
+++ b/torch/utils/benchmark/utils/timeit_template.cpp
@@ -0,0 +1,36 @@
+/* C++ template for Timer.timeit
+
+This template will be consumed by `cpp_jit.py`, and will replace:
+    `SETUP_TEMPLATE_LOCATION`
+      and
+    `STMT_TEMPLATE_LOCATION`
+sections with user provided statements.
+*/
+#include <chrono>
+
+#include <pybind11/pybind11.h>
+#include <torch/extension.h>
+
+
+double timeit(int n) {
+    // Setup
+    // SETUP_TEMPLATE_LOCATION
+
+    {
+        // Warmup
+        // STMT_TEMPLATE_LOCATION
+    }
+
+    // Main loop
+    auto start_time = std::chrono::high_resolution_clock::now();
+    for (int loop_idx = 0; loop_idx < n; loop_idx++) {
+        // STMT_TEMPLATE_LOCATION
+    }
+    auto end_time = std::chrono::high_resolution_clock::now();
+    return std::chrono::duration<double>(end_time - start_time).count();
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("timeit", &timeit);
+}
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index e017dda2d4dd..374b8bd6c6e0 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -1,16 +1,17 @@
 """Timer class based on the timeit.Timer class, but torch aware."""
-
+import enum
 import timeit
 import textwrap
-from typing import Any, Callable, Dict, List, NoReturn, Optional
+from typing import Any, Callable, Dict, List, NoReturn, Optional, Type, Union
 
 import numpy as np
 import torch
-from torch.utils.benchmark.utils import common
+from torch.utils.benchmark.utils import common, cpp_jit
+from torch.utils.benchmark.utils._stubs import TimerClass, TimeitModuleType
 from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
 
 
-__all__ = ["Timer", "timer"]
+__all__ = ["Timer", "timer", "Language"]
 
 
 if torch.has_cuda and torch.cuda.is_available():
@@ -21,6 +22,46 @@ def timer() -> float:
     timer = timeit.default_timer
 
 
+class Language(enum.Enum):
+    PYTHON = 0
+    CPP = 1
+
+
+class CPPTimer:
+    def __init__(
+        self,
+        stmt: str,
+        setup: str,
+        timer: Callable[[], float],
+        globals: Dict[str, Any],
+    ) -> None:
+        if timer is not timeit.default_timer:
+            raise NotImplementedError(
+                "PyTorch was built with CUDA and a GPU is present; however "
+                "Timer does not yet support GPU measurements. If your "
+                "code is CPU only, pass `timer=timeit.default_timer` to the "
+                "Timer's constructor to indicate this. (Note that this will "
+                "produce incorrect results if the GPU is in fact used, as "
+                "Timer will not synchronize CUDA.)"
+            )
+
+        if globals:
+            raise ValueError("C++ timing does not support globals.")
+
+        self._stmt: str = textwrap.dedent(stmt)
+        self._setup: str = textwrap.dedent(setup)
+        self._timeit_module: Optional[TimeitModuleType] = None
+
+    def timeit(self, number: int) -> float:
+        if self._timeit_module is None:
+            self._timeit_module = cpp_jit.compile_timeit_template(
+                self._stmt,
+                self._setup,
+            )
+
+        return self._timeit_module.timeit(number)
+
+
 class Timer(object):
     """Helper class for measuring execution time of PyTorch statements.
 
@@ -122,7 +163,7 @@ class Timer(object):
             threadpool size which tries to utilize all cores.
     """
 
-    _timer_cls = timeit.Timer
+    _timer_cls: Type[TimerClass] = timeit.Timer
 
     def __init__(
         self,
@@ -135,21 +176,32 @@ def __init__(
         description: Optional[str] = None,
         env: Optional[str] = None,
         num_threads: int = 1,
+        language: Union[Language, str] = Language.PYTHON,
     ):
         if not isinstance(stmt, str):
             raise ValueError("Currently only a `str` stmt is supported.")
 
-        # We copy `globals` to prevent mutations from leaking, (for instance,
-        # `eval` adds the `__builtins__` key) and include `torch` if not
-        # specified as a convenience feature.
-        globals = dict(globals or {})
-        globals.setdefault("torch", torch)
-        self._globals = globals
+        # We copy `globals` to prevent mutations from leaking.
+        # (For instance, `eval` adds the `__builtins__` key)
+        self._globals = dict(globals or {})
+        if language in (Language.PYTHON, "py", "python"):
+            # Include `torch` if not specified as a convenience feature.
+            self._globals.setdefault("torch", torch)
+            self._language: Language = Language.PYTHON
+
+        elif language in (Language.CPP, "cpp", "c++"):
+            assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped."
+            self._timer_cls = CPPTimer
+            setup = ("" if setup == "pass" else setup)
+            self._language = Language.CPP
+
+        else:
+            raise ValueError(f"Invalid language `{language}`.")
 
         # Convenience adjustment so that multi-line code snippets defined in
-        # functions do not IndentationError inside timeit.Timer. The leading
-        # newline removal is for the initial newline that appears when defining
-        # block strings. For instance:
+        # functions do not IndentationError (Python) or look odd (C++). The
+        # leading newline removal is for the initial newline that appears when
+        # defining block strings. For instance:
         #   textwrap.dedent("""
         #     print("This is a stmt")
         #   """)
@@ -158,15 +210,15 @@ def __init__(
         # Stripping this down to 'print("This is a stmt")' doesn't change
         # what gets executed, but it makes __repr__'s nicer.
         stmt = textwrap.dedent(stmt)
-        stmt = (stmt[1:] if stmt[0] == "\n" else stmt).rstrip()
+        stmt = (stmt[1:] if stmt and stmt[0] == "\n" else stmt).rstrip()
         setup = textwrap.dedent(setup)
-        setup = (setup[1:] if setup[0] == "\n" else setup).rstrip()
+        setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip()
 
         self._timer = self._timer_cls(
             stmt=stmt,
             setup=setup,
             timer=timer,
-            globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(globals),
+            globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(self._globals),
         )
         self._task_spec = common.TaskSpec(
             stmt=stmt,
@@ -369,6 +421,9 @@ def collect_callgrind(
         if not isinstance(self._task_spec.stmt, str):
             raise ValueError("`collect_callgrind` currently only supports string `stmt`")
 
+        if self._language != Language.PYTHON:
+            raise NotImplementedError("C++ Callgrind is later in the stack.")
+
         # Check that the statement is valid. It doesn't guarantee success, but it's much
         # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
         # the parent process rather than the valgrind subprocess.
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp b/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp
new file mode 100644
index 000000000000..b52626fe76fd
--- /dev/null
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.cpp
@@ -0,0 +1,25 @@
+/* Used to collect profiles of old versions of PyTorch. */
+#include <callgrind.h>
+#include <pybind11/pybind11.h>
+
+
+bool _valgrind_supported_platform() {
+    #if defined(NVALGRIND)
+    return false;
+    #else
+    return true;
+    #endif
+}
+
+void _valgrind_toggle() {
+    #if defined(NVALGRIND)
+    TORCH_CHECK(false, "Valgrind is not supported.");
+    #else
+    CALLGRIND_TOGGLE_COLLECT;
+    #endif
+}
+
+PYBIND11_MODULE(callgrind_bindings, m) {
+    m.def("_valgrind_supported_platform", &_valgrind_supported_platform);
+    m.def("_valgrind_toggle", &_valgrind_toggle);
+}
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.py b/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.py
deleted file mode 100644
index b7404a653a20..000000000000
--- a/torch/utils/benchmark/utils/valgrind_wrapper/compat_bindings.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""Allow Timer.collect_callgrind to be used on earlier versions of PyTorch
-
-FIXME: Remove this module once we no longer need to back test.
-"""
-import os
-import textwrap
-from typing import List
-
-from torch.utils.cpp_extension import load_inline
-
-
-# load_inline will automatically search /usr/include, but not conda include.
-extra_include_paths: List[str] = []
-conda_prefix = os.getenv("CONDA_PREFIX")
-if conda_prefix is not None:
-    extra_include_paths = [os.path.join(conda_prefix, "include")]
-
-bindings = load_inline(
-    name="callgrind_bindings",
-    cpp_sources=textwrap.dedent("""
-    #include <valgrind/callgrind.h>
-
-    bool _valgrind_supported_platform() {
-        #if defined(NVALGRIND)
-        return false;
-        #else
-        return true;
-        #endif
-    }
-
-    void _valgrind_toggle() {
-        #if defined(NVALGRIND)
-        TORCH_CHECK(false, "Valgrind is not supported.");
-        #else
-        CALLGRIND_TOGGLE_COLLECT;
-        #endif
-    }
-    """),
-    extra_include_paths=extra_include_paths,
-    functions=["_valgrind_supported_platform", "_valgrind_toggle"],
-)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index bad9df90bcd2..b8513671beb9 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -11,13 +11,13 @@
 import sys
 import tempfile
 import textwrap
-from types import ModuleType
 from typing import (
     cast, Any, Callable, DefaultDict, Dict, Generator, List, NamedTuple,
     Optional, Tuple, Union, TYPE_CHECKING)
 
 import torch
-from torch.utils.benchmark.utils import common
+from torch.utils.benchmark.utils import common, cpp_jit
+from torch.utils.benchmark.utils._stubs import CallgrindModuleType
 
 
 __all__ = ["FunctionCount", "FunctionCounts", "CallgrindStats", "CopyIfCallgrind"]
@@ -444,17 +444,14 @@ def construct(self) -> str:
 
 class _ValgrindWrapper(object):
     def __init__(self) -> None:
-        self._bindings_module: Optional[ModuleType] = None
+        self._bindings_module: Optional[CallgrindModuleType] = None
         if hasattr(torch._C, "_valgrind_supported_platform"):
             self._supported_platform: bool = torch._C._valgrind_supported_platform()
 
         else:
             print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.")
-            # This import will JIT the Callgrind control bindings, so don't
-            # invoke unless we know we'll need it.
-            from torch.utils.benchmark.utils.valgrind_wrapper.compat_bindings import bindings
-            self._bindings_module = bindings
-            self._supported_platform = bindings._valgrind_supported_platform()
+            self._bindings_module = cpp_jit.get_compat_bindings()
+            self._supported_platform = self._bindings_module._valgrind_supported_platform()
 
         self._commands_available: Dict[str, bool] = {}
         if self._supported_platform:
@@ -643,7 +640,7 @@ def _construct_script(
         number: int,
         error_log: str,
         stat_log: str,
-        bindings: Optional[ModuleType],
+        bindings: Optional[CallgrindModuleType],
     ) -> str:
         # The naive template looks something like:
         #   "for _ in range({number}): {stmt}"
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index a2f47744c5f3..993b04ca23d8 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -23,6 +23,14 @@
 
 
 IS_WINDOWS = sys.platform == 'win32'
+LIB_EXT = '.pyd' if IS_WINDOWS else '.so'
+EXEC_EXT = '.exe' if IS_WINDOWS else ''
+SHARED_FLAG = '/DLL' if IS_WINDOWS else '-shared'
+
+_HERE = os.path.abspath(__file__)
+_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE))
+TORCH_LIB_PATH = os.path.join(_TORCH_PATH, 'lib')
+
 
 def _find_cuda_home() -> Optional[str]:
     r'''Finds the CUDA install path.'''
@@ -400,7 +408,7 @@ def unix_cuda_flags(cflags):
             # overriding the option if the user explicitly passed it.
             _ccbin = os.getenv("CC")
             if (
-                _ccbin is not None 
+                _ccbin is not None
                 and not any([flag.startswith('-ccbin') or flag.startswith('--compiler-bindir') for flag in cflags])
             ):
                 cflags.extend(['-ccbin', _ccbin])
@@ -848,9 +856,7 @@ def include_paths(cuda: bool = False) -> List[str]:
     Returns:
         A list of include path strings.
     '''
-    here = os.path.abspath(__file__)
-    torch_path = os.path.dirname(os.path.dirname(here))
-    lib_include = os.path.join(torch_path, 'include')
+    lib_include = os.path.join(_TORCH_PATH, 'include')
     paths = [
         lib_include,
         # Remove this once torch/torch.h is officially no longer supported for C++ extensions.
@@ -886,13 +892,8 @@ def library_paths(cuda: bool = False) -> List[str]:
     Returns:
         A list of library path strings.
     '''
-    paths = []
-
     # We need to link against libtorch.so
-    here = os.path.abspath(__file__)
-    torch_path = os.path.dirname(os.path.dirname(here))
-    lib_path = os.path.join(torch_path, 'lib')
-    paths.append(lib_path)
+    paths = [TORCH_LIB_PATH]
 
     if cuda and IS_HIP_EXTENSION:
         lib_dir = 'lib'
@@ -925,6 +926,7 @@ def load(name,
          verbose=False,
          with_cuda: Optional[bool] = None,
          is_python_module=True,
+         is_standalone=False,
          keep_intermediates=True):
     r'''
     Loads a PyTorch C++ extension just-in-time (JIT).
@@ -979,14 +981,23 @@ def load(name,
             ``.cuh`` in ``sources``. Set it to `True`` to force CUDA headers
             and libraries to be included.
         is_python_module: If ``True`` (default), imports the produced shared
-            library as a Python module. If ``False``, loads it into the process
-            as a plain dynamic library.
+            library as a Python module. If ``False``, behavior depends on
+            ``is_standalone``.
+        is_standalone: If ``False`` (default) loads the constructed extension
+            into the process as a plain dynamic library. If ``True``, build a
+            standalone executable.
 
     Returns:
-        If ``is_python_module`` is ``True``, returns the loaded PyTorch
-        extension as a Python module. If ``is_python_module`` is ``False``
-        returns nothing (the shared library is loaded into the process as a side
-        effect).
+        If ``is_python_module`` is ``True``:
+            Returns the loaded PyTorch extension as a Python module.
+
+        If ``is_python_module`` is ``False`` and ``is_standalone`` is ``False``:
+            Returns nothing. (The shared library is loaded into the process as
+            a side effect.)
+
+        If ``is_standalone`` is ``True``.
+            Return the path to the executable. (On Windows, TORCH_LIB_PATH is
+            added to the PATH environment variable as a side effect.)
 
     Example:
         >>> from torch.utils.cpp_extension import load
@@ -1007,6 +1018,7 @@ def load(name,
         verbose,
         with_cuda,
         is_python_module,
+        is_standalone,
         keep_intermediates=keep_intermediates)
 
 
@@ -1155,6 +1167,7 @@ def load_inline(name,
         verbose,
         with_cuda,
         is_python_module,
+        is_standalone=False,
         keep_intermediates=keep_intermediates)
 
 
@@ -1168,7 +1181,11 @@ def _jit_compile(name,
                  verbose: bool,
                  with_cuda: Optional[bool],
                  is_python_module,
+                 is_standalone,
                  keep_intermediates=True) -> None:
+    if is_python_module and is_standalone:
+        raise ValueError("`is_python_module` and `is_standalone` are mutually exclusive.")
+
     if with_cuda is None:
         with_cuda = any(map(_is_cuda_file, sources))
     with_cudnn = any(['cudnn' in f for f in extra_ldflags or []])
@@ -1178,7 +1195,9 @@ def _jit_compile(name,
         sources,
         build_arguments=[extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths],
         build_directory=build_directory,
-        with_cuda=with_cuda
+        with_cuda=with_cuda,
+        is_python_module=is_python_module,
+        is_standalone=is_standalone,
     )
     if version > 0:
         if version != old_version and verbose:
@@ -1210,7 +1229,8 @@ def _jit_compile(name,
                         extra_include_paths=extra_include_paths or [],
                         build_directory=build_directory,
                         verbose=verbose,
-                        with_cuda=with_cuda)
+                        with_cuda=with_cuda,
+                        is_standalone=is_standalone)
             finally:
                 baton.release()
         else:
@@ -1221,6 +1241,10 @@ def _jit_compile(name,
 
     if verbose:
         print(f'Loading extension module {name}...')
+
+    if is_standalone:
+        return _get_exec_path(name, build_directory)
+
     return _import_module_from_library(name, build_directory, is_python_module)
 
 
@@ -1275,7 +1299,8 @@ def _write_ninja_file_and_build_library(
         extra_include_paths,
         build_directory: str,
         verbose: bool,
-        with_cuda: Optional[bool]) -> None:
+        with_cuda: Optional[bool],
+        is_standalone: bool = False) -> None:
     verify_ninja_availability()
     if IS_WINDOWS:
         compiler = os.environ.get('CXX', 'cl')
@@ -1287,7 +1312,8 @@ def _write_ninja_file_and_build_library(
     extra_ldflags = _prepare_ldflags(
         extra_ldflags or [],
         with_cuda,
-        verbose)
+        verbose,
+        is_standalone)
     build_file_path = os.path.join(build_directory, 'build.ninja')
     if verbose:
         print(f'Emitting ninja build file {build_file_path}...')
@@ -1301,7 +1327,8 @@ def _write_ninja_file_and_build_library(
         extra_cuda_cflags=extra_cuda_cflags or [],
         extra_ldflags=extra_ldflags or [],
         extra_include_paths=extra_include_paths or [],
-        with_cuda=with_cuda)
+        with_cuda=with_cuda,
+        is_standalone=is_standalone)
 
     if verbose:
         print(f'Building extension module {name}...')
@@ -1334,11 +1361,7 @@ def verify_ninja_availability():
         raise RuntimeError("Ninja is required to load C++ extensions")
 
 
-def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
-    here = os.path.abspath(__file__)
-    torch_path = os.path.dirname(os.path.dirname(here))
-    lib_path = os.path.join(torch_path, 'lib')
-
+def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
     if IS_WINDOWS:
         python_path = os.path.dirname(sys.executable)
         python_lib_path = os.path.join(python_path, 'libs')
@@ -1353,11 +1376,13 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
             # Related issue: https://github.com/pytorch/pytorch/issues/31611
             extra_ldflags.append('-INCLUDE:?warp_size@cuda@at@@YAHXZ')
         extra_ldflags.append('torch.lib')
-        extra_ldflags.append('torch_python.lib')
-        extra_ldflags.append(f'/LIBPATH:{python_lib_path}')
-        extra_ldflags.append(f'/LIBPATH:{lib_path}')
+        extra_ldflags.append(f'/LIBPATH:{TORCH_LIB_PATH}')
+        if not is_standalone:
+            extra_ldflags.append('torch_python.lib')
+            extra_ldflags.append(f'/LIBPATH:{python_lib_path}')
+
     else:
-        extra_ldflags.append(f'-L{lib_path}')
+        extra_ldflags.append(f'-L{TORCH_LIB_PATH}')
         extra_ldflags.append('-lc10')
         if with_cuda:
             extra_ldflags.append('-lc10_hip' if IS_HIP_EXTENSION else '-lc10_cuda')
@@ -1365,7 +1390,11 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose):
         if with_cuda:
             extra_ldflags.append('-ltorch_hip' if IS_HIP_EXTENSION else '-ltorch_cuda')
         extra_ldflags.append('-ltorch')
-        extra_ldflags.append('-ltorch_python')
+        if not is_standalone:
+            extra_ldflags.append('-ltorch_python')
+
+        if is_standalone:
+            extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}")
 
     if with_cuda:
         if verbose:
@@ -1565,6 +1594,17 @@ def _run_ninja_build(build_directory: str, verbose: bool, error_prefix: str) ->
         raise RuntimeError(message) from e
 
 
+def _get_exec_path(module_name, path):
+    if IS_WINDOWS and TORCH_LIB_PATH not in os.getenv('PATH', '').split(';'):
+        torch_lib_in_path = any(
+            os.path.exists(p) and os.path.samefile(p, TORCH_LIB_PATH)
+            for p in os.getenv('PATH', '').split(';')
+        )
+        if not torch_lib_in_path:
+            os.environ['PATH'] = f"{TORCH_LIB_PATH};{os.getenv('PATH', '')}"
+    return os.path.join(path, f'{module_name}{EXEC_EXT}')
+
+
 def _import_module_from_library(module_name, path, is_python_module):
     # https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path
     file, path, description = imp.find_module(module_name, [path])
@@ -1583,7 +1623,8 @@ def _write_ninja_file_to_build_library(path,
                                        extra_cuda_cflags,
                                        extra_ldflags,
                                        extra_include_paths,
-                                       with_cuda) -> None:
+                                       with_cuda,
+                                       is_standalone) -> None:
     extra_cflags = [flag.strip() for flag in extra_cflags]
     extra_cuda_cflags = [flag.strip() for flag in extra_cuda_cflags]
     extra_ldflags = [flag.strip() for flag in extra_ldflags]
@@ -1603,8 +1644,10 @@ def _write_ninja_file_to_build_library(path,
         user_includes += system_includes
         system_includes.clear()
 
-    common_cflags = [f'-DTORCH_EXTENSION_NAME={name}']
-    common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
+    common_cflags = []
+    if not is_standalone:
+        common_cflags.append(f'-DTORCH_EXTENSION_NAME={name}')
+        common_cflags.append('-DTORCH_API_INCLUDE_EXTENSION_H')
 
     # Note [Pybind11 ABI constants]
     #
@@ -1674,19 +1717,16 @@ def object_file_path(source_file: str) -> str:
         return target
 
     objects = [object_file_path(src) for src in sources]
+    ldflags = ([] if is_standalone else [SHARED_FLAG]) + extra_ldflags
 
-    if IS_WINDOWS:
-        ldflags = ['/DLL'] + extra_ldflags
-    else:
-        ldflags = ['-shared'] + extra_ldflags
     # The darwin linker needs explicit consent to ignore unresolved symbols.
     if sys.platform.startswith('darwin'):
         ldflags.append('-undefined dynamic_lookup')
     elif IS_WINDOWS:
         ldflags = _nt_quote_args(ldflags)
 
-    ext = 'pyd' if IS_WINDOWS else 'so'
-    library_target = f'{name}.{ext}'
+    ext = EXEC_EXT if is_standalone else LIB_EXT
+    library_target = f'{name}{ext}'
 
     _write_ninja_file(
         path=path,
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 8d7726ebd129..d1025c02cc9b 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -618,46 +618,72 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     #      simple things like acquiring an internal lock of a queue may hang.
     #      Therefore, in this case, we actually need to prevent `__del__` from
     #      being executed, and rely on the automatic termination of daemonic
-    #      children. Thus, we register an `atexit` hook that sets a global flag
+    #      children.
+    #
+    #      Thus, we register an `atexit` hook that sets a global flag
     #      `_utils.python_exit_status`. Since `atexit` hooks are executed in the
     #      reverse order of registration, we are guaranteed that this flag is
-    #      set before library resources we use are freed. (Hooks freeing those
-    #      resources are registered at importing the Python core libraries at
-    #      the top of this file.) So in `__del__`, we check if
-    #      `_utils.python_exit_status` is set or `None` (freed), and perform
-    #      no-op if so.
+    #      set before library resources we use are freed (which, at least in
+    #      CPython, is done via an `atexit` handler defined in
+    #      `multiprocessing/util.py`
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/util.py#L320-L362
+    #      registered when an object requiring this mechanism is first
+    #      created, e.g., `mp.Queue`
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/context.py#L100-L103
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/queues.py#L29
+    #      )
+    #
+    #      So in `__del__`, we check if `_utils.python_exit_status` is set or
+    #      `None` (freed), and perform no-op if so.
+    #
+    #      However, simply letting library clean-up codes run can also be bad,
+    #      because such codes (i.e., `multiprocessing.util._exit_function()`)
+    #      include join putting threads for `mp.Queue`, which can be blocking.
+    #      Hence, the main process putting threads are called with
+    #      `cancel_join_thread` at creation.  See later section
+    #      [ 3b. A process won't hang when putting into a queue; ]
+    #      for more details.
+    #
+    #      Here are two example cases where library clean-up codes can run
+    #      before `__del__` is called:
     #
-    #      Another problem with `__del__` is also related to the library cleanup
-    #      calls. When a process ends, it shuts the all its daemonic children
-    #      down with a SIGTERM (instead of joining them without a timeout).
-    #      Simiarly for threads, but by a different mechanism. This fact,
-    #      together with a few implementation details of multiprocessing, forces
-    #      us to make workers daemonic. All of our problems arise when a
-    #      DataLoader is used in a subprocess, and are caused by multiprocessing
-    #      code which looks more or less like this:
+    #        1. If we hold onto a reference to the iterator, it more often
+    #           than not tries to do `multiprocessing` library cleaning before
+    #           clearing the alive referenced objects (https://github.com/pytorch/pytorch/issues/48666)
+    #           and thus prevents our cleaning-up code to run first.
     #
-    #          try:
-    #              your_function_using_a_dataloader()
-    #          finally:
-    #              multiprocessing.util._exit_function()
+    #        2. A similar issue araises when a `DataLoader` is used in a subprocess.
+    #           When a process ends, it shuts the all its daemonic children
+    #           down with a SIGTERM (instead of joining them without a timeout).
+    #           Simiarly for threads, but by a different mechanism. This fact,
+    #           together with a few implementation details of multiprocessing, forces
+    #           us to make workers daemonic. All of our problems arise when a
+    #           DataLoader is used in a subprocess, and are caused by multiprocessing
+    #           code which looks more or less like this:
     #
-    #      The joining/termination mentioned above happens inside
-    #      `_exit_function()`. Now, if `your_function_using_a_dataloader()`
-    #      throws, the stack trace stored in the exception will prevent the
-    #      frame which uses `DataLoaderIter` to be freed. If the frame has any
-    #      reference to the `DataLoaderIter` (e.g., in a method of the iter),
-    #      its  `__del__`, which starts the shutdown procedure, will not be
-    #      called. That, in turn, means that workers aren't notified. Attempting
-    #      to join in `_exit_function` will then result in a hang.
+    #               try:
+    #                   your_function_using_a_dataloader()
+    #               finally:
+    #                   multiprocessing.util._exit_function()
     #
-    #      For context, `_exit_function` is also registered as an `atexit` call.
-    #      So it is unclear to me (@ssnl) why this is needed in a finally block.
-    #      The code dates back to 2008 and there is no comment on the original
-    #      PEP 371 or patch https://bugs.python.org/issue3050 (containing both
-    #      the finally block and the `atexit` registration) that explains this.
+    #           The joining/termination mentioned above happens inside
+    #           `_exit_function()`. Now, if `your_function_using_a_dataloader()`
+    #           throws, the stack trace stored in the exception will prevent the
+    #           frame which uses `DataLoaderIter` to be freed. If the frame has any
+    #           reference to the `DataLoaderIter` (e.g., in a method of the iter),
+    #           its  `__del__`, which starts the shutdown procedure, will not be
+    #           called. That, in turn, means that workers aren't notified. Attempting
+    #           to join in `_exit_function` will then result in a hang.
     #
-    #      Another choice is to just shutdown workers with logic in 1 above
-    #      whenever we see an error in `next`. This isn't ideal because
+    #           For context, `_exit_function` is also registered as an `atexit` call.
+    #           So it is unclear to me (@ssnl) why this is needed in a finally block.
+    #           The code dates back to 2008 and there is no comment on the original
+    #           PEP 371 or patch https://bugs.python.org/issue3050 (containing both
+    #           the finally block and the `atexit` registration) that explains this.
+    #
+    #
+    #      Finally, another choice is to just shutdown workers with logic in 1
+    #      above whenever we see an error in `next`. This isn't ideal because
     #        a. It prevents users from using try-catch to resume data loading.
     #        b. It doesn't prevent hanging if users have references to the
     #           iterator.
@@ -705,30 +731,33 @@ class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
     #           We use `mp.Queue` which has a separate background thread to put
     #           objects from an unbounded buffer array. The background thread is
     #           daemonic and usually automatically joined when the process
-    #           exits.
+    #           *exits*.
     #
-    #           However, in case that the receiver has ended abruptly while
-    #           reading from the pipe, the join will hang forever. Therefore,
-    #           for both `worker_result_queue` (worker -> main process/pin_memory_thread)
-    #           and each `index_queue` (main process -> worker), we use
-    #           `q.cancel_join_thread()` in sender process before any `q.put` to
-    #           prevent this automatic join.
-    #
-    #           Moreover, having all queues called `cancel_join_thread` makes
-    #           implementing graceful shutdown logic in `__del__` much easier.
-    #           It won't need to get from any queue, which would also need to be
-    #           guarded by periodic status checks.
+    #           In case that the receiver has ended abruptly while
+    #           reading from the pipe, the join will hang forever.  The usual
+    #           solution for this in Python is calling  `q.cancel_join_thread`,
+    #           which prevents automatically joining it when finalizing
+    #           (exiting).
     #
     #           Nonetheless, `cancel_join_thread` must only be called when the
     #           queue is **not** going to be read from or write into by another
     #           process, because it may hold onto a lock or leave corrupted data
     #           in the queue, leading other readers/writers to hang.
     #
-    #           `pin_memory_thread`'s `data_queue` is a `queue.Queue` that does
-    #           a blocking `put` if the queue is full. So there is no above
-    #           problem, but we do need to wrap the `put` in a loop that breaks
-    #           not only upon success, but also when the main process stops
-    #           reading, i.e., is shutting down.
+    #           Hence,
+    #             + For worker processes, we only do so (for their output
+    #               queues, i.e., `worker_result_queue`) before exiting.
+    #             + For `pin_memory_thread`, its output queue `data_queue` is a
+    #               `queue.Queue` that does blocking `put` if the queue is full.
+    #               So there is no above problem, but as a result, in
+    #               `_pin_memory_loop`, we do need to  wrap the `put` in a loop
+    #               that breaks not only upon success, but also when the main
+    #               process stops reading, i.e., is shutting down.
+    #             + For loader process, we `cancel_join_thread()` for all
+    #               `_index_queues` because the whole purpose of workers and
+    #               `pin_memory_thread` is to serve the loader process.  If
+    #               loader process is already exiting, we don't really care if
+    #               the queues are corrupted.
     #
     #
     # Now let's get back to 1:
@@ -867,7 +896,9 @@ def __init__(self, loader):
         for i in range(self._num_workers):
             # No certainty which module multiprocessing_context is
             index_queue = multiprocessing_context.Queue()  # type: ignore
-            # index_queue.cancel_join_thread()
+            # Need to `cancel_join_thread` here!
+            # See sections (2) and (3b) above.
+            index_queue.cancel_join_thread()
             w = multiprocessing_context.Process(
                 target=_utils.worker._worker_loop,
                 args=(self._dataset_kind, self._dataset, index_queue,
@@ -1234,6 +1265,9 @@ def _shutdown_workers(self):
         if not self._shutdown:
             self._shutdown = True
             try:
+                # Normal exit when last reference is gone / iterator is depleted.
+                # See (1) and the second half of the note.
+
                 # Exit `pin_memory_thread` first because exiting workers may leave
                 # corrupted data in `worker_result_queue` which `pin_memory_thread`
                 # reads from.
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index 9ad9c4b82e9d..8c1eb599a714 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -8103,7 +8103,7 @@
         ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)),
         ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)),
         ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)),
-        ("TORCH_CUDA_KERNEL_LAUNCH_CHECK", ("TORCH_HIP_KERNEL_LAUNCH_CHECK", API_C10))
+        ("C10_CUDA_KERNEL_LAUNCH_CHECK", ("C10_HIP_KERNEL_LAUNCH_CHECK", API_C10))
     ]
 )