diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0716e516518b..d19c08b2b0b6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,6 +11,9 @@ parameters:
   run_binary_tests:
     type: boolean
     default: false
+  run_build:
+    type: boolean
+    default: true
 
 docker_config_defaults: &docker_config_defaults
   user: jenkins
@@ -9762,6 +9765,7 @@ workflows:
               only:
                 - postnightly
           executor: windows-with-nvidia-gpu
+    when: << pipeline.parameters.run_build >>
   ecr_gc:
     triggers:
       - schedule:
diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py
index f1af924bd3e2..a836d2e510a6 100755
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@@ -112,7 +112,10 @@ def gen_build_workflows_tree():
                 "when": r"<< pipeline.parameters.run_binary_tests >>",
                 "jobs": [f() for f in binary_build_functions],
             },
-            "build": {"jobs": [f() for f in build_workflows_functions]},
+            "build": {
+                "when": r"<< pipeline.parameters.run_build >>",
+                "jobs": [f() for f in build_workflows_functions]
+            },
         }
     }
 
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index 0b2e60b48f8e..26cc77c8ff9c 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -51,7 +51,14 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
     else
       cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
     fi
-    retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
+    (
+      # For some reason conda likes to re-activate the conda environment when attempting this install
+      # which means that a deactivate is run and some variables might not exist when that happens,
+      # namely CONDA_MKL_INTERFACE_LAYER_BACKUP from libblas so let's just ignore unbound variables when
+      # it comes to the conda installation commands
+      set +u
+      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}"
+    )
   fi
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
   pip install "\$pkg"
diff --git a/.circleci/verbatim-sources/header-section.yml b/.circleci/verbatim-sources/header-section.yml
index 26205a0cccba..43d4c94ee5ed 100644
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@@ -11,6 +11,9 @@ parameters:
   run_binary_tests:
     type: boolean
     default: false
+  run_build:
+    type: boolean
+    default: true
 
 docker_config_defaults: &docker_config_defaults
   user: jenkins
diff --git a/.github/pytorch-circleci-labels.yml b/.github/pytorch-circleci-labels.yml
index ccdf2e876af1..3a9eeca0abcc 100644
--- a/.github/pytorch-circleci-labels.yml
+++ b/.github/pytorch-circleci-labels.yml
@@ -9,3 +9,5 @@ labels_to_circle_params:
         - release/.*
       tags:
         - v[0-9]+(\.[0-9]+)*-rc[0-9]+
+    set_to_false:
+      - run_build
diff --git a/.github/workflows/update_s3_htmls.yml b/.github/workflows/update_s3_htmls.yml
index 92f9a66a0fd8..f2320ce2fcbf 100644
--- a/.github/workflows/update_s3_htmls.yml
+++ b/.github/workflows/update_s3_htmls.yml
@@ -9,6 +9,7 @@ on:
 jobs:
   update-html:
     runs-on: ubuntu-latest
+    if: ${{ github.repository_owner == 'pytorch' }}
     strategy:
       matrix:
         prefix: ["whl", "whl/test", "whl/nightly"]
diff --git a/.jenkins/pytorch/README.md b/.jenkins/pytorch/README.md
index ea6c6dd40f68..9fd68ecf7f15 100644
--- a/.jenkins/pytorch/README.md
+++ b/.jenkins/pytorch/README.md
@@ -10,9 +10,9 @@ it is very easy to run these tests yourself:
    ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``,
    where ``$BUILD_ENVIRONMENT`` is one of the build environments
    enumerated in
-   [pytorch-dockerfiles](https://github.com/pietern/pytorch-dockerfiles/blob/master/build.sh)
+   [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.circleci/docker/build.sh). The dockerfile used by jenkins can be found under the `.circle` [directory](https://github.com/pytorch/pytorch/blob/master/.circleci/docker)
 
-2. Run ``docker -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
+2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
    run one of the scripts in this directory.
 
 The Docker images are designed so that any "reasonable" build commands
@@ -38,5 +38,5 @@ mechanisms we use:
   build scripts.
 
 - We reroute well known paths like `/usr/bin/gcc` to alternate
-  implementations with `update-alternatives, instead of setting
+  implementations with `update-alternatives`, instead of setting
   `CC` and `CXX` in our implementations.
diff --git a/.jenkins/pytorch/codegen-test.sh b/.jenkins/pytorch/codegen-test.sh
index 17e7e9fa3445..47d13f2908d0 100755
--- a/.jenkins/pytorch/codegen-test.sh
+++ b/.jenkins/pytorch/codegen-test.sh
@@ -48,13 +48,6 @@ python -m tools.autograd.gen_autograd \
   "$OUT"/autograd \
   tools/autograd
 
-# unboxing_wrappers codegen (called by torch codegen but can run independently)
-mkdir -p "$OUT"/unboxing_wrappers
-python -m tools.jit.gen_unboxing_wrappers \
-  "$OUT"/torch/share/ATen/Declarations.yaml \
-  "$OUT"/unboxing_wrappers \
-  tools/jit/templates
-
 # annotated_fn_args codegen (called by torch codegen but can run independently)
 mkdir -p "$OUT"/annotated_fn_args
 python -m tools.autograd.gen_annotated_fn_args \
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 0c34ddcc6179..24ec02c76df5 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -9,11 +9,6 @@ pip install -q hypothesis "librosa>=0.6.2" "numba<=0.49.1" psutil
 # TODO move this to docker
 pip install unittest-xml-reporting pytest
 
-# faulthandler become built-in since 3.3
-if [[ ! $(python -c "import sys; print(int(sys.version_info >= (3, 3)))") == "1" ]]; then
-  pip install -q faulthandler
-fi
-
 if [ -z "${IN_CI}" ]; then
   rm -rf ${WORKSPACE_DIR}/miniconda3/lib/python3.6/site-packages/torch*
 fi
diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
index a052a1b67d59..ed6482890993 100644
--- a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -41,8 +41,6 @@ popd
 :: The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 pip install "ninja==1.10.0.post1" future "hypothesis==4.53.2" "librosa>=0.6.2" psutil pillow unittest-xml-reporting pytest coverage
 if %errorlevel% neq 0 ( exit /b %errorlevel% )
-:: No need to install faulthandler since we only test Python >= 3.6 on Windows
-:: faulthandler is builtin since Python 3.3
 
 set DISTUTILS_USE_SDK=1
 
diff --git a/BUILD.bazel b/BUILD.bazel
index b3faea487965..2b4636d850c9 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -193,9 +193,6 @@ libtorch_cpp_generated_sources = [
         "torch/csrc/autograd/generated/Functions.h",
         "torch/csrc/autograd/generated/Functions.cpp",
         "torch/csrc/autograd/generated/variable_factories.h",
-        "torch/csrc/jit/generated/generated_unboxing_wrappers_0.cpp",
-        "torch/csrc/jit/generated/generated_unboxing_wrappers_1.cpp",
-        "torch/csrc/jit/generated/generated_unboxing_wrappers_2.cpp",
 ]
 
 libtorch_python_generated_sources = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba862b5a4d5f..3df73f8a3041 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,6 +173,8 @@ option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(
     USE_NCCL "Use NCCL" ON
     "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
+cmake_dependent_option(USE_RCCL "Use RCCL" ON
+    USE_NCCL OFF)
 cmake_dependent_option(
     USE_STATIC_NCCL "Use static NCCL" OFF
     "USE_NCCL" OFF)
@@ -316,7 +318,7 @@ set(OP_DEPENDENCY "" CACHE STRING
 # symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk
 # https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu
 if(LINUX)
-  set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed")
+  set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed ${CMAKE_SHARED_LINKER_FLAGS}")
 endif()
 
 if(MSVC)
diff --git a/android/test_app/app/src/main/AndroidManifest.xml b/android/test_app/app/src/main/AndroidManifest.xml
index a83bf223bdaf..abdd9a8d986a 100644
--- a/android/test_app/app/src/main/AndroidManifest.xml
+++ b/android/test_app/app/src/main/AndroidManifest.xml
@@ -18,4 +18,10 @@
     </application>
 
     <uses-permission android:name="android.permission.CAMERA" />
+
+    <!--
+     Permissions required by the Snapdragon Profiler to collect GPU metrics.
+    -->
+    <uses-permission android:name="android.permission.INTERNET" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
 </manifest>
diff --git a/aten/conda/meta.yaml b/aten/conda/meta.yaml
index d8096fc73a0f..a502690a5447 100644
--- a/aten/conda/meta.yaml
+++ b/aten/conda/meta.yaml
@@ -24,7 +24,7 @@ requirements:
     - mkl # [not osx]
 
 about:
-  home: https://github.com/zdevito/ATen
+  home: https://github.com/pytorch/pytorch
   license: BSD
   summary: A TENsor library for C++14
 
diff --git a/aten/src/ATen/ATen.h b/aten/src/ATen/ATen.h
index ae95ef43f21c..8d29a9204420 100644
--- a/aten/src/ATen/ATen.h
+++ b/aten/src/ATen/ATen.h
@@ -31,3 +31,4 @@
 #include <c10/util/Exception.h>
 #include <ATen/core/UnsafeFromTH.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index 419c454257d8..2cd7cac4e71b 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -287,6 +287,25 @@ Tensor squeeze_dim_batching_rule(const Tensor& self, int64_t dim) {
   return self_physical.getPhysicalToLogicalMap().apply(result);
 }
 
+Tensor trace_batching_rule(const Tensor& self) {
+  auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
+  // Batched Diagonal View
+  auto self_diag = at::diagonal(self_physical.tensor(), /*offset*/0, /*dim1*/-2, /*dim2*/-1);
+  auto result =  at::sum(self_diag, -1);
+  return self_physical.getPhysicalToLogicalMap().apply(result);
+}
+
+Tensor trace_backward_batching_rule(const Tensor& grad, IntArrayRef input_sizes) {
+  auto grad_physical = MultiBatchVmapTransform::logicalToPhysical(grad);
+  auto grad_input = at::zeros(grad_physical.getPhysicalShape(input_sizes), grad.options());
+  // Batched Diagonal View
+  auto grad_input_diag = at::diagonal(grad_input, /*offset*/0, /*dim1*/-2, /*dim2*/-1);
+  // Append a dimension of size one to the grad output 
+  auto grad_physical_tensor = grad_physical.tensor().unsqueeze(-1);
+  grad_input_diag.copy_(grad_physical_tensor);
+  return grad_physical.getPhysicalToLogicalMap().apply(grad_input);
+}
+
 Tensor transpose_int_batching_rule(const Tensor& self, int64_t dim0, int64_t dim1) {
   // PyTorch has a special case where scalar_tensor.transpose(dim0, dim1) works
   // for dim0, dim1 in {0, -1} and returns the scalar tensor. If the following happens:
@@ -996,7 +1015,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("_add_batch_dim", native::_add_batch_dim);
   m.impl("_remove_batch_dim", native::_remove_batch_dim);
 
-  m.impl_UNBOXED("sum.dim_IntList", sum_batching_rule);
+  m.impl("sum.dim_IntList", sum_batching_rule);
   m.impl("is_complex", native::is_complex);
   m.impl("conj", native::conj);
 
@@ -1029,6 +1048,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("squeeze", squeeze_batching_rule);
   m.impl("squeeze.dim", squeeze_dim_batching_rule);
   m.impl("t", native::t); // composite wrt autograd
+  m.impl("trace", trace_batching_rule);
   m.impl("transpose.int", transpose_int_batching_rule);
   m.impl("unbind.int", unbind_batching_rule);
   m.impl("unfold", unfold_batching_rule);
@@ -1089,6 +1109,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
 #undef TO_BATCHING_RULE
   m.impl("clone", clone_batching_rule);
 
+  using TensorTensorScalarType = Tensor (*)(const Tensor&, const Tensor&, Scalar);
   using TensorTensorType = Tensor (*)(const Tensor&, const Tensor&);
   using TensorScalarType = Tensor (*)(const Tensor&, Scalar);
 
@@ -1115,6 +1136,12 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   m.impl("pow.Scalar", pow_scalar_Tensor_batching_rule);
 
   m.impl("sigmoid_backward", binary_pointwise_batching_rule<TensorTensorType, at::sigmoid_backward>);
+  m.impl(
+      "threshold_backward",
+      binary_pointwise_batching_rule<
+          TensorTensorScalarType,
+          at::threshold_backward,
+          Scalar>);
 
   // for at::result_type, call the native::result_type implementation.
   // We don't have to do anything special because native::result_type operates
@@ -1150,6 +1177,7 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
   // backward operators
   m.impl("select_backward", select_backward_batching_rule);
   m.impl("slice_backward", slice_backward_batching_rule);
+  m.impl("trace_backward", trace_backward_batching_rule);
   m.impl("diagonal_backward", diagonal_backward_batching_rule);
 
   // Tensor.new_* operators
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index fd3c95f2573b..6fedef185b21 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -72,7 +72,7 @@ file(GLOB metal_h "metal/*.h")
 file(GLOB metal_cpp "metal/*.cpp")
 file(GLOB_RECURSE native_metal_h "native/metal/*.h")
 file(GLOB metal_test_srcs "native/metal/mpscnn/tests/*.mm")
-file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm", "native/metal/*.cpp")
+file(GLOB_RECURSE native_metal_srcs "native/metal/*.mm" "native/metal/*.cpp")
 EXCLUDE(native_metal_srcs "${native_metal_srcs}" ${metal_test_srcs})
 file(GLOB metal_prepack_h "native/metal/MetalPrepackOpContext.h")
 file(GLOB metal_prepack_cpp "native/metal/MetalPrepackOpRegister.cpp")
diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp
index bfa4a2a8f72f..ff4a2f1c61e2 100644
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@@ -1,4 +1,6 @@
 #include <ATen/CPUGeneratorImpl.h>
+#include <ATen/Utils.h>
+#include <ATen/core/MT19937RNGEngine.h>
 #include <c10/util/C++17.h>
 #include <algorithm>
 
@@ -6,6 +8,42 @@ namespace at {
 
 namespace detail {
 
+/**
+ * CPUGeneratorImplStateLegacy is a POD class needed for memcpys
+ * in torch.get_rng_state() and torch.set_rng_state().
+ * It is a legacy class and even though it is replaced with
+ * at::CPUGeneratorImpl, we need this class and some of its fields
+ * to support backward compatibility on loading checkpoints.
+ */
+struct CPUGeneratorImplStateLegacy {
+  /* The initial seed. */
+  uint64_t the_initial_seed;
+  int left;  /* = 1; */
+  int seeded; /* = 0; */
+  uint64_t next;
+  uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector  */
+
+  /********************************/
+
+  /* For normal distribution */
+  double normal_x;
+  double normal_y;
+  double normal_rho;
+  int normal_is_valid; /* = 0; */
+};
+
+/**
+ * CPUGeneratorImplState is a POD class containing
+ * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used
+ * as a helper for torch.get_rng_state() and torch.set_rng_state()
+ * functions.
+ */ 
+struct CPUGeneratorImplState {
+  CPUGeneratorImplStateLegacy legacy_pod;
+  float next_float_normal_sample;
+  bool is_next_float_normal_sample_valid;
+};
+
 /**
  * PyTorch maintains a collection of default generators that get
  * initialized once. The purpose of these default generators is to
@@ -75,6 +113,128 @@ uint64_t CPUGeneratorImpl::seed() {
   return random;
 }
 
+/**
+ * Sets the internal state of CPUGeneratorImpl. The new internal state
+ * must be a strided CPU byte tensor and of the same size as either
+ * CPUGeneratorImplStateLegacy (for legacy CPU generator state) or
+ * CPUGeneratorImplState (for new state).
+ * 
+ * FIXME: Remove support of the legacy state in the future?
+ */
+void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  using detail::CPUGeneratorImplState;
+  using detail::CPUGeneratorImplStateLegacy;
+
+  static_assert(std::is_pod<CPUGeneratorImplStateLegacy>::value, "CPUGeneratorImplStateLegacy is not a PODType");
+  static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+
+  static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy);
+  static const size_t size_current = sizeof(CPUGeneratorImplState);
+  static_assert(size_legacy != size_current, "CPUGeneratorImplStateLegacy and CPUGeneratorImplState can't be of the same size");
+
+  detail::check_rng_state(new_state);
+
+  at::mt19937 engine;
+  auto float_normal_sample = c10::optional<float>();
+  auto double_normal_sample = c10::optional<double>();
+
+  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
+  CPUGeneratorImplStateLegacy* legacy_pod;
+  auto new_state_size = new_state.numel();
+  if (new_state_size == size_legacy) {
+    legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
+    // Note that in CPUGeneratorImplStateLegacy, we didn't have float version
+    // of normal sample and hence we leave the c10::optional<float> as is
+
+    // Update next_double_normal_sample.
+    // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
+    // and a rho value (normal_rho). These three values were redundant and in the new
+    // DistributionsHelper.h, we store the actual extra normal sample, rather than three
+    // intermediate values.
+    if (legacy_pod->normal_is_valid) {
+      auto r = legacy_pod->normal_rho;
+      auto theta = 2.0 * M_PI * legacy_pod->normal_x;
+      // we return the sin version of the normal sample when in caching mode
+      double_normal_sample = c10::optional<double>(r * ::sin(theta));
+    }
+  } else if (new_state_size == size_current) {
+    auto rng_state = (CPUGeneratorImplState*)new_state.data();
+    legacy_pod = &rng_state->legacy_pod;
+    // update next_float_normal_sample
+    if (rng_state->is_next_float_normal_sample_valid) {
+      float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
+    }
+
+    // Update next_double_normal_sample.
+    // Note that in getRNGState, we now return the actual normal sample in normal_y
+    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
+    // are squashed to 0.0.
+    if (legacy_pod->normal_is_valid) {
+      double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
+    }
+  } else {
+    AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
+             " or a CPUGeneratorImplState of size ", size_current,
+             " but found the input RNG state size to be ", new_state_size);
+  }
+
+  // construct engine_
+  // Note that CPUGeneratorImplStateLegacy stored a state array of 64 bit uints, whereas in our
+  // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
+  // doing a std::copy.
+  at::mt19937_data_pod rng_data;
+  std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin());
+  rng_data.seed_ = legacy_pod->the_initial_seed;
+  rng_data.left_ = legacy_pod->left;
+  rng_data.seeded_ = legacy_pod->seeded;
+  rng_data.next_ = static_cast<uint32_t>(legacy_pod->next);
+  engine.set_data(rng_data);
+  TORCH_CHECK(engine.is_valid(), "Invalid mt19937 state");
+  this->engine_ = engine;
+  this->next_float_normal_sample_ = float_normal_sample;
+  this->next_double_normal_sample_ = double_normal_sample;
+}
+
+/**
+ * Gets the current internal state of CPUGeneratorImpl. The internal
+ * state is returned as a CPU byte tensor.
+ */
+c10::intrusive_ptr<c10::TensorImpl> CPUGeneratorImpl::get_state() const {
+  using detail::CPUGeneratorImplState;
+
+  static const size_t size = sizeof(CPUGeneratorImplState);
+  static_assert(std::is_pod<CPUGeneratorImplState>::value, "CPUGeneratorImplState is not a PODType");
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr();
+
+  // accumulate generator data to be copied into byte tensor
+  auto accum_state = std::make_unique<CPUGeneratorImplState>();
+  auto rng_data = this->engine_.data();
+  accum_state->legacy_pod.the_initial_seed = rng_data.seed_;
+  accum_state->legacy_pod.left = rng_data.left_;
+  accum_state->legacy_pod.seeded = rng_data.seeded_;
+  accum_state->legacy_pod.next = rng_data.next_;
+  std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state));
+  accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy
+  accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy
+  accum_state->legacy_pod.normal_is_valid = false;
+  accum_state->legacy_pod.normal_y = 0.0;
+  accum_state->next_float_normal_sample = 0.0f;
+  accum_state->is_next_float_normal_sample_valid = false;
+  if (this->next_double_normal_sample_) {
+    accum_state->legacy_pod.normal_is_valid = true;
+    accum_state->legacy_pod.normal_y = *(this->next_double_normal_sample_);
+  }
+  if (this->next_float_normal_sample_) {
+    accum_state->is_next_float_normal_sample_valid = true;
+    accum_state->next_float_normal_sample = *(this->next_float_normal_sample_);
+  }
+
+  memcpy(rng_state, accum_state.get(), size);
+  return state_tensor.getIntrusivePtr();
+}
+
 /**
  * Gets the DeviceType of CPUGeneratorImpl.
  * Used for type checking during run time.
diff --git a/aten/src/ATen/CPUGeneratorImpl.h b/aten/src/ATen/CPUGeneratorImpl.h
index eceb338966fd..f8b43a04c73c 100644
--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@@ -17,6 +17,8 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
   static DeviceType device_type();
   uint32_t random();
   uint64_t random64();
diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h
index 9a9febd01f8e..1179a049aa08 100644
--- a/aten/src/ATen/CUDAGeneratorImpl.h
+++ b/aten/src/ATen/CUDAGeneratorImpl.h
@@ -129,8 +129,10 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override;
   uint64_t current_seed() const override;
   uint64_t seed() override;
+  void set_state(const c10::TensorImpl& new_state) override;
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override;
   void set_philox_offset_per_thread(uint64_t offset);
-  uint64_t philox_offset_per_thread();
+  uint64_t philox_offset_per_thread() const;
   void capture_prologue(int64_t* offset_extragraph);
   uint64_t capture_epilogue();
   PhiloxCudaState philox_cuda_state(uint64_t increment);
diff --git a/aten/src/ATen/Dispatch.h b/aten/src/ATen/Dispatch.h
index 41252609953f..341e20cab1f3 100644
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@@ -10,6 +10,9 @@
 #include <c10/util/complex.h>
 #include <c10/util/string_view.h>
 
+#ifdef XPLAT_MOBILE_BUILD
+#include <ATen/selected_mobile_ops.h>
+#else
 namespace at {
 /**
  * The method should_include_kernel_dtype() returns true/false
@@ -25,6 +28,7 @@ inline constexpr bool should_include_kernel_dtype(
   return true;
 }
 }
+#endif
 
 /**
  * In the Facebook internal build (using BUCK), this macro is enabled by
@@ -93,26 +97,6 @@ inline constexpr bool should_include_kernel_dtype(
     return __VA_ARGS__();                                                         \
   }
 
-// This macro should be used to skip bfloat16 dispatch on non-ROCm platforms and
-// should be removed once the bfloat16 bringup is complete on other platforms.
-// This is supposed to be used as a wrapper around the lambda function passed to
-// the dispatch macro and will conditionally dispatch ops with bfloat16 type
-// only on ROCm.
-#if !defined(__HIP_PLATFORM_HCC__)
-#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) \
-  if (std::is_same<SCALARTYPE, at::BFloat16>::value) {      \
-    AT_ERROR(                                               \
-        #NAME,                                              \
-        " not implemented for '",                           \
-        toString(at::ScalarType::BFloat16),                 \
-        "'");                                               \
-  } else {                                                  \
-    return __VA_ARGS__();                                   \
-  }
-#else
-#define AT_SKIP_BFLOAT16_IF_NOT_ROCM(SCALARTYPE, NAME, ...) return __VA_ARGS__()
-#endif
-
 namespace detail {
 
 inline at::ScalarType scalar_type(at::ScalarType s) {
diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp
index 07fc4e279557..261f6cdd46b5 100644
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@@ -1,4 +1,5 @@
 #include <ATen/Config.h>
+#include <ATen/core/jit_type.h>
 #if AT_PARALLEL_OPENMP
 #include <ATen/Parallel.h>
 
diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h
index 3890662123a2..f6c3bbbe09cc 100644
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@@ -10,6 +10,8 @@
 // There is some back story, see https://github.com/pytorch/pytorch/issues/48684
 #include <ATen/NativeFunctions.h>
 
+#include <ATen/core/List.h>
+
 namespace at {
 namespace indexing {
 
@@ -261,14 +263,15 @@ static inline void recordTensorIndex(const Tensor& tensor, std::vector<Tensor>&
   (*dim_ptr)++;
 };
 
-static inline std::vector<Tensor> typeConvertIndices(const Tensor& self, std::vector<Tensor>&& indices) {
-  std::vector<Tensor> converted_inds(indices.size());
+static inline c10::List<c10::optional<Tensor>> typeConvertIndices(const Tensor& self, std::vector<Tensor>&& indices) {
+  c10::List<c10::optional<Tensor>> converted_inds;
+  converted_inds.reserve(indices.size());
   for (size_t i = 0; i < indices.size(); ++i) {
     const auto &ind = indices[i];
     if (ind.defined()) {
-      converted_inds[i] = ind.to(ind.options().device(self.device()));
+      converted_inds.push_back(ind.to(ind.options().device(self.device())));
     } else {
-      converted_inds[i] = std::move(indices[i]);
+      converted_inds.push_back(std::move(indices[i]));
     }
   }
   return converted_inds;
diff --git a/aten/src/ATen/VmapTransforms.h b/aten/src/ATen/VmapTransforms.h
index 5063beeb08b0..8fa085245459 100644
--- a/aten/src/ATen/VmapTransforms.h
+++ b/aten/src/ATen/VmapTransforms.h
@@ -96,8 +96,17 @@ struct VmapPhysicalToLogicalMap;
 // The levels bitset specifies which vmap levels correspond to the batch
 // dimensions at the front of the tensor. In particular, the number of set bits
 // corresponds to the number of batch dimensions on `tensor` and the rightmost
-// bit of `levels` specifies the minimum number of nested vmaps we are in at
+// bit of `levels` specifies the maximum number of nested vmaps we are in at
 // this point in time.
+// For example, given:
+//   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3})
+//
+// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less
+// than or equal to 3.
+//   bitset: 010100
+//              ^
+//              |
+//   levels: 012345
 struct TORCH_API VmapPhysicalView {
   VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
       : levels_(levels), tensor_(tensor) {
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index 8c82f965ef0f..9a2f34257c57 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -239,13 +239,9 @@ Therefore, for the moment, this is all copy pasted in from VariableTypeEverythin
   m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
     &WrapFunction<CastPolicy::POLICY, SIGNATURE, SIGNATURE, &FUNC>::type::call);
 
-#define KERNEL_UNBOXED_ONLY(FUNC, REGISTER_NAME, SIGNATURE, POLICY) \
-  m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
-    &WrapFunction<CastPolicy::POLICY, SIGNATURE, SIGNATURE, &FUNC>::type::call);
-
 // Less-common but still useful case: redispatching to a function with a new signature (e.g. appending a dtype)
-#define KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \
-  m.impl_UNBOXED(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
+#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \
+  m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
     &WrapFunction<CastPolicy::POLICY, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, &REDISPATCH_FUNC>::type::call);
 
 /*****************************************
@@ -367,20 +363,20 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   KERNEL(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional<Tensor>&, const c10::optional<Tensor>&, int64_t), fp32)
   KERNEL(ADD_NS(dist), "dist", Tensor (const Tensor &, const Tensor &, Scalar), fp32)
   KERNEL(ADD_NS(pdist), "pdist", Tensor (const Tensor &, double), fp32)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
+  KERNEL(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional<int64_t>), fp32)
   KERNEL(ADD_NS(renorm), "renorm", Tensor (const Tensor &, Scalar, int64_t, Scalar), fp32)
   // fp32_set_opt_dtype
   KERNEL(ADD_NS(prod), "prod", Tensor (const Tensor &, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(prod), "prod.dim_int", Tensor (const Tensor &, int64_t, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(prod), "prod.dim_Dimname", Tensor (const Tensor &, Dimname, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(softmax), "softmax.int", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(softmax), "softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(log_softmax), "log_softmax.int", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(log_softmax), "log_softmax.Dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(cumprod), "cumprod", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(cumprod), "cumprod.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(cumsum), "cumsum", Tensor (const Tensor &, int64_t, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(cumsum), "cumsum.dimname", Tensor (const Tensor &, Dimname, c10::optional<ScalarType>), fp32_set_opt_dtype)
   // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even
   // when autocasting.
   // KERNEL(ADD_NS(norm), "norm.ScalarOpt_dtype", Tensor (const Tensor &, c10::optional<Scalar>, ScalarType), fp32_set_opt_dtype)
@@ -388,25 +384,25 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
   // KERNEL(ADD_NS(norm), "norm.names_ScalarOpt_dim_dtype", Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType), fp32_set_opt_dtype)
   KERNEL(ADD_NS(sum), "sum", Tensor (const Tensor &, c10::optional<ScalarType>), fp32_set_opt_dtype)
   KERNEL(ADD_NS(sum), "sum.dim_IntList", Tensor (const Tensor &, IntArrayRef, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
-  KERNEL_UNBOXED_ONLY(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
+  KERNEL(ADD_NS(sum), "sum.dim_DimnameList", Tensor (const Tensor &, DimnameList, bool, c10::optional<ScalarType>), fp32_set_opt_dtype)
   // fp32_append_dtype
   // The fp32_append_dtype wrapper overrides implicit promotion behavior.
   // norm does not implicitly promote, but be aware when adding new ops to this policy.
-  KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional<Scalar>, ScalarType), fp32_append_dtype)
-  KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool), Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool, ScalarType), fp32_append_dtype)
-  KERNEL_UNBOXED_ONLY_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool), Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType), fp32_append_dtype)
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, Scalar), Tensor (const Tensor &, c10::optional<Scalar>, ScalarType), fp32_append_dtype)
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool), Tensor (const Tensor &, c10::optional<Scalar>, IntArrayRef, bool, ScalarType), fp32_append_dtype)
+  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool), Tensor (const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType), fp32_append_dtype)
   // promote
   KERNEL(ADD_NS(addcdiv), "addcdiv", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
   KERNEL(ADD_NS(addcmul), "addcmul", Tensor (const Tensor &, const Tensor &, const Tensor &, Scalar), promote)
   KERNEL(ADD_NS(atan2), "atan2", Tensor (const Tensor &, const Tensor &), promote)
   KERNEL(ADD_NS(bilinear), "bilinear", Tensor (const Tensor &, const Tensor &, const Tensor &, const c10::optional<Tensor>&), promote)
   KERNEL(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
-  KERNEL_UNBOXED_ONLY(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
+  KERNEL(ADD_NS(cat), "cat.names", Tensor (TensorList, Dimname), promote)
   KERNEL(ADD_NS(_cat), "_cat", Tensor (TensorList, int64_t), promote)
   KERNEL(ADD_NS(cross), "cross", Tensor (const Tensor &, const Tensor &, c10::optional<int64_t>), promote)
   KERNEL(ADD_NS(dot), "dot", Tensor (const Tensor &, const Tensor &), promote)
   KERNEL(ADD_NS(equal), "equal", bool (const Tensor &, const Tensor &), promote)
-  KERNEL_UNBOXED_ONLY(ADD_NS(index_put), "index_put", Tensor (const Tensor &, TensorList, const Tensor &, bool), promote)
+  KERNEL(ADD_NS(index_put), "index_put", Tensor (const Tensor &, const torch::List<c10::optional<Tensor>>&, const Tensor &, bool), promote)
   KERNEL(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
   KERNEL(ADD_NS(tensordot), "tensordot", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef), promote)
 
diff --git a/aten/src/ATen/core/Generator.cpp b/aten/src/ATen/core/Generator.cpp
new file mode 100644
index 000000000000..800f8c7c88ec
--- /dev/null
+++ b/aten/src/ATen/core/Generator.cpp
@@ -0,0 +1,16 @@
+#include <ATen/core/Generator.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/Exception.h>
+
+namespace at {
+
+void Generator::set_state(const at::Tensor& new_state) {
+  TORCH_CHECK(new_state.defined(), "Undefined tensor is not allowed");
+  this->impl_->set_state(*new_state.unsafeGetTensorImpl());
+}
+
+at::Tensor Generator::get_state() const {
+  return at::Tensor::wrap_tensor_impl(this->impl_->get_state());
+}
+
+} // namespace at
diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h
index de3f6e46f8f2..b5bbb2fe3c74 100644
--- a/aten/src/ATen/core/Generator.h
+++ b/aten/src/ATen/core/Generator.h
@@ -56,6 +56,8 @@
 
 namespace at {
 
+class Tensor;
+
 struct TORCH_API Generator {
   Generator() {}
 
@@ -96,6 +98,12 @@ struct TORCH_API Generator {
 
   uint64_t seed() { return impl_->seed(); }
 
+  // Implementation not inlined to prevent cycle reference between
+  // `ATen/core/Generator.h` and `ATen/core/Tensor.h`
+  void set_state(const at::Tensor& new_state);
+
+  at::Tensor get_state() const;
+
   std::mutex& mutex() {
     return impl_->mutex_;
   }
@@ -130,4 +138,24 @@ Generator make_generator(Args&&... args) {
   return Generator(c10::make_intrusive<Impl>(std::forward<Args>(args)...));
 }
 
+namespace detail {
+
+/**
+ * Helper function for checking the validity of new random generator
+ * state. Right now following conditions are checked:
+ * 
+ * - The new state tensor must be a torch.ByteTensor
+ * - Data of the new state tensor must be contiguous
+ */
+static inline void check_rng_state(const c10::TensorImpl& new_state) {
+  TORCH_CHECK_TYPE(
+    new_state.layout() == kStrided && new_state.device().type() == kCPU && new_state.dtype() == kByte,
+    "RNG state must be a torch.ByteTensor"
+  );
+
+  TORCH_CHECK(new_state.is_contiguous(), "RNG state must be contiguous");
+}
+
+} // namespace detail
+
 } // namespace at
diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h
index 40f733784fe5..f911722c51e1 100644
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@@ -243,7 +243,7 @@ class List final {
    * Example:
    *   List<int> a({2, 3, 4});
    */
-  explicit List(std::initializer_list<T> initial_values);
+  List(std::initializer_list<T> initial_values);
   explicit List(ArrayRef<T> initial_values);
 
   /**
diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h
index 3cbd7a310275..ab3ddae55770 100644
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@@ -1,7 +1,7 @@
 #pragma once
 
+#include <ATen/core/jit_type_base.h>
 #include <ATen/core/ivalue.h>
-#include <ATen/core/jit_type.h>
 
 namespace c10 {
 
@@ -50,7 +50,17 @@ List<T>::List(TypePtr elementType)
 namespace impl {
 template<class T>
 List<T> toTypedList(impl::GenericList list) {
-  TORCH_INTERNAL_ASSERT(*getTypePtr<T>() == *list.impl_->elementType, "Tried to cast a List<", toString(list.impl_->elementType), "> to a List<", toString(getTypePtr<T>()), ">. Types mismatch.");
+  // If there's other instances of the list (i.e. list.use_count() > 1), then we have to be invariant
+  // because upcasting would allow people to add types into the new list that would break the old list.
+  // However, if there aren't any other instances of this list (i.e. list.use_count() == 1), then we can
+  // allow upcasting. This can be a perf improvement since we can cast List<T> to List<optional<T>>
+  // without having to copy it. This is also used to provide backwards compatibility with some old models
+  // that serialized the index arguments to aten::index, aten::index_put, aten::index_put_ and aten::index_put_impl_
+  // as List<Tensor> before we changed that argument to be List<optional<Tensor>>. When deserializing, we
+  // have list.use_count() == 1 and can deserialize the List<Tensor> directly as List<optional<Tensor>>.
+  TORCH_CHECK(*list.impl_->elementType == *getTypePtr<T>()
+    || (list.use_count() == 1 && list.impl_->elementType->isSubtypeOf(getTypePtr<T>()))
+    , "Tried to cast a List<", toString(list.impl_->elementType), "> to a List<", toString(getTypePtr<T>()), ">. Types mismatch.");
   return List<T>(std::move(list.impl_));
 }
 
@@ -312,3 +322,5 @@ void List<T>::unsafeSetElementType(TypePtr t) {
   impl_->elementType = std::move(t);
 }
 }
+
+#include <ATen/core/jit_type.h>
diff --git a/aten/src/ATen/core/Variadic.h b/aten/src/ATen/core/Variadic.h
index b49d94bba1c8..d33f3d575177 100644
--- a/aten/src/ATen/core/Variadic.h
+++ b/aten/src/ATen/core/Variadic.h
@@ -6,6 +6,7 @@
 #include <utility>
 
 #include <c10/util/ArrayRef.h>
+#include <ATen/core/List.h>
 
 namespace at {
 
@@ -56,6 +57,15 @@ struct IterArgs {
     }
   }
 
+  template <typename T>
+  void operator()(const torch::List<T>& args) {
+    for (const auto& arg : args) {
+      self()(arg);
+      if (self().short_circuit())
+        return;
+    }
+  }
+
   // NB: we need to specify std::vector manually as C++ won't
   // do an implicit conversion to make a template deduction go through.
   template <typename T>
diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp
index f84352ebee1f..58c35557018c 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction.cpp
@@ -57,25 +57,4 @@ bool KernelFunction::_equalsBoxedAndUnboxed(const KernelFunction& other) const {
          unboxed_kernel_func_ == other.unboxed_kernel_func_;
 }
 
-void KernelFunction::checkBoxedKernel(const OperatorHandle& opHandle) const {
-  if (C10_UNLIKELY(boxed_kernel_func_ == nullptr)) {
-    if (unboxed_kernel_func_ == nullptr) {
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction.",
-          " opname: ",
-          opHandle.operator_name(),
-          " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`.");
-    } else {
-      // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this case should be impossible.
-      TORCH_INTERNAL_ASSERT(
-          false,
-          "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call().",
-          " opname: ",
-          opHandle.operator_name(),
-          " If you're using mobile selective build please make sure to include all ops exported from `torch.jit.export_opnames(model)`.");
-    }
-  }
-}
-
 } // namespace c10
diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h
index 6817907b12b1..ddbbd912777a 100644
--- a/aten/src/ATen/core/boxing/KernelFunction.h
+++ b/aten/src/ATen/core/boxing/KernelFunction.h
@@ -123,26 +123,6 @@ class TORCH_API KernelFunction final {
   template<bool AllowLegacyTypes = false, class KernelFunctor>
   static KernelFunction makeFromUnboxedFunctor(std::unique_ptr<OperatorKernel> kernelFunctor);
 
-  /**
-   * Create a KernelFunction from an unboxed functor and prevent creation of an
-   * unboxing-wrapper. This means that you cannot call this KernelFunction
-   * using KernelFunction::callBoxed()
-   *
-   * This is necessary because our unboxing wrappers don't work for all types
-   * yet, so if you want to use one of these types as function arguments,
-   * you need to use makeFromUnboxedOnlyFunctor.
-   *
-   * Example:
-   *
-   * > class MyFunctor final {
-   * >   public:
-   * >     Tensor operator()(Tensor a, Tensor b) {...}
-   * > };
-   * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor(std::make_unique<MyFunctor>());
-   */
-  template<class KernelFunctor>
-  static KernelFunction makeFromUnboxedOnlyFunctor(std::unique_ptr<OperatorKernel> kernelFunctor);
-
   /**
    * Create a KernelFunction from an unboxed function.
    * This is usually better than KernelFunction::makeFromUnboxedRuntimeFunction
@@ -158,23 +138,6 @@ class TORCH_API KernelFunction final {
   template<class FuncPtr, bool AllowLegacyTypes = false>
   static KernelFunction makeFromUnboxedFunction(FuncPtr);
 
-  /**
-   * Create a KernelFunction from an unboxed function and prevent creation of an
-   * unboxing-wrapper. This means that you cannot call this KernelFunction
-   * using KernelFunction::callBoxed()
-   *
-   * This is necessary because our unboxing wrappers don't work for all types
-   * yet, so if you want to use one of these types as function arguments,
-   * you need to use makeFromUnboxedOnlyFunctor.
-   *
-   * Example:
-   *
-   * > Tensor unboxed_func(Tensor a, Tensor b) {...}
-   * > KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction<decltype(unboxed_func), &unboxed_func>();
-   */
-  template<class FuncPtr>
-  static KernelFunction makeFromUnboxedOnlyFunction(FuncPtr);
-
   /**
    * Create a KernelFunction from an unboxed function.
    * KernelFunction::makeFromUnboxedFunction is usually a better choice than
@@ -189,9 +152,6 @@ class TORCH_API KernelFunction final {
   template<bool AllowLegacyTypes = false, class FuncType>
   static KernelFunction makeFromUnboxedRuntimeFunction(FuncType* func);
 
-  template<class FuncType>
-  static KernelFunction makeFromUnboxedOnlyRuntimeFunction(FuncType* func);
-
   static KernelFunction makeFallthrough();
   static KernelFunction makeAmbiguousAutogradOther();
   static KernelFunction makeNamedNotSupported();
@@ -213,12 +173,6 @@ class TORCH_API KernelFunction final {
   // For testing internal invariants only
   bool _equalsBoxedAndUnboxed(const KernelFunction&) const;
 
-  // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic. This can be done once https://github.com/pytorch/pytorch/issues/32366 is fixed.
-  void setManuallyBoxedKernel_(InternalBoxedKernelFunction* func);
-
 private:
 
   explicit KernelFunction(std::unique_ptr<OperatorKernel> functor, InternalBoxedKernelFunction* boxed_kernel_func, void* unboxed_kernel_func);
@@ -226,8 +180,6 @@ class TORCH_API KernelFunction final {
   template<BoxedKernelFunction* func>
   static void make_boxed_function(OperatorKernel*, const OperatorHandle& opHandle, Stack* stack);
 
-  void checkBoxedKernel(const OperatorHandle& opHandle) const;
-
   OperatorKernel* getFunctor_() const;
 
   std::shared_ptr<OperatorKernel> functor_;
diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h
index 82a65fa27ffb..b248e54a6f94 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_impl.h
+++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h
@@ -23,8 +23,7 @@ inline void KernelFunction::make_boxed_function(OperatorKernel*, const OperatorH
 }
 
 inline bool KernelFunction::isValid() const {
-    // TODO We want to introduce the invariant that all kernels must be callable in a boxed way, then this should only check boxed_kernel_func_.
-    return boxed_kernel_func_ != nullptr || unboxed_kernel_func_ != nullptr;
+    return boxed_kernel_func_ != nullptr;
 }
 
 inline bool KernelFunction::isFallthrough() const {
@@ -32,7 +31,10 @@ inline bool KernelFunction::isFallthrough() const {
 }
 
 inline void KernelFunction::callBoxed(const OperatorHandle& opHandle, Stack* stack) const {
-    checkBoxedKernel(opHandle);
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        boxed_kernel_func_ != nullptr,
+        "Tried to call KernelFunction::callBoxed() on an uninitialized KernelFunction."
+    );
     (*boxed_kernel_func_)(functor_.get(), opHandle, stack);
 }
 
@@ -111,21 +113,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunctor(std::unique_ptr<Ope
     );
 }
 
-template<class KernelFunctor>
-inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunctor(std::unique_ptr<OperatorKernel> kernelFunctor) {
-    // TODO We want to get rid of kernels that have only an unboxed function pointer.
-    //      All kernels should have a boxed pointer.
-
-    static_assert(guts::is_functor<KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor> but the argument is not a functor.");
-    static_assert(std::is_base_of<OperatorKernel, KernelFunctor>::value, "Tried to call KernelFunction::makeFromUnboxedFunctor<KernelFunctor>, but the functor doesn't inherit from c10::OperatorKernel. Please have the functor inherit from it.");
-
-    return KernelFunction(
-        std::move(kernelFunctor),
-        nullptr, // Don't create a boxed kernel for this
-        reinterpret_cast<void*>(&impl::wrap_kernel_functor_unboxed<KernelFunctor>::call)
-    );
-}
-
 template<class FuncPtr, bool AllowLegacyTypes>
 inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr) {
     static_assert(is_compile_time_function_pointer<FuncPtr>::value, "Tried to call KernelFunction::makeFromUnboxedFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
@@ -144,26 +131,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr)
 #endif
 }
 
-template<class FuncPtr>
-inline KernelFunction KernelFunction::makeFromUnboxedOnlyFunction(FuncPtr func_ptr) {
-    // TODO We want to get rid of kernels that have only an unboxed function pointer.
-    //      All kernels should have a boxed pointer.
-    static_assert(is_compile_time_function_pointer<FuncPtr>::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with an invalid parameter. It must be a function pointer created with TORCH_FN.");
-    static_assert(!std::is_same<typename FuncPtr::FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedOnlyFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
-    static_assert(FuncPtr::func_ptr() != nullptr, "Kernel function cannot be nullptr");
-
-#if !defined(C10_MOBILE)
-    return makeFromUnboxedOnlyFunctor<typename impl::WrapFunctionIntoFunctor<FuncPtr>::type> (
-        guts::make_unique_base<OperatorKernel, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>()
-    );
-#else
-    // On mobile, we rather want to optimize for binary size than for performance,
-    // so let's not inline the kernel into the wrapper but use makeFromUnboxedOnlyRuntimeFunction
-    // instead.
-    return makeFromUnboxedOnlyRuntimeFunction(func_ptr.func_ptr());
-#endif
-}
-
 template<bool AllowLegacyTypes, class FuncType>
 inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* func) {
     static_assert(guts::is_function_type<FuncType>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
@@ -175,17 +142,6 @@ inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* f
     );
 }
 
-template<class FuncType>
-inline KernelFunction KernelFunction::makeFromUnboxedOnlyRuntimeFunction(FuncType* func) {
-    static_assert(guts::is_function_type<FuncType>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a non-function type.");
-    static_assert(!std::is_same<FuncType, BoxedKernelFunction>::value, "Tried to call KernelFunction::makeFromUnboxedRuntimeFunction with a boxed function pointer. Please use KernelFunction::makeFromBoxedFunction instead.");
-    TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
-
-    return makeFromUnboxedOnlyFunctor<impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
-        guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func)
-    );
-}
-
 template<bool AllowLegacyTypes, class Lambda>
 inline std::enable_if_t<guts::is_stateless_lambda<std::decay_t<Lambda>>::value, KernelFunction> KernelFunction::makeFromUnboxedLambda(Lambda&& lambda) {
     static_assert(guts::is_functor<std::decay_t<Lambda>>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
@@ -212,14 +168,4 @@ inline std::enable_if_t<!guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
     );
 }
 
-inline void KernelFunction::setManuallyBoxedKernel_(InternalBoxedKernelFunction* func) {
-    if (boxed_kernel_func_ == &fallthrough_kernel) {
-      // special case no-op
-      return;
-    }
-    TORCH_INTERNAL_ASSERT(boxed_kernel_func_ == nullptr, "Tried to set a manually boxed kernel for a kernel that already has a boxed kernel set.");
-    TORCH_INTERNAL_ASSERT(unboxed_kernel_func_ != nullptr, "Tried to set a manually boxed kernel for an invalid KernelFunction.");
-    boxed_kernel_func_ = func;
-}
-
 }
diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
index 8ba50db14a2b..e17efab10ba5 100644
--- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp
+++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp
@@ -544,26 +544,6 @@ TEST(KernelFunctionTest, givenUnboxedFunctor_withoutReturn_whenCallingUnboxed_th
   kernels::expectUnboxedCallingWithoutReturnWorks(func);
 }
 
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_with_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_with_return>()));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_without_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_without_return>()));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_with_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_with_return>()));
-  kernels::expectUnboxedCallingWithReturnWorks(func);
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunctor_withoutReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunctor<kernels::unboxed_functor_without_return>(std::unique_ptr<OperatorKernel>(std::make_unique<kernels::unboxed_functor_without_return>()));
-  kernels::expectUnboxedCallingWithoutReturnWorks(func);
-}
-
 TEST(KernelFunctionTest, givenUnboxedFunction_withReturn_whenCallingBoxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromUnboxedFunction(TORCH_FN(kernels::unboxed_function_with_return));
   kernels::expectBoxedCallingWithReturnWorks(func);
@@ -584,26 +564,6 @@ TEST(KernelFunctionTest, givenUnboxedFunction_withoutReturn_whenCallingUnboxed_t
   kernels::expectUnboxedCallingWithoutReturnWorks(func);
 }
 
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingBoxed_thenFails) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return));
-  kernels::expectBoxedCallingFailsWith(func, "Tried to call KernelFunction::callBoxed() on a KernelFunction that can only be called with KernelFunction::call()");
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_with_return));
-  kernels::expectUnboxedCallingWithReturnWorks(func);
-}
-
-TEST(KernelFunctionTest, givenUnboxedOnlyFunction_withoutReturn_whenCallingUnboxed_thenWorks) {
-  KernelFunction func = KernelFunction::makeFromUnboxedOnlyFunction(TORCH_FN(kernels::unboxed_function_without_return));
-  kernels::expectUnboxedCallingWithoutReturnWorks(func);
-}
-
 TEST(KernelFunctionTest, givenUnboxedRuntimeFunction_withReturn_whenCallingBoxed_thenWorks) {
   KernelFunction func = KernelFunction::makeFromUnboxedRuntimeFunction(&kernels::unboxed_function_with_return);
   kernels::expectBoxedCallingWithReturnWorks(func);
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
index 8bfb4f7e9d16..adeaa1039638 100644
--- a/aten/src/ATen/core/builtin_function.h
+++ b/aten/src/ATen/core/builtin_function.h
@@ -101,8 +101,17 @@ struct BuiltinOpFunction : public Function {
   }
 
   std::string pretty_print_schema() const override {
+    #ifdef __NVCC__
+    // Disable the "statement is unreachable" warning
+    #pragma diag_suppress code_is_unreachable
+    #endif
+
     TORCH_INTERNAL_ASSERT(false);
     return "";
+
+    #ifdef __NVCC__
+    #pragma diag_default code_is_unreachable
+    #endif
   }
 
   Function& setSchema(c10::FunctionSchema schema) override {
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 5e3e91afbb45..270cffaf6d1f 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -295,12 +295,6 @@ void Dispatcher::checkInvariants() const {
   }
 }
 
-void Dispatcher::setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  op.operatorIterator_->op.setManuallyBoxedKernel_(*this, func);
-  // NB: Do not need to set manually boxed kernel for backend fallbacks
-}
-
 std::vector<OperatorHandle> Dispatcher::findDanglingImpls() const {
   return operatorLookupTable_.read([&] (const ska::flat_hash_map<OperatorName, OperatorHandle>& operatorLookupTable) -> std::vector<OperatorHandle> {
     std::vector<OperatorHandle> opsWithDanglingImpls;
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h
index 60f9f9bd0579..d83653f75363 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@@ -182,12 +182,6 @@ class TORCH_API Dispatcher final {
    */
   RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);
 
-  // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete setBoxedKernelFor_ once all operators work with the templated boxing logic
-  void setManuallyBoxedKernelFor_(const OperatorHandle& op, KernelFunction::InternalBoxedKernelFunction* func);
-
   // ------------------------------------------------------------------------
   //
   // Listeners on registrations
@@ -310,7 +304,9 @@ class TORCH_API OperatorHandle {
     // smuggle in a kernel that is typed incorrectly).  For everything
     // in core library this won't happen, because all the static registrations
     // will be done by the time a typed() handle is acquired.
+#if !defined C10_MOBILE
     operatorIterator_->op.assertSignatureIsCorrect<FuncType>();
+#endif
     return TypedOperatorHandle<FuncType>(operatorIterator_);
   }
 
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index f0d7bc6968ed..7c3698beeb06 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -21,7 +21,6 @@ OperatorEntry::OperatorEntry(OperatorName&& operator_name)
 , schema_()
 , dispatchTable_()
 , dispatchKeyExtractor_(DispatchKeyExtractor::makeUninitialized())
-, manuallyBoxedKernel_()
 , kernels_()
 , cpp_signature_()
 , is_observed_(ObservedOperators::isObserved(name_))
@@ -122,10 +121,6 @@ std::list<AnnotatedKernel>::iterator OperatorEntry::registerKernel(
     );
   }
 
-  if (manuallyBoxedKernel_.has_value()) {
-    kernel.setManuallyBoxedKernel_(*manuallyBoxedKernel_);
-  }
-
   k.emplace_front(std::move(kernel), std::move(inferred_function_schema), std::move(debug));
   std::list<AnnotatedKernel>::iterator inserted = k.begin();
   // update the dispatch table, i.e. re-establish the invariant
@@ -331,19 +326,6 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher)
   }
 }
 
-void OperatorEntry::setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func) {
-  TORCH_INTERNAL_ASSERT(!manuallyBoxedKernel_);
-  manuallyBoxedKernel_ = func;
-
-  for (auto& kv : kernels_) {
-    for (auto& k : kv.second) {
-      k.kernel.setManuallyBoxedKernel_(func);
-    }
-  }
-  // Refresh entries in dispatchTable_
-  updateDispatchTableFull_(dispatcher);
-}
-
 void OperatorEntry::checkInvariants() const {
   if (schema_) {
     TORCH_INTERNAL_ASSERT(schema_->schema.operator_name() == name_, dumpState());
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h
index 5098fd0d8c28..44b8fac5661e 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.h
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.h
@@ -148,12 +148,6 @@ class TORCH_API OperatorEntry final {
 
   const DispatchKeyExtractor& dispatchKeyExtractor() const { return dispatchKeyExtractor_; }
 
-  // This function is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete setManuallyBoxedKernel_ once all operators work with the templated boxing logic
-  void setManuallyBoxedKernel_(const c10::Dispatcher& dispatcher, KernelFunction::InternalBoxedKernelFunction* func);
-
   // Asserts that the given FuncType is correct for calling this operator in an unboxed way.
   template<class FuncType>
   void assertSignatureIsCorrect() {
@@ -189,12 +183,6 @@ class TORCH_API OperatorEntry final {
   std::array<KernelFunction, static_cast<uint8_t>(DispatchKey::NumDispatchKeys)> dispatchTable_;
   DispatchKeyExtractor dispatchKeyExtractor_;
 
-  // This manuallyBoxedKernel_ member is a temporary hack that allows generated_unboxing_wrappers.cpp to register its codegen'ed
-  // unboxing wrapper for aten operators. We still need those for some operators because not all work
-  // with the templated unboxing logic yet.
-  // TODO Delete manuallyBoxedKernel_ once all operators work with the templated boxing logic
-  c10::optional<KernelFunction::InternalBoxedKernelFunction*> manuallyBoxedKernel_;
-
   // kernels_ stores all registered kernels for the corresponding dispatch key
   // and catchAllKernels_ stores the catch-all kernels.
   // If an operator library gets loaded that overwrites an already existing kernel,
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index 720d274ec5b2..624ded76ffda 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -107,7 +107,7 @@ struct Argument {
   c10::optional<int32_t> N_;
 
   c10::optional<IValue> default_value_;
-  // is this only specifyable as a keyword argument?
+  // is this only specifiable as a keyword argument?
   bool kwarg_only_;
   c10::optional<AliasInfo> alias_info_;
 };
diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index 8065300f0b32..f99dc3c07058 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -17,6 +17,7 @@ namespace c10 {
 #define FORALL_NS_SYMBOLS(_)         \
   _(namespaces, prim)                \
   _(namespaces, aten)                \
+  _(namespaces, cuda)                \
   _(namespaces, onnx)                \
   _(namespaces, attr)                \
   _(namespaces, scope)               \
@@ -284,6 +285,9 @@ namespace c10 {
   _(aten, zero_)                     \
   _(aten, fill_)                     \
   _(aten, masked_fill_)              \
+  _(cuda, _set_device)               \
+  _(cuda, set_stream)                \
+  _(cuda, _current_device)           \
   _(aten, swapaxes)                  \
   _(aten, swapaxes_)                 \
   _(aten, swapdims)                  \
@@ -383,6 +387,7 @@ namespace c10 {
 #define FORALL_NS_SYMBOLS(_) \
   _(namespaces, prim)              \
   _(namespaces, aten)              \
+  _(namespaces, cuda)              \
   _(namespaces, onnx)              \
   _(namespaces, attr)              \
   _(namespaces, scope)             \
@@ -453,6 +458,7 @@ struct TORCH_API Symbol {
   // (and if it's not, you should add it to the built-ins list above.)
   static Symbol attr(const std::string & s);
   static Symbol aten(const std::string & s);
+  static Symbol cuda(const std::string & s);
   static Symbol onnx(const std::string & s);
   static Symbol prim(const std::string & s);
   static Symbol user(const std::string & s);
@@ -463,6 +469,7 @@ struct TORCH_API Symbol {
 
   bool is_attr() const;
   bool is_aten() const;
+  bool is_cuda() const;
   bool is_prim() const;
   bool is_onnx() const;
   bool is_user() const;
@@ -523,6 +530,7 @@ FORALL_NS_SYMBOLS(DEFINE_SYMBOL)
 
 inline Symbol Symbol::attr(const std::string & s) { return Symbol::fromQualString("attr::" + s); }
 inline Symbol Symbol::aten(const std::string & s)  { return Symbol::fromQualString("aten::" + s); }
+inline Symbol Symbol::cuda(const std::string & s)  { return Symbol::fromQualString("cuda::" + s); }
 inline Symbol Symbol::onnx(const std::string & s)  { return Symbol::fromQualString("onnx::" + s); }
 inline Symbol Symbol::prim(const std::string & s)  { return Symbol::fromQualString("prim::" + s); }
 inline Symbol Symbol::scope(const std::string & s) { return Symbol::fromQualString("scope::" + s); }
@@ -531,6 +539,7 @@ inline Symbol Symbol::caffe2(const std::string & s) { return Symbol::fromQualStr
 inline Symbol Symbol::dimname(const std::string & s) { return Symbol::fromQualString("dimname::" + s); }
 inline bool Symbol::is_attr() const { return ns() == namespaces::attr; }
 inline bool Symbol::is_aten() const { return ns() == namespaces::aten; }
+inline bool Symbol::is_cuda() const { return ns() == namespaces::cuda; }
 inline bool Symbol::is_prim() const { return ns() == namespaces::prim; }
 inline bool Symbol::is_onnx() const { return ns() == namespaces::onnx; }
 inline bool Symbol::is_user() const { return ns() == namespaces::user; }
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index c05e7313fa63..1223577c59c6 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -125,7 +125,7 @@ TypePtr IValue::type() const {
 
 void IValue::visit(const std::function<bool (const IValue &)>& visitor) const {
   if (visitor(*this)) {
-    // Short cut.
+    // Shortcut
     return;
   }
   switch (this->tag) {
@@ -265,7 +265,7 @@ bool IValue::ptrEqual(const IValue& lhs, const IValue& rhs) {
   TORCH_INTERNAL_ASSERT(lhs.is_intrusive_ptr);
   TORCH_INTERNAL_ASSERT(rhs.is_intrusive_ptr);
   return lhs.tag == rhs.tag &&
-      lhs.payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+      lhs.payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
 }
 
 IValue IValue::equals(const IValue& rhs) const {
@@ -325,17 +325,17 @@ size_t IValue::hash(const IValue& v) {
     case Tag::None:
       return 0;
     case Tag::Bool:
-      return c10::get_hash(v.payload.as_bool);
+      return c10::get_hash(v.payload.u.as_bool);
     case Tag::Double:
-      return c10::get_hash(v.payload.as_double);
+      return c10::get_hash(v.payload.u.as_double);
     case Tag::Tensor:
       // Tensor __hash__ is equivalent to `id()`, so take the pointer value of
       // the tensor to emulate it
-      return c10::get_hash(v.payload.as_int);
+      return c10::get_hash(v.payload.as_tensor.unsafeGetTensorImpl());
     case Tag::Storage:
-      return c10::get_hash(v.payload.as_int);
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::Int:
-      return c10::get_hash(v.payload.as_int);
+      return c10::get_hash(v.payload.u.as_int);
     case Tag::String:
       return c10::get_hash(v.toStringRef());
     case Tag::Tuple:
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 4a7e15c4008b..ca68a8df46e1 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -131,10 +131,15 @@ struct Capsule {
 // they are marked `@private`, which hides them on the doxygen documentation for
 // this page.
 
-/// IValue (Interpreter Value) is a tagged union over the types supported by the
-/// TorchScript interpreter. IValues contain their values as an
-/// `IValue::Payload`, which holds primitive types (`int64_t`, `bool`, `double`,
-/// `Device`), as values and all other types as a `c10::intrusive_ptr`.
+/// IValue (Interpreter Value) is a tagged union over the types
+/// supported by the TorchScript interpreter. IValues contain their
+/// values as an `IValue::Payload`, which holds primitive types
+/// (`int64_t`, `bool`, `double`, `Device`) and `Tensor` as values,
+/// and all other types as a `c10::intrusive_ptr`. In order to
+/// optimize performance of the destructor and related operations by
+/// making the `Tensor` and `c10::intrusive_ptr` paths generate the
+/// same code, we represent a null `c10::intrusive_ptr` as
+/// `UndefinedTensorImpl::singleton()`, *not* `nullptr`.
 ///
 /// IValues are used as inputs to and outputs from the TorchScript interpreter.
 /// To retrieve the value contained within an IValue, use the `.toX()` methods,
@@ -160,27 +165,35 @@ struct Capsule {
 struct TORCH_API IValue final {
   IValue(const IValue& rhs)
       : IValue(rhs.payload, rhs.tag, rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::incref(payload.as_intrusive_ptr);
+    if (is_intrusive_ptr && payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+      c10::raw::intrusive_ptr::incref(payload.u.as_intrusive_ptr);
     }
   }
-  IValue(IValue&& rhs) noexcept : IValue() {
-    swap(rhs);
+
+  IValue(IValue&& rhs) noexcept : tag(rhs.tag), is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    moveFrom(std::move(rhs));
   }
+
   /// @private [doxygen private]
   ~IValue() {
-    if (is_intrusive_ptr) {
-      c10::raw::intrusive_ptr::decref(payload.as_intrusive_ptr);
-    }
+    destroy();
   }
-  IValue& operator=(IValue&& rhs) & noexcept {
-    IValue(std::move(rhs)).swap(*this); // this also sets rhs to None
+
+  C10_ALWAYS_INLINE IValue& operator=(IValue&& rhs) & noexcept {
+    if (&rhs == this) {
+      return *this;
+    }
+
+    destroy();
+    moveFrom(std::move(rhs));
     return *this;
   }
+
   IValue& operator=(IValue const& rhs) & {
     IValue(rhs).swap(*this);
     return *this;
   }
+
   void dump() const;
 
   /**
@@ -260,6 +273,13 @@ struct TORCH_API IValue final {
       return false;
     }
 
+    // Tensors should be compared based on internal storage
+    if (this->isTensor()) {
+      const auto& thisTensor = this->toTensor();
+      const auto& rhsTensor = rhs.toTensor();
+      return thisTensor.is_alias_of(rhsTensor);
+    }
+
     if (!this->is_intrusive_ptr) {
       // Primitive types don't alias anything
       return false;
@@ -267,29 +287,49 @@ struct TORCH_API IValue final {
 
     AT_ASSERT(rhs.is_intrusive_ptr);
 
-    // Tensors should be compared based on internal storage
-    if (this->isTensor()) {
-      const auto thisTensor = this->toTensor();
-      const auto rhsTensor = rhs.toTensor();
-      return thisTensor.is_alias_of(rhsTensor);
-    }
-
     // Other types can be compared by their ptr value
-    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+    return this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
   }
 
   /// @private [doxygen private]
   size_t use_count() const noexcept {
+    if (isTensor()) {
+      return payload.as_tensor.use_count();
+    }
+
     if (!is_intrusive_ptr) {
       return 1;
     }
 
-    return c10::raw::intrusive_ptr::use_count(payload.as_intrusive_ptr);
+    if (payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()) {
+      return 0;
+    }
+    return c10::raw::intrusive_ptr::use_count(payload.u.as_intrusive_ptr);
   }
 
   /// @private [doxygen private]
   void swap(IValue& rhs) noexcept {
-    std::swap(payload, rhs.payload);
+    if (isTensor() && rhs.isTensor()) {
+      std::swap(payload.as_tensor, rhs.payload.as_tensor);
+    } else if (isTensor()) {
+      at::Tensor t = std::move(payload.as_tensor);
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // payload.as_tensor.~Tensor();
+      payload.u = rhs.payload.u;
+      new (&rhs.payload.as_tensor) at::Tensor(std::move(t));
+    } else if (rhs.isTensor()) {
+      rhs.swap(*this);
+      return;
+    } else {
+      std::swap(payload.u, rhs.payload.u);
+    }
     std::swap(is_intrusive_ptr, rhs.is_intrusive_ptr);
     std::swap(tag, rhs.tag);
   }
@@ -298,21 +338,17 @@ struct TORCH_API IValue final {
   // While some of these accessors could be generated through templates,
   // we prefer to write them manually for clarity
 
-  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(t.defined()) {
-    // Note: the undefined tensor is not refcounted, so while it
-    // is tagged as a tensor, is_intrusive_ptr is set to false.
-    // This is not an optional optimization: our incref call
-    // *will not* do the right thing when called on an
-    // undefined tensor.
-    payload.as_intrusive_ptr = t.unsafeReleaseTensorImpl();
+  IValue(at::Tensor t) : tag(Tag::Tensor), is_intrusive_ptr(false) {
+    new (&payload.as_tensor) at::Tensor(std::move(t));
   }
   bool isTensor() const {
     return Tag::Tensor == tag;
   }
   at::Tensor toTensor() &&;
-  at::Tensor toTensor() const&;
+  at::Tensor& toTensor() &;
+  const at::Tensor& toTensor() const&;
   at::TensorImpl* unsafeToTensorImpl() const {
-    return static_cast<at::TensorImpl*>(payload.as_intrusive_ptr);
+    return payload.as_tensor.unsafeGetTensorImpl();
   }
 
   IValue(at::Storage s) : tag(Tag::Storage), is_intrusive_ptr(static_cast<bool>(s)) {
@@ -321,7 +357,7 @@ struct TORCH_API IValue final {
     // This is not an optional optimization: our incref call
     // *will not* do the right thing when called on an
     // undefined tensor.
-    payload.as_intrusive_ptr = s.unsafeReleaseStorageImpl();
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(s.unsafeReleaseStorageImpl());
   }
   bool isStorage() const {
     return Tag::Storage == tag;
@@ -341,7 +377,7 @@ struct TORCH_API IValue final {
       : tag(Tag::Blob), is_intrusive_ptr(true) {
     // TODO (after Tensor merge) If we pass in a Blob holding a Tensor, extract
     // and store it as a Tensor instead.
-    payload.as_intrusive_ptr = blob.release();
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
   }
 
   /// @private [doxygen private]
@@ -397,14 +433,14 @@ struct TORCH_API IValue final {
 
   // Double
   IValue(double d) : tag(Tag::Double), is_intrusive_ptr(false) {
-    payload.as_double = d;
+    payload.u.as_double = d;
   }
   bool isDouble() const {
     return Tag::Double == tag;
   }
   double toDouble() const {
     AT_ASSERT(isDouble());
-    return payload.as_double;
+    return payload.u.as_double;
   }
 
   // Future
@@ -433,7 +469,7 @@ struct TORCH_API IValue final {
 
   // Int
   IValue(int64_t i) : tag(Tag::Int), is_intrusive_ptr(false) {
-    payload.as_int = i;
+    payload.u.as_int = i;
   }
 
   // allow you to pass literals (3, 4) without ambiguity
@@ -445,7 +481,7 @@ struct TORCH_API IValue final {
 
   int64_t toInt() const {
     AT_ASSERT(isInt());
-    return payload.as_int;
+    return payload.u.as_int;
   }
 
   // Bool
@@ -454,9 +490,9 @@ struct TORCH_API IValue final {
     // Initializing entire payload stops valgrind's from reporting
     // "jump or move depends on uninitialised value" in IValue copy constructor
     // See https://github.com/pytorch/pytorch/issues/37117
-    payload.as_int = b;
+    payload.u.as_int = b;
 #else
-    payload.as_bool = b;
+    payload.u.as_bool = b;
 #endif
   }
   bool isBool() const {
@@ -464,7 +500,7 @@ struct TORCH_API IValue final {
   }
   bool toBool() const {
     AT_ASSERT(isBool());
-    return payload.as_bool;
+    return payload.u.as_bool;
   }
 
   // IntList
@@ -580,7 +616,7 @@ struct TORCH_API IValue final {
   c10::intrusive_ptr<ivalue::EnumHolder> toEnumHolder() const&;
 
   // None
-  IValue() : payload{0}, tag(Tag::None), is_intrusive_ptr(false) {}
+  IValue() : tag(Tag::None), is_intrusive_ptr(false) {}
   bool isNone() const {
     return Tag::None == tag;
   }
@@ -616,21 +652,21 @@ struct TORCH_API IValue final {
 
   // Device
   IValue(c10::Device d) : tag(Tag::Device), is_intrusive_ptr(false) {
-    payload.as_device.type = d.type();
-    payload.as_device.index = d.index();
+    payload.u.as_device.type = d.type();
+    payload.u.as_device.index = d.index();
   }
   bool isDevice() const {
     return Tag::Device == tag;
   }
   c10::Device toDevice() const {
     AT_ASSERT(isDevice());
-    return c10::Device(payload.as_device.type, payload.as_device.index);
+    return c10::Device(payload.u.as_device.type, payload.u.as_device.index);
   }
 
   //Stream
   IValue(c10::Stream stream)
     : tag(Tag::Stream), is_intrusive_ptr(false) {
-    payload.as_int = stream.pack();
+    payload.u.as_int = stream.pack();
   }
   c10::Stream toStream() &&;
   c10::Stream toStream() const &;
@@ -659,7 +695,7 @@ struct TORCH_API IValue final {
 
   // QScheme
   IValue(at::QScheme qscheme) : tag(Tag::Int), is_intrusive_ptr(false) {
-    payload.as_int = static_cast<int64_t>(qscheme);
+    payload.u.as_int = static_cast<int64_t>(qscheme);
   }
 
   at::QScheme toQScheme() const {
@@ -680,7 +716,7 @@ struct TORCH_API IValue final {
     // This is not an optional optimization: our incref call
     // *will not* do the right thing when called on an
     // undefined generator.
-    payload.as_intrusive_ptr = g.unsafeReleaseGeneratorImpl();
+    payload.u.as_intrusive_ptr = null_to_undefined_tensor(g.unsafeReleaseGeneratorImpl());
   }
   bool isGenerator() const {
     return Tag::Generator == tag;
@@ -749,14 +785,19 @@ struct TORCH_API IValue final {
       const IValue& v);
 
   bool isPtrType() const {
-    return is_intrusive_ptr;
+    return (isTensor() && payload.as_tensor.defined()) || is_intrusive_ptr;
   }
 
   /// @private [doxygen private]
   const void* internalToPointer() const {
     TORCH_INTERNAL_ASSERT(
         isPtrType(), "Can only call internalToPointer() for pointer types");
-    return payload.as_intrusive_ptr;
+    if (isTensor()) {
+      return payload.as_tensor.unsafeGetTensorImpl();
+    } else {
+      return payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()
+        ? payload.u.as_intrusive_ptr : nullptr;
+    }
   }
 
   TypePtr type() const;
@@ -770,7 +811,7 @@ struct TORCH_API IValue final {
       }
       // If it is not a Tensor, then two mutable IValues alias each other only
       // if they are the same pointer.
-      return val.payload.as_int;
+      return val.payload.u.as_int;
     }
   };
 
@@ -800,6 +841,10 @@ struct TORCH_API IValue final {
   IValue deepcopy(HashAliasedIValueMap& memo) const;
 
  private:
+  static c10::intrusive_ptr_target* null_to_undefined_tensor(c10::intrusive_ptr_target* p) {
+    return p ? p : static_cast<c10::intrusive_ptr_target*>(c10::UndefinedTensorImpl::singleton());
+  }
+
   static bool ptrEqual(const IValue& lhs, const IValue& rhs);
   // NOTE: IValue tags are intentionally private. In the future we may encode
   // this value different (e.g. using NaN boxing), and this would make it more
@@ -822,24 +867,77 @@ struct TORCH_API IValue final {
       class NullType = c10::detail::intrusive_target_default_null_type<T>>
   c10::intrusive_ptr<T, NullType> toIntrusivePtr() const;
 
-  void clearToNone() {
-    payload.as_int = 0;
+  void destroy() {
+    // We carefully construct this call to both 1) avoid UB by using
+    // the "wrong" one of as_tensor and as_intrusive_ptr and 2) enable
+    // the compiler to generate the same code for each case. It is
+    // surprisingly difficult to get this right.
+    if (isTensor() || is_intrusive_ptr) {
+      c10::intrusive_ptr_target* p = isTensor() ? payload.as_tensor.unsafeGetTensorImpl() : payload.u.as_intrusive_ptr;
+      c10::intrusive_ptr<intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(p);
+      // No need to make this destructor call!
+      // payload.as_tensor.~Tensor();
+    }
+  }
+
+  C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept {
+    if (rhs.isTensor()) {
+      new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor));
+      // As far as I can tell, omitting the usual explicit destructor call
+      // is not UB in and of itself, and it's a slight perf win. The
+      // destructor is a no-op, because the moved-from Tensor is
+      // effectively an intrusive_ptr in the null state, so we don't need
+      // the behavior for correctness reasons either. Leaving this
+      // explanatory comment, including commented-out destructor call, to
+      // make this abundantly clear.
+      //
+      // rhs.payload.as_tensor.~Tensor();
+    } else {
+      payload.u = rhs.payload.u;
+    }
+    tag = rhs.tag;
+    is_intrusive_ptr = rhs.is_intrusive_ptr;
+    rhs.clearToNone();
+  }
+
+  void clearToNone() noexcept {
+    payload.u.as_int = 0;
     tag = Tag::None;
     is_intrusive_ptr = false;
   }
 
   union Payload {
-    int64_t as_int;
-    double as_double;
-    bool as_bool;
-    c10::intrusive_ptr_target* as_intrusive_ptr;
-    struct {
-      DeviceType type;
-      DeviceIndex index;
-    } as_device;
+    // We use a nested union here so that we can make the copy easy
+    // and efficient in the non-tensor (i.e., trivially copyable)
+    // case. Specifically, we do not have to do a switch-on-tag to
+    // figure out which union member to assign; we can just use
+    // TriviallyCopyablePayload::operator=.
+    union TriviallyCopyablePayload {
+      TriviallyCopyablePayload() : as_int(0) {}
+      int64_t as_int;
+      double as_double;
+      bool as_bool;
+      // Invariant: never nullptr; null state is represented as
+      // c10::UndefinedTensorImpl::singleton() for consistency of
+      // representation with Tensor.
+      c10::intrusive_ptr_target* as_intrusive_ptr;
+      struct {
+        DeviceType type;
+        DeviceIndex index;
+      } as_device;
+    } u;
+    at::Tensor as_tensor;
+    Payload() : u() {}
+    ~Payload() {}
   };
 
-  IValue(Payload p, Tag t, bool i) : payload(p), tag(t), is_intrusive_ptr(i) {}
+  IValue(const Payload& p, Tag t, bool i) : tag(t), is_intrusive_ptr(i) {
+    if (isTensor()) {
+      new (&payload.as_tensor) at::Tensor(p.as_tensor);
+    } else {
+      payload.u = p.u;
+    }
+  }
 
   Payload payload;
   Tag tag;
@@ -848,29 +946,36 @@ struct TORCH_API IValue final {
 };
 
 struct TORCH_API WeakIValue final {
-  WeakIValue() : payload{0}, tag(IValue::Tag::None), is_intrusive_ptr(false) {}
+  WeakIValue() : tag(IValue::Tag::None), is_intrusive_ptr(false) {}
 
   WeakIValue(const WeakIValue& rhs)
       : payload(rhs.payload),
         tag(rhs.tag),
         is_intrusive_ptr(rhs.is_intrusive_ptr) {
-    if (is_intrusive_ptr) {
+    if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
       c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
     }
   }
   WeakIValue(const IValue& rhs)
-      : payload(rhs.payload),
-        tag(rhs.tag),
+      : tag(rhs.tag),
         is_intrusive_ptr(rhs.is_intrusive_ptr) {
+    if (rhs.isTensor()) {
+      payload.as_intrusive_ptr = rhs.unsafeToTensorImpl();
+      is_intrusive_ptr = true;
+    } else {
+      payload = rhs.payload.u;
+    }
     if (is_intrusive_ptr) {
-      c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
+      if (payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
+        c10::raw::weak_intrusive_ptr::incref(payload.as_intrusive_ptr);
+      }
     }
   }
   WeakIValue(WeakIValue&& rhs) noexcept : WeakIValue() {
     swap(rhs);
   }
   ~WeakIValue() {
-    if (is_intrusive_ptr) {
+    if (is_intrusive_ptr && payload.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton()) {
       c10::raw::weak_intrusive_ptr::decref(payload.as_intrusive_ptr);
     }
   }
@@ -895,17 +1000,33 @@ struct TORCH_API WeakIValue final {
 
   IValue lock() const {
     if (!is_intrusive_ptr) {
-      return IValue(payload, tag, false);
+      IValue::Payload newPayload;
+      newPayload.u = payload;
+      return IValue(newPayload, tag, false);
     }
-    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
-        payload.as_intrusive_ptr);
-    IValue::Payload pl;
-    pl.as_intrusive_ptr = temp.lock().release();
-    temp.release();
-    if (!pl.as_intrusive_ptr) {
-      return IValue();
+    if (IValue::Tag::Tensor == tag) {
+      auto temp = c10::weak_intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl>::reclaim(
+          static_cast<at::TensorImpl*>(payload.as_intrusive_ptr));
+      c10::intrusive_ptr<at::TensorImpl, c10::UndefinedTensorImpl> ip(temp.lock());
+      temp.release();
+      if (!ip) {
+        return IValue();
+      } else {
+        return IValue(at::Tensor(std::move(ip)));
+      }
     } else {
-      return IValue(pl, tag, true);
+      auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+          payload.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+          ? nullptr
+          : payload.as_intrusive_ptr);
+      IValue::Payload pl;
+      pl.u.as_intrusive_ptr = temp.lock().release();
+      temp.release();
+      if (!pl.u.as_intrusive_ptr) {
+        return IValue();
+      } else {
+        return IValue(pl, tag, true);
+      }
     }
   }
 
@@ -913,7 +1034,7 @@ struct TORCH_API WeakIValue final {
     if (!is_intrusive_ptr) {
       return 1;
     }
-    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(
         payload.as_intrusive_ptr);
     size_t result = temp.use_count();
     temp.release();
@@ -924,7 +1045,7 @@ struct TORCH_API WeakIValue final {
     if (!is_intrusive_ptr) {
       return 1;
     }
-    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target>::reclaim(
+    auto temp = c10::weak_intrusive_ptr<c10::intrusive_ptr_target, c10::UndefinedTensorImpl>::reclaim(
         payload.as_intrusive_ptr);
     size_t result = temp.weak_use_count();
     temp.release();
@@ -935,7 +1056,8 @@ struct TORCH_API WeakIValue final {
   }
 
  private:
-  IValue::Payload payload;
+  using Payload = IValue::Payload::TriviallyCopyablePayload;
+  Payload payload;
   IValue::Tag tag;
   bool is_intrusive_ptr;
 };
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index 89c8e669c138..b96f4b834989 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -48,14 +48,18 @@ struct tagged_capsule {
 template <class T, class NullType>
 c10::intrusive_ptr<T, NullType> IValue::moveToIntrusivePtr() {
   auto t = c10::intrusive_ptr<T, NullType>::reclaim(
-      static_cast<T*>(payload.as_intrusive_ptr));
+      payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+      ? NullType::singleton()
+      : static_cast<T*>(payload.u.as_intrusive_ptr));
   clearToNone();
   return t;
 }
 template <typename T, class NullType>
 c10::intrusive_ptr<T, NullType> IValue::toIntrusivePtr() const {
   auto r = c10::intrusive_ptr<T, NullType>::reclaim(
-      static_cast<T*>(payload.as_intrusive_ptr));
+      payload.u.as_intrusive_ptr == c10::UndefinedTensorImpl::singleton()
+      ? NullType::singleton()
+      : static_cast<T*>(payload.u.as_intrusive_ptr));
   auto p = r;
   r.release();
   return p;
@@ -131,12 +135,26 @@ inline c10::intrusive_ptr<ivalue::EnumHolder> IValue::toEnumHolder() const& {
 }
 inline at::Tensor IValue::toTensor() && {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
-  return at::Tensor(
-      moveToIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  auto result = std::move(payload.as_tensor);
+  // As far as I can tell, omitting the usual explicit destructor call
+  // is not UB in and of itself, and it's a slight perf win. The
+  // destructor is a no-op, because the moved-from Tensor is
+  // effectively an intrusive_ptr in the null state, so we don't need
+  // the behavior for correctness reasons either. Leaving this
+  // explanatory comment, including commented-out destructor call, to
+  // make this abundantly clear.
+  //
+  // payload.as_tensor.~Tensor();
+  clearToNone();
+  return result;
 }
-inline at::Tensor IValue::toTensor() const& {
+inline at::Tensor& IValue::toTensor() & {
   AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
-  return at::Tensor(toIntrusivePtr<at::TensorImpl, at::UndefinedTensorImpl>());
+  return payload.as_tensor;
+}
+inline const at::Tensor& IValue::toTensor() const& {
+  AT_ASSERT(isTensor(), "Expected Tensor but got ", tagKind());
+  return payload.as_tensor;
 }
 inline c10::Storage IValue::toStorage() && {
   AT_ASSERT(isStorage(), "Expected Storage but got ", tagKind());
@@ -148,10 +166,10 @@ inline c10::Storage IValue::toStorage() const& {
   return c10::Storage(toIntrusivePtr<at::StorageImpl>());
 }
 inline c10::Stream IValue::toStream() && {
-  return c10::Stream::unpack(payload.as_int);
+  return c10::Stream::unpack(payload.u.as_int);
 }
 inline c10::Stream IValue::toStream() const& {
-  return c10::Stream::unpack(payload.as_int);
+  return c10::Stream::unpack(payload.u.as_int);
 }
 inline c10::intrusive_ptr<caffe2::Blob> IValue::toBlob() && {
   AT_ASSERT(isBlob(), "Expected Blob but got ", tagKind());
@@ -713,7 +731,8 @@ using _guarded_unsigned_long = std::conditional_t<
 
 inline const ivalue::Object& IValue::toObjectRef() const {
   AT_ASSERT(isObject(), "Expected Object but got ", tagKind());
-  return *static_cast<const c10::ivalue::Object*>(payload.as_intrusive_ptr);
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(), "Attempted to create null reference");
+  return *static_cast<const c10::ivalue::Object*>(payload.u.as_intrusive_ptr);
 }
 
 // note: when adding a DEFINE_TO case here you should also add a
@@ -729,6 +748,7 @@ inline const ivalue::Object& IValue::toObjectRef() const {
   inline type IValue::to<type>() const& {  \
     return this->method_name();            \
   }
+
 DEFINE_TO(at::Tensor, toTensor)
 DEFINE_TO(at::Storage, toStorage)
 DEFINE_TO(c10::Stream, toStream)
@@ -980,8 +1000,11 @@ inline c10::List<int64_t> IValue::toIntList() const& {
 }
 inline std::vector<int64_t> IValue::toIntVector() const {
   AT_ASSERT(isIntList(), "Expected IntList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toIntVector on null intrusive_ptr IValue");
   return createVectorFromList<int64_t>(
-      static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr));
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<double> IValue::toDoubleList() && {
   AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
@@ -993,8 +1016,11 @@ inline c10::List<double> IValue::toDoubleList() const& {
 }
 inline std::vector<double> IValue::toDoubleVector() const {
   AT_ASSERT(isDoubleList(), "Expected DoubleList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toDoubleVector on null intrusive_ptr IValue");
   return createVectorFromList<double>(
-      static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr));
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<bool> IValue::toBoolList() && {
   AT_ASSERT(isBoolList(), "Expected BoolList but got ", tagKind());
@@ -1014,8 +1040,11 @@ inline c10::List<at::Tensor> IValue::toTensorList() const& {
 }
 inline std::vector<at::Tensor> IValue::toTensorVector() const {
   AT_ASSERT(isTensorList(), "Expected TensorList but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toTensorVector on null intrusive_ptr IValue");
   return createVectorFromList<at::Tensor>(
-      static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr));
+      static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr));
 }
 inline c10::List<IValue> IValue::toList() && {
   AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
@@ -1027,7 +1056,10 @@ inline c10::List<IValue> IValue::toList() const& {
 }
 inline c10::ArrayRef<IValue> IValue::toListRef() const {
   AT_ASSERT(isList(), "Expected GenericList but got ", tagKind());
-  return static_cast<const c10::detail::ListImpl*>(payload.as_intrusive_ptr)
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toListRef on null intrusive_ptr IValue");
+  return static_cast<const c10::detail::ListImpl*>(payload.u.as_intrusive_ptr)
       ->list;
 }
 inline c10::Dict<IValue, IValue> IValue::toGenericDict() && {
@@ -1049,7 +1081,7 @@ inline c10::intrusive_ptr<ivalue::Tuple> IValue::toTuple() const& {
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Tuple> v)
     : tag(Tag::Tuple), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 template <
     typename... Args,
@@ -1065,14 +1097,14 @@ inline IValue::IValue(const std::tuple<Args...>& t)
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::ConstantString> v)
     : tag(Tag::String), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 inline IValue::IValue(std::string v)
     : IValue(ivalue::ConstantString::create(std::move(v))) {}
 
 inline IValue::IValue(c10::impl::GenericList v)
     : tag(Tag::GenericList), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.impl_.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
 }
 
 template <class T, IValue::enable_if_ivalue_constructible<T>>
@@ -1104,7 +1136,7 @@ inline IValue::IValue(std::array<T, N> v) : IValue(c10::List<T>()) {
 
 inline IValue::IValue(c10::impl::GenericDict v)
     : tag(Tag::GenericDict), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.impl_.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.impl_.release());
 }
 template <class Key, class Value>
 inline IValue::IValue(c10::Dict<Key, Value> v)
@@ -1131,17 +1163,17 @@ inline IValue::IValue(c10::nullopt_t) : IValue() {}
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Object> v)
     : tag(Tag::Object), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::PyObjectHolder> v)
     : tag(Tag::PyObject), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::EnumHolder> v)
     : tag(Tag::Enum), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue IValue::make_capsule(
@@ -1149,7 +1181,7 @@ inline IValue IValue::make_capsule(
   IValue iv;
   iv.tag = Tag::Capsule;
   iv.is_intrusive_ptr = true;
-  iv.payload.as_intrusive_ptr = blob.release();
+  iv.payload.u.as_intrusive_ptr = null_to_undefined_tensor(blob.release());
   return iv;
 }
 
@@ -1170,30 +1202,33 @@ IValue::IValue(c10::intrusive_ptr<T> custom_class) {
   auto ivalue_obj = c10::ivalue::Object::create(
       c10::StrongTypePtr(nullptr, classType), /*num_slots=*/1);
   ivalue_obj->setSlot(0, IValue::make_capsule(std::move(custom_class)));
-  payload.as_intrusive_ptr = ivalue_obj.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(ivalue_obj.release());
   tag = Tag::Object;
   is_intrusive_ptr = true;
 }
 
 inline IValue::IValue(c10::intrusive_ptr<ivalue::Future> v)
     : tag(Tag::Future), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<c10::RRefInterface> v)
     : tag(Tag::RRef), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline IValue::IValue(c10::intrusive_ptr<at::Quantizer> v)
     : tag(Tag::Quantizer), is_intrusive_ptr(true) {
-  payload.as_intrusive_ptr = v.release();
+  payload.u.as_intrusive_ptr = null_to_undefined_tensor(v.release());
 }
 
 inline const std::string& IValue::toStringRef() const {
   AT_ASSERT(isString(), "Expected String but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toStringRef on null intrusive_ptr IValue");
   return static_cast<const c10::ivalue::ConstantString*>(
-             payload.as_intrusive_ptr)
+             payload.u.as_intrusive_ptr)
       ->string();
 }
 inline c10::optional<std::reference_wrapper<const std::string>> IValue::
@@ -1202,8 +1237,11 @@ inline c10::optional<std::reference_wrapper<const std::string>> IValue::
     return c10::nullopt;
   }
   AT_ASSERT(isString(), "Expected optional<string> but got ", tagKind());
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+      "called toOptionalStringRef on null intrusive_ptr IValue");
   return std::reference_wrapper<const std::string>(
-      static_cast<const c10::ivalue::ConstantString*>(payload.as_intrusive_ptr)
+      static_cast<const c10::ivalue::ConstantString*>(payload.u.as_intrusive_ptr)
           ->string());
 }
 
@@ -1241,15 +1279,13 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const {
     // for bool type, do equality check
     return this->toBool() == rhs.toBool();
   } else if (this->isTensor() && rhs.isTensor()) {
-    // for tensor type, just check the as_intrusive_ptr since is_intrusive_ptr
-    // is false for undefined tensor
-    return this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+    return this->payload.as_tensor.is_same(rhs.payload.as_tensor);
   } else if (this->isTensor() && rhs.isNone()) {
     // special case: undefined tensor and None are the same identity
-    return !this->is_intrusive_ptr;
+    return !this->payload.as_tensor.defined();
   } else if (this->isNone() && rhs.isTensor()) {
     // special case: undefined tensor and None are the same identity
-    return !rhs.is_intrusive_ptr;
+    return !rhs.payload.as_tensor.defined();
   } else if (this->isInt() && rhs.isInt()) {
     return this->toInt() == rhs.toInt();
   } else if (this->isDouble() && rhs.isDouble()) {
@@ -1260,7 +1296,7 @@ inline bool IValue::isSameIdentity(const IValue& rhs) const {
     // for objects holding in IValue, do shallow compare on pointer address to
     // testify the identity
     return this->is_intrusive_ptr && rhs.is_intrusive_ptr &&
-        this->payload.as_intrusive_ptr == rhs.payload.as_intrusive_ptr;
+        this->payload.u.as_intrusive_ptr == rhs.payload.u.as_intrusive_ptr;
   }
 }
 
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index f6902cd4beb6..7d3890f582b8 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1,10 +1,11 @@
 #pragma once
 
+#include <ATen/core/jit_type_base.h>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/interned_strings.h>
-#include <ATen/core/ivalue.h>
 #include <ATen/core/qualified_name.h>
+#include <ATen/core/ivalue.h>
 #include <c10/util/TypeList.h>
 #include <c10/util/Optional.h>
 
@@ -17,197 +18,17 @@ struct ClassType;
 namespace torch {
 namespace jit {
 struct CompilationUnit;
+struct Function;
 } // namespace jit
 } // namespace torch
 
 namespace c10 {
 
+struct IValue;
 struct FunctionSchema;
 struct NamedType;
 using OptNameList = c10::optional<std::vector<std::string>>;
 
-#define C10_FORALL_TYPES(_) \
-  _(AnyType)                \
-  _(EnumType)               \
-  _(AnyEnumType)            \
-  _(TensorType)             \
-  _(StorageType)            \
-  _(TupleType)              \
-  _(ListType)               \
-  _(DictType)               \
-  _(NumberType)             \
-  _(FloatType)              \
-  _(FutureType)             \
-  _(RRefType)               \
-  _(IntType)                \
-  _(NoneType)               \
-  _(StringType)             \
-  _(GeneratorType)          \
-  _(QuantizerType)          \
-  _(BoolType)               \
-  _(OptionalType)           \
-  _(VarType)                \
-  _(DeviceObjType)          \
-  _(StreamObjType)          \
-  _(FunctionType)           \
-  _(ClassType)              \
-  _(PyObjectType)           \
-  _(CapsuleType)            \
-  _(InterfaceType)          \
-  _(QSchemeType)            \
-  _(LayoutType)             \
-  _(ScalarTypeType)         \
-  _(AnyListType)            \
-  _(AnyTupleType)           \
-  _(AnyClassType)
-
-enum class TypeKind {
-#define DEFINE_TYPE(T) T,
-  C10_FORALL_TYPES(DEFINE_TYPE)
-#undef DEFINE_TYPE
-};
-
-TORCH_API const char* typeKindToString(TypeKind kind);
-
-struct Type;
-using TypePtr = std::shared_ptr<Type>;
-using ConstTypePtr = std::shared_ptr<const Type>;
-
-// Use this to customize how a Type is printed using `annotation_str()`. If
-// c10::nullopt is returned, `annotation_str()` falls through to its default
-// implementation.
-using TypePrinter =
-    std::function<c10::optional<std::string>(const ConstTypePtr&)>;
-
-struct TORCH_API Type : std::enable_shared_from_this<Type> {
- private:
-  TypeKind kind_;
-
- protected:
-  Type(TypeKind kind) : kind_(kind) {}
-
-  virtual std::string annotation_str_impl(TypePrinter printer) const {
-    return str();
-  }
-
- public:
-  virtual bool operator==(const Type& rhs) const = 0;
-
-  // subtyping relation. By default, we return true for the case
-  // when the type is exactly equal or if this <: T where rhs = Optional[T]
-
-  // if this returns false and the why_not stream is non-null, it contains
-  // additional details that describe why this is not a subtype of 'rhs'.
-  // This additional information should only contain details that are not obvious
-  // from the annotation_str() that describes the type. For instance it is clear that `int <: str` is false
-  // but not clear why `Foo <: InterfaceBar` might be false.
-  virtual bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const;
-  virtual bool is_module() const;
-  bool isSubtypeOf(const TypePtr& rhs) const {
-    return isSubtypeOfExt(rhs, nullptr);
-  }
-
-  // How this type will appear in FunctionSchema declarations
-  virtual std::string str() const = 0;
-
-  // How this type will appear as if it were a type annotation in Python
-  // which is sometimes different than how it appears in declarations (e.g.
-  // int[] vs List[int])
-  //
-  // Takes a custom printer that users can pass in to customize the output of
-  // this method.
-  std::string annotation_str(TypePrinter printer) const {
-    if (printer) {
-      // the printer can return nullopt to fall through to the default impl
-      if (auto renamed = printer(shared_from_this())) {
-        return *renamed;
-      }
-    }
-    return annotation_str_impl(printer);
-  }
-  std::string annotation_str() const {
-    // Overload instead of define a default value for `printer` to help
-    // debuggers out.
-    return annotation_str(nullptr);
-  }
-
-  // Returns a human readable string that includes additional information like
-  // "type is inferred rather than explictly defined" to help construct more
-  // user-friendly messages.
-  virtual std::string repr_str() const {
-    return annotation_str();
-  }
-
-  TypeKind kind() const {
-    return kind_;
-  }
-
-  virtual bool requires_grad() const {
-    for (const auto& ct : containedTypes()) {
-      if (ct->requires_grad()) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // Dynamically cast this object to the subclass indicated by the
-  // template variable, returning nullptr if the cast is invalid.
-  template <typename T>
-  std::shared_ptr<T> cast() {
-    if (T::Kind == kind()) {
-      return std::static_pointer_cast<T>(shared_from_this());
-    }
-    return nullptr;
-  }
-  template <typename T>
-  std::shared_ptr<const T> cast() const {
-    if (T::Kind == kind()) {
-      return std::static_pointer_cast<const T>(shared_from_this());
-    }
-    return nullptr;
-  }
-  template <typename T>
-  std::shared_ptr<T> expect() {
-    auto r = cast<T>();
-    AT_ASSERT(r);
-    return r;
-  }
-  template <typename T>
-  std::shared_ptr<const T> expect() const {
-    auto r = cast<const T>();
-    AT_ASSERT(r);
-    return r;
-  }
-  virtual ~Type() = default;
-  virtual bool hasFreeVariables() const {
-    return false;
-  }
-  // list of types this type contains, e.g. for a List then element type of a
-  // list for a tuple, the types of the tuple elements
-  virtual at::ArrayRef<TypePtr> containedTypes() const {
-    return {};
-  }
-  // create a new version of this type, replacing its contained types with
-  // contained_types
-  TypePtr withContained(std::vector<TypePtr> contained_types) {
-    auto current_contained = containedTypes();
-    AT_ASSERT(current_contained.size() == contained_types.size());
-    if (current_contained.equals(contained_types)) {
-      return shared_from_this();
-    }
-    return createWithContained(std::move(contained_types));
-  }
-  // per-type constructor, you only need to override this if the
-  // containedTypes() is not empty
-  virtual TypePtr createWithContained(
-      std::vector<TypePtr> contained_types) const {
-    AT_ERROR(
-        "type with contained types did not overload createWithContained: ",
-        str());
-  }
-};
-
 struct AnyType;
 using AnyTypePtr = std::shared_ptr<AnyType>;
 // Any is the top of the type hierarchy, all other types are subtypes
@@ -2549,19 +2370,19 @@ struct TORCH_API AnyClassType : public Type {
 
 inline bool IValue::isDoubleList() const {
   // note: avoids calling type() to avoid extra referencing counting for the returned type.
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == FloatType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == FloatType::Kind;
 }
 
 inline bool IValue::isTensorList() const {
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == TensorType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == TensorType::Kind;
 }
 
 inline bool IValue::isIntList() const {
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == IntType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == IntType::Kind;
 }
 
 inline bool IValue::isBoolList() const {
-  return isList() && static_cast<detail::ListImpl*>(payload.as_intrusive_ptr)->elementType->kind() == BoolType::Kind;
+  return isList() && static_cast<detail::ListImpl*>(payload.u.as_intrusive_ptr)->elementType->kind() == BoolType::Kind;
 }
 
 template<>
diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h
new file mode 100644
index 000000000000..e5a6d48340cf
--- /dev/null
+++ b/aten/src/ATen/core/jit_type_base.h
@@ -0,0 +1,221 @@
+#pragma once
+
+#include <functional>
+#include <string>
+#include <memory>
+#include <c10/macros/Macros.h>
+#include <c10/util/Optional.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ArrayRef.h>
+
+namespace c10 {
+
+#define C10_FORALL_TYPES(_) \
+  _(AnyType)                \
+  _(EnumType)               \
+  _(AnyEnumType)            \
+  _(TensorType)             \
+  _(StorageType)            \
+  _(TupleType)              \
+  _(ListType)               \
+  _(DictType)               \
+  _(NumberType)             \
+  _(FloatType)              \
+  _(FutureType)             \
+  _(RRefType)               \
+  _(IntType)                \
+  _(NoneType)               \
+  _(StringType)             \
+  _(GeneratorType)          \
+  _(QuantizerType)          \
+  _(BoolType)               \
+  _(OptionalType)           \
+  _(VarType)                \
+  _(DeviceObjType)          \
+  _(StreamObjType)          \
+  _(FunctionType)           \
+  _(ClassType)              \
+  _(PyObjectType)           \
+  _(CapsuleType)            \
+  _(InterfaceType)          \
+  _(QSchemeType)            \
+  _(LayoutType)             \
+  _(ScalarTypeType)         \
+  _(AnyListType)            \
+  _(AnyTupleType)           \
+  _(AnyClassType)
+
+enum class TypeKind {
+#define DEFINE_TYPE(T) T,
+  C10_FORALL_TYPES(DEFINE_TYPE)
+#undef DEFINE_TYPE
+};
+
+TORCH_API const char* typeKindToString(TypeKind kind);
+
+struct Type;
+using TypePtr = std::shared_ptr<Type>;
+using ConstTypePtr = std::shared_ptr<const Type>;
+
+// Use this to customize how a Type is printed using `annotation_str()`. If
+// c10::nullopt is returned, `annotation_str()` falls through to its default
+// implementation.
+using TypePrinter =
+    std::function<c10::optional<std::string>(const ConstTypePtr&)>;
+
+struct TORCH_API Type : std::enable_shared_from_this<Type> {
+ private:
+  TypeKind kind_;
+
+ protected:
+  Type(TypeKind kind) : kind_(kind) {}
+
+  virtual std::string annotation_str_impl(TypePrinter printer) const {
+    return str();
+  }
+
+ public:
+  virtual bool operator==(const Type& rhs) const = 0;
+
+  // subtyping relation. By default, we return true for the case
+  // when the type is exactly equal or if this <: T where rhs = Optional[T]
+
+  // if this returns false and the why_not stream is non-null, it contains
+  // additional details that describe why this is not a subtype of 'rhs'.
+  // This additional information should only contain details that are not obvious
+  // from the annotation_str() that describes the type. For instance it is clear that `int <: str` is false
+  // but not clear why `Foo <: InterfaceBar` might be false.
+  virtual bool isSubtypeOfExt(const TypePtr& rhs, std::ostream* why_not) const;
+  virtual bool is_module() const;
+  bool isSubtypeOf(const TypePtr& rhs) const {
+    return isSubtypeOfExt(rhs, nullptr);
+  }
+
+  // How this type will appear in FunctionSchema declarations
+  virtual std::string str() const = 0;
+
+  // How this type will appear as if it were a type annotation in Python
+  // which is sometimes different than how it appears in declarations (e.g.
+  // int[] vs List[int])
+  //
+  // Takes a custom printer that users can pass in to customize the output of
+  // this method.
+  std::string annotation_str(TypePrinter printer) const {
+    if (printer) {
+      // the printer can return nullopt to fall through to the default impl
+      if (auto renamed = printer(shared_from_this())) {
+        return *renamed;
+      }
+    }
+    return annotation_str_impl(printer);
+  }
+  std::string annotation_str() const {
+    // Overload instead of define a default value for `printer` to help
+    // debuggers out.
+    return annotation_str(nullptr);
+  }
+
+  // Returns a human readable string that includes additional information like
+  // "type is inferred rather than explictly defined" to help construct more
+  // user-friendly messages.
+  virtual std::string repr_str() const {
+    return annotation_str();
+  }
+
+  TypeKind kind() const {
+    return kind_;
+  }
+
+  virtual bool requires_grad() const {
+    for (const auto& ct : containedTypes()) {
+      if (ct->requires_grad()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Dynamically cast this object to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid.
+  template <typename T>
+  std::shared_ptr<T> cast() {
+    if (T::Kind == kind()) {
+      return std::static_pointer_cast<T>(shared_from_this());
+    }
+    return nullptr;
+  }
+  template <typename T>
+  std::shared_ptr<const T> cast() const {
+    if (T::Kind == kind()) {
+      return std::static_pointer_cast<const T>(shared_from_this());
+    }
+    return nullptr;
+  }
+  template <typename T>
+  T* castRaw() {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  const T* castRaw() const {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  std::shared_ptr<T> expect() {
+    auto r = cast<T>();
+    AT_ASSERT(r);
+    return r;
+  }
+  template <typename T>
+  std::shared_ptr<const T> expect() const {
+    auto r = cast<const T>();
+    AT_ASSERT(r);
+    return r;
+  }
+  template <typename T>
+  T& expectRef() {
+    auto* r = castRaw<T>();
+    AT_ASSERT(r);
+    return *r;
+  }
+  template <typename T>
+  const T& expectRef() const {
+    auto* r = castRaw<const T>();
+    AT_ASSERT(r);
+    return *r;
+  }
+  virtual ~Type() = default;
+  virtual bool hasFreeVariables() const {
+    return false;
+  }
+  // list of types this type contains, e.g. for a List then element type of a
+  // list for a tuple, the types of the tuple elements
+  virtual at::ArrayRef<TypePtr> containedTypes() const {
+    return {};
+  }
+  // create a new version of this type, replacing its contained types with
+  // contained_types
+  TypePtr withContained(std::vector<TypePtr> contained_types) {
+    auto current_contained = containedTypes();
+    AT_ASSERT(current_contained.size() == contained_types.size());
+    if (current_contained.equals(contained_types)) {
+      return shared_from_this();
+    }
+    return createWithContained(std::move(contained_types));
+  }
+  // per-type constructor, you only need to override this if the
+  // containedTypes() is not empty
+  virtual TypePtr createWithContained(
+      std::vector<TypePtr> contained_types) const {
+    AT_ERROR(
+        "type with contained types did not overload createWithContained: ",
+        str());
+  }
+};
+
+}
diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp
index 6259578fdac8..56afe8ca7fb5 100644
--- a/aten/src/ATen/core/op_registration/op_registration_test.cpp
+++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp
@@ -1909,7 +1909,7 @@ TEST(NewOperatorRegistrationTest, CppFunction) {
   m.def("fn3", [](const Tensor& x) { return x; });
   // These require explicit schema
   m.def("fn4(Tensor x) -> Tensor", CppFunction::makeFallthrough());
-  m.def("fn5(Tensor x) -> Tensor", CppFunction::makeUnboxedOnly(dummy_fn));
+  m.def("fn5(Tensor x) -> Tensor", CppFunction::makeFromUnboxedFunction(dummy_fn));
   m.def("fn6(Tensor x) -> Tensor", CppFunction::makeFromBoxedFunction<&backend_fallback_kernel>());
 }
 
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index fa2e85b29a07..be47a749d828 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -440,7 +440,7 @@ MatchTypeReturn matchTypeVariables(
       // unknown type).
       return matchTypeVariables(opt_formal->getElementType(), actual, type_env);
     }
-    // note: if actual was non here we potentially did not fill in the type
+    // note: if actual was None here we potentially did not fill in the type
     // variables contained in the formal. It is still a valid match because None
     // matches Optional[T] later error checking on tryEvalTypeVariables will
     // report the problem if we never match variables in type T
diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
index 8a5e4f48e0c0..f0572bb6d809 100644
--- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
+++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp
@@ -130,6 +130,67 @@ uint64_t CUDAGeneratorImpl::seed() {
   return random;
 }
 
+/**
+ * Gets the current internal state of CUDAGeneratorImpl. The internal
+ * state is returned as a CPU byte tensor.
+ */
+c10::intrusive_ptr<c10::TensorImpl> CUDAGeneratorImpl::get_state() const {
+  // The RNG state comprises the seed, and an offset used for Philox.
+  // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120.
+  // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
+  // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here
+  // because this is just host side code and we don't want to worry about linking with cuda
+  static const size_t states_size = 200 * sizeof(4120);
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = states_size + seed_size + offset_size;
+
+  auto state_tensor = at::detail::empty_cpu({(int64_t)total_size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt);
+  auto rng_state = state_tensor.data_ptr<uint8_t>();
+  // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1
+  // gen_states in THCGenerator struct was an array of curandStateMtgp32s.
+  memset(rng_state, -1, states_size);
+  auto current_seed = this->current_seed();
+  auto offset = static_cast<int64_t>(this->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
+  memcpy(rng_state + states_size, &current_seed, seed_size);
+  memcpy(rng_state + states_size + seed_size, &offset, offset_size);
+
+  return state_tensor.getIntrusivePtr();
+}
+
+/**
+ * Sets the internal state of CUDAGeneratorImpl. The new internal state
+ * must be a strided CPU byte tensor and have appropriate size. See
+ * comments of CUDAGeneratorImpl::state for information about the layout
+ * and size of the internal state.
+ */
+void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
+  static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason
+  static const size_t seed_size = sizeof(uint64_t);
+  static const size_t offset_size = sizeof(int64_t);
+  static const size_t total_size = states_size + seed_size + offset_size;
+
+  detail::check_rng_state(new_state);
+
+  bool no_philox_seed = false;
+  auto new_state_size = new_state.numel();
+  if (new_state_size == total_size - offset_size) {
+    no_philox_seed = true;
+  } else {
+    TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size");
+  }
+  
+  uint64_t input_seed;
+  auto new_rng_state = new_state.data<uint8_t>();
+  memcpy(&input_seed, new_rng_state + states_size, seed_size);
+  this->set_current_seed(input_seed);
+  int64_t philox_offset = 0;
+  if (!no_philox_seed) {
+    memcpy(&philox_offset, new_rng_state + states_size + seed_size, offset_size);
+  }
+  this->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
+}
+
 /**
  * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10
  *
@@ -143,7 +204,7 @@ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) {
 /**
  * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl.
  */
-uint64_t CUDAGeneratorImpl::philox_offset_per_thread() {
+uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const {
   at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread");
   return philox_offset_per_thread_;
 }
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 00424ab83ba0..b75ef8219b1c 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -21,7 +21,7 @@
 #endif
 
 #ifdef USE_MAGMA
-#include <magma.h>
+#include <magma_v2.h>
 #endif
 
 #ifdef __HIP_PLATFORM_HCC__
@@ -369,6 +369,11 @@ int CUDAHooks::getNumGPUs() const {
   return at::cuda::device_count();
 }
 
+void CUDAHooks::deviceSynchronize(int64_t device_index) const {
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  c10::cuda::device_synchronize();
+}
+
 // Sigh, the registry doesn't support namespaces :(
 using at::CUDAHooksRegistry;
 using at::RegistererCUDAHooksRegistry;
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
index dff8913b153f..abef2e7ff835 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -38,6 +38,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   int64_t cuFFTGetPlanCacheSize(int64_t device_index) const override;
   void cuFFTClearPlanCache(int64_t device_index) const override;
   int getNumGPUs() const override;
+  void deviceSynchronize(int64_t device_index) const override;
 };
 
 }}} // at::cuda::detail
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
index af4eb6fd0739..afe88761d88f 100644
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -181,6 +181,10 @@ struct TORCH_API CUDAHooksInterface {
   virtual int getNumGPUs() const {
     return 0;
   }
+
+  virtual void deviceSynchronize(int64_t device_index) const {
+    TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP);
+  }
 };
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index cc1403febf90..bd59fe7d28b9 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -944,7 +944,9 @@ static void apply_orgqr(Tensor& self, const Tensor& tau, int64_t m, int64_t n_co
 #endif
 }
 
-std::tuple<Tensor, Tensor> _qr_helper_cpu(const Tensor& self, bool some) {
+std::tuple<Tensor, Tensor> _linalg_qr_helper_cpu(const Tensor& self, std::string mode) {
+  bool compute_q, reduced;
+  std::tie(compute_q, reduced) = _parse_qr_mode(mode);
   std::vector<int64_t> infos(batchCount(self), 0);
   int64_t m = self.size(-2), n = self.size(-1);
 
@@ -954,25 +956,22 @@ std::tuple<Tensor, Tensor> _qr_helper_cpu(const Tensor& self, bool some) {
   self_sizes[self.dim() - 2] = std::min(m, n);
   auto tau_working_copy = at::empty(self_sizes, self.options());
   Tensor q_working_copy;
+  Tensor R;
 
   // Setup input geometry for apply_orgqr
   std::vector<int64_t> q_sizes, q_strides;
   int64_t n_columns_q;
-  Tensor R;
-  std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, some);
+  std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, reduced);
 
   // If there are no elements, then we simply return a pair of tensors of required dimensions
   if (self.numel() == 0) {
-    // Fix the number of columns of q appropriately
-    q_sizes[self.dim() - 1] = n_columns_q;
-    q_working_copy = at::eye(q_sizes[self.dim() - 2], q_sizes[self.dim() - 1], self.options());
-    q_working_copy = q_working_copy.expand_as(q_working_copy);
-
-    // We repurpose the same q_sizes for R
-    // Fix the number of rows and columns of q_working_copy appropriately
-    q_sizes[self.dim() - 1] = n;
-    q_sizes[self.dim() - 2] = n_columns_q;
-    R = at::empty(q_sizes, self.options());
+    R = at::empty({n_columns_q, n}, self.options());
+    if (compute_q) {
+      int64_t n_rows_q = q_sizes[self.dim() - 2];
+      q_working_copy = at::eye(n_rows_q, n_columns_q, self.options());
+    } else {
+      q_working_copy = at::empty({0}, self.options());
+    }
     return std::make_tuple(q_working_copy, R);
   }
 
@@ -992,6 +991,11 @@ std::tuple<Tensor, Tensor> _qr_helper_cpu(const Tensor& self, bool some) {
   }
 
   R = q_working_copy.slice(-2, 0, n_columns_q).slice(-1, 0, n).triu();
+  if (!compute_q) {
+    // this is for mode='r'
+    Tensor empty_Q = at::empty({0}, self.options());
+    return std::make_tuple(empty_Q, R);
+  }
 
   // Next perform ORGQR for Q using the results (both raw R and TAU) from GEQRF
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cpu", [&]{
@@ -1005,22 +1009,34 @@ std::tuple<Tensor, Tensor> _qr_helper_cpu(const Tensor& self, bool some) {
   return std::make_tuple(q_working_copy.narrow(-1, 0, n_columns_q), R);
 }
 
-std::tuple<Tensor,Tensor> qr(const Tensor& self, bool some) {
+std::tuple<Tensor,Tensor> linalg_qr(const Tensor& self, std::string mode) {
   TORCH_CHECK(self.dim() >= 2,
               "self should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
-  return at::_qr_helper(self, some);
+  return at::_linalg_qr_helper(self, mode);
 }
 
-std::tuple<Tensor&,Tensor&> qr_out(Tensor& Q, Tensor& R, const Tensor& self, bool some) {
+std::tuple<Tensor&,Tensor&> linalg_qr_out(Tensor& Q, Tensor& R, const Tensor& self, std::string mode) {
   TORCH_CHECK(self.dim() >= 2,
               "self should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
   Tensor Q_tmp, R_tmp;
-  std::tie(Q_tmp, R_tmp) = at::_qr_helper(self, some);
-  Q.resize_as_(Q_tmp).copy_(Q_tmp);
-  R.resize_as_(R_tmp).copy_(R_tmp);
+  std::tie(Q_tmp, R_tmp) = at::_linalg_qr_helper(self, mode);
+  at::native::resize_output(Q, Q_tmp.sizes());
+  Q.copy_(Q_tmp);
+  at::native::resize_output(R, R_tmp.sizes());
+  R.copy_(R_tmp);
   return std::tuple<Tensor&, Tensor&>(Q, R);
 }
 
+std::tuple<Tensor,Tensor> qr(const Tensor& self, bool some) {
+  std::string mode = some ? "reduced" : "complete";
+  return at::linalg_qr(self, mode);
+}
+
+std::tuple<Tensor&,Tensor&> qr_out(Tensor& Q, Tensor& R, const Tensor& self, bool some) {
+  std::string mode = some ? "reduced" : "complete";
+  return at::linalg_qr_out(Q, R, self, mode);
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ syevd ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 // This function computes eigenvalues 'w' and eigenvectors 'v' of the input that is stored initially in 'v'
diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index 9103eafb1f12..d4b514f6797b 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -71,7 +71,7 @@ static Tensor wrapped_scalar_tensor(Scalar scalar) {
 }
 
 TORCH_IMPL_FUNC(add_out) (
-  Tensor& result, const Tensor& self, const Tensor& other, Scalar alpha
+  const Tensor& self, const Tensor& other, Scalar alpha, Tensor& result
 ) {
   add_stub(device_type(), *this, alpha);
   TORCH_INTERNAL_ASSERT(result.scalar_type() == output().dtype());
@@ -1109,11 +1109,11 @@ Tensor& xlogy_out(Tensor& result, const Tensor& self, const Tensor& other) {
 }
 
 Tensor& xlogy_out(Tensor& result, Scalar self, const Tensor& other) {
-  return at::xlogy_out(result, c10::scalar_to_tensor(self, other.device()), other);
+  return at::xlogy_out(result, wrapped_scalar_tensor(self), other);
 }
 
 Tensor& xlogy_out(Tensor& result, const Tensor& self, Scalar other) {
-  return at::xlogy_out(result, self, c10::scalar_to_tensor(other, self.device()));
+  return at::xlogy_out(result, self, wrapped_scalar_tensor(other));
 }
 
 Tensor xlogy(const Tensor& x, const Tensor& y) {
@@ -1124,11 +1124,11 @@ Tensor xlogy(const Tensor& x, const Tensor& y) {
 }
 
 Tensor xlogy(Scalar x, const Tensor& y) {
-  return at::xlogy(c10::scalar_to_tensor(x, y.device()), y);
+  return at::xlogy(wrapped_scalar_tensor(x), y);
 }
 
 Tensor xlogy(const Tensor& x, Scalar y) {
-  return at::xlogy(x, c10::scalar_to_tensor(y, x.device()));
+  return at::xlogy(x, wrapped_scalar_tensor(y));
 }
 
 Tensor& xlogy_(Tensor& x, const Tensor& y) {
@@ -1136,7 +1136,7 @@ Tensor& xlogy_(Tensor& x, const Tensor& y) {
 }
 
 Tensor& xlogy_(Tensor& x, Scalar y) {
-  return at::xlogy_out(x, x, c10::scalar_to_tensor(y, x.device()));
+  return at::xlogy_out(x, x, wrapped_scalar_tensor(y));
 }
 
 } // namespace native
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index ef0c2e2509c1..413ea32acdef 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -118,7 +118,7 @@ DEFINE_DISPATCH(bernoulli_tensor_stub);
 DEFINE_DISPATCH(bernoulli_scalar_stub);
 DEFINE_DISPATCH(cauchy_stub);
 DEFINE_DISPATCH(exponential_stub);
-DEFINE_DISPATCH(multinomial_stub);
+DEFINE_DISPATCH(multinomial_with_replacement_stub);
 DEFINE_DISPATCH(geometric_stub);
 DEFINE_DISPATCH(log_normal_stub);
 DEFINE_DISPATCH(uniform_stub);
@@ -497,8 +497,10 @@ Tensor& multinomial_out(
   // Reference:
   // https://github.com/pytorch/pytorch/issues/11931#issuecomment-625882503
   // Half is not supported on CPU.
-  if (!with_replacement &&
-      !(self.device().is_cpu() && self.scalar_type() == ScalarType::Half)) {
+  TORCH_CHECK(
+      !(self.device().is_cpu() && self.scalar_type() == ScalarType::Half),
+      "multinomial is not implemented for half on CPU");
+  if (!with_replacement) {
     // Sanity checks on `self`.
     auto is_valid = ((self.max() < INFINITY) & (self.min() >= 0)).item();
     TORCH_CHECK(
@@ -537,13 +539,8 @@ Tensor& multinomial_out(
     return result;
   }
 
-  multinomial_stub(
-      result.device().type(),
-      result,
-      self,
-      n_sample,
-      with_replacement,
-      gen);
+  multinomial_with_replacement_stub(
+      result.device().type(), result, self, n_sample, gen);
   return result;
 }
 
diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp
index bf74e8b356c7..a4854e1ced4d 100644
--- a/aten/src/ATen/native/Embedding.cpp
+++ b/aten/src/ATen/native/Embedding.cpp
@@ -68,7 +68,7 @@ Tensor embedding_sparse_backward(
   Tensor indices = indices_;
   Tensor grad = grad_;
   if (padding_idx != -1) {
-    auto c = indices != padding_idx;
+    torch::List<c10::optional<Tensor>> c({indices != padding_idx});
     indices = indices.index(c);
     grad = grad.index(c);
   }
diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h
index 94d61b02dd0b..92f6957f25ad 100644
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <ATen/ExpandUtils.h>
 #include <ATen/native/TensorIterator.h>
+#include <ATen/core/List.h>
 
 #include <limits>
 
@@ -15,40 +16,45 @@ static void invalid_mask(const Tensor & self, int64_t idx, const Tensor & mask,
 }
 
 
-static std::vector<Tensor> expandTensors(const Tensor & self, TensorList indices) {
+static std::vector<Tensor> expandTensors(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   // If indices come in as ByteTensor or BoolTensor (masks), expand them into the equivalent indexing by LongTensors
   std::vector<Tensor> result;
-  for (const auto & index : indices) {
-    if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
-      if (index.scalar_type() == kByte) {
-        TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
-        " please use a dtype torch.bool instead.");
-      }
-      // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
-      // corresponding dimensions in self
-      for (int64_t j = 0; j < index.dim(); j++) {
-        int64_t srcIdx = result.size() + j;
-        if (index.size(j) != self.size(srcIdx)) {
-          invalid_mask(self, srcIdx, index, j);
+  for (c10::optional<Tensor> index_opt : indices) {
+    if (!index_opt.has_value()) {
+      result.emplace_back();
+    } else {
+      Tensor index = std::move(*index_opt);
+      if (index.scalar_type() == kByte || index.scalar_type() == kBool) {
+        if (index.scalar_type() == kByte) {
+          TORCH_WARN("indexing with dtype torch.uint8 is now deprecated," \
+          " please use a dtype torch.bool instead.");
         }
+        // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
+        // corresponding dimensions in self
+        for (int64_t j = 0; j < index.dim(); j++) {
+          int64_t srcIdx = result.size() + j;
+          if (index.size(j) != self.size(srcIdx)) {
+            invalid_mask(self, srcIdx, index, j);
+          }
+        }
+        // Replace with nonzeros
+        auto nonzero = index.nonzero();
+        for (int64_t j = 0; j < index.dim(); j++) {
+          result.emplace_back(nonzero.select(1, j));
+        }
+      } else {
+        result.emplace_back(std::move(index));
       }
-      // Replace with nonzeros
-      auto nonzero = index.nonzero();
-      for (int64_t j = 0; j < index.dim(); j++) {
-        result.emplace_back(nonzero.select(1, j));
-      }
-    } else {
-      result.emplace_back(index);
     }
   }
   return result;
 }
 
 
-static void checkIndexTensorTypes(TensorList indices) {
-  for (auto& tensor : indices) {
-    if (tensor.defined()) {
-      auto scalarType = tensor.scalar_type();
+static void checkIndexTensorTypes(const torch::List<c10::optional<Tensor>>& indices) {
+  for (c10::optional<Tensor> tensor : indices) {
+    if (tensor.has_value() && tensor->defined()) {
+      auto scalarType = tensor->scalar_type();
       if (scalarType != kLong && scalarType != kByte && scalarType != kBool) {
           TORCH_CHECK_INDEX(false, "tensors used as indices must be long, byte or bool tensors");
       }
@@ -56,6 +62,15 @@ static void checkIndexTensorTypes(TensorList indices) {
   }
 }
 
+inline torch::List<c10::optional<Tensor>> toListOfOptionalTensors(ArrayRef<Tensor> list) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(list.size());
+  for (const Tensor& a : list) {
+    result.push_back(a);
+  }
+  return result;
+}
+
 static bool hasContiguousSubspace(TensorList tl) {
   // true if all the non-null tensors are adjacent
   auto isDefined = [](const Tensor & tensor){ return tensor.defined(); };
diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index da8d2bd6db47..a37d1046bac2 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -8,6 +8,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/LinearAlgebra.h>
+#include <ATen/native/IndexingUtils.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Parallel.h>
 #include <ATen/LegacyTHFunctionsCPU.h>
@@ -73,7 +74,8 @@ Tensor logdet(const Tensor& self) {
   // U is singular when U(i, i) = 0 for some i in [1, self.size(-1)].
   Tensor logdet_vals = diag_U.abs_().log_().sum(-1);
   if (self.dim() > 2) {
-    logdet_vals.index_put_((det_sign < 0).nonzero_numpy(), at::full({}, NAN, self.options()));
+    auto indices = toListOfOptionalTensors((det_sign < 0).nonzero_numpy());
+    logdet_vals.index_put_(std::move(indices), at::full({}, NAN, self.options()));
   } else if (det_sign.item<double>() < 0) {
     logdet_vals.fill_(NAN);
   }
diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h
index 848060df944e..607e201ebe8d 100644
--- a/aten/src/ATen/native/LinearAlgebraUtils.h
+++ b/aten/src/ATen/native/LinearAlgebraUtils.h
@@ -192,16 +192,35 @@ static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) {
   return self.permute(perm);
 }
 
+// parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced)
+static inline std::tuple<bool, bool> _parse_qr_mode(std::string mode) {
+  bool compute_q;
+  bool reduced;
+  if (mode == "reduced") {
+    compute_q = true;
+    reduced = true;
+  } else if (mode == "complete") {
+    compute_q = true;
+    reduced = false;
+  } else if (mode == "r") {
+    compute_q = false;
+    reduced = true; // this is actually irrelevant in this mode
+  } else {
+    TORCH_CHECK(false, "Unrecognized mode '", mode, "'");
+  }
+  return std::make_tuple(compute_q, reduced);
+}
+
 // Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition
 static inline std::tuple<std::vector<int64_t>,
                          std::vector<int64_t>,
-                         int64_t> _compute_geometry_for_Q(const Tensor& input, bool some) {
+                         int64_t> _compute_geometry_for_Q(const Tensor& input, bool reduced) {
   int64_t m = input.size(-2), n = input.size(-1);
   int64_t n_columns_q;
 
-  // We need to compute the required size of Q based on the `some` option
+  // We need to compute the required size of Q based on the `reduced` option
   auto q_sizes = input.sizes().vec();
-  if (!some && m > n) {
+  if (!reduced && m > n) {
     q_sizes[input.dim() - 1] = m;
     n_columns_q = m;
   } else {
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 071460b090cd..8b5d65a8a60f 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -72,7 +72,7 @@ pool2d_shape_check(
   TORCH_CHECK(input.numel() > 0 && (ndim == 3 || ndim == 4),
               "non-empty 3D or 4D input tensor expected but got ndim: ", ndim);
   TORCH_CHECK(kW/2 >= padW && kH/2 >= padH,
-              "pad should be smaller than half of kernel size, but got ",
+              "pad should be smaller than or equal to half of kernel size, but got ",
               "padW = ", padW, ", padH = ", padH, ", kW = ", kW, ", kH = ", kH);
 
   TORCH_CHECK(outputWidth >= 1 && outputHeight >= 1,
@@ -172,7 +172,7 @@ pool3d_shape_check(
   }
 
   TORCH_CHECK(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH,
-              "pad should be smaller than half of kernel size, but got "
+              "pad should be smaller than or equal to half of kernel size, but got "
               "kT: ", kT, " kW: ", kW, " kH: ", kH, " padT: ", pT, " padW: ", pW, " padH: ", pH);
 
   TORCH_CHECK(otime >= 1 && owidth >= 1 && oheight >= 1,
diff --git a/aten/src/ATen/native/Pow.cpp b/aten/src/ATen/native/Pow.cpp
index bfc5f910e093..4d1601d3e6a0 100644
--- a/aten/src/ATen/native/Pow.cpp
+++ b/aten/src/ATen/native/Pow.cpp
@@ -31,11 +31,9 @@ Tensor& pow_out(Tensor& result, const Tensor& base, Scalar exp) {
            "result type ", common_dtype, "can't be cast to the desired output type ",
            result.scalar_type());
 
-  auto exponent = (exp.isComplex()) ? exp.toComplexDouble() : exp.toDouble();
-
-  if (exponent == 0.0) {
+  if (exp.equal(0.0)) {
     result.resize_as_(base).fill_(1);
-  } else if (exponent == 1.0) {
+  } else if (exp.equal(1.0)) {
     result.resize_as_(base).copy_(base);
   } else {
     auto iter = TensorIterator::unary_op(result, base.to(common_dtype));
diff --git a/aten/src/ATen/native/README.md b/aten/src/ATen/native/README.md
index 6f17ac860cf8..6e7664c1e1a5 100644
--- a/aten/src/ATen/native/README.md
+++ b/aten/src/ATen/native/README.md
@@ -335,22 +335,31 @@ set of reviewers.
 ### `use_c10_dispatcher`
 
 ```
-use_c10_dispatcher: 'with_codegenerated_unboxing_wrapper'
-use_c10_dispatcher: 'hacky_wrapper_for_legacy_signatures'
 use_c10_dispatcher: 'full'
+use_c10_dispatcher: 'hacky_wrapper_for_legacy_signatures'
 ```
 
 This will indicate the level of integration with the c10 dispatcher.
-If setting this to 'full' works for your operator, please do.
-This will enabled the full templated boxing and unboxing for your operator.
-Some ops use features that aren't supported by those templates yet,
-and enabling `use_c10_dispatcher: full` for those will result in a compiler error.
-For those, use `use_c10_dispatcher: 'with_codegenerated_unboxing_wrapper'` instead,
-or just omit the argument because 'with_codegenerated_unboxing_wrapper' is the default.
-`use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` is similar to `full`
-but adds a wrapper around the kernel before registering it with the dispatcher
-to support some legacy function signatures for kernels that we didn't migrate to
-the new signatures yet.
+For any new ops, please set this to 'full'. This is also the default,
+so you can just omit it.
+This requires the operator function signature to be aligned with the
+function schema in native_functions.yaml, i.e.
+- out arguments have to be in the end of the argument list instead of in the beginning
+- TensorOptions are taken as separate arguments
+```
+  const c10::optional<ScalarType>& dtype,
+  const c10::optional<Layout>& layout,
+  const c10::optional<Device>& device,
+  const c10::optional<bool>& pin_memory
+```
+  instead of one `TensorOptions` argument
+- optional tensors are taken as `const c10::optional<Tensor>&` instead of `Tensor`
+Some of our kernels are still written in a legacy way, not doing those things,
+and need an adapter to work with the dispatcher calling convention. For those, we use
+`use_c10_dispatcher: hacky_wrapper_for_legacy_signatures` to codegenerate a corresponding
+adapter around them in the operator registration call. Over time, we will migrate all
+those kernels to the new calling convention and hacky_wrapper will die.
+Please don't use it for new operators.
 
 ### `manual_kernel_registration`
 
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index c8eb3cc99a01..289d1128d2f9 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -102,9 +102,12 @@ Tensor resize_fft_input(Tensor x, IntArrayRef dims, IntArrayRef sizes) {
 }
 
 // Complex to real FFT
-Tensor fft_c2r(Tensor input, c10::optional<int64_t> n_opt,
+Tensor fft_c2r(c10::string_view function_name,
+               Tensor out, Tensor input, c10::optional<int64_t> n_opt,
                int64_t unwrapped_dim, c10::optional<std::string> norm_str,
                bool forward) {
+  TORCH_CHECK(!out.defined() || out.is_floating_point(), function_name,
+              " expects a floating point output tensor, but got ", out.scalar_type());
   input = promote_tensor_fft(input, /*require_complex=*/true);
   const auto input_dim = input.dim();
   const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
@@ -118,14 +121,22 @@ Tensor fft_c2r(Tensor input, c10::optional<int64_t> n_opt,
     // FIXME: _fft does not support complex_output=false with inverse=false
     input = at::conj(input);
   }
-  return at::_fft_c2r(input, dim, static_cast<int64_t>(norm), n);
+  if (out.defined()) {
+    return at::_fft_c2r_out(out, input, dim, static_cast<int64_t>(norm), n);
+  } else {
+    return at::_fft_c2r(input, dim, static_cast<int64_t>(norm), n);
+  }
 }
 
 // Real to complex FFT
-Tensor fft_r2c(Tensor input, c10::optional<int64_t> n_opt,
+Tensor fft_r2c(c10::string_view function_name,
+               Tensor out, Tensor input, c10::optional<int64_t> n_opt,
                int64_t unwrapped_dim, c10::optional<std::string> norm_str,
                bool forward, bool onesided) {
-  TORCH_CHECK(!input.is_complex(), "Expected a real input tensor to FFT");
+  TORCH_CHECK(!input.is_complex(), function_name,
+              " expects a real input tensor, but got ", input.scalar_type());
+  TORCH_CHECK(!out.defined() || out.is_complex(), function_name,
+              " expects a complex output tensor, but got ", out.scalar_type());
   input = promote_tensor_fft(input);
   const auto input_dim = input.dim();
   const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
@@ -136,19 +147,29 @@ Tensor fft_r2c(Tensor input, c10::optional<int64_t> n_opt,
   }
 
   const auto norm = norm_from_string(norm_str, forward);
-  auto out = at::_fft_r2c(input, dim, static_cast<int64_t>(norm), onesided);
+
+  Tensor ret;
+  if (out.defined() && forward) {
+    ret = at::_fft_r2c_out(out, input, dim, static_cast<int64_t>(norm), onesided);
+  } else {
+    ret = at::_fft_r2c(input, dim, static_cast<int64_t>(norm), onesided);
+  }
+
   if (!forward) {
     // FIXME: _fft_r2c doesn't support native r2c IFFT
-    out = at::conj(out);
+    return out.defined() ? at::conj_out(out, ret) : at::conj(ret);
+  } else {
+    return ret;
   }
-  return out;
 }
 
 // Complex to complex FFT
-Tensor fft_c2c(Tensor input, c10::optional<int64_t> n_opt,
+Tensor fft_c2c(c10::string_view function_name,
+               Tensor out, Tensor input, c10::optional<int64_t> n_opt,
                int64_t unwrapped_dim, c10::optional<std::string> norm_str,
                bool forward) {
-  TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT");
+  TORCH_CHECK(input.is_complex(), function_name,
+              " expects a complex input tensor, but got ", input.scalar_type());
   const auto input_dim = input.dim();
   const auto dim = maybe_wrap_dim(unwrapped_dim, input_dim);
   const auto n = n_opt.value_or(input.sizes()[dim]);
@@ -157,7 +178,13 @@ Tensor fft_c2c(Tensor input, c10::optional<int64_t> n_opt,
     input = resize_fft_input(input, dim, n);
   }
   const auto norm = norm_from_string(norm_str, forward);
-  return at::_fft_c2c(input, dim, static_cast<int64_t>(norm), forward);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_complex(), function_name,
+                " expects a complex output tensor, but got ", out.scalar_type());
+    return at::_fft_c2c_out(out, input, dim, static_cast<int64_t>(norm), forward);
+  } else {
+    return at::_fft_c2c(input, dim, static_cast<int64_t>(norm), forward);
+  }
 }
 
 // Dimensions to transform, and the signal shape in those dimensions
@@ -230,12 +257,18 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args(
 
 // Complex to complex n-dimensional fft
 Tensor fftn_c2c(
-    const Tensor& input, IntArrayRef shape, IntArrayRef dim,
-    c10::optional<std::string> norm_str, bool forward) {
-  TORCH_CHECK(input.is_complex(), "Expected a complex input tensor to FFT");
+    c10::string_view function_name,
+    Tensor out, const Tensor& input, IntArrayRef shape,
+    IntArrayRef dim, c10::optional<std::string> norm_str, bool forward) {
+  TORCH_CHECK(input.is_complex(), function_name, " expects a complex input tensor, but got", input.scalar_type());
   Tensor x = resize_fft_input(input, dim, shape);
   const auto norm = norm_from_string(norm_str, forward);
-  return at::_fft_c2c(x, dim, static_cast<int64_t>(norm), forward);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_complex(), function_name, " expects a complex output tensor, but got ", out.scalar_type());
+    return at::_fft_c2c_out(out, x, dim, static_cast<int64_t>(norm), forward);
+  } else {
+    return at::_fft_c2c(x, dim, static_cast<int64_t>(norm), forward);
+  }
 }
 
 }  // namespace (anonymous)
@@ -244,35 +277,79 @@ Tensor fftn_c2c(
 Tensor fft_fft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                c10::optional<std::string> norm) {
   return self.is_complex() ?
-    fft_c2c(self, n, dim, norm, /*forward=*/true) :
-    fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
+    fft_c2c("fft", {}, self, n, dim, norm, /*forward=*/true) :
+    fft_r2c("fft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
+}
+
+Tensor& fft_fft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                    int64_t dim, c10::optional<std::string> norm) {
+  if (self.is_complex()) {
+    fft_c2c("fft", out, self, n, dim, norm, /*forward=*/true);
+  } else {
+    fft_r2c("fft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/false);
+  }
+  return out;
 }
 
 Tensor fft_ifft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
   return self.is_complex() ?
-    fft_c2c(self, n, dim, norm, /*forward=*/false) :
-    fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
+    fft_c2c("ifft", {}, self, n, dim, norm, /*forward=*/false) :
+    fft_r2c("ifft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
+}
+
+Tensor& fft_ifft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  if (self.is_complex()) {
+    fft_c2c("ifft", out, self, n, dim, norm, /*forward=*/false);
+  } else {
+    fft_r2c("ifft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/false);
+  }
+  return out;
 }
 
 Tensor fft_rfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
-  return fft_r2c(self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
+  return fft_r2c("rfft", {}, self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
+}
+
+Tensor& fft_rfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  fft_r2c("rfft", out, self, n, dim, norm, /*forward=*/true, /*onesided=*/true);
+  return out;
 }
 
 Tensor fft_irfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                  c10::optional<std::string> norm) {
-  return fft_c2r(self, n, dim, norm, /*forward=*/false);
+  return fft_c2r("irfft", {}, self, n, dim, norm, /*forward=*/false);
+}
+
+Tensor& fft_irfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                  int64_t dim, c10::optional<std::string> norm) {
+  fft_c2r("irfft", out, self, n, dim, norm, /*forward=*/false);
+  return out;
 }
 
 Tensor fft_hfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                 c10::optional<std::string> norm) {
-  return fft_c2r(self, n, dim, norm, /*forward=*/true);
+  return fft_c2r("hfft", {}, self, n, dim, norm, /*forward=*/true);
+}
+
+Tensor& fft_hfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  fft_c2r("hfft", out, self, n, dim, norm, /*forward=*/true);
+  return out;
 }
 
 Tensor fft_ihfft(const Tensor& self, c10::optional<int64_t> n, int64_t dim,
                  c10::optional<std::string> norm) {
-  return fft_r2c(self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
+  return fft_r2c("ihfft", {}, self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
+}
+
+Tensor& fft_ihfft_out(Tensor& out, const Tensor& self, c10::optional<int64_t> n,
+                     int64_t dim, c10::optional<std::string> norm) {
+  fft_r2c("ihfft", out, self, n, dim, norm, /*forward=*/false, /*onesided=*/true);
+  return out;
 }
 
 Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
@@ -281,7 +358,18 @@ Tensor fft_fftn(const Tensor& self, c10::optional<IntArrayRef> s,
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   // TODO: For real input, perform rfftn then mirror with conjugate symmetry
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
-  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/true);
+  return fftn_c2c("fftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/true);
+}
+
+Tensor& fft_fftn_out(Tensor& out, const Tensor& self,
+                     c10::optional<IntArrayRef> s,
+                     c10::optional<IntArrayRef> dim,
+                     c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  // TODO: For real input, perform rfftn then mirror with conjugate symmetry
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  fftn_c2c("fftn", out, input, desc.shape, desc.dim, norm, /*forward=*/true);
+  return out;
 }
 
 Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
@@ -289,24 +377,55 @@ Tensor fft_ifftn(const Tensor& self, c10::optional<IntArrayRef> s,
                 c10::optional<std::string> norm) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
-  return fftn_c2c(input, desc.shape, desc.dim, norm, /*forward=*/false);
+  return fftn_c2c("ifftn", {}, input, desc.shape, desc.dim, norm, /*forward=*/false);
 }
 
-Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
-                c10::optional<IntArrayRef> dim,
-                c10::optional<std::string> norm_str) {
+Tensor& fft_ifftn_out(Tensor& out, const Tensor& self,
+                      c10::optional<IntArrayRef> s,
+                      c10::optional<IntArrayRef> dim,
+                      c10::optional<std::string> norm) {
+  auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
+  Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
+  fftn_c2c("ifftn", out, input, desc.shape, desc.dim, norm, /*forward=*/false);
+  return out;
+}
+
+static Tensor fft_rfftn_impl(Tensor out, const Tensor& self,
+                             c10::optional<IntArrayRef> s,
+                             c10::optional<IntArrayRef> dim,
+                             const c10::optional<std::string>& norm_str) {
   TORCH_CHECK(!self.is_complex(), "rfftn expects a real-valued input tensor, but got ", self.scalar_type());
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   TORCH_CHECK(desc.shape.size() > 0, "rfftn must transform at least one axis");
   Tensor input = promote_tensor_fft(self, /*require_complex=*/false);
   Tensor x = resize_fft_input(input, desc.dim, desc.shape);
   const auto norm = norm_from_string(norm_str, /*forward=*/true);
-  return at::_fft_r2c(x, desc.dim, static_cast<int64_t>(norm), /*onesided=*/true);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_complex(), "rfftn expects a complex-valued output tensor, but got ", out.scalar_type());
+    return at::_fft_r2c_out(out, x, desc.dim, static_cast<int64_t>(norm), /*onesided=*/true);
+  } else {
+    return at::_fft_r2c(x, desc.dim, static_cast<int64_t>(norm), /*onesided=*/true);
+  }
 }
 
-Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
+Tensor fft_rfftn(const Tensor& self, c10::optional<IntArrayRef> s,
                 c10::optional<IntArrayRef> dim,
                 c10::optional<std::string> norm_str) {
+  return fft_rfftn_impl({}, self, s, dim, norm_str);
+}
+
+Tensor& fft_rfftn_out(Tensor& out, const Tensor& self,
+                      c10::optional<IntArrayRef> s,
+                      c10::optional<IntArrayRef> dim,
+                      c10::optional<std::string> norm_str) {
+  fft_rfftn_impl(out, self, s, dim, norm_str);
+  return out;
+}
+
+static Tensor fft_irfftn_impl(Tensor out, const Tensor& self,
+                              c10::optional<IntArrayRef> s,
+                              c10::optional<IntArrayRef> dim,
+                              const c10::optional<std::string>& norm_str) {
   auto desc = canonicalize_fft_shape_and_dim_args(self, s, dim);
   TORCH_CHECK(desc.shape.size() > 0, "irfftn must transform at least one axis");
 
@@ -323,7 +442,27 @@ Tensor fft_irfftn(const Tensor& self, c10::optional<IntArrayRef> s,
   Tensor input = promote_tensor_fft(self, /*require_complex=*/true);
   Tensor x = resize_fft_input(input, desc.dim, desc.shape);
   const auto norm = norm_from_string(norm_str, /*forward=*/false);
-  return at::_fft_c2r(x, desc.dim, static_cast<int64_t>(norm), last_dim_size);
+  if (out.defined()) {
+    TORCH_CHECK(out.is_floating_point(), "irfftn expects a floating point output tensor, but got ", out.scalar_type());
+    return at::_fft_c2r_out(out, x, desc.dim, static_cast<int64_t>(norm), last_dim_size);
+  } else {
+    return at::_fft_c2r(x, desc.dim, static_cast<int64_t>(norm), last_dim_size);
+  }
+}
+
+Tensor fft_irfftn(const Tensor& self,
+                  c10::optional<IntArrayRef> s,
+                  c10::optional<IntArrayRef> dim,
+                  c10::optional<std::string> norm_str) {
+  return fft_irfftn_impl({}, self, s, dim, norm_str);
+}
+
+Tensor& fft_irfftn_out(Tensor& out, const Tensor& self,
+                       c10::optional<IntArrayRef> s,
+                       c10::optional<IntArrayRef> dim,
+                       c10::optional<std::string> norm_str) {
+  fft_irfftn_impl(out, self, s, dim, norm_str);
+  return out;
 }
 
 Tensor fft_fft2(const Tensor& self, c10::optional<IntArrayRef> s,
@@ -331,41 +470,69 @@ Tensor fft_fft2(const Tensor& self, c10::optional<IntArrayRef> s,
   return native::fft_fftn(self, s, dim, std::move(norm));
 }
 
+Tensor& fft_fft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                     IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_fftn_out(out, self, s, dim, std::move(norm));
+}
+
 Tensor fft_ifft2(const Tensor& self, c10::optional<IntArrayRef> s,
                 IntArrayRef dim, c10::optional<std::string> norm) {
   return native::fft_ifftn(self, s, dim, std::move(norm));
 }
 
+Tensor& fft_ifft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                      IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_ifftn_out(out, self, s, dim, std::move(norm));
+}
+
 Tensor fft_rfft2(const Tensor& self, c10::optional<IntArrayRef> s,
                 IntArrayRef dim, c10::optional<std::string> norm) {
   return native::fft_rfftn(self, s, dim, std::move(norm));
 }
 
+Tensor& fft_rfft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                      IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_rfftn_out(out, self, s, dim, std::move(norm));
+}
+
 Tensor fft_irfft2(const Tensor& self, c10::optional<IntArrayRef> s,
                   IntArrayRef dim, c10::optional<std::string> norm) {
   return native::fft_irfftn(self, s, dim, std::move(norm));
 }
 
-Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
-  ScalarType dtype = typeMetaToScalarType(options.dtype());
+Tensor& fft_irfft2_out(Tensor& out, const Tensor& self, c10::optional<IntArrayRef> s,
+                       IntArrayRef dim, c10::optional<std::string> norm) {
+  return native::fft_irfftn_out(out, self, s, dim, std::move(norm));
+}
+
+Tensor& fft_fftfreq_out(Tensor& out, int64_t n, double d) {
+  ScalarType dtype = out.scalar_type();
   TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
               "fftfreq requires a floating point or complex dtype");
   // TODO: arange doesn't have complex support
-  Tensor result = native::arange(n, options);
-  auto right_slice = result.slice(0, (n + 1) / 2, 0);
+  at::arange_out(out, n);
+  auto right_slice = out.slice(0, (n + 1) / 2, 0);
   at::arange_out(right_slice, -(n/2), 0, 1);
-  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
-  return result;
+  return out.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
 }
 
-Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
-  ScalarType dtype = typeMetaToScalarType(options.dtype());
+Tensor fft_fftfreq(int64_t n, double d, const TensorOptions& options) {
+  auto out = at::empty({n}, options);
+  return native::fft_fftfreq_out(out, n, d);
+}
+
+Tensor& fft_rfftfreq_out(Tensor& out, int64_t n, double d) {
+  ScalarType dtype = out.scalar_type();
   TORCH_CHECK(at::isFloatingType(dtype) || at::isComplexType(dtype),
               "rfftfreq requires a floating point or complex dtype");
   // TODO: arange doesn't have complex support
-  Tensor result = native::arange(n/2 + 1, options);
-  result.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
-  return result;
+  native::arange_out(out, n/2 + 1);
+  return out.mul_(1.0 / (n * d));  // Slightly faster than div_(n*d)
+}
+
+Tensor fft_rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  auto out = at::empty({n/2 + 1}, options);
+  return native::fft_rfftfreq_out(out, n, d);
 }
 
 // If an array dim is specified, wraps them according to self.dim().
@@ -469,18 +636,20 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const optional<int64_t> hop
   const bool return_complex = return_complexOpt.value_or(
       self.is_complex() || (window.defined() && window.is_complex()));
   if (!return_complex) {
-    TORCH_CHECK(return_complexOpt.has_value(),
-        "stft requires the return_complex parameter be given for real inputs."
-        "You should pass return_complex=True to opt-in to complex dtype returns "
-        "(which will be required in a future pytorch release). "
+    if (!return_complexOpt.has_value()) {
+      TORCH_WARN_ONCE(
+        "stft will soon require the return_complex parameter be given for real inputs, "
+        "and will further require that return_complex=True in a future PyTorch release."
       );
+    }
 
-    TORCH_WARN_ONCE(
-        "stft with return_complex=False is deprecated. In a future pytorch "
-        "release, stft will return complex tensors for all inputs, and "
-        "return_complex=False will raise an error.\n"
-        "Note: you can still call torch.view_as_real on the complex output to "
-        "recover the old return format.");
+
+    // TORCH_WARN_ONCE(
+    //     "stft with return_complex=False is deprecated. In a future pytorch "
+    //     "release, stft will return complex tensors for all inputs, and "
+    //     "return_complex=False will raise an error.\n"
+    //     "Note: you can still call torch.view_as_real on the complex output to "
+    //     "recover the old return format.");
   }
 
   if (!at::isFloatingType(self.scalar_type()) && !at::isComplexType(self.scalar_type())) {
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
index 1d9f9d9d2a12..3ced0cf5eb52 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -206,7 +206,7 @@ AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
   }
 }
 
-static AdvancedIndex make_info(Tensor self, TensorList orig) {
+static AdvancedIndex make_info(Tensor self, const torch::List<c10::optional<at::Tensor>>& orig) {
   checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = expandTensors(self, orig);
@@ -281,7 +281,7 @@ static TensorIterator make_index_out_iterator(const AdvancedIndex& info, Tensor&
   return config.build();
 }
 
-Tensor index(const Tensor & self, TensorList indices) {
+Tensor index(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
 
   auto info = make_info(self, indices);
@@ -290,7 +290,7 @@ Tensor index(const Tensor & self, TensorList indices) {
   return iter.output();
 }
 
-Tensor quantized_index(const Tensor & self, TensorList indices) {
+Tensor quantized_index(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   TORCH_INTERNAL_ASSERT(
       self.qscheme() == c10::kPerTensorAffine ||
       self.qscheme() == c10::kPerTensorSymmetric,
@@ -311,12 +311,14 @@ Tensor quantized_index(const Tensor & self, TensorList indices) {
       res, self.q_scale(), self.q_zero_point(), self.scalar_type());
 }
 
-Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) {
+Tensor& index_out(Tensor& result, const Tensor & self, const torch::List<c10::optional<Tensor>>& indices) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   at::assert_no_internal_overlap(result);
   at::assert_no_overlap(result, self);
-  for (auto& index: indices) {
-    at::assert_no_overlap(result, index);
+  for (const c10::optional<Tensor>& index: indices) {
+    if (index.has_value()) {
+      at::assert_no_overlap(result, *index);
+    }
   }
 
   auto info = make_info(self, indices);
@@ -325,11 +327,11 @@ Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices) {
   return result;
 }
 
-Tensor index_put(const Tensor & self, TensorList indices, const Tensor & value, bool accumulate) {
+Tensor index_put(const Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, bool accumulate) {
   return self.clone(at::MemoryFormat::Preserve).index_put_(indices, value, accumulate);
 }
 
-Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate, const bool unsafe) {
+Tensor & _index_put_impl_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate, const bool unsafe) {
   TORCH_CHECK_INDEX(indices.size() <= (size_t)self.dim(), "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   if (at::has_internal_overlap(self) == MemOverlap::YES) {
     TORCH_WARN(
@@ -338,8 +340,10 @@ Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & valu
       "This also applies to advanced indexing e.g. tensor[indices] = tensor");
   }
   at::assert_no_overlap(self, value);
-  for (auto& index: indices) {
-    at::assert_no_overlap(self, index);
+  for (const c10::optional<Tensor>& index: indices) {
+    if (index.has_value()) {
+      at::assert_no_overlap(self, *index);
+    }
   }
 
   if (accumulate && self.device().type() == kCUDA) {
@@ -356,7 +360,7 @@ Tensor & _index_put_impl_(Tensor & self, TensorList indices, const Tensor & valu
 }
 
 
-Tensor & index_put_(Tensor & self, TensorList indices, const Tensor & value, const bool accumulate) {
+Tensor & index_put_(Tensor & self, const torch::List<c10::optional<Tensor>>& indices, const Tensor & value, const bool accumulate) {
   return at::_index_put_impl_(self, indices, value, accumulate, /*unsafe=*/false);
 }
 
@@ -467,7 +471,7 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T
 
     // explicitly capture all required variables to work around windows build
     // TODO: fix this when windows can correctly capture variables in nested lambda
-    AT_DISPATCH_ALL_TYPES(self.scalar_type(), "index_add_", [&self, &source, &dim, &index_contig, &numel] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "index_add_", [&self, &source, &dim, &index_contig, &numel] {
       auto self_stride = self.dim() == 0 ? 1 : self.stride(dim);
       auto source_stride = source.dim() == 0 ? 1 : source.stride(dim);
       // TODO: Maybe TensorAccessor can beused here?
@@ -678,7 +682,7 @@ Tensor & index_select_out_cpu_(Tensor & result, const Tensor & self, int64_t dim
     TORCH_CHECK(result.dim() <= 1, "result.dim() (", result.dim(), ") must one or zero for given self.dim() (", self.dim(), ")");
     // explicitly capture all required variables to work around windows build
     // TODO: fix this when windows can correctly capture variables in nested lambda
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool, self.scalar_type(), "index_select",
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, self.scalar_type(), "index_select",
       [&index_contig, &self, &result, &dim, &numel] {
       auto self_stride = self.dim() == 0 ? 1 : self.stride(dim);
       auto result_stride = result.dim() == 0 ? 1 : result.stride(dim);
diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.h b/aten/src/ATen/native/TensorAdvancedIndexing.h
index 560b46162546..0e0958606de1 100644
--- a/aten/src/ATen/native/TensorAdvancedIndexing.h
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.h
@@ -15,7 +15,7 @@ enum class SCATTER_GATHER_OP: uint8_t {REDUCE_ADD, REDUCE_MULTIPLY};
 
 using index_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides);
 using index_put_fn = void(*)(TensorIterator &, IntArrayRef indexed_sizes, IntArrayRef indexed_strides, bool accumulate);
-using index_put_accum_fn = void(*)(Tensor &, TensorList , const Tensor &, bool unsafe);
+using index_put_accum_fn = void(*)(Tensor &, const c10::List<c10::optional<Tensor>> &, const Tensor &, bool unsafe);
 using masked_fill_fn = void(*)(TensorIterator &, Scalar scalar);
 using masked_select_fn = void(*)(TensorIterator &, int64_t orig_stride);
 
@@ -42,6 +42,6 @@ DECLARE_DISPATCH(scatter_add_fn, scatter_add_stub);
 DECLARE_DISPATCH(scatter_reduce_fn, scatter_reduce_stub);
 DECLARE_DISPATCH(scatter_scalar_reduce_fn, scatter_scalar_reduce_stub);
 
-TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, TensorList indices);
+TORCH_API Tensor& index_out(Tensor& result, const Tensor & self, const c10::List<c10::optional<at::Tensor>>& indices);
 
 }} // namespace at::native
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index b27a995962b4..5435f5042ce0 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -38,6 +38,8 @@ Tensor isclose(const Tensor& self, const Tensor& other, double rtol, double atol
   TORCH_CHECK(self.scalar_type() == other.scalar_type(), self.scalar_type(), " did not match ", other.scalar_type());
   TORCH_CHECK(!(self.is_complex() && equal_nan),
     "isclose with equal_nan=True is not supported for complex inputs.");
+  TORCH_CHECK(!(self.is_quantized() || other.is_quantized()),
+    "isclose is not supported for quantized inputs.");
 
   // Checks that rtol and atol are non-negative
   // Note: consistent with Python's isclose but divergent from NumPy's, which
diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp
index f8ba5527e5a9..d1fadd58d38d 100644
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@@ -8,6 +8,7 @@
 #include <ATen/WrapDimUtils.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Optional.h>
+#include <c10/util/SmallVector.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/TypeProperties.h>
 #include <ATen/SparseTensorUtils.h>
@@ -97,23 +98,25 @@ static inline void check_cat_shape_except_dim(const Tensor & first, const Tensor
     if (dim == dimension) {
       continue;
     }
-    int64_t first_dim_size = first.size(dim);
-    int64_t second_dim_size = second.size(dim);
+    int64_t first_dim_size = first.sizes()[dim];
+    int64_t second_dim_size = second.sizes()[dim];
     TORCH_CHECK(first_dim_size == second_dim_size, "Sizes of tensors must match except in dimension ",
                 dimension, ". Got ", first_dim_size, " and ", second_dim_size, " in dimension ", dim,
                 " (The offending index is ", index, ")");
   }
 }
 
+static bool should_skip(const Tensor& t) {
+  return t.numel() == 0 && t.dim() == 1;
+}
+
 Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
   // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
   // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
   // to be "skipped".  We maintain this behavior for backwards compatibility, but only for this specific
   // size (i.e. other empty sizes are not skipped).
-  // FIXME: warn if this is the case
-  bool allSkipped = true;
+
   bool allContiguous = true;
-  Tensor notSkippedTensor;
 
   // Inputs cannot alias the output tensor
   for (int64_t i = 0; i < tensors.size(); i++) {
@@ -125,19 +128,23 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
   }
   at::assert_no_internal_overlap(result);
 
-  auto should_skip = [](const Tensor& t) { return t.numel() == 0 && t.dim() == 1; };
-  for (auto const &tensor : tensors) {
-    if (should_skip(tensor)) {
-      continue;
+  const Tensor* pnotSkippedTensor = [](TensorList tensors) -> const Tensor* {
+    for (auto const &tensor : tensors) {
+      if (should_skip(tensor)) {
+        continue;
+      }
+      // we've found a non-empty tensor
+      return &tensor;
     }
-    // we've found a non-empty tensor
-    allSkipped = false;
-    notSkippedTensor = tensor;
-    break;
-  }
-  if (allSkipped) {
+    return nullptr;
+  }(tensors);
+
+  if (!pnotSkippedTensor) {
+    // FIXME: warn if this is the case -- see comment about skipped
+    // tensors at top of function.
     return result;
   }
+  const Tensor& notSkippedTensor = *pnotSkippedTensor;
 
   TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors");
   TORCH_CHECK(dim <= notSkippedTensor.dim(), "dimension ", dim, "out of range");
@@ -160,7 +167,7 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
       continue;
     }
     check_cat_shape_except_dim(notSkippedTensor, tensor, dim, i);
-    cat_dim_size += tensor.size(dim);
+    cat_dim_size += tensor.sizes()[dim];
 
     if (!tensor.is_contiguous(first_tensor_mem_format)) {
       allContiguous = false;
@@ -195,8 +202,8 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
   if (reuse_iterator &&
       result.is_contiguous(first_tensor_mem_format) &&
       no_type_promotion) {
-    auto source_slice = notSkippedTensor;
-    auto slice_dim_size = source_slice.size(dim);
+    const auto& source_slice = notSkippedTensor;
+    auto slice_dim_size = source_slice.sizes()[dim];
     auto result_slice = result.narrow(dim, 0, slice_dim_size);
     auto result_slice_data = result_slice.data_ptr();
     auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type());
@@ -225,7 +232,7 @@ Tensor & _cat_out_cpu(Tensor& result, TensorList tensors, int64_t dim) {
       if (should_skip(tensor)) {
         continue;
       }
-      auto slice_dim_size = tensor.size(dim);
+      auto slice_dim_size = tensor.sizes()[dim];
       auto result_slice = result.narrow(dim, offset, slice_dim_size);
 
       auto iter = TensorIteratorConfig()
@@ -1467,15 +1474,25 @@ inferSqueezeGeometry(const Tensor& tensor, int64_t dim) {
   return std::make_tuple(sizes, strides);
 }
 
-std::tuple<std::vector<int64_t>, std::vector<int64_t> >
+namespace {
+// Named type instead of a pair/tuple so that we can be sure to
+// construct the vectors in place and get NRVO.
+struct InferUnsqueezeGeometryResult {
+  c10::SmallVector<int64_t, 5> sizes;
+  c10::SmallVector<int64_t, 5> strides;
+  InferUnsqueezeGeometryResult(IntArrayRef tensor_sizes, IntArrayRef tensor_strides)
+      : sizes(tensor_sizes.begin(), tensor_sizes.end())
+      , strides(tensor_strides.begin(), tensor_strides.end()) {}
+};
+}
+InferUnsqueezeGeometryResult
 inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) {
-  auto sizes = tensor.sizes().vec();
-  auto strides = tensor.strides().vec();
-  int64_t new_stride = dim >= tensor.dim() ? 1 : sizes[dim] * strides[dim];
-  sizes.insert(sizes.begin() + dim, 1);
-  strides.insert(strides.begin() + dim, new_stride);
+  InferUnsqueezeGeometryResult result(tensor.sizes(), tensor.strides());
+  int64_t new_stride = dim >= tensor.dim() ? 1 : result.sizes[dim] * result.strides[dim];
+  result.sizes.insert(result.sizes.begin() + dim, 1);
+  result.strides.insert(result.strides.begin() + dim, new_stride);
 
-  return std::make_tuple(sizes, strides);
+  return result;
 }
 
 Tensor squeeze_qtensor(const Tensor& self) {
@@ -1624,7 +1641,7 @@ Tensor unsqueeze_qtensor(const Tensor& self, int64_t dim) {
                                                   axis,
                                                   quantizer->scalar_type());
   }
-  return make_qtensor(self, std::get<0>(g), std::get<1>(g), quantizer);
+  return make_qtensor(self, g.sizes, g.strides, quantizer);
 }
 
 Tensor unsqueeze(const Tensor& self, int64_t dim) {
@@ -1636,7 +1653,7 @@ Tensor unsqueeze(const Tensor& self, int64_t dim) {
     return unsqueeze_qtensor(self, dim);
   } else {
     auto g = inferUnsqueezeGeometry(self, dim);
-    return self.as_strided(std::get<0>(g), std::get<1>(g));
+    return self.as_strided(g.sizes, g.strides);
   }
 }
 
@@ -1644,7 +1661,7 @@ Tensor & unsqueeze_(Tensor& self, int64_t dim) {
   dim = maybe_wrap_dim(dim, self.dim() + 1);
 
   auto g = inferUnsqueezeGeometry(self, dim);
-  return self.as_strided_(std::get<0>(g), std::get<1>(g));
+  return self.as_strided_(g.sizes, g.strides);
 }
 
 Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) {
diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp
index fdee519c4bd0..5c6ab40b0ad4 100644
--- a/aten/src/ATen/native/TensorTransformations.cpp
+++ b/aten/src/ATen/native/TensorTransformations.cpp
@@ -73,7 +73,7 @@ Tensor flip_cpu(const Tensor& self, IntArrayRef dims) {
       );
     });
   } else {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool,
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kHalf, kBFloat16,
                                           in_tensor.scalar_type(),
                                           "flip_cpu", [&] {
       flip_cpu_kernel<scalar_t>(
diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp
index 0ebdce6795aa..7a6f7c6e8e05 100644
--- a/aten/src/ATen/native/TestOps.cpp
+++ b/aten/src/ATen/native/TestOps.cpp
@@ -2,6 +2,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/ScalarOps.h>
 
 namespace at {
 namespace native {
@@ -50,5 +51,22 @@ Tensor _test_string_default(const Tensor& dummy, std::string a, std::string b) {
   return dummy;
 }
 
+// Test that overloads with ambiguity created by defaulted parameters work.
+// The operator declared first should have priority always
+
+// Overload a
+Tensor _test_ambiguous_defaults(const Tensor& dummy, int64_t a, int64_t b) {
+  TORCH_CHECK(a == 1);
+  TORCH_CHECK(b == 1);
+  return c10::scalar_to_tensor(1);
+}
+
+// Overload b
+Tensor _test_ambiguous_defaults(const Tensor& dummy, int64_t a, std::string b) {
+  TORCH_CHECK(a == 2);
+  TORCH_CHECK(b == "2");
+  return c10::scalar_to_tensor(2);
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index e6dd1bc4afde..0f6da7e4292a 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -326,8 +326,12 @@ Tensor& reciprocal_out(Tensor& result, const Tensor& self) { return unary_op_imp
 Tensor reciprocal(const Tensor& self) { return unary_op_impl_float(self, reciprocal_stub); }
 Tensor& reciprocal_(Tensor& self) { return unary_op_impl_(self, at::reciprocal_out); }
 
-Tensor& rsqrt_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, rsqrt_stub); }
-Tensor rsqrt(const Tensor& self) { return unary_op_impl(self, at::rsqrt_out); }
+Tensor& rsqrt_out(Tensor& result, const Tensor& self) {
+  return unary_op_impl_float_out(result, self, rsqrt_stub);
+}
+Tensor rsqrt(const Tensor& self) {
+  return unary_op_impl_float(self, rsqrt_stub);
+}
 Tensor& rsqrt_(Tensor& self) { return unary_op_impl_(self, at::rsqrt_out); }
 
 Tensor& sign_out(Tensor& result, const Tensor& self) {
diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h
index f732cb9a0141..d92864e6fb2a 100644
--- a/aten/src/ATen/native/UnaryOps.h
+++ b/aten/src/ATen/native/UnaryOps.h
@@ -77,7 +77,9 @@ DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional<Generator>), random_full
 DECLARE_DISPATCH(void(*)(TensorIterator&, c10::optional<Generator>), random_stub);
 DECLARE_DISPATCH(void(*)(TensorIterator&, const int64_t), polygamma_stub);
 DECLARE_DISPATCH(void(*)(TensorIterator&, Scalar a, Scalar b), clamp_stub);
-DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, int64_t, bool, c10::optional<Generator>), multinomial_stub);
+DECLARE_DISPATCH(
+    void (*)(Tensor&, const Tensor&, int64_t, c10::optional<Generator>),
+    multinomial_with_replacement_stub);
 DECLARE_DISPATCH(
     void (*)(
         TensorIterator&,
diff --git a/aten/src/ATen/native/UpSampleNearest1d.cpp b/aten/src/ATen/native/UpSampleNearest1d.cpp
index b9dd52dffa5d..6478bbb58eaf 100644
--- a/aten/src/ATen/native/UpSampleNearest1d.cpp
+++ b/aten/src/ATen/native/UpSampleNearest1d.cpp
@@ -66,19 +66,21 @@ TORCH_META_FUNC(upsample_nearest1d_backward) (
 namespace native {
 
 TORCH_IMPL_FUNC(upsample_nearest1d_out_cpu) (
-    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& output
+) {
   upsample_nearest1d_kernel(kCPU, output, input, scales);
 }
 
 TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cpu) (
-    Tensor& grad_input,
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& grad_input
+) {
   grad_input.zero_();
   upsample_nearest1d_backward_kernel(kCPU, grad_input, grad_output, scales);
 }
diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp
index 299850407da3..f86adb8e6318 100644
--- a/aten/src/ATen/native/cpu/CatKernel.cpp
+++ b/aten/src/ATen/native/cpu/CatKernel.cpp
@@ -15,18 +15,20 @@ struct InputMeta {
 
   InputMeta(const Tensor& t, int64_t dim, int64_t inner)
     : data_ptr(t.data_ptr())
-    , inner_size(t.size(dim) * inner) {}
+    , inner_size(t.sizes()[dim] * inner) {}
 };
 
 template <typename scalar_t>
 void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) {
-  int64_t outer = result.numel() / (result.size(dim) * result.stride(dim));
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      dim >= 0 && dim < result.dim(), "dim out of range in cat_serial_kernel_impl");
+  int64_t outer = result.numel() / (result.sizes()[dim] * result.strides()[dim]);
   scalar_t* result_data = result.data_ptr<scalar_t>();
   int64_t ninputs = tensors.size();
   std::vector<InputMeta> inputs;
   inputs.reserve(ninputs);
   for (auto const &tensor : tensors) {
-    inputs.emplace_back(tensor, dim, result.stride(dim));
+    inputs.emplace_back(tensor, dim, result.strides()[dim]);
   }
 
   using Vec = vec256::Vec256<scalar_t>;
diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
index 1f4a52084962..62f1d7b879ac 100644
--- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp
+++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp
@@ -11,8 +11,12 @@ namespace at {
 namespace native {
 namespace {
 
-template<typename scalar_t>
-void multinomial_apply(Tensor& result, const Tensor& self, const int64_t n_sample, const bool with_replacement, c10::optional<Generator> generator) {
+template <typename scalar_t>
+void multinomial_with_replacement_apply(
+    Tensor& result,
+    const Tensor& self,
+    const int64_t n_sample,
+    c10::optional<Generator> generator) {
   auto gen = get_generator_or_default<CPUGeneratorImpl>(generator, detail::getDefaultCPUGenerator());
   // See Note [Acquire lock when using random generators]
   std::lock_guard<std::mutex> lock(gen->mutex_);
@@ -61,8 +65,6 @@ void multinomial_apply(Tensor& result, const Tensor& self, const int64_t n_sampl
     }
 
     TORCH_CHECK(sum > 0, "invalid multinomial distribution (sum of probabilities <= 0)");
-    TORCH_CHECK(with_replacement || (n_categories - n_zeros >= n_sample),
-        "invalid multinomial distribution (with replacement=False, not enough non-negative category to sample)");
 
     /* normalize cumulative probability distribution so that last val is 1
     i.e. doesn't assume original self row sums to one */
@@ -100,45 +102,23 @@ void multinomial_apply(Tensor& result, const Tensor& self, const int64_t n_sampl
 
       /* store in result tensor (will be incremented for lua compat by wrapper) */
       result_ptr[i * result_dist_stride_0 + j * result_dist_stride_1] = sample_idx;
-
-      /* Once a sample is drawn, it cannot be drawn again. ie sample without replacement */
-      if (!with_replacement && j < n_sample - 1) {
-        /* update cumulative distribution so that sample cannot be drawn again */
-        scalar_t diff;
-        scalar_t new_val = 0;
-        scalar_t sum;
-
-        if (sample_idx != 0) {
-          new_val = cum_dist_ptr[(sample_idx - 1) * cum_dist_stride_0];
-        }
-        /* marginal cumulative mass (i.e. original probability) of sample */
-        diff = cum_dist_ptr[sample_idx * cum_dist_stride_0] - new_val;
-        /* new sum of marginals is not one anymore... */
-        sum = 1.0 - diff;
-        for (int64_t k = 0; k < n_categories; k++) {
-          new_val = cum_dist_ptr[k * cum_dist_stride_0];
-          if (k >= sample_idx) {
-            /* remove sampled probability mass from later cumulative probabilities */
-            new_val -= diff;
-          }
-          /* make total marginals sum to one */
-          new_val /= sum;
-          cum_dist_ptr[k * cum_dist_stride_0] = new_val;
-        }
-      }
     }
   }
 }
 
-static void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n_sample, const bool with_replacement, c10::optional<Generator> gen) {
+static void multinomial_with_replacement_kernel_impl(
+    Tensor& result,
+    const Tensor& self,
+    const int64_t n_sample,
+    c10::optional<Generator> gen) {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(self.scalar_type(), "multinomial", [&] {
-    multinomial_apply<scalar_t>(result, self, n_sample, with_replacement, gen);
+    multinomial_with_replacement_apply<scalar_t>(result, self, n_sample, gen);
   });
 }
-
 }
 
-REGISTER_DISPATCH(multinomial_stub, &multinomial_kernel_impl);
-
+REGISTER_DISPATCH(
+    multinomial_with_replacement_stub,
+    &multinomial_with_replacement_kernel_impl);
 }
 }
diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp
index b7ec099a80da..6f0d153e978a 100644
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@@ -63,7 +63,7 @@ void pow_tensor_scalar_kernel(TensorIterator& iter, Scalar exp_scalar) {
         );
       } else if (exp == -0.5) {
         cpu_kernel_vec(iter,
-          [](scalar_t base) -> scalar_t {
+          [](scalar_t base) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
             return 1.0 / std::sqrt(base);
           },
           [](Vec base) -> Vec { return base.rsqrt(); }
diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
index 5f96e01ab319..32033abcd4e2 100644
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -225,7 +225,7 @@ static void norm_kernel_tensor_iterator_impl(
       binary_kernel_reduce(
         iter,
         AbsMaxOps<scalar_t, acc_t>(),
-        std::numeric_limits<acc_t>::min()
+        acc_t(0)
       );
     });
   } else if (val == -INFINITY) {
diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
index 049b3eff6b5b..32ebaf7752f7 100644
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@@ -587,10 +587,10 @@ static void random_full_64_bits_range_kernel(TensorIterator& iter, c10::optional
 }
 
 static void rsqrt_kernel(TensorIterator& iter) {
-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.dtype(), "rsqrt_cpu", [&] {
+  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(iter.common_dtype(), "rsqrt_cpu", [&] {
     cpu_kernel_vec(
         iter,
-        [=](scalar_t a) -> scalar_t {
+        [=](scalar_t a) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
           return (static_cast<scalar_t>(1)) / std::sqrt(a);
         },
         [=](Vec256<scalar_t> a) { return a.rsqrt(); });
diff --git a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
index e5804ba389c5..3fbd693d17b1 100644
--- a/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
+++ b/aten/src/ATen/native/cuda/BatchLinearAlgebra.cu
@@ -16,8 +16,8 @@
 #include <THC/THC.h> // for USE_MAGMA
 
 #ifdef USE_MAGMA
-#include <magma.h>
 #include <magma_types.h>
+#include <magma_v2.h>
 
 const bool use_magma_ = true;
 #else
@@ -95,10 +95,18 @@ void magmaCholeskyBatched(
     magma_uplo_t uplo, magma_int_t n, scalar_t** dA_array, magma_int_t ldda,
     magma_int_t* info_array, magma_int_t batchsize, const MAGMAQueue& magma_queue);
 
-template<class scalar_t>
+template <class scalar_t>
 void magmaTriangularSolve(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    scalar_t* dA, magma_int_t ldda, scalar_t* dB, magma_int_t lddb);
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    scalar_t* dA,
+    magma_int_t ldda,
+    scalar_t* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue);
 
 template<class scalar_t>
 void magmaTriangularSolveBatched(
@@ -662,45 +670,117 @@ void magmaCholeskyBatched<c10::complex<float>>(
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<double>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    double* dA, magma_int_t ldda, double* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
-  magma_dtrsm(MagmaLeft, uplo, trans, diag, m, n, 1, dA, ldda, dB, lddb);
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    double* dA,
+    magma_int_t ldda,
+    double* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
+  magma_dtrsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      1,
+      dA,
+      ldda,
+      dB,
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<float>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    float* dA, magma_int_t ldda, float* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
-  magma_strsm(MagmaLeft, uplo, trans, diag, m, n, 1, dA, ldda, dB, lddb);
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    float* dA,
+    magma_int_t ldda,
+    float* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
+  magma_strsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      1,
+      dA,
+      ldda,
+      dB,
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<c10::complex<double>>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    c10::complex<double>* dA, magma_int_t ldda, c10::complex<double>* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    c10::complex<double>* dA,
+    magma_int_t ldda,
+    c10::complex<double>* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
   magmaDoubleComplex alpha({1, 0});
-  magma_ztrsm(MagmaLeft, uplo, trans, diag, m, n, alpha,
-    reinterpret_cast<magmaDoubleComplex*>(dA), ldda,
-    reinterpret_cast<magmaDoubleComplex*>(dB), lddb);
+  magma_ztrsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      alpha,
+      reinterpret_cast<magmaDoubleComplex*>(dA),
+      ldda,
+      reinterpret_cast<magmaDoubleComplex*>(dB),
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-template<>
+template <>
 void magmaTriangularSolve<c10::complex<float>>(
-    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n,
-    c10::complex<float>* dA, magma_int_t ldda, c10::complex<float>* dB, magma_int_t lddb) {
-  MagmaStreamSyncGuard guard;
+    magma_uplo_t uplo,
+    magma_trans_t trans,
+    magma_diag_t diag,
+    magma_int_t m,
+    magma_int_t n,
+    c10::complex<float>* dA,
+    magma_int_t ldda,
+    c10::complex<float>* dB,
+    magma_int_t lddb,
+    const MAGMAQueue& magma_queue) {
   magmaFloatComplex alpha({1, 0});
-  magma_ctrsm(MagmaLeft, uplo, trans, diag, m, n, alpha,
-    reinterpret_cast<magmaFloatComplex*>(dA), ldda,
-    reinterpret_cast<magmaFloatComplex*>(dB), lddb);
+  magma_ctrsm(
+      MagmaLeft,
+      uplo,
+      trans,
+      diag,
+      m,
+      n,
+      alpha,
+      reinterpret_cast<magmaFloatComplex*>(dA),
+      ldda,
+      reinterpret_cast<magmaFloatComplex*>(dB),
+      lddb,
+      magma_queue.get_queue());
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
@@ -1636,11 +1716,14 @@ AT_ERROR("triangular_solve: MAGMA library not found in "
   magma_int_t nrhs = magma_int_cast(b.size(-1), "b.size(-1)");
   magma_int_t batch_size = magma_int_cast(batchCount(A), "batchCount");
 
+  MAGMAQueue magma_queue(b.get_device());
+
   // batch_size == 1 implies that:
   // 1. the RHS and LHS tensors have 2 dimensions, or
   // 2. the RHS and LHS tensors have more than 2 dimensions but all batch dimensions are 1
   if (batch_size == 1) {
-    magmaTriangularSolve<scalar_t>(uplo, trans, diag, n, nrhs, A_data, n, b_data, n);
+    magmaTriangularSolve<scalar_t>(
+        uplo, trans, diag, n, nrhs, A_data, n, b_data, n, magma_queue);
   } else {
     auto A_mat_stride = matrixStride(A);
     auto b_mat_stride = matrixStride(b);
@@ -1697,18 +1780,18 @@ std::tuple<Tensor, Tensor> _triangular_solve_helper_cuda(const Tensor& self, con
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ qr ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename scalar_t>
-static void apply_qr(Tensor& Q, Tensor& R, int64_t n_columns, std::vector<int64_t>& infos) {
+static void apply_qr(Tensor& Q, Tensor& R, int64_t q_size_minus_2, int64_t r_size_minus_1, int64_t n_columns,
+                     bool compute_q, std::vector<int64_t>& infos) {
 #ifndef USE_MAGMA
 AT_ERROR("qr: MAGMA library not found in "
     "compilation. Please rebuild with MAGMA.");
 #else
-  auto q_data = Q.data_ptr<scalar_t>();
+
+  magma_int_t m = magma_int_cast(q_size_minus_2, "Q.size(-2)");
+  magma_int_t n = magma_int_cast(r_size_minus_1, "R.size(-1)");
+
   auto r_data = R.data_ptr<scalar_t>();
-  auto q_matrix_stride = matrixStride(Q);
   auto r_matrix_stride = matrixStride(R);
-
-  magma_int_t m = magma_int_cast(Q.size(-2), "Q.size(-2)");
-  magma_int_t n = magma_int_cast(R.size(-1), "R.size(-1)");
   magma_int_t k = m < n ? m : n;
   magma_int_t nb = magmaGeqrfOptimalBlocksize<scalar_t>(m, n);
   int64_t batch_size = batchCount(R);
@@ -1731,6 +1814,10 @@ AT_ERROR("qr: MAGMA library not found in "
       return;
     }
   }
+  if (!compute_q) {
+    // this is for mode='r'
+    return;
+  }
 
   // This phase computes Q (the raw version)
   // We require to perform ?geqrf_gpu again due to this bug in MAGMA:
@@ -1738,6 +1825,8 @@ AT_ERROR("qr: MAGMA library not found in "
   // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
   // Refer to the below link for more details:
   // http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800
+  auto q_data = Q.data_ptr<scalar_t>();
+  auto q_matrix_stride = matrixStride(Q);
   for (int64_t i = 0; i < batch_size; i++) {
     scalar_t* q_working_ptr = &q_data[i * q_matrix_stride];
     magmaGeqrf<scalar_t>(m, n, q_working_ptr, m, tau_data, work_data, &info, /*is_v2=*/false);
@@ -1754,36 +1843,43 @@ AT_ERROR("qr: MAGMA library not found in "
 #endif
 }
 
-std::tuple<Tensor,Tensor> _qr_helper_cuda(const Tensor& self, bool some) {
+std::tuple<Tensor,Tensor> _linalg_qr_helper_cuda(const Tensor& self, std::string mode) {
+  bool compute_q, reduced;
+  std::tie(compute_q, reduced) = _parse_qr_mode(mode);
   std::vector<int64_t> infos(batchCount(self), 0);
 
   // Setup input geometry and inputs for apply_qr
   std::vector<int64_t> q_sizes, q_strides;
   int64_t n_columns_q;
-  std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, some);
+  std::tie(q_sizes, q_strides, n_columns_q) = _compute_geometry_for_Q(self, reduced);
   Tensor q_working_copy, r_working_copy;
 
   // If there are no elements, then we simply return a pair of tensors of required dimensions
   if (self.numel() == 0) {
-    // Fix the number of columns of q_working_copy appropriately
-    q_sizes[self.dim() - 1] = n_columns_q;
-    q_working_copy = at::eye(q_sizes[self.dim() - 2], q_sizes[self.dim() - 1], self.options());
-    q_working_copy = q_working_copy.expand_as(q_working_copy);
-
-    // We repurpose the same q_sizes for r_working_copy
-    // Fix the number of rows and columns of q_working_copy appropriately
-    q_sizes[self.dim() - 1] = self.size(-1);
-    q_sizes[self.dim() - 2] = n_columns_q;
-    r_working_copy = at::empty(q_sizes, self.options());
+    int64_t n = self.size(-1);
+    r_working_copy = at::empty({n_columns_q, n}, self.options());
+    if (compute_q) {
+        int64_t n_rows_q = q_sizes[self.dim() - 2];
+        q_working_copy = at::eye(n_rows_q, n_columns_q, self.options());
+    } else {
+      q_working_copy = at::empty({0}, self.options());
+    }
     return std::make_tuple(q_working_copy, r_working_copy);
   }
 
-  q_working_copy = at::empty_strided(q_sizes, q_strides, self.options());
-  q_working_copy.narrow(-1, 0, self.size(-1)).copy_(self);
+  if (compute_q) {
+    q_working_copy = at::empty_strided(q_sizes, q_strides, self.options());
+    q_working_copy.narrow(-1, 0, self.size(-1)).copy_(self);
+  } else {
+    q_working_copy = at::empty({0}, self.options());
+  }
   r_working_copy = cloneBatchedColumnMajor(self);
 
+  int64_t m = q_sizes[self.dim() - 2];
+  int64_t n = r_working_copy.size(-1);
+
   AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(self.scalar_type(), "qr_cuda", [&]{
-    apply_qr<scalar_t>(q_working_copy, r_working_copy, n_columns_q, infos);
+    apply_qr<scalar_t>(q_working_copy, r_working_copy, m, n, n_columns_q, compute_q, infos);
   });
   if (self.dim() > 2) {
     batchCheckErrors(infos, "qr_cuda");
@@ -1791,10 +1887,12 @@ std::tuple<Tensor,Tensor> _qr_helper_cuda(const Tensor& self, bool some) {
     singleCheckErrors(infos[0], "qr_cuda");
   }
 
-  return std::make_tuple(q_working_copy.narrow(-1, 0, n_columns_q),
-                         r_working_copy.narrow(-2, 0, n_columns_q).triu());
+  if (compute_q) {
+    q_working_copy = q_working_copy.narrow(-1, 0, n_columns_q);
+  }
+  r_working_copy = r_working_copy.narrow(-2, 0, n_columns_q).triu();
+  return std::make_tuple(q_working_copy, r_working_copy);
 }
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ symeig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 template <typename scalar_t>
diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
index 2379877e91ba..bc1884d8d642 100644
--- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
+++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu
@@ -32,7 +32,7 @@ void mse_kernel_cuda(TensorIterator& iter) {
 
 void xlogy_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.common_dtype(), "xlogy_cuda", [&]() {
-    gpu_kernel(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
+    gpu_kernel_with_scalars(iter, []GPU_LAMBDA(scalar_t x, scalar_t y) -> scalar_t {
       if (at::_isnan(y)){
         return NAN;
       }
diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu
index 67adbaabbb84..c3e456d97056 100644
--- a/aten/src/ATen/native/cuda/Dropout.cu
+++ b/aten/src/ATen/native/cuda/Dropout.cu
@@ -57,6 +57,12 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
 
   accscalar_t pinv = accscalar_t(1)/p;
 
+  // Helps align the total number of times curand_uniform4 is called by each thread for the same totalElements
+  // in the vec=2 and vec=4 cases.
+  bool gridxvec_loop_state = 0;
+
+  float4 rand;
+
   // Note: Vectorized loads means we'll stride each thread by an additional VEC factor, as we'll load VEC elements at a time
   for (IndexType linearIndex = idx * VEC;
       linearIndex < totalElements;
@@ -69,12 +75,21 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo<scalar_t, IndexType> a,
     //curand_uniform_double was pure evil anyway, not doing what it promises, and there's nothing for halfs, so generate float for everything
     // Note: need a new set of random values per 4 elements -- we'll handle VEC elements in this thread, so need ceil(VEC / 4)
     // sets of rand.
-    float4 rand = curand_uniform4(&state);
+    if ((VEC == 4) || (gridxvec_loop_state == 0)) {
+      rand = curand_uniform4(&state);
+    } else {
+      // sets up the last two values we generated last iteration to be used this iteration.
+      rand.x = rand.z;
+      rand.y = rand.w;
+      gridxvec_loop_state ^= 1;
+    }
 
     rand.x = rand.x < p;
     rand.y = rand.y < p;
-    rand.z = rand.z < p;
-    rand.w = rand.w < p;
+    if (VEC == 4) {
+      rand.z = rand.z < p;
+      rand.w = rand.w < p;
+    }
 
     // Note: We explicitly check for is_contiguous() before launching the vectorized kernel
     // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other)
diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
index cb4aa644fee2..d88f202487af 100644
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -190,7 +190,7 @@ static Tensor & masked_select_out_cuda_impl(Tensor & result, const Tensor & self
   Tensor _mask = (mask.dim() == 0) ? mask.unsqueeze(0) : mask;
   Tensor _self = (self.dim() == 0) ? self.unsqueeze(0) : self;
   std::tie(_mask, _self) = expand_outplace(_mask, _self);
-  at::native::index_out(result, _self, _mask);
+  at::native::index_out(result, _self, c10::List<c10::optional<at::Tensor>>({_mask}));
 
   return result;
 }
diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu
index d630d727019f..2dc04ed4ddef 100644
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@@ -160,7 +160,7 @@ computeLinearIndex(const Tensor & src, TensorList indices, bool check_range) {
 }
 
 
-static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>> makeLinearIndex(Tensor self, TensorList orig, bool check_range) {
+static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t>> makeLinearIndex(Tensor self, const c10::List<c10::optional<at::Tensor>>& orig, bool check_range) {
   checkIndexTensorTypes(orig);
   // first expand BoolTensor (masks) or ByteTensor (masks) into 1 or more LongTensors
   auto indices = expandTensors(self, orig);
@@ -184,7 +184,7 @@ static std::tuple<Tensor, Tensor, int64_t, int64_t, int64_t, std::vector<int64_t
 
 
 namespace {
-void index_put_accum_kernel(Tensor & self, TensorList indices, const Tensor & value, bool unsafe) {
+void index_put_accum_kernel(Tensor & self, const c10::List<c10::optional<Tensor>>& indices, const Tensor & value, bool unsafe) {
   if (indices.size() > (size_t)self.dim()) {
     TORCH_CHECK_INDEX(false, "too many indices for tensor of dimension ", self.dim(), " (got ", indices.size(), ")");
   }
@@ -505,7 +505,7 @@ Tensor& index_add_cuda_(Tensor & self, int64_t dim, const Tensor & index, const
   if (cuda::detail::canUse32BitIndexMath(self) &&
       cuda::detail::canUse32BitIndexMath(source) &&
       cuda::detail::canUse32BitIndexMath(index)) {
-    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "index_add", [&] {
       cuda::detail::TensorInfo<scalar_t, unsigned int> selfInfo =
           cuda::detail::getTensorInfo<scalar_t, unsigned int>(self_);
       int selfAddDim = selfInfo.collapseDims(dim);
diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h
index 31e6d69aa0a1..8f78e8d78003 100644
--- a/aten/src/ATen/native/cuda/MiscUtils.h
+++ b/aten/src/ATen/native/cuda/MiscUtils.h
@@ -6,8 +6,8 @@
 #include <THC/THC.h>  // for USE_MAGMA
 
 #ifdef USE_MAGMA
-#include <magma.h>
 #include <magma_types.h>
+#include <magma_v2.h>
 #endif
 
 namespace at {
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index 3d59617903b4..cc74848b632a 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -300,7 +300,11 @@ sampleMultinomialOnce(int64_t* dest,
   }
 }
 
-void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n_sample, const bool with_replacement, c10::optional<Generator> generator) {
+void multinomial_with_replacement_kernel_impl(
+    Tensor& result,
+    const Tensor& self,
+    const int64_t n_sample,
+    c10::optional<Generator> generator) {
   auto gen = get_generator_or_default<CUDAGeneratorImpl>(generator, cuda::detail::getDefaultCUDAGenerator());
 
   int inputSize = self.dim();
@@ -371,7 +375,6 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
 
       PhiloxCudaState rng_engine_inputs;
 
-      if (with_replacement) {
         // Binary search is warp divergent (so effectively we're running
         // with just a single thread), but for better utilization,
         // we need each block to have at least 4 warps.
@@ -402,7 +405,6 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
                 prefixSum.data_ptr<scalar_t>(),
                 normDist.data_ptr<scalar_t>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
-      }
     }
   });
 
@@ -412,6 +414,7 @@ void multinomial_kernel_impl(Tensor& result, const Tensor& self, const int64_t n
 }
 }
 
-REGISTER_DISPATCH(multinomial_stub, &multinomial_kernel_impl);
-
+REGISTER_DISPATCH(
+    multinomial_with_replacement_stub,
+    &multinomial_with_replacement_kernel_impl);
 }}
diff --git a/aten/src/ATen/native/cuda/ReduceNormKernel.cu b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
index 3953f16b69c9..3a24f00f6ebf 100644
--- a/aten/src/ATen/native/cuda/ReduceNormKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceNormKernel.cu
@@ -28,7 +28,7 @@ void norm_kernel_cuda_impl(TensorIterator& iter, Scalar val) {
   } else if (p == static_cast<double>(2)) {
     gpu_reduce_kernel<scalar_t, out_t>(iter, NormTwoOps<scalar_t, acc_t>(), 0);
   } else if (p == static_cast<double>(INFINITY)) {
-    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t>(), std::numeric_limits<acc_t>::min());
+    gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMaxOps<scalar_t, acc_t>(), 0);
   } else if (p == static_cast<double>(-INFINITY)) {
     gpu_reduce_kernel<scalar_t, out_t>(iter, AbsMinOps<scalar_t, acc_t>(), std::numeric_limits<acc_t>::max());
   } else {
diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
index db3e853a9321..e5e91cea4ccc 100644
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -7,6 +7,7 @@
 #include <ATen/cuda/detail/KernelUtils.h>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/native/cuda/CuFFTUtils.h>
@@ -439,10 +440,10 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes,
 
 // Calculates the normalization constant and applies it in-place to self
 // sizes is the sizes of a twosided tensor and dims are all transformed dims
-void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+double _fft_normalization_scale(int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
   auto norm = static_cast<fft_norm_mode>(normalization);
   if (norm == fft_norm_mode::none) {
-    return;
+    return 1.0;
   }
 
   int64_t signal_numel = 1;
@@ -451,7 +452,17 @@ void _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArra
   }
   const double scale_denom = (norm == fft_norm_mode::by_root_n) ?
     std::sqrt(signal_numel) : static_cast<double>(signal_numel);
-  self.div_(scale_denom);
+  return 1.0 / scale_denom;
+}
+
+const Tensor& _fft_apply_normalization(const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto scale = _fft_normalization_scale(normalization, sizes, dims);
+  return (scale == 1.0) ? self : self.mul_(scale);
+}
+
+Tensor& _fft_apply_normalization_out(Tensor& out, const Tensor& self, int64_t normalization, IntArrayRef sizes, IntArrayRef dims) {
+  auto scale = _fft_normalization_scale(normalization, sizes, dims);
+  return at::mul_out(out, self, c10::scalar_to_tensor(scale));
 }
 
 }  // namespace (anonymous)
@@ -522,6 +533,23 @@ Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   return output;
 }
 
+Tensor& _fft_r2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, bool onesided) {
+  auto result = _fft_r2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), /*onesided=*/true);
+  if (onesided) {
+    return _fft_apply_normalization_out(out, result, normalization, self.sizes(), dim);
+  }
+
+  resize_output(out, self.sizes());
+
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = result.sizes()[last_dim];
+  auto out_slice = out.slice(last_dim, 0, last_dim_halfsize);
+  _fft_apply_normalization_out(out_slice, result, normalization, self.sizes(), dim);
+  at::native::_fft_fill_with_conjugate_symmetry_(out, dim);
+  return out;
+}
+
 // n-dimensional complex to real IFFT
 Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t lastdim) {
   TORCH_CHECK(self.is_complex());
@@ -544,8 +572,13 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
   // TODO: could transform up to 2 other dims in the same cuFFT operation
   auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type())));
   _exec_fft(output, temp, out_sizes, dim.back(), /*forward=*/false);
-  _fft_apply_normalization(output, normalization, out_sizes, dim);
-  return output;
+  return _fft_apply_normalization(output, normalization, out_sizes, dim);
+}
+
+Tensor& _fft_c2r_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, int64_t lastdim) {
+  auto result = _fft_c2r_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), lastdim);
+  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
 }
 
 // n-dimensional complex to complex FFT/IFFT
@@ -586,8 +619,13 @@ Tensor _fft_c2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization
     }
   }
 
-  _fft_apply_normalization(output, normalization, out_sizes, dim);
-  return output;
+  return _fft_apply_normalization(output, normalization, out_sizes, dim);
+}
+
+Tensor& _fft_c2c_cufft_out(Tensor& out, const Tensor& self, IntArrayRef dim,
+                           int64_t normalization, bool forward) {
+  auto result = _fft_c2c_cufft(self, dim, static_cast<int64_t>(fft_norm_mode::none), forward);
+  return _fft_apply_normalization_out(out, result, normalization, result.sizes(), dim);
 }
 
 
diff --git a/aten/src/ATen/native/cuda/TensorTransformations.cu b/aten/src/ATen/native/cuda/TensorTransformations.cu
index a435c7060f45..9dfa4e8759cf 100644
--- a/aten/src/ATen/native/cuda/TensorTransformations.cu
+++ b/aten/src/ATen/native/cuda/TensorTransformations.cu
@@ -87,7 +87,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) {
 
   // use kernel_pointwise_flip_apply2 only when to-flip dim is the 1st or last dim, where collapseDims can reduce the amount of work
   if (flip_dims_size == 1 && in_tensor.is_contiguous() && (flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) {
-    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, in_tensor.scalar_type(), "flip_cuda", [&] {
+    AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] {
       auto in_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(in_tensor);
       auto out_tensor_info = cuda::detail::getTensorInfo<scalar_t, int64_t>(out_tensor);
       int flip_dim = in_tensor_info.collapseDims(flip_dims[0]);
@@ -123,7 +123,7 @@ Tensor flip_cuda(const Tensor& self, IntArrayRef dims) {
     }
   }
 
-  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Half, in_tensor.scalar_type(), "flip_cuda", [&] {
+  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kHalf, kBool, kBFloat16, in_tensor.scalar_type(), "flip_cuda", [&] {
     flip_cuda_kernel<<<dim_grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(
       in_tensor.data_ptr<scalar_t>(), out_tensor.data_ptr<scalar_t>(), N,
       flip_dims_t.cuda().data_ptr<int64_t>(),
diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
index 99488108ac26..b269bd303e76 100644
--- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu
@@ -197,19 +197,21 @@ static void upsample_nearest1d_backward_out_cuda_template(
 } // namespace
 
 TORCH_IMPL_FUNC(upsample_nearest1d_out_cuda) (
-    Tensor& output,
     const Tensor& input,
     IntArrayRef output_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& output
+) {
   upsample_nearest1d_out_cuda_template(output, input, output_size, scales);
 }
 
 TORCH_IMPL_FUNC(upsample_nearest1d_backward_out_cuda) (
-    Tensor& grad_input,
     const Tensor& grad_output,
     IntArrayRef output_size,
     IntArrayRef input_size,
-    c10::optional<double> scales) {
+    c10::optional<double> scales,
+    Tensor& grad_input
+) {
   upsample_nearest1d_backward_out_cuda_template(
       grad_input, grad_output, output_size, input_size, scales);
 }
diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
index 834c000fdb05..8ac7abca1824 100644
--- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
+++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu
@@ -168,43 +168,43 @@ __global__ void upsample_trilinear3d_backward_out_frame(
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1, h1, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1, h1, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t0lambda * h0lambda * w1lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1),
+      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1),
       i_numel,
       static_cast<scalar_t>(t0lambda * h1lambda * w0lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1, h1 + h1p, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1, h1 + h1p, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t0lambda * h1lambda * w1lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1),
       i_numel,
       static_cast<scalar_t>(t1lambda * h0lambda * w0lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t1lambda * h0lambda * w1lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1),
       i_numel,
       static_cast<scalar_t>(t1lambda * h1lambda * w0lambda * d2val),
       true);
     fastAtomicAdd(
       idata,
-      idx_3d(nc, depth1, width1, height1, t1 + t1p, h1 + h1p, w1 + w1p),
+      idx_3d(nc, depth1, height1, width1, t1 + t1p, h1 + h1p, w1 + w1p),
       i_numel,
       static_cast<scalar_t>(t1lambda * h1lambda * w1lambda * d2val),
       true);
diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp
index 8fca9ad9ecdf..d5a39e45941b 100644
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@@ -1,5 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/NativeFunctions.h>
+#include <ATen/native/Resize.h>
 #include <ATen/native/SpectralOpsUtils.h>
 #include <ATen/Config.h>
 
@@ -21,6 +22,21 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   AT_ERROR("fft: ATen not compiled with MKL support");
 }
 
+Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool onesided) {
+  AT_ERROR("fft: ATen not compiled with MKL support");
+}
+
+Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         int64_t last_dim_size) {
+  AT_ERROR("fft: ATen not compiled with MKL support");
+}
+
+Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool forward) {
+  AT_ERROR("fft: ATen not compiled with MKL support");
+}
+
 }}
 
 #else // AT_MKL_ENABLED
@@ -381,6 +397,13 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false);
 }
 
+Tensor& _fft_c2r_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         int64_t last_dim_size) {
+  auto result = _fft_c2r_mkl(self, dim, normalization, last_dim_size);
+  resize_output(out, result.sizes());
+  return out.copy_(result);
+}
+
 // n-dimensional real to complex FFT
 Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
   TORCH_CHECK(self.is_floating_point());
@@ -402,6 +425,24 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   return out;
 }
 
+Tensor& _fft_r2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool onesided) {
+  auto result = _fft_r2c_mkl(self, dim, normalization, /*onesided=*/true);
+  if (onesided) {
+    resize_output(out, result.sizes());
+    return out.copy_(result);
+  }
+
+  resize_output(out, self.sizes());
+
+  auto last_dim = dim.back();
+  auto last_dim_halfsize = result.sizes()[last_dim];
+  auto out_slice = out.slice(last_dim, 0, last_dim_halfsize);
+  out_slice.copy_(result);
+  at::native::_fft_fill_with_conjugate_symmetry_(out, dim);
+  return out;
+}
+
 // n-dimensional complex to complex FFT/IFFT
 Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) {
   TORCH_CHECK(self.is_complex());
@@ -410,6 +451,13 @@ Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization,
   return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward);
 }
 
+Tensor& _fft_c2c_mkl_out(Tensor& out, const Tensor& self, IntArrayRef dim, int64_t normalization,
+                         bool forward) {
+  auto result = _fft_c2c_mkl(self, dim, normalization, forward);
+  resize_output(out, result.sizes());
+  return out.copy_(result);
+}
+
 }} // namespace at::native
 
 #endif
diff --git a/aten/src/ATen/native/mkldnn/BinaryOps.cpp b/aten/src/ATen/native/mkldnn/BinaryOps.cpp
index 029b1d225d14..3358079f4df5 100644
--- a/aten/src/ATen/native/mkldnn/BinaryOps.cpp
+++ b/aten/src/ATen/native/mkldnn/BinaryOps.cpp
@@ -8,10 +8,11 @@ namespace at {
 namespace native {
 
 Tensor& mkldnn_add_out(
-    Tensor& result,
     const Tensor& self,
     const Tensor& other,
-    Scalar alpha) {
+    Scalar alpha,
+    Tensor& result
+    ) {
   TORCH_CHECK(false, "mkldnn_add_out: ATen not compiled with MKLDNN support");
 }
 
@@ -46,10 +47,11 @@ namespace at {
 namespace native {
 
 Tensor& mkldnn_add_out(
-    Tensor& result,
     const Tensor& self,
     const Tensor& other,
-    Scalar alpha) {
+    Scalar alpha,
+    Tensor& result
+    ) {
   ideep::tensor& x = itensor_from_mkldnn(self);
   ideep::tensor& y = itensor_from_mkldnn(other);
 
@@ -73,7 +75,7 @@ Tensor mkldnn_add(const Tensor& self, const Tensor& other, Scalar alpha) {
 }
 
 Tensor& mkldnn_add_(Tensor& self, const Tensor& other, Scalar alpha) {
-  return native::mkldnn_add_out(self, self, other, alpha);
+  return native::mkldnn_add_out(self, other, alpha, self);
 }
 
 Tensor& mkldnn_mul_out(Tensor& result, const Tensor& self, const Tensor& other) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index e6bb52490e7d..b474d435398c 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -7,42 +7,34 @@
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Byte(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Char(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Double(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Float(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Int(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Long(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Short(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # DEPRECATED. DO NOT USE
 - func: _cast_Half(Tensor self, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # Computes the gradient of current tensor w.r.t. graph leaves.
@@ -59,18 +51,15 @@
 # where Variables *are* Tensors (as opposed to them containing tensors, which
 # is what the previous interpretation was.)
 - func: set_data(Tensor(a!) self, Tensor new_data) -> ()
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 - func: data(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 # True if this `Variable` is a leaf and thus does not have a `grad_fn`.
 - func: is_leaf(Tensor self) -> bool
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
@@ -85,23 +74,19 @@
 #   assert y2.output_nr == 2
 #
 - func: output_nr(Tensor self) -> int
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 - func: _version(Tensor self) -> int
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 - func: requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
 # Enables .grad attribute for non-leaf Tensors.
 - func: retain_grad(Tensor(a!) self) -> ()
-  use_c10_dispatcher: full
   manual_kernel_registration: True
   variants: method
 
@@ -120,47 +105,36 @@
   variants: function
 
 - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_to(Tensor(a) self, Dimname[] names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_to.ellipsis_idx(Tensor(a) self, Dimname[] order, int ellipsis_idx) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_as(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method
 
 - func: align_tensors(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
 
 - func: refine_names(Tensor(a) self, Dimname[] names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: _use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _use_cudnn_ctc_loss
 
 - func: _cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_ctc_loss
 
 - func: _use_cudnn_rnn_flatten_weight() -> bool
-  use_c10_dispatcher: full
 
 - func: _cudnn_rnn_flatten_weight(Tensor[] weight_arr, int weight_stride0, int input_size, int mode, int hidden_size, int proj_size, int num_layers, bool batch_first, bool bidirectional) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _cudnn_rnn_flatten_weight
 
@@ -180,71 +154,52 @@
     CUDA: _cudnn_init_dropout_state
 
 - func: _debug_has_internal_overlap(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: function
 
 - func: _fused_dropout(Tensor self, float p, Generator? generator=None) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: fused_dropout_cuda
 
 - func: _masked_scale(Tensor self, Tensor mask, float scale) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: masked_scale_cuda
 
 - func: _sobol_engine_draw(Tensor quasi, int n, Tensor sobolstate, int dimension, int num_generated, ScalarType? dtype) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: _sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: _sobol_engine_scramble_(Tensor(a!) self, Tensor ltm, int dimension) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: _sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: _reshape_from_tensor(Tensor self, Tensor shape) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _shape_as_tensor(Tensor self) -> Tensor
-  use_c10_dispatcher: full
 
 - func: dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: feature_dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: alpha_dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: feature_alpha_dropout(Tensor input, float p, bool train) -> Tensor
-  use_c10_dispatcher: full
 
 - func: feature_alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: abs(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: abs
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: abs_
@@ -281,18 +236,15 @@
 
 # Absolute, alias for abs
 - func: absolute(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: absolute_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: absolute.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: angle(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: angle
@@ -303,19 +255,16 @@
     CPU, CUDA: angle_out
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: view_as_real
 
 - func: view_as_complex(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: view_as_complex
 
 - func: sgn(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sgn
@@ -332,15 +281,12 @@
     CPU, CUDA: sgn_out
 
 - func: real(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
 
 - func: imag(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function
 
 - func: conj(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -349,19 +295,16 @@
     CPU, CUDA: conj_out
 
 - func: _conj(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: _conj
 
 - func: acos(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acos
 
 - func: acos_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acos_
@@ -373,28 +316,22 @@
 
 # arccos, alias of acos
 - func: arccos(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccos_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccos.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: avg_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, bool ceil_mode=False, bool count_include_pad=True) -> Tensor
-  use_c10_dispatcher: full
 
 - func: adaptive_avg_pool1d(Tensor self, int[1] output_size) -> Tensor
-  use_c10_dispatcher: full
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   structured_delegate: add.out
   variants: function, method
   dispatch:
@@ -403,7 +340,6 @@
     MkldnnCPU: mkldnn_add
 
 - func: add_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   structured_delegate: add.out
   dispatch:
@@ -412,7 +348,7 @@
     MkldnnCPU: mkldnn_add_
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  use_c10_dispatcher: full
   structured: True
   structured_inherits: TensorIteratorBase
   dispatch:
@@ -422,13 +358,11 @@
     MkldnnCPU: mkldnn_add_out
 
 - func: _add_relu.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: add_relu
 
 - func: _add_relu_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: add_relu_
@@ -441,25 +375,21 @@
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: add
 
 - func: add_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: add_
 
 - func: addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: addmv
 
 - func: addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: addmv_
@@ -470,20 +400,17 @@
     CPU, CUDA: addmv_out
 
 - func: _addmv_impl_(Tensor(a!) self, Tensor self2, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU: addmv_impl_cpu
     CUDA: addmv_impl_cuda
 
 - func: addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: addr
     Math: math_addr
 
 - func: addr_(Tensor(a!) self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: addr_
@@ -495,17 +422,14 @@
     Math: math_addr_out
 
 - func: affine_grid_generator(Tensor theta, int[] size, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: affine_grid_generator
 
 - func: affine_grid_generator_backward(Tensor grad, int[] size, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: all
@@ -516,18 +440,15 @@
     CPU, CUDA: all_out
 
 - func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: any
@@ -538,7 +459,6 @@
     CPU, CUDA: any_out
 
 - func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -568,10 +488,8 @@
 # preserve tracing.  Get rid of this when arange can directly take tensors for bounds
 # (so that it can be traced directly).
 - func: _dim_arange(Tensor like, int dim) -> Tensor
-  use_c10_dispatcher: full
 
 - func: argmax(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: argmax
@@ -582,7 +500,6 @@
     CPU, CUDA: argmax_out
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: argmin
@@ -593,13 +510,11 @@
     CPU, CUDA: argmin_out
 
 - func: acosh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acosh
 
 - func: acosh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: acosh_
@@ -611,24 +526,20 @@
 
 # arccosh, alias for acosh
 - func: arccosh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccosh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: asinh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: asinh
 
 - func: asinh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: asinh_
@@ -640,24 +551,20 @@
 
 # arcsinh, alias for asinh
 - func: arcsinh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsinh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: atanh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atanh
 
 - func: atanh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atanh_
@@ -669,18 +576,15 @@
 
 # arctanh, alias for atanh
 - func: arctanh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctanh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: as_strided(Tensor(a) self, int[] size, int[] stride, int? storage_offset=None) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: as_strided_tensorimpl
@@ -695,14 +599,12 @@
     DefaultBackend: as_strided_
 
 - func: asin(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: asin
     SparseCPU, SparseCUDA: asin_sparse
 
 - func: asin_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: asin_
@@ -716,24 +618,20 @@
 
 # arcsin, alias of asin
 - func: arcsin(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsin_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arcsin.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: atan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atan
 
 - func: atan_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: atan_
@@ -745,55 +643,44 @@
 
 # arctan, alias of atan
 - func: arctan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctan_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: arctan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: atleast_1d(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_1d.Sequence(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
 
 - func: atleast_2d(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_2d.Sequence(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_3d(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: atleast_3d.Sequence(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
 
 - func: baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: baddbmm_cpu
     CUDA: baddbmm_cuda
 
 - func: baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: baddbmm__cpu
     CUDA: baddbmm__cuda
 
 - func: _baddbmm_mkl_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
 
 - func: baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -825,7 +712,6 @@
 
 # Sample bernoulli with values in `self` as probability.
 - func: bernoulli(Tensor self, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: bernoulli
@@ -837,13 +723,11 @@
     CPU, CUDA: bernoulli_out
 
 - func: bernoulli_.Tensor(Tensor(a!) self, Tensor p, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: bernoulli_
 
 - func: bernoulli_.float(Tensor(a!) self, float p=0.5, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: bernoulli_
@@ -852,7 +736,6 @@
 # There is no default valid on `p` here because it would introduce ambiguity
 # with `bernoulli(Tensor self, *, Generator? generator=None)` declaration.
 - func: bernoulli.p(Tensor self, float p, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias) -> Tensor
@@ -908,11 +791,9 @@
     CUDA: _bincount_cuda
 
 - func: bitwise_not(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: bitwise_not_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -921,13 +802,11 @@
     CPU, CUDA: bitwise_not_out
 
 - func: copysign.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: copysign
 
 - func: copysign_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: copysign_
@@ -938,23 +817,19 @@
     CPU, CUDA: copysign_out
 
 - func: copysign.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: copysign
 
 - func: copysign_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: copysign_
 
 - func: logical_not(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -963,11 +838,9 @@
     CPU, CUDA: logical_not_out
 
 - func: logical_xor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_xor_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_xor.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -976,11 +849,9 @@
     CPU, CUDA: logical_xor_out
 
 - func: logical_and(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_and_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_and.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -989,11 +860,9 @@
     CPU, CUDA: logical_and_out
 
 - func: logical_or(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logical_or_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -1008,7 +877,6 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: bmm(Tensor self, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: bmm_cpu
@@ -1017,7 +885,6 @@
     SparseCUDA: bmm_sparse_cuda
 
 - func: _bmm(Tensor self, Tensor mat2, *, bool deterministic=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     SparseCUDA: _bmm_sparse_cuda
@@ -1038,7 +905,6 @@
     SparseCUDA: _bmm_out_sparse_cuda
 
 - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   device_guard: False
 
 - func: broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
@@ -1048,7 +914,6 @@
     Math: broadcast_to
 
 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: cat
 
@@ -1058,23 +923,19 @@
     DefaultBackend: cat_out
 
 - func: cat.names(Tensor[] tensors, Dimname dim) -> Tensor
-  use_c10_dispatcher: full
 
 - func: cat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: block_diag(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: ceil(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: ceil
 
 - func: ceil_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: ceil_
@@ -1085,25 +946,20 @@
     CPU, CUDA: ceil_out
 
 - func: chain_matmul(Tensor[] matrices) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: unsafe_chunk(Tensor self, int chunks, int dim=0) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: chunk(Tensor(a) self, int chunks, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: tensor_split.sections(Tensor(a) self, int sections, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: tensor_split.indices(Tensor(a) self, int[] indices, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: tensor_split.tensor_indices_or_sections(Tensor(a) self, Tensor tensor_indices_or_sections, int dim=0) -> Tensor(a)[]
@@ -1111,14 +967,12 @@
   variants: function, method
 
 - func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: clamp
     QuantizedCPU: clamp_quantized_cpu
 
 - func: clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_
@@ -1129,13 +983,11 @@
     CPU, CUDA: clamp_out
 
 - func: clamp_max(Tensor self, Scalar max) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_max
 
 - func: clamp_max_(Tensor(a!) self, Scalar max) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_max_
@@ -1146,13 +998,11 @@
     CPU, CUDA: clamp_max_out
 
 - func: clamp_min(Tensor self, Scalar min) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_min
 
 - func: clamp_min_(Tensor(a!) self, Scalar min) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: clamp_min_
@@ -1164,7 +1014,6 @@
 
 # clip is an alias for clamp
 - func: clip(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: clip_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)
@@ -1175,11 +1024,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: cudnn_is_acceptable(Tensor self) -> bool
-  use_c10_dispatcher: full
   device_guard: False
 
 - func: complex(Tensor real, Tensor imag) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: complex
@@ -1190,7 +1037,6 @@
     CPU, CUDA: complex_out
 
 - func: polar(Tensor abs, Tensor angle) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: polar
@@ -1201,13 +1047,11 @@
     CPU, CUDA: polar_out
 
 - func: constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: constant_pad_nd
 
 - func: contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   manual_cpp_binding: True
 
@@ -1220,7 +1064,6 @@
     DefaultBackend: convolution_overrideable
 
 - func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: convolution_backward_overrideable
 
@@ -1246,12 +1089,10 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: conv_tbc
 
 - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
 
 # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
 - func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
@@ -1264,24 +1105,20 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: copy_
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch: {}
 
 - func: cos(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cos
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cos_
@@ -1292,13 +1129,11 @@
     CPU, CUDA: cos_out
 
 - func: cosh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cosh
 
 - func: cosh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cosh_
@@ -1309,28 +1144,23 @@
     CPU, CUDA: cosh_out
 
 - func: cosine_embedding_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: count_nonzero.dim_IntList(Tensor self, int[] dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: count_nonzero
 
 - func: count_nonzero(Tensor self, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: count_nonzero
 
 - func: cudnn_affine_grid_generator(Tensor theta, int N, int C, int H, int W) -> Tensor grid
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_affine_grid_generator_forward
 
 # TODO: Why do I have to call this grad?!
 - func: cudnn_affine_grid_generator_backward(Tensor grad, int N, int C, int H, int W) -> Tensor grad_theta
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_affine_grid_generator_backward
 
@@ -1351,27 +1181,22 @@
     CUDA: cudnn_convolution_deprecated
 
 - func: cudnn_convolution.deprecated2(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_deprecated2
 
 - func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution
 
 - func: cudnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward_input
 
 - func: cudnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward
 
 - func: cudnn_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_backward_weight
 
@@ -1381,45 +1206,37 @@
     CUDA: cudnn_convolution_transpose_deprecated
 
 - func: cudnn_convolution_transpose.deprecated2(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_deprecated2
 
 - func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose
 
 # NB: output_padding not strictly needed here, but it's helpful for the float
 # backwards
 - func: cudnn_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32, bool[2] output_mask) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward
 
 - func: cudnn_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_input
 
 - func: cudnn_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_convolution_transpose_backward_weight
 
 # NB: input is special cased in a way I don't quite understand
 - func: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_grid_sampler_forward
 
 - func: cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: cudnn_grid_sampler_backward
 
 - func: cummax(Tensor self, int dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cummax
@@ -1430,7 +1247,6 @@
     DefaultBackend: cummax_out
 
 - func: cummax.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cummax.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -1444,7 +1260,6 @@
     CUDA: cummax_helper_cuda
 
 - func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cummin
@@ -1455,7 +1270,6 @@
     DefaultBackend: cummin_out
 
 - func: cummin.dimname(Tensor self, Dimname dim) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cummin.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -1469,18 +1283,15 @@
     CUDA: cummin_helper_cuda
 
 - func: cummaxmin_backward(Tensor grad, Tensor input, Tensor indices, int dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cumprod
 
 - func: cumprod_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: cumprod_
@@ -1491,29 +1302,24 @@
     DefaultBackend: cumprod_out
 
 - func: cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumprod_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: cumprod.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: cumprod_backward(Tensor grad, Tensor input, int dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: cumsum
 
 - func: cumsum_(Tensor(a!) self, int dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: cumsum_
@@ -1524,137 +1330,111 @@
     DefaultBackend: cumsum_out
 
 - func: cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: cumsum_.dimname(Tensor(a!) self, Dimname dim, *, ScalarType? dtype=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: cumsum.dimname_out(Tensor self, Dimname dim, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: ctc_loss.IntList(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
-  use_c10_dispatcher: full
 
 # convenience function that converts to intlists for you
 - func: ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, int reduction=Mean, bool zero_infinity=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: ctc_loss_cpu
     CUDA: ctc_loss_gpu
 
 - func: _ctc_loss_backward(Tensor grad, Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, Tensor neg_log_likelihood, Tensor log_alpha, int blank, bool zero_infinity=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: ctc_loss_backward_cpu
     CUDA: ctc_loss_backward_gpu
 
 - func: diag_embed(Tensor self, int offset=0, int dim1=-2, int dim2=-1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: diagflat(Tensor self, int offset=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: diagonal
 
 - func: diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: diagonal_backward(Tensor grad, int[] input_sizes, int offset, int dim1, int dim2) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: fill_diagonal_(Tensor(a!) self, Scalar fill_value, bool wrap=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: div.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: div
     SparseCPU, SparseCUDA: div_sparse
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: div_
     SparseCPU, SparseCUDA: div_sparse_
 
 - func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: div_out
     SparseCPU, SparseCUDA: div_out_sparse_zerodim
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: div.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: div
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: div_
 
 # divide, alias for div
 - func: divide.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: divide.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
   # true_divide, an alias for div
 - func: true_divide.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: true_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: true_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: true_divide.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: true_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: dot(Tensor self, Tensor tensor) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: dot
@@ -1666,7 +1446,6 @@
     DefaultBackend: dot_out
 
 - func: vdot(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: vdot
@@ -1678,30 +1457,24 @@
     DefaultBackend: vdot_out
 
 - func: einsum(str equation, Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: embedding(Tensor weight, Tensor indices, int padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: embedding
 
 - func: embedding_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq, bool sparse) -> Tensor
-  use_c10_dispatcher: full
 
 - func: embedding_dense_backward(Tensor grad_output, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: embedding_dense_backward_cpu
     CUDA: embedding_dense_backward_cuda
 
 - func: embedding_renorm_(Tensor(a!) self, Tensor indices, float max_norm, float norm_type) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU: embedding_renorm_cpu_
     CUDA: embedding_renorm_cuda_
 
 - func: embedding_sparse_backward(Tensor grad, Tensor indices, int num_weights, int padding_idx, bool scale_grad_by_freq) -> Tensor
-  use_c10_dispatcher: full
 
 # NOTE [ embedding_bag Native Functions ]
 # The `_embedding_bag.*` variants assume that input tensors except for `weight`,
@@ -1720,11 +1493,9 @@
     CUDA: _embedding_bag_forward_only_cuda
 
 - func: rowwise_prune(Tensor weight, Tensor mask, ScalarType compressed_indices_dtype) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 # row_stack is the alias of vstack
 - func: row_stack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     Math: row_stack
 
@@ -1755,20 +1526,17 @@
     CUDA: _embedding_bag_dense_backward_cuda
 
 - func: _embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _embedding_bag_per_sample_weights_backward_cpu
     CUDA: _embedding_bag_per_sample_weights_backward_cuda
 
 - func: empty_meta(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
 
 - func: empty.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   device_guard: False
 
 - func: empty.memory_format(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: empty_cpu
     CUDA: empty_cuda
@@ -1776,7 +1544,6 @@
     SparseCPU, SparseCUDA: empty_sparse
 
 - func: new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
 
 - func: new_empty_strided(Tensor self, int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -1808,7 +1575,6 @@
     QuantizedCPU, QuantizedCUDA: empty_per_channel_affine_quantized
 
 - func: resize_(Tensor(a!) self, int[] size, *, MemoryFormat? memory_format=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -1818,7 +1584,6 @@
     Meta: resize_meta_
 
 - func: empty_quantized(int[] size, Tensor qtensor) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     QuantizedCPU, QuantizedCUDA: empty_quantized
@@ -1832,19 +1597,16 @@
   device_guard: False
 
 - func: empty_strided(int[] size, int[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: empty_strided_cpu
     CUDA: empty_strided_cuda
 
 - func: erf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erf
 
 - func: erf_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erf_
@@ -1855,13 +1617,11 @@
     CPU, CUDA: erf_out
 
 - func: erfc(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erfc
 
 - func: erfc_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: erfc_
@@ -1872,13 +1632,11 @@
     CPU, CUDA: erfc_out
 
 - func: exp(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp
 
 - func: exp_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp_
@@ -1889,13 +1647,11 @@
     CPU, CUDA: exp_out
 
 - func: exp2(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp2
 
 - func: exp2_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: exp2_
@@ -1906,13 +1662,11 @@
     CPU, CUDA: exp2_out
 
 - func: expm1(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: expm1
 
 - func: expm1_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: expm1_
@@ -1923,14 +1677,12 @@
     CPU, CUDA: expm1_out
 
 - func: expand(Tensor(a) self, int[] size, *, bool implicit=False) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_guard: False
   dispatch:
     DefaultBackend: expand
 
 - func: expand_as(Tensor(a) self, Tensor other) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   device_guard: False
 
@@ -1953,49 +1705,39 @@
     CUDA: eye_out_cuda
 
 - func: flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: unflatten.int(Tensor(a) self, int dim, int[] sizes, Dimname[]? names=None) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: unflatten.Dimname(Tensor(a) self, Dimname dim, int[] sizes, Dimname[] names) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: fill_
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: fill_
 
 - func: floor(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: floor
 
 - func: floor_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: floor_
@@ -2006,14 +1748,12 @@
     CPU, CUDA: floor_out
 
 - func: floor_divide(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: floor_divide
     SparseCPU, SparseCUDA: floor_divide_sparse
 
 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: floor_divide_
@@ -2026,21 +1766,17 @@
     SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
 
 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: floor_divide_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: frac(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: frac
 
 - func: frac_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: frac_
@@ -2074,11 +1810,9 @@
     CPU, CUDA: gcd_out
 
 - func: gcd(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: lcm.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -2087,11 +1821,9 @@
     CPU, CUDA: lcm_out
 
 - func: lcm(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: lcm_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 # NOTE [ grid_sampler Native Functions ]
@@ -2110,37 +1842,30 @@
 # Nor does it take in `align_corners` because it only supports the mode
 # `align_corners = True`.
 - func: grid_sampler(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
 
 - func: grid_sampler_2d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_2d_cpu
     CUDA: grid_sampler_2d_cuda
 
 - func: grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_2d_backward_cpu
     CUDA: grid_sampler_2d_backward_cuda
 
 # See NOTE [ grid_sample CPU fallback ]
 - func: _grid_sampler_2d_cpu_fallback(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _grid_sampler_2d_cpu_fallback
 
 - func: _grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: grid_sampler_3d(Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_3d_cpu
     CUDA: grid_sampler_3d_cuda
 
 - func: grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: grid_sampler_3d_backward_cpu
     CUDA: grid_sampler_3d_backward_cuda
@@ -2173,7 +1898,6 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: hinge_embedding_loss(Tensor self, Tensor target, float margin=1.0, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: group_norm(Tensor input, int num_groups, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enabled=True) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -2191,39 +1915,53 @@
 
 # Real to complex forward FFT
 - func: _fft_r2c(Tensor self, int[] dim, int normalization, bool onesided) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _fft_r2c_mkl
     CUDA: _fft_r2c_cufft
 
+- func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU: _fft_r2c_mkl_out
+    CUDA: _fft_r2c_cufft_out
+
 # Complex to real inverse FFT
 - func: _fft_c2r(Tensor self, int[] dim, int normalization, int last_dim_size) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _fft_c2r_mkl
     CUDA: _fft_c2r_cufft
 
+- func: _fft_c2r.out(Tensor self, int[] dim, int normalization, int last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU: _fft_c2r_mkl_out
+    CUDA: _fft_c2r_cufft_out
+
 # Standard complex to complex FFT (forward or backward)
 - func: _fft_c2c(Tensor self, int[] dim, int normalization, bool forward) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _fft_c2c_mkl
     CUDA: _fft_c2c_cufft
 
+- func: _fft_c2c.out(Tensor self, int[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  variants: function
+  dispatch:
+    CPU: _fft_c2c_mkl_out
+    CUDA: _fft_c2c_cufft_out
+
 - func: _cufft_get_plan_cache_size(int device_index) -> int
-  use_c10_dispatcher: full
 
 - func: _cufft_get_plan_cache_max_size(int device_index) -> int
-  use_c10_dispatcher: full
 
 - func: _cufft_set_plan_cache_max_size(int device_index, int max_size) -> ()
-  use_c10_dispatcher: full
 
 - func: _cufft_clear_plan_cache(int device_index) -> ()
-  use_c10_dispatcher: full
 
 - func: index.Tensor(Tensor self, Tensor?[] indices) -> Tensor
   variants: function, method
@@ -2236,21 +1974,17 @@
   # - Tensor Tensor::index(std::initializer_list<TensorIndex> indices)
 
 - func: index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: index_copy_
 
 - func: index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)
@@ -2276,7 +2010,6 @@
   variants: function
 
 - func: inverse(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: inverse
@@ -2287,18 +2020,15 @@
     DefaultBackend: inverse_out
 
 - func: _inverse_helper(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _inverse_helper_cpu
     CUDA: _inverse_helper_cuda
 
 - func: isclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: isnan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
@@ -2306,52 +2036,42 @@
     SparseCPU, SparseCUDA: isnan_sparse
 
 - func: is_distributed(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_floating_point(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_complex(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: isreal(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: is_nonzero(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_same_size(Tensor self, Tensor other) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: is_signed(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: kl_div
 
 - func: kl_div_backward(Tensor grad_output, Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: kl_div_backward_cpu
     CUDA: kl_div_backward_cuda
 
 - func: kron(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     Math: kron
@@ -2362,7 +2082,6 @@
     Math: kron_out
 
 - func: kthvalue(Tensor self, int k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: kthvalue
@@ -2374,7 +2093,6 @@
     CUDA: kthvalue_out_cuda
 
 - func: kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: kthvalue.dimname_out(Tensor self, int k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -2397,13 +2115,11 @@
     CUDA: layer_norm_backward_cuda
 
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: nan_to_num
 
 - func: nan_to_num_(Tensor(a!) self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: nan_to_num_
@@ -2424,35 +2140,25 @@
     MkldnnCPU: mkldnn_linear
 
 - func: fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_int8_weight(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_quantize_weight(Tensor input) -> (Tensor, Tensor, float, int)
-  use_c10_dispatcher: full
 
 - func: fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
-  use_c10_dispatcher: full
 
 - func: ldexp.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: ldexp_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: ldexp.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -2468,13 +2174,11 @@
     CUDA: linspace_cuda_out
 
 - func: log(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log
 
 - func: log_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log_
@@ -2485,13 +2189,11 @@
     CPU, CUDA: log_out
 
 - func: log10(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log10
 
 - func: log10_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log10_
@@ -2502,14 +2204,12 @@
     CPU, CUDA: log10_out
 
 - func: log1p(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: log1p
     SparseCPU, SparseCUDA: log1p_sparse
 
 - func: log1p_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: log1p_
@@ -2522,13 +2222,11 @@
     SparseCPU, SparseCUDA: log1p_out_sparse
 
 - func: log2(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log2
 
 - func: log2_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: log2_
@@ -2544,7 +2242,6 @@
     CPU, CUDA: logaddexp_out
 
 - func: logaddexp(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: logaddexp
@@ -2555,7 +2252,6 @@
     CPU, CUDA: logaddexp2_out
 
 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: logaddexp2
@@ -2611,7 +2307,6 @@
     CPU, CUDA: xlogy_out
 
 - func: logdet(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: logdet
@@ -2627,27 +2322,22 @@
 
 # log_softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: _log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: log_softmax_cpu
     CUDA: log_softmax_cuda
 
 - func: _log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: log_softmax_backward_cpu
     CUDA: log_softmax_backward_cuda
 
 - func: _logcumsumexp(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _logcumsumexp_cpu
     CUDA: _logcumsumexp_cuda
@@ -2659,7 +2349,6 @@
     CUDA: _logcumsumexp_out_cuda
 
 - func: logcumsumexp(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: logcumsumexp
@@ -2670,14 +2359,12 @@
     DefaultBackend: logcumsumexp_out
 
 - func: logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: logsumexp(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: logsumexp
@@ -2688,55 +2375,44 @@
     DefaultBackend: logsumexp_out
 
 - func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: matmul(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: matmul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: matrix_rank.tol(Tensor self, float tol, bool symmetric=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: matrix_rank(Tensor self, bool symmetric=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: matrix_power(Tensor self, int n) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: matrix_exp(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: matrix_exp
 
 - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _aminmax(Tensor self) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _aminmax_all
 
 - func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _aminmax
 
 - func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _compute_linear_combination
 
@@ -2746,7 +2422,6 @@
     CPU, CUDA: _compute_linear_combination_out
 
 - func: max.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: max
@@ -2757,19 +2432,16 @@
     CPU, CUDA: max_out
 
 - func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, int[] sizes, bool keepdim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: amax
@@ -2781,48 +2453,38 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_max_pool2d
 
 - func: mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_max_pool3d
 
 - func: quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     QuantizedCPU: quantized_max_pool1d
 
 - func: quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     QuantizedCPU: quantized_max_pool2d
 
 - func: max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor
-  use_c10_dispatcher: full
 
 # The CPU and GPU dispatch variants are named weirdly here because otherwise there
 # are namespacing issues in C++
 - func: mean(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mean_cpu_gpu
     QuantizedCPU: mean_quantized_cpu
 
 - func: mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mean_cpu_gpu
@@ -2835,21 +2497,18 @@
     QuantizedCPU: mean_out_quantized_cpu
 
 - func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: median(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: median_cpu
     CUDA: median_cuda
 
 - func: median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: median
@@ -2861,21 +2520,18 @@
     CUDA: median_out_cuda
 
 - func: median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nanmedian(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: nanmedian_cpu
     CUDA: nanmedian_cuda
 
 - func: nanmedian.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: nanmedian
@@ -2887,14 +2543,12 @@
     CUDA: nanmedian_out_cuda
 
 - func: nanmedian.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: nanmedian.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: min
@@ -2905,14 +2559,12 @@
     CPU, CUDA: min_out
 
 - func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: amin
@@ -2928,13 +2580,10 @@
     DefaultBackend: mkldnn_convolution
 
 - func: mkldnn_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> Tensor
-  use_c10_dispatcher: full
 
 - func: mkldnn_convolution_backward_weights(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool bias_defined) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: mkldnn_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: mkldnn_convolution_backward
 
@@ -2954,22 +2603,18 @@
     CUDA: miopen_convolution
 
 - func: miopen_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward_input
 
 - func: miopen_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward
 
 - func: miopen_convolution_backward_bias(Tensor grad_output) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward_bias
 
 - func: miopen_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_backward_weight
 
@@ -2981,17 +2626,14 @@
 # NB: output_padding not strictly needed here, but it's helpful for the float
 # backwards
 - func: miopen_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose_backward
 
 - func: miopen_convolution_transpose_backward_input(Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose_backward_input
 
 - func: miopen_convolution_transpose_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_convolution_transpose_backward_weight
 
@@ -3001,17 +2643,14 @@
     CUDA: miopen_depthwise_convolution
 
 - func: miopen_depthwise_convolution_backward_input(int[] self_size, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution_backward_input
 
 - func: miopen_depthwise_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution_backward
 
 - func: miopen_depthwise_convolution_backward_weight(int[] weight_size, Tensor grad_output, Tensor self, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CUDA: miopen_depthwise_convolution_backward_weight
 
@@ -3026,7 +2665,6 @@
     CUDA: miopen_rnn_backward
 
 - func: mm(Tensor self, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: mm_cpu
@@ -3041,7 +2679,6 @@
     SparseCPU, SparseCUDA: _sparse_mm_out
 
 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sparse_matmul(Tensor self, Tensor other) -> Tensor
   use_c10_dispatcher: full
@@ -3055,7 +2692,6 @@
     SparseCUDA: sparse_matrix_mask_helper_cuda
 
 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mode
@@ -3066,14 +2702,12 @@
     DefaultBackend: mode_out
 
 - func: mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: mode.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: mul.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mul
@@ -3081,7 +2715,6 @@
     MkldnnCPU: mkldnn_mul
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: mul_
@@ -3098,39 +2731,32 @@
 
   # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: mul.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: mul
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: mul_
 
 # multiply, alias for mul
 - func: multiply.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: multiply_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: multiply.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: multiply.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: multiply_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: mv(Tensor self, Tensor vec) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: mv
@@ -3142,31 +2768,26 @@
     DefaultBackend: mv_out
 
 - func: mvlgamma(Tensor self, int p) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: mvlgamma
 
 - func: mvlgamma_(Tensor(a!) self, int p) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: mvlgamma_
 
 - func: narrow_copy(Tensor self, int dim, int start, int length) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: narrow_copy_dense
     SparseCPU, SparseCUDA: narrow_copy_sparse
 
 - func: narrow(Tensor(a) self, int dim, int start, int length) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: narrow.Tensor(Tensor(a) self, int dim, Tensor start, int length) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
@@ -3183,7 +2804,6 @@
     CUDA: batch_norm_cuda_out
 
 - func: batch_norm_stats(Tensor input, float eps) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: batch_norm_stats_cuda
 
@@ -3231,10 +2851,8 @@
     CUDA: batch_norm_update_stats_cuda
 
 - func: is_vulkan_available() -> bool
-  use_c10_dispatcher: full
 
 - func: _nnpack_available() -> bool
-  use_c10_dispatcher: full
 
 - func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, int[2] padding, int[2] stride=1) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -3243,15 +2861,12 @@
     DefaultBackend: _nnpack_spatial_convolution
 
 - func: _nnpack_spatial_convolution_backward(Tensor input, Tensor grad_output, Tensor weight, int[2] padding, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: _nnpack_spatial_convolution_backward_input(Tensor input, Tensor grad_output, Tensor weight, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _nnpack_spatial_convolution_backward_weight(Tensor input, int[] weightsize, Tensor grad_output, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: ones.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -3268,64 +2883,50 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: pairwise_distance(Tensor x1, Tensor x2, float p=2, float eps=1e-06, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _euclidean_dist(Tensor x1, Tensor x2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _euclidean_dist
 
 - func: _cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _cdist_forward
 
 - func: _cdist_backward(Tensor grad, Tensor x1, Tensor x2, float p, Tensor cdist) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _cdist_backward
 
 - func: pdist(Tensor self, float p=2) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _pdist_forward(Tensor self, float p=2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _pdist_forward
 
 - func: _pdist_backward(Tensor grad, Tensor self, float p, Tensor pdist) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: _pdist_backward
 
 - func: cosine_similarity(Tensor x1, Tensor x2, int dim=1, float eps=1e-08) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: permute(Tensor(a) self, int[] dims) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   dispatch:
     DefaultBackend: permute
 
 - func: movedim.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: movedim.int(Tensor(a) self, int source, int destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 # moveaxis, alias for movedim
 - func: moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 # Only exposed from C++ -- in Python,
@@ -3336,45 +2937,36 @@
 # behavior on Windows, for reasons I don't understand
 # (maybe related to capital letter collation somehow...)
 - func: numpy_T(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: pixel_shuffle(Tensor self, int upscale_factor) -> Tensor
-  use_c10_dispatcher: full
 
 - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor
   use_c10_dispatcher: full
 
 - func: channel_shuffle(Tensor self, int groups) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: channel_shuffle
     QuantizedCPU: channel_shuffle_quantized_cpu
 
 - func: is_pinned(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: method
 
 - func: pin_memory(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
 
 - func: pinverse(Tensor self, float rcond=1e-15) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: poisson_nll_loss(Tensor input, Tensor target, bool log_input, bool full, float eps, int reduction) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: rad2deg(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rad2deg
 
 - func: rad2deg_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rad2deg_
@@ -3385,13 +2977,11 @@
     DefaultBackend: rad2deg_out
 
 - func: deg2rad(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: deg2rad
 
 - func: deg2rad_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: deg2rad_
@@ -3508,17 +3098,14 @@
     CUDA: range_cuda_out
 
 - func: ravel(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: reciprocal(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: reciprocal
 
 - func: reciprocal_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: reciprocal_
@@ -3529,13 +3116,11 @@
     CPU, CUDA: reciprocal_out
 
 - func: neg(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: neg
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: neg_
@@ -3549,61 +3134,50 @@
 
 # Alias for neg
 - func: negative(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: negative_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: repeat(Tensor self, int[] repeats) -> Tensor
-  use_c10_dispatcher: full
   variants: method  # This is method-only to match the previous tensor API. In the future we could make this a function too.
   dispatch:
     DefaultBackend: repeat
 
 - func: repeat_interleave.Tensor(Tensor repeats) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: repeat_interleave_cpu
     CUDA: repeat_interleave_cuda
 
 - func: repeat_interleave.self_Tensor(Tensor self, Tensor repeats, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: repeat_interleave.self_int(Tensor self, int repeats, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: reshape(Tensor(a) self, int[] shape) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: _mkldnn_reshape(Tensor self, int[] shape) -> Tensor
-  use_c10_dispatcher: full
   device_guard: False
   dispatch:
     MkldnnCPU: mkldnn_reshape
 
 - func: reshape_as(Tensor(a) self, Tensor other) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: round(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: round
 
 - func: round_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: round_
@@ -3615,13 +3189,10 @@
     CUDA: round_out
 
 - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
 
 - func: rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: relu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: relu
@@ -3629,7 +3200,6 @@
     QuantizedCPU: relu_quantized_cpu
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: relu_
@@ -3637,59 +3207,50 @@
     QuantizedCPU: relu_quantized_cpu_
 
 - func: prelu(Tensor self, Tensor weight) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: prelu_cpu
     CUDA: prelu_cuda
 
 - func: prelu_backward(Tensor grad_output, Tensor self, Tensor weight) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: prelu_backward_cpu
     CUDA: prelu_backward_cuda
 
 - func: gelu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: gelu_cpu
     CUDA: gelu_cuda
 
 - func: gelu_backward(Tensor grad, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: gelu_backward_cpu
     CUDA: gelu_backward_cuda
 
 - func: infinitely_differentiable_gelu_backward(Tensor grad, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   python_module: nn
   device_guard: False
 
 - func: hardshrink(Tensor self, Scalar lambd=0.5) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: hardshrink
 
 - func: hardshrink_backward(Tensor grad_out, Tensor self, Scalar lambd) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: hardshrink_backward
 
 - func: rsqrt(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rsqrt
 
 - func: rsqrt_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rsqrt_
@@ -3700,46 +3261,37 @@
     CPU, CUDA: rsqrt_out
 
 - func: select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: select.int(Tensor(a) self, int dim, int index) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: select
 
 - func: select_backward(Tensor grad, int[] input_sizes, int dim, int index) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: selu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
 
 - func: selu_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
 
 - func: celu(Tensor self, Scalar alpha=1.0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: celu
 
 - func: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: celu_
 
 - func: silu(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: silu
 
 - func: silu_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: silu_
@@ -3751,14 +3303,12 @@
     CPU, CUDA: silu_out
 
 - func: silu_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: silu_backward
     Math: math_silu_backward
 
 - func: sigmoid(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sigmoid
@@ -3766,7 +3316,6 @@
     MkldnnCPU: mkldnn_sigmoid
 
 - func: sigmoid_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sigmoid_
@@ -3778,13 +3327,11 @@
     CPU, CUDA: sigmoid_out
 
 - func: logit(Tensor self, float? eps=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: logit
 
 - func: logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: logit_
@@ -3795,13 +3342,11 @@
     CPU, CUDA: logit_out
 
 - func: sin(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sin
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sin_
@@ -3829,13 +3374,11 @@
     CPU, CUDA: sinc_out
 
 - func: sinh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sinh
 
 - func: sinh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sinh_
@@ -3857,7 +3400,6 @@
 # changing metadata of the detached tensor and expecting the original tensor to also
 # be updated.
 - func: detach(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: detach
@@ -3866,134 +3408,112 @@
 # only be called on non-view `Variable`s. You can use `is_view()` to check
 # this. If this `Variable` is a view, throws an `std::runtime_error()`.
 - func: detach_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: detach_
 
 - func: size.int(Tensor self, int dim) -> int
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
   manual_cpp_binding: True
 
 - func: size.Dimname(Tensor self, Dimname dim) -> int
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: slice.Tensor(Tensor(a) self, int dim=0, int start=0, int end=9223372036854775807, int step=1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: slice
 
 - func: slice_backward(Tensor grad, int[] input_sizes, int dim, int start, int end, int step) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
 - func: slogdet(Tensor self) -> (Tensor sign, Tensor logabsdet)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: slogdet
 
 - func: smm(Tensor self, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 # softmax allows positional dtype, unlike most operators, because kwonly is BC-breaking when loading jit models.
 - func: softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: softmax_cpu
     CUDA: softmax_cuda
     MkldnnCPU: mkldnn_softmax
 
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: softmax_backward_cpu
     CUDA: softmax_backward_cuda
 
 - func: unsafe_split.Tensor(Tensor self, int split_size, int dim=0) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: unsafe_split
 
 - func: split.Tensor(Tensor(a) self, int split_size, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: split
 
 - func: unsafe_split_with_sizes(Tensor self, int[] split_sizes, int dim=0) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: unsafe_split_with_sizes
 
 - func: split_with_sizes(Tensor(a) self, int[] split_sizes, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: split_with_sizes
 
 - func: squeeze(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze
 
 - func: squeeze.dim(Tensor(a) self, int dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze
 
 - func: squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: squeeze_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze_
 
 - func: squeeze_.dim(Tensor(a!) self, int dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: squeeze_
 
 - func: squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: sspaddmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: sspaddmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -4005,7 +3525,6 @@
     SparseCUDA: _sspaddmm_out_cuda
 
 - func: stack(Tensor[] tensors, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: stack
 
@@ -4015,19 +3534,16 @@
     DefaultBackend: stack_out
 
 - func: hstack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: hstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: vstack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: vstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: dstack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
 
 - func: dstack.out(Tensor[] tensors, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -4045,30 +3561,25 @@
   variants: function, method
 
 - func: stride.int(Tensor self, int dim) -> int
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
   manual_cpp_binding: True
 
 - func: stride.Dimname(Tensor self, Dimname dim) -> int
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: sum(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sum
 
 - func: sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sum
 
 - func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: sum.IntList_out(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
@@ -4080,13 +3591,11 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nansum(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: nansum
 
 - func: nansum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: nansum
@@ -4097,18 +3606,15 @@
     CPU, CUDA: nansum_out
 
 - func: sum_to_size(Tensor self, int[] size) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: sqrt(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sqrt
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sqrt_
@@ -4119,39 +3625,32 @@
     CPU, CUDA: sqrt_out
 
 - func: square(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: square_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: std(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: std
 
 - func: std.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: std
 
 - func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: std_mean
 
 - func: std_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: std_mean
 
 - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: std.out(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4160,20 +3659,17 @@
     CPU, CUDA: std_out
 
 - func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: prod
 
 - func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: prod
@@ -4184,34 +3680,29 @@
     CPU, CUDA: prod_out
 
 - func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: t(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   device_guard: False
   variants: function, method
   dispatch:
     DefaultBackend: t
 
 - func: t_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   device_guard: False
   variants: method
   dispatch:
     DefaultBackend: t_
 
 - func: tan(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: tan
 
 - func: tan_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: tan_
@@ -4222,14 +3713,12 @@
     CPU, CUDA: tan_out
 
 - func: tanh(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: tanh
     QuantizedCPU: tanh_quantized_cpu
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: tanh_
@@ -4240,7 +3729,6 @@
     CPU, CUDA: tanh_out
 
 - func: tensordot(Tensor self, Tensor other, int[] dims_self, int[] dims_other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: tensordot.out(Tensor self, Tensor other, int[] dims_self, int[] dims_other, *, Tensor(a!) out) -> Tensor(a!)
@@ -4251,7 +3739,6 @@
 
 # TODO: namespace threshold in 'nn'
 - func: threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold
@@ -4259,7 +3746,6 @@
     QuantizedCPU: threshold_quantized_cpu
 
 - func: threshold_(Tensor(a!) self, Scalar threshold, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold_
@@ -4272,69 +3758,57 @@
     CUDA: threshold_out_cuda
 
 - func: threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: threshold_backward
     CUDA: threshold_backward_cuda
 
 - func: tile(Tensor self, int[] dims) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: transpose.int(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: transpose
 
 - func: transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: _mkldnn_transpose(Tensor self, int dim0, int dim1) -> Tensor
-  use_c10_dispatcher: full
   device_guard: False
   dispatch:
     MkldnnCPU: mkldnn_transpose
 
 - func: transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: transpose_
 
 - func: _mkldnn_transpose_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
-  use_c10_dispatcher: full
   device_guard: False
   dispatch:
     MkldnnCPU: mkldnn_transpose_
 
 - func: one_hot(Tensor self, int num_classes=-1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   variants: function
 
 - func: flip(Tensor self, int[] dims) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, QuantizedCPU: flip_cpu
     CUDA: flip_cuda
 
 - func: fliplr(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: flipud(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: roll(Tensor self, int[1] shifts, int[1] dims=[]) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: roll_cpu
@@ -4343,33 +3817,26 @@
 # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args
 
 - func: rot90(Tensor self, int k=1, int[] dims=[0,1]) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: rot90
 
 - func: trapz.x(Tensor y, Tensor x, *, int dim=-1) -> Tensor
-  use_c10_dispatcher: full
 
 - func: trapz.dx(Tensor y, *, float dx=1, int dim=-1) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _trilinear
 
 - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, float margin=1.0, float p=2, float eps=1e-06, bool swap=False, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
 
 - func: trunc(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: trunc
 
 - func: trunc_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: trunc_
@@ -4381,47 +3848,39 @@
 
 # Alias for trunc
 - func: fix(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: fix_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: type_as(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method
 
 - func: _has_compatible_shallow_copy_type(Tensor self, Tensor from) -> bool
-  use_c10_dispatcher: full
   variants: function
 
 - func: _unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _unique_cpu
     CUDA: _unique_cuda
 
 - func: unique_dim(Tensor self, int dim, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: unique_dim_cpu
     CUDA: unique_dim_cuda
 
 - func: unique_consecutive(Tensor self, bool return_inverse=False, bool return_counts=False, int? dim=None) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: unique_consecutive_cpu
     CUDA: unique_consecutive_cuda
 
 - func: unique_dim_consecutive(Tensor self, int dim, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: unique_dim_consecutive_cpu
@@ -4432,42 +3891,35 @@
 # Please don't rely on these two operators, they will be removed soon
 
 - func: _unique2(Tensor self, bool sorted=True, bool return_inverse=False, bool return_counts=False) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _unique2_cpu
     CUDA: _unique2_cuda
 
 - func: _unsafe_view(Tensor self, int[] size) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _unsafe_view
 
 - func: unsqueeze(Tensor(a) self, int dim) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
   dispatch:
     DefaultBackend: unsqueeze
 
 - func: unsqueeze_(Tensor(a!) self, int dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     DefaultBackend: unsqueeze_
 
 - func: vander(Tensor x, int? N=None, bool increasing=False) -> Tensor
-  use_c10_dispatcher: full
 
 - func: var(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: var
 
 - func: var.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: var
@@ -4478,30 +3930,25 @@
     CPU, CUDA: var_out
 
 - func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: var_mean
 
 - func: var_mean.dim(Tensor self, int[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: var_mean
 
 - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -4509,55 +3956,44 @@
 # this allows us to implicitly calculate the broadcast derivative, while only dealing with the
 # _s_where derivative.
 - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: where(Tensor condition) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
 
 - func: _s_where(Tensor condition, Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _s_where
 
 - func: norm_except_dim(Tensor v, int pow=2, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # VariableType::_weight_norm does not want to be given a gap in the autograd graph,
 # so we don't define "dispatch" variants for it.
 - func: _weight_norm(Tensor v, Tensor g, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _weight_norm_cuda_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: weight_norm_cuda
 
 - func: _weight_norm_cuda_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: weight_norm_cuda_backward
 
 - func: _weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: zeros.names(int[] size, *, Dimname[]? names, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -4574,40 +4010,34 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: _standard_gamma_grad(Tensor self, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _standard_gamma_grad_cpu
     CUDA: _standard_gamma_grad_cuda
 
 - func: _standard_gamma(Tensor self, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _s_gamma_cpu
     CUDA: _s_gamma_cuda
 
 - func: _dirichlet_grad(Tensor x, Tensor alpha, Tensor total) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _dirichlet_grad_cpu
     CUDA: _dirichlet_grad_cuda
 
 - func: _sample_dirichlet(Tensor self, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _s_dirichlet_cpu
     CUDA: _s_dirichlet_cuda
 
 - func: poisson(Tensor self, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _s_poisson_cpu
     CUDA: _s_poisson_cuda
 
 - func: binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _s_binomial_cpu
     CUDA: _s_binomial_cuda
@@ -4616,96 +4046,77 @@
 # complicated
 
 - func: native_norm(Tensor self, Scalar p=2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: norm_sparse
 
 - func: native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: norm_sparse
 
 # TODO: reduce signatures down to one when optional args is available
 - func: _sparse_sum(Tensor self) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sum.dtype(Tensor self, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sum.dim(Tensor self, int[1] dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _sparse_sum
 
 - func: _sparse_sum.dim_dtype(Tensor self, int[1] dim, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: _sparse_sum_backward_cpu
     SparseCUDA: _sparse_sum_backward_cuda
 
 - func: _sparse_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_sparse_cpu
     SparseCUDA: softmax_sparse_cuda
 
 - func: _sparse_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: softmax_backward_sparse_cpu
     SparseCUDA: softmax_backward_sparse_cuda
 
 - func: _sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_sparse_cpu
     SparseCUDA: log_softmax_sparse_cuda
 
 - func: _sparse_log_softmax_backward_data(Tensor grad_output, Tensor output, int dim, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: log_softmax_backward_sparse_cpu
     SparseCUDA: log_softmax_backward_sparse_cuda
 
 - func: norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
 
 - func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
 
 - func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
 
 - func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: norm
@@ -4721,11 +4132,9 @@
     CPU, CUDA: norm_out
 
 - func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
@@ -4735,11 +4144,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: frobenius_norm(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: frobenius_norm.dim(Tensor self, int[1] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: frobenius_norm.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4747,7 +4154,6 @@
   variants: function
 
 - func: nuclear_norm(Tensor self, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4755,7 +4161,6 @@
   variants: function
 
 - func: nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
@@ -4763,7 +4168,6 @@
   variants: function
 
 - func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: clone
@@ -4772,13 +4176,11 @@
     QuantizedCPU, QuantizedCUDA: quantized_clone
 
 - func: resize_as_(Tensor(a!) self, Tensor the_template, *, MemoryFormat? memory_format=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: resize_as_
 
 - func: zero_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: zero_
@@ -4792,14 +4194,12 @@
     SparseCPU, SparseCUDA: sub_out_sparse
 
 - func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: sub
     SparseCPU, SparseCUDA: sub_sparse
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: sub_
@@ -4807,13 +4207,11 @@
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: sub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sub
 
 - func: sub_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: sub_
@@ -4823,24 +4221,19 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: rsub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: rsub
@@ -4851,7 +4244,6 @@
     CPU, CUDA: heaviside_out
 
 - func: heaviside(Tensor self, Tensor values) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: heaviside_(Tensor(a!) self, Tensor values) -> Tensor(a!)
@@ -4860,7 +4252,6 @@
 
 # For C++ only, until we have conversion from C++ numbers to Tensor
 - func: rsub.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     DefaultBackend: rsub
@@ -4868,7 +4259,6 @@
 # Functionally the same as addmm, but we give it a different derivative formula
 # that doesn't propagate gradients to non-present entries on sparse.
 - func: _sparse_addmm(Tensor self, Tensor sparse, Tensor dense, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _sparse_addmm
 
@@ -4881,7 +4271,6 @@
     SparseCUDA: addmm_out_sparse_dense_cuda
 
 - func: addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU: addmm_cpu
@@ -4890,7 +4279,6 @@
     SparseCUDA: addmm_sparse_dense_cuda
 
 - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: addmm_cpu_
@@ -5024,49 +4412,40 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: _validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size) -> ()
-  use_c10_dispatcher: full
 
 - func: _sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: new_with_dims_sparse
 
 - func: _sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, int[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU, SparseCUDA: new_with_dims_and_tensor_sparse
 
 - func: sparse_resize_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_resize_
 
 - func: sparse_resize_and_clear_(Tensor(a!) self, int[] size, int sparse_dim, int dense_dim) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_resize_and_clear_
 
 - func: sparse_mask(Tensor self, Tensor mask) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU: sparse_mask_cpu
     SparseCUDA: sparse_mask_cuda
 
 - func: to_dense(Tensor self, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_to_dense
     MkldnnCPU: mkldnn_to_dense
 
 - func: to_dense_backward(Tensor grad, Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: sparse_dim(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_dim_sparse
@@ -5074,14 +4453,12 @@
 
 # legacy method
 - func: _dimI(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: sparse_dim_sparse
   device_guard: False
 
 - func: dense_dim(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: dense_dim_sparse
@@ -5089,42 +4466,36 @@
 
 # legacy method
 - func: _dimV(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: dense_dim_sparse
   device_guard: False
 
 - func: _nnz(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _nnz_sparse
   device_guard: False
 
 - func: coalesce(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU: coalesce_sparse_cpu
     SparseCUDA: coalesce_sparse_cuda
 
 - func: is_coalesced(Tensor self) -> bool
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: is_coalesced_sparse
   device_guard: False
 
 - func: _indices(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _indices_sparse
   device_guard: False
 
 - func: _values(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _values_sparse
@@ -5134,21 +4505,18 @@
 # a bit unsafe. Similar to _indices and _values, this is useful for implementing
 # custom sparse operations in Python/C++ extension.
 - func: _coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: _coalesced_sparse_
   device_guard: False
 
 - func: indices(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: indices_sparse
   device_guard: False
 
 - func: values(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     SparseCPU, SparseCUDA: values_sparse
@@ -5161,196 +4529,161 @@
     SparseCUDA: hspmm_out_sparse_cuda
 
 - func: hspmm(Tensor mat1, Tensor mat2) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     SparseCPU: hspmm_sparse_cpu
     SparseCUDA: hspmm_sparse_cuda
 
 - func: copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     SparseCPU, SparseCUDA: copy_sparse_
 
 - func: unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: unbind
 
 - func: unbind.Dimname(Tensor(a) self, Dimname dim) -> Tensor(a)[]
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: to_sparse.sparse_dim(Tensor self, int sparse_dim) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
 
 - func: to_sparse(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: dense_to_sparse
 
 - func: to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: dense_to_mkldnn
 
 - func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_reorder_conv2d_weight
 
 - func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_reorder_conv3d_weight
 
 - func: to_mkldnn_backward(Tensor grad, Tensor input) -> Tensor
-  use_c10_dispatcher: full
 
 - func: quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: quantize_per_tensor
 
 - func: quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_tensor_list_cpu
 
 - func: quantize_per_channel(Tensor self, Tensor scales, Tensor zero_points, int axis, ScalarType dtype) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: quantize_per_channel_cpu
 
 - func: dequantize.self(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: dequantize_quant
 
 - func: dequantize.tensors(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     QuantizedCPU: dequantize_tensors_quantized_cpu
 
 - func: q_scale(Tensor self) -> float
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_scale_quant
 
 - func: q_zero_point(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_zero_point_quant
 
 - func: q_per_channel_scales(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_per_channel_scales
 
 - func: q_per_channel_zero_points(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_per_channel_zero_points
 
 - func: q_per_channel_axis(Tensor self) -> int
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU, QuantizedCUDA: q_per_channel_axis
 
 - func: int_repr(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     QuantizedCPU: int_repr_quantized_cpu
     QuantizedCUDA: int_repr_quantized_cuda
 
 - func: _make_per_tensor_quantized_tensor(Tensor self, float scale, int zero_point) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: make_per_tensor_quantized_tensor_cpu
     CUDA: make_per_tensor_quantized_tensor_cuda
 
 - func: _make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: make_per_channel_quantized_tensor_cpu
 
 - func: qscheme(Tensor self) -> QScheme
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     QuantizedCPU, QuantizedCUDA: qscheme_quant
 
 - func: fake_quantize_per_tensor_affine(Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: fake_quantize_per_tensor_affine
 
 - func: fake_quantize_per_tensor_affine_backward(Tensor grad, Tensor self, float scale, int zero_point, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _fake_quantize_learnable_per_tensor_affine
 
 - func: _fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: fake_quantize_per_channel_affine
 
 - func: fake_quantize_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: _fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: _fake_quantize_learnable_per_channel_affine
 
 - func: _fake_quantize_learnable_per_channel_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 - func: _choose_qparams_per_tensor(Tensor self, bool reduce_range=False) -> (float, int)
-  use_c10_dispatcher: full
   variants: function
 
 - func: _saturate_weight_to_fp16(Tensor weight) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: choose_qparams_optimized(Tensor input, int numel, int n_bins, float ratio, int bit_width) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
 
 # to(Device) must not exist because all constructors of Device also works for
@@ -5362,61 +4695,47 @@
   device_guard: False
 
 - func: to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 - func: meshgrid(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
 
 - func: cartesian_prod(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: combinations(Tensor self, int r=2, bool with_replacement=False) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 - func: item(Tensor self) -> Scalar
-  use_c10_dispatcher: full
   variants: method
 
 - func: result_type.Tensor(Tensor tensor, Tensor other) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar(Tensor tensor, Scalar other) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Tensor(Scalar scalar, Tensor tensor) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
-  use_c10_dispatcher: full
 
 - func: can_cast(ScalarType from, ScalarType to) -> bool
-  use_c10_dispatcher: full
   variants: function
 
 - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
-  use_c10_dispatcher: full
   variants: function
 
 # NB: Does NOT check precondition that numel == 1
 - func: _local_scalar_dense(Tensor self) -> Scalar
-  use_c10_dispatcher: full
   dispatch:
     CPU: _local_scalar_dense_cpu
     CUDA: _local_scalar_dense_cuda
@@ -5442,7 +4761,6 @@
     CUDA: _thnn_fused_gru_cell_cuda
 
 - func: _thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CUDA: _thnn_fused_gru_cell_backward_cuda
 
@@ -5451,28 +4769,20 @@
 
 # RNN cells and layers
 - func: lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_tanh.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_tanh.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_relu.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: rnn_relu.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> (Tensor, Tensor)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -5490,55 +4800,46 @@
 
 # Quantized RNN layers
 # - func: quantized_lstm(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
-#  use_c10_dispatcher: full
+
 
 # - func: quantized_lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, *, ScalarType? dtype=None, bool use_dynamic=False) -> (Tensor, Tensor, Tensor)
-#  use_c10_dispatcher: full
+
 
 # Quantized GRU layers
 
 # - func: quantized_gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor)
-#   use_c10_dispatcher: full
+#
 
 # - func: quantized_gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor)
-#   use_c10_dispatcher: full
+#
 
 # Quantized RNN cells
 - func: quantized_lstm_cell(Tensor input, Tensor[] hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 - func: quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
-  use_c10_dispatcher: full
 
 - func: quantized_rnn_relu_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
-  use_c10_dispatcher: full
 
 - func: quantized_rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor
-  use_c10_dispatcher: full
 
 # PackedSequence utilities
 - func: _pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     DefaultBackend: _pack_padded_sequence
 
 - func: _pack_padded_sequence_backward(Tensor grad, int[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor
-  use_c10_dispatcher: full
 
 - func: _pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
 
 # wrappers for legacy TH methods
 
 - func: set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: set_
 
 - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -5547,61 +4848,51 @@
     QuantizedCPU, QuantizedCUDA: set_storage_quantized_
 
 - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: set_tensor_
 
 - func: set_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: set_cpu_
     CUDA: set_cuda_
 
 - func: is_set_to(Tensor self, Tensor tensor) -> bool
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
     CPU, CUDA: is_set_to
 
 - func: masked_fill_.Scalar(Tensor(a!) self, Tensor mask, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
 
 - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_fill__cpu
     CUDA: masked_fill__cuda
 
 - func: masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: masked_scatter__cpu
     CUDA: masked_scatter__cuda
 
 - func: masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: view(Tensor(a) self, int[] size) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -5609,126 +4900,101 @@
     MkldnnCPU: mkldnn_view
 
 - func: put_(Tensor(a!) self, Tensor index, Tensor source, bool accumulate=False) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_put_
     CUDA: legacy::cuda::_th_put_
 
 - func: index_add_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: index_add_cpu_
     CUDA: index_add_cuda_
 
 - func: index_add(Tensor self, int dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill_.int_Scalar(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_index_fill_
     CUDA: legacy::cuda::_th_index_fill_
 
 - func: index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill_.int_Tensor(Tensor(a!) self, int dim, Tensor index, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: index_fill_
 
 - func: index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill_.Dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: index_fill_.Dimname_Tensor(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.src(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_
 
 - func: scatter.src(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.value(Tensor(a!) self, int dim, Tensor index, Scalar value) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_fill_
 
 - func: scatter.value(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_.reduce(Tensor(a!) self, int dim, Tensor index, Tensor src, *, str reduce) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_reduce_
 
 - func: scatter_.value_reduce(Tensor(a!) self, int dim, Tensor index, Scalar value, *, str reduce) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_scalar_reduce_
 
 - func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: scatter_add_
 
 - func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: eq_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: eq_
 
 - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: eq_
@@ -5746,35 +5012,27 @@
     CPU, CUDA: bitwise_and_out
 
 - func: bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __and__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __and__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __iand__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __iand__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_or.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -5790,35 +5048,27 @@
     CPU, CUDA: bitwise_or_out
 
 - func: bitwise_or.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_or.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_or_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_or_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __or__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __or__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __ior__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __ior__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_xor.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -5834,181 +5084,149 @@
     CPU, CUDA: bitwise_xor_out
 
 - func: bitwise_xor.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_xor.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: bitwise_xor_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: bitwise_xor_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __xor__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __xor__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: __ixor__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __ixor__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: __lshift__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __lshift__
 
 - func: __lshift__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __lshift__
 
 - func: __ilshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __ilshift__
 
 - func: __ilshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __ilshift__
 
 - func: __rshift__.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __rshift__
 
 - func: __rshift__.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: __rshift__
 
 - func: __irshift__.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __irshift__
 
 - func: __irshift__.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: __irshift__
 
 - func: lgamma_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: _lgamma__cpu
     CUDA: _lgamma__cuda
 
 - func: atan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: atan2_
 
 - func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: tril_cpu_
     CUDA: tril_cuda_
 
 - func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: triu_cpu_
     CUDA: triu_cuda_
 
 - func: digamma_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: digamma_
 
 - func: polygamma_(Tensor(a!) self, int n) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: renorm_(Tensor(a!) self, Scalar p, int dim, Scalar maxnorm) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: legacy::cpu::_th_renorm_
     CUDA: legacy::cuda::_th_renorm_
 
 - func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: pow_
 
 - func: pow_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: pow_
 
 - func: lerp_.Scalar(Tensor(a!) self, Tensor end, Scalar weight) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: lerp_cpu_scalar_
     CUDA: lerp_cuda_scalar_
 
 - func: lerp_.Tensor(Tensor(a!) self, Tensor end, Tensor weight) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU: lerp_cpu_tensor_
     CUDA: lerp_cuda_tensor_
 
 - func: fmod_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: fmod_
 
 - func: fmod_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: fmod_
 
 - func: remainder_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: remainder_
 
 - func: remainder_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: remainder_
 
 - func: addbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: addbmm_
@@ -6019,61 +5237,51 @@
     CPU, CUDA: addbmm_out
 
 - func: addbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: addbmm
 
 - func: addcdiv_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: addcdiv_
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: random_
 
 - func: random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: random_
 
 - func: random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: random_
 
 - func: uniform_(Tensor(a!) self, float from=0, float to=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: uniform_
 
 - func: cauchy_(Tensor(a!) self, float median=0, float sigma=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: cauchy_
 
 - func: log_normal_(Tensor(a!) self, float mean=1, float std=2, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: log_normal_
 
 - func: exponential_(Tensor(a!) self, float lambd=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: exponential_
 
 - func: geometric_(Tensor(a!) self, float p, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: geometric_
@@ -6087,13 +5295,11 @@
     CUDA: diag_cuda_out
 
 - func: diag(Tensor self, int diagonal=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: diag
 
 - func: diag_backward(Tensor grad, int[] input_sizes, int diagonal) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6103,7 +5309,6 @@
     CPU, CUDA: cross_out
 
 - func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: cross
@@ -6115,7 +5320,6 @@
     CUDA: triu_cuda_out
 
 - func: triu(Tensor self, int diagonal=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: triu
@@ -6127,32 +5331,27 @@
     CUDA: tril_cuda_out
 
 - func: tril(Tensor self, int diagonal=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: tril
 
 - func: tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: tril_indices_cpu
     CUDA: tril_indices_cuda
 
 - func: triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: triu_indices_cpu
     CUDA: triu_indices_cuda
 
 - func: trace(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: trace_cpu
     CUDA: trace_cuda
 
 - func: trace_backward(Tensor grad, int[] sizes) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6163,7 +5362,6 @@
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ne
@@ -6176,20 +5374,17 @@
     QuantizedCPU: ne_out_quantized_cpu
 
 - func: ne.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ne
     QuantizedCPU: ne_quantized_cpu
 
 - func: ne_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ne_
 
 - func: ne_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ne_
@@ -6199,22 +5394,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: not_equal.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: not_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: not_equal.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: not_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: not_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: eq.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6224,7 +5415,6 @@
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: eq
@@ -6237,7 +5427,6 @@
     QuantizedCPU: eq_out_quantized_cpu
 
 - func: eq.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: eq
@@ -6250,7 +5439,6 @@
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ge
@@ -6263,20 +5451,17 @@
     QuantizedCPU: ge_out_quantized_cpu
 
 - func: ge.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: ge
     QuantizedCPU: ge_quantized_cpu
 
 - func: ge_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ge_
 
 - func: ge_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: ge_
@@ -6286,22 +5471,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater_equal.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater_equal.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: greater_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6311,7 +5492,6 @@
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: le
@@ -6324,20 +5504,17 @@
     QuantizedCPU: le_out_quantized_cpu
 
 - func: le.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: le
     QuantizedCPU: le_quantized_cpu
 
 - func: le_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: le_
 
 - func: le_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: le_
@@ -6347,22 +5524,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less_equal.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less_equal.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less_equal.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less_equal_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: less_equal_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: gt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6372,7 +5545,6 @@
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: gt
@@ -6385,20 +5557,17 @@
     QuantizedCPU: gt_out_quantized_cpu
 
 - func: gt.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: gt
     QuantizedCPU: gt_quantized_cpu
 
 - func: gt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: gt_
 
 - func: gt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: gt_
@@ -6408,22 +5577,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: greater.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: greater_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: greater_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
@@ -6433,7 +5598,6 @@
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: lt
@@ -6446,20 +5610,17 @@
     QuantizedCPU: lt_out_quantized_cpu
 
 - func: lt.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: lt
     QuantizedCPU: lt_quantized_cpu
 
 - func: lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: lt_
 
 - func: lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: lt_
@@ -6469,22 +5630,18 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: less.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: less_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: less_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
 
 - func: take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)
@@ -6494,14 +5651,12 @@
     CUDA: take_out_cuda
 
 - func: take(Tensor self, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: take_cpu
     CUDA: take_cuda
 
 - func: take_backward(Tensor grad, Tensor input, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6512,7 +5667,6 @@
     CUDA: index_select_out_cuda
 
 - func: index_select(Tensor self, int dim, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: index_select_cpu_
@@ -6524,11 +5678,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: index_select_backward(Tensor grad, int[] self_sizes, int dim, Tensor index) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6539,14 +5691,12 @@
     CUDA: masked_select_out_cuda
 
 - func: masked_select(Tensor self, Tensor mask) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: masked_select_cpu
     CUDA: masked_select_cuda
 
 - func: masked_select_backward(Tensor grad, Tensor input, Tensor mask) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6557,14 +5707,12 @@
     CUDA: nonzero_out_cuda
 
 - func: nonzero(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_nonzero
     CUDA: nonzero_cuda
 
 - func: nonzero_numpy(Tensor self) -> Tensor[]
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: gather.out(Tensor self, int dim, Tensor index, *, bool sparse_grad=False, Tensor(a!) out) -> Tensor(a!)
@@ -6574,13 +5722,11 @@
     CUDA: gather_out_cpu_cuda
 
 - func: gather(Tensor self, int dim, Tensor index, *, bool sparse_grad=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: gather
 
 - func: gather_backward(Tensor grad, Tensor self, int dim, Tensor index, bool sparse_grad) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   device_guard: False
 
@@ -6588,11 +5734,9 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: _gather_sparse_backward(Tensor self, int dim, Tensor index, Tensor grad) -> Tensor
-  use_c10_dispatcher: full
 
 - func: addcmul.out(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -6600,13 +5744,11 @@
     CPU, CUDA: addcmul_out
 
 - func: addcmul(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: addcmul
 
 - func: addcmul_(Tensor(a!) self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: addcmul_
@@ -6617,7 +5759,6 @@
     CPU, CUDA: addcdiv_out
 
 - func: addcdiv(Tensor self, Tensor tensor1, Tensor tensor2, *, Scalar value=1) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: addcdiv
@@ -6629,7 +5770,6 @@
     CUDA: legacy::cuda::_th_gels_out
 
 - func: lstsq(Tensor self, Tensor A) -> (Tensor solution, Tensor QR)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_gels
@@ -6641,13 +5781,11 @@
     DefaultBackend: triangular_solve_out
 
 - func: triangular_solve(Tensor self, Tensor A, bool upper=True, bool transpose=False, bool unitriangular=False) -> (Tensor solution, Tensor cloned_coefficient)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: triangular_solve
 
 - func: _triangular_solve_helper(Tensor self, Tensor A, bool upper, bool transpose, bool unitriangular) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _triangular_solve_helper_cpu
@@ -6659,13 +5797,11 @@
     DefaultBackend: symeig_out
 
 - func: symeig(Tensor self, bool eigenvectors=False, bool upper=True) -> (Tensor eigenvalues, Tensor eigenvectors)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: symeig
 
 - func: _symeig_helper(Tensor self, bool eigenvectors, bool upper) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _symeig_helper_cpu
@@ -6677,7 +5813,6 @@
     DefaultBackend: eig_out
 
 - func: eig(Tensor self, bool eigenvectors=False) -> (Tensor eigenvalues, Tensor eigenvectors)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: eig
@@ -6688,13 +5823,11 @@
     DefaultBackend: svd_out
 
 - func: svd(Tensor self, bool some=True, bool compute_uv=True) -> (Tensor U, Tensor S, Tensor V)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: svd
 
 - func: _svd_helper(Tensor self, bool some, bool compute_uv) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _svd_helper_cpu
@@ -6702,23 +5835,19 @@
 
 # swapaxes, alias for transpose
 - func: swapaxes(Tensor(a) self, int axis0, int axis1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: swapaxes_(Tensor(a!) self, int axis0, int axis1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
 # swapdims, alias for transpose
 - func: swapdims(Tensor(a) self, int dim0, int dim1) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: swapdims_(Tensor(a!) self, int dim0, int dim1) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
 
@@ -6728,13 +5857,11 @@
     DefaultBackend: cholesky_out
 
 - func: cholesky(Tensor self, bool upper=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: cholesky
 
 - func: _cholesky_helper(Tensor self, bool upper) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _cholesky_helper_cpu
@@ -6746,20 +5873,17 @@
     DefaultBackend: cholesky_solve_out
 
 - func: cholesky_solve(Tensor self, Tensor input2, bool upper=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: cholesky_solve
 
 - func: _cholesky_solve_helper(Tensor self, Tensor A, bool upper) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _cholesky_solve_helper_cpu
     CUDA: _cholesky_solve_helper_cuda
 
 - func: solve(Tensor self, Tensor A) -> (Tensor solution, Tensor LU)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: solve
@@ -6770,7 +5894,6 @@
     DefaultBackend: solve_out
 
 - func: _solve_helper(Tensor self, Tensor A) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _solve_helper_cpu
@@ -6783,7 +5906,6 @@
     CUDA: legacy::cuda::_th_potri_out
 
 - func: cholesky_inverse(Tensor self, bool upper=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_potri
@@ -6792,20 +5914,12 @@
 - func: qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   dispatch:
-    DefaultBackend: qr_out
+    Math: qr_out
 
 - func: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
-    DefaultBackend: qr
-
-- func: _qr_helper(Tensor self, bool some) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
-  variants: function
-  dispatch:
-    CPU: _qr_helper_cpu
-    CUDA: _qr_helper_cuda
+    Math: qr
 
 - func: geqrf.a(Tensor self, *, Tensor(a!) a, Tensor(b!) tau) -> (Tensor(a!) a, Tensor(b!) tau)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
@@ -6814,7 +5928,6 @@
     CUDA: legacy::cuda::_th_geqrf_out
 
 - func: geqrf(Tensor self) -> (Tensor a, Tensor tau)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_geqrf
@@ -6826,7 +5939,6 @@
     CPU: legacy::cpu::_th_orgqr_out
 
 - func: orgqr(Tensor self, Tensor input2) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_orgqr
@@ -6837,13 +5949,11 @@
     CPU: legacy::cpu::_th_ormqr_out
 
 - func: ormqr(Tensor self, Tensor input2, Tensor input3, bool left=True, bool transpose=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_ormqr
 
 - func: _lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor, Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _lu_with_info_cpu
@@ -6855,13 +5965,11 @@
     DefaultBackend: lu_solve_out
 
 - func: lu_solve(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: lu_solve
 
 - func: _lu_solve_helper(Tensor self, Tensor LU_data, Tensor LU_pivots) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _lu_solve_helper_cpu
@@ -6874,20 +5982,17 @@
     CPU, CUDA: multinomial_out
 
 - func: multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: multinomial
 
 - func: _multinomial_alias_setup(Tensor probs) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: legacy::cpu::_th_multinomial_alias_setup
     CUDA: legacy::cuda::_th_multinomial_alias_setup
 
 - func: _multinomial_alias_draw(Tensor J, Tensor q, int num_samples, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: legacy::cpu::_th_multinomial_alias_draw
@@ -6900,7 +6005,6 @@
     CUDA: _lgamma_out_cuda
 
 - func: lgamma(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: lgamma
@@ -6911,7 +6015,6 @@
     CPU, CUDA: digamma_out
 
 - func: digamma(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: digamma
@@ -6922,19 +6025,16 @@
     CPU, CUDA: polygamma_out
 
 - func: polygamma(int n, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: polygamma
 
 - func: erfinv(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: erfinv
 
 - func: erfinv_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: erfinv_
@@ -6945,13 +6045,11 @@
     CPU, CUDA: erfinv_out
 
 - func: i0(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: i0
 
 - func: i0_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: i0_
@@ -6962,13 +6060,11 @@
     CPU, CUDA: i0_out
 
 - func: sign(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: sign
 
 - func: sign_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     DefaultBackend: sign_
@@ -6979,7 +6075,6 @@
     CPU, CUDA: sign_out
 
 - func: signbit(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: signbit.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6989,7 +6084,6 @@
     CUDA: signbit_out
 
 - func: dist(Tensor self, Tensor other, Scalar p=2) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: dist
@@ -7000,7 +6094,6 @@
     CPU, CUDA: atan2_out
 
 - func: atan2(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: atan2
@@ -7018,14 +6111,12 @@
     CUDA: lerp_cuda_tensor_out
 
 - func: lerp.Scalar(Tensor self, Tensor end, Scalar weight) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: lerp_cpu_scalar
     CUDA: lerp_cuda_scalar
 
 - func: lerp.Tensor(Tensor self, Tensor end, Tensor weight) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: lerp_cpu_tensor
@@ -7038,7 +6129,6 @@
     CUDA: _histc_out_cuda
 
 - func: histc(Tensor self, int bins=100, Scalar min=0, Scalar max=0) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_histc
@@ -7050,7 +6140,6 @@
     CPU, CUDA: fmod_out
 
 - func: fmod.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: fmod
@@ -7061,7 +6150,6 @@
     CPU, CUDA: fmod_out
 
 - func: fmod.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: fmod
@@ -7072,7 +6160,6 @@
     CPU, CUDA: hypot_out
 
 - func: hypot(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: hypot
@@ -7089,7 +6176,6 @@
     CPU, CUDA: igamma_out
 
 - func: igamma(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: igamma
@@ -7106,13 +6192,11 @@
     CPU, CUDA: igammac_out
 
 - func: igammac(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: igammac
 
 - func: igammac_(Tensor(a!) self, Tensor other) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: igammac_
@@ -7123,7 +6207,6 @@
     CPU, CUDA: nextafter_out
 
 - func: nextafter(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: nextafter
@@ -7140,7 +6223,6 @@
     CPU, CUDA: remainder_out
 
 - func: remainder.Scalar(Tensor self, Scalar other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: remainder
@@ -7151,27 +6233,23 @@
     CPU, CUDA: remainder_out
 
 - func: remainder.Tensor(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: remainder
 
 - func: min(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: min
     QuantizedCPU: min_quantized_cpu
 
 - func: max(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: max
     QuantizedCPU: max_quantized_cpu
 
 - func: maximum(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: maximum
@@ -7184,14 +6262,12 @@
 # binary max, alias of maximum
 # NOTE: max is not an alias for maximum, since there is also unary max
 - func: max.other(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: max.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: minimum(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: minimum
@@ -7207,35 +6283,30 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: min.other(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: quantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: quantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: quantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: quantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: nanquantile.scalar_out(Tensor self, float q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nanquantile.scalar(Tensor self, float q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: nanquantile.out(Tensor self, Tensor q, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: nanquantile(Tensor self, Tensor q, int? dim=None, bool keepdim=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7245,7 +6316,6 @@
     CUDA: legacy::cuda::_th_sort_out
 
 - func: sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: sort_cpu
@@ -7256,7 +6326,6 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -7265,17 +6334,14 @@
     Math: msort_out
 
 - func: msort(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     Math: msort
 
 - func: argsort(Tensor self, int dim=-1, bool descending=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
 
 - func: topk.values(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)
@@ -7285,20 +6351,17 @@
     CUDA: legacy::cuda::_th_topk_out
 
 - func: topk(Tensor self, int k, int dim=-1, bool largest=True, bool sorted=True) -> (Tensor values, Tensor indices)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: topk
     QuantizedCPU: topk_quantized_cpu
 
 - func: all(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: all
 
 - func: any(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: any
@@ -7311,14 +6374,12 @@
     CUDA: legacy::cuda::_th_renorm_out
 
 - func: renorm(Tensor self, Scalar p, int dim, Scalar maxnorm) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: legacy::cpu::_th_renorm
     CUDA: legacy::cuda::_th_renorm
 
 - func: unfold(Tensor(a) self, int dimension, int size, int step) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method
   device_guard: False
   dispatch:
@@ -7326,13 +6387,11 @@
     QuantizedCPU, QuantizedCUDA: unfold
 
 - func: unfold_backward(Tensor grad_in, int[] input_sizes, int dim, int size, int step) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU, CUDA: unfold_backward
 
 - func: equal(Tensor self, Tensor other) -> bool
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU: cpu_equal
@@ -7345,7 +6404,6 @@
     CPU, CUDA: pow_out
 
 - func: pow.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     CPU, CUDA: pow
@@ -7356,7 +6414,6 @@
     CPU, CUDA: pow_out
 
 - func: pow.Scalar(Scalar self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: pow
 
@@ -7367,7 +6424,6 @@
     SparseCPU, SparseCUDA: pow_out_sparse_scalar
 
 - func: pow.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     CPU, CUDA: pow
@@ -7379,7 +6435,6 @@
     Math: float_power_out
 
 - func: float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     Math: float_power
@@ -7390,7 +6445,6 @@
     Math: float_power_out
 
 - func: float_power.Scalar(Scalar self, Tensor exponent) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     Math: float_power
 
@@ -7400,25 +6454,21 @@
     Math: float_power_out
 
 - func: float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     Math: float_power
 
 - func: float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     Math: float_power_
 
 - func: float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     Math: float_power_
 
 - func: normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CPU, CUDA: normal_
@@ -7429,7 +6479,6 @@
     CPU, CUDA: normal_out
 
 - func: normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: normal
 
@@ -7439,7 +6488,6 @@
     CPU, CUDA: normal_out
 
 - func: normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: normal
 
@@ -7449,7 +6497,6 @@
     CPU, CUDA: normal_out
 
 - func: normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU, CUDA: normal
 
@@ -7460,19 +6507,16 @@
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: alias(Tensor(a) self) -> Tensor(a)
-  use_c10_dispatcher: full
   variants: method, function
   dispatch:
     DefaultBackend: alias
 
 - func: _index_copy_(Tensor(a!) self, int dim, Tensor index, Tensor source) -> Tensor(a!)
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_index_copy_
     CUDA: legacy::cuda::_th_index_copy_
 
 - func: _cumsum(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _cumsum_cpu
     CUDA: _cumsum_cuda
@@ -7484,7 +6528,6 @@
     CUDA: _cumsum_out_cuda
 
 - func: _cumprod(Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _cumprod_cpu
     CUDA: _cumprod_cuda
@@ -7496,29 +6539,24 @@
     CUDA: _cumprod_out_cuda
 
 - func: _var(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_var
 
 - func: _std(Tensor self, bool unbiased=True) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_std
 
 - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_
 
 - func: _amp_update_scale(Tensor(a!) growth_tracker, Tensor current_scale, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CUDA: _amp_update_scale_cuda
 
 - func: _cat(Tensor[] tensors, int dim=0) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: _cat_cpu
     CUDA: cat_cuda
@@ -7532,644 +6570,552 @@
     QuantizedCPU: cat_out_quantized_cpu
 
 - func: _foreach_add.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow
     CUDA: foreach_tensor_add_scalar_kernel_cuda
 
 - func: _foreach_add_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalar_kernel_slow_
     CUDA: foreach_tensor_add_scalar_kernel_cuda_
 
 - func: _foreach_sub.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow
     CUDA: foreach_tensor_sub_scalar_kernel_cuda
 
 - func: _foreach_sub_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalar_kernel_slow_
     CUDA: foreach_tensor_sub_scalar_kernel_cuda_
 
 - func: _foreach_mul.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow
     CUDA: foreach_tensor_mul_scalar_kernel_cuda
 
 - func: _foreach_mul_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalar_kernel_slow_
     CUDA: foreach_tensor_mul_scalar_kernel_cuda_
 
 - func: _foreach_div.Scalar(Tensor[] tensors, Scalar scalar) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow
     CUDA: foreach_tensor_div_scalar_kernel_cuda
 
 - func: _foreach_div_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalar_kernel_slow_
     CUDA: foreach_tensor_div_scalar_kernel_cuda_
 
 - func: _foreach_add.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow
     CUDA: foreach_tensor_add_list_kernel_cuda
 
 - func: _foreach_add_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_list_kernel_slow_
     CUDA: foreach_tensor_add_list_kernel_cuda_
 
 - func: _foreach_sub.List(Tensor[] tensors1, Tensor[] tensors2, *, Scalar alpha=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow
     CUDA: foreach_tensor_sub_list_kernel_cuda
 
 - func: _foreach_sub_.List(Tensor(a!)[] self, Tensor[] other, *, Scalar alpha=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_list_kernel_slow_
     CUDA: foreach_tensor_sub_list_kernel_cuda_
 
 - func: _foreach_mul.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow
     CUDA: foreach_tensor_mul_list_kernel_cuda
 
 - func: _foreach_mul_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_list_kernel_slow_
     CUDA: foreach_tensor_mul_list_kernel_cuda_
 
 - func: _foreach_div.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow
     CUDA: foreach_tensor_div_list_kernel_cuda
 
 - func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_list_kernel_slow_
     CUDA: foreach_tensor_div_list_kernel_cuda_
 
 - func: _foreach_add.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalarlist_kernel_slow
     CUDA: foreach_tensor_add_scalarlist_kernel_cuda
 
 - func: _foreach_add_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_add_scalarlist_kernel_slow_
     CUDA: foreach_tensor_add_scalarlist_kernel_cuda_
 
 - func: _foreach_sub.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalarlist_kernel_slow
     CUDA: foreach_tensor_sub_scalarlist_kernel_cuda
 
 - func: _foreach_sub_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sub_scalarlist_kernel_slow_
     CUDA: foreach_tensor_sub_scalarlist_kernel_cuda_
 
 - func: _foreach_div.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalarlist_kernel_slow
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda
 
 - func: _foreach_div_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_div_scalarlist_kernel_slow_
     CUDA: foreach_tensor_div_scalarlist_kernel_cuda_
 
 - func: _foreach_mul.ScalarList(Tensor[] tensors, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalarlist_kernel_slow
     CUDA: foreach_tensor_mul_scalarlist_kernel_cuda
 
 - func: _foreach_mul_.ScalarList(Tensor(a!)[] self, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_mul_scalarlist_kernel_slow_
     CUDA: foreach_tensor_mul_scalarlist_kernel_cuda_
 
 - func: _foreach_exp(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_exp_slow
     CUDA: foreach_tensor_exp_cuda
 
 - func: _foreach_zero_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_zero_slow_
     CUDA: foreach_tensor_zero_cuda_
 
 - func: _foreach_exp_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_exp_slow_
     CUDA: foreach_tensor_exp_cuda_
 
 - func: _foreach_sqrt(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sqrt_slow
     CUDA: foreach_tensor_sqrt_cuda
 
 - func: _foreach_sqrt_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sqrt_slow_
     CUDA: foreach_tensor_sqrt_cuda_
 
 - func: _foreach_abs(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_abs_slow
     CUDA: foreach_tensor_abs_cuda
 
 - func: _foreach_abs_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_abs_slow_
     CUDA: foreach_tensor_abs_cuda_
 
 - func: _foreach_acos(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_acos_slow
     CUDA: foreach_tensor_acos_cuda
 
 - func: _foreach_acos_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_acos_slow_
     CUDA: foreach_tensor_acos_cuda_
 
 - func: _foreach_asin(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_asin_slow
     CUDA: foreach_tensor_asin_cuda
 
 - func: _foreach_asin_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_asin_slow_
     CUDA: foreach_tensor_asin_cuda_
 
 - func: _foreach_atan(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_atan_slow
     CUDA: foreach_tensor_atan_cuda
 
 - func: _foreach_atan_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_atan_slow_
     CUDA: foreach_tensor_atan_cuda_
 
 - func: _foreach_ceil(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_ceil_slow
     CUDA: foreach_tensor_ceil_cuda
 
 - func: _foreach_ceil_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_ceil_slow_
     CUDA: foreach_tensor_ceil_cuda_
 
 - func: _foreach_cos(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cos_slow
     CUDA: foreach_tensor_cos_cuda
 
 - func: _foreach_cos_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cos_slow_
     CUDA: foreach_tensor_cos_cuda_
 
 - func: _foreach_cosh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cosh_slow
     CUDA: foreach_tensor_cosh_cuda
 
 - func: _foreach_cosh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_cosh_slow_
     CUDA: foreach_tensor_cosh_cuda_
 
 - func: _foreach_erf(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erf_slow
     CUDA: foreach_tensor_erf_cuda
 
 - func: _foreach_erf_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erf_slow_
     CUDA: foreach_tensor_erf_cuda_
 
 - func: _foreach_erfc(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erfc_slow
     CUDA: foreach_tensor_erfc_cuda
 
 - func: _foreach_erfc_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_erfc_slow_
     CUDA: foreach_tensor_erfc_cuda_
 
 - func: _foreach_expm1(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_expm1_slow
     CUDA: foreach_tensor_expm1_cuda
 
 - func: _foreach_expm1_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_expm1_slow_
     CUDA: foreach_tensor_expm1_cuda_
 
 - func: _foreach_floor(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_floor_slow
     CUDA: foreach_tensor_floor_cuda
 
 - func: _foreach_floor_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_floor_slow_
     CUDA: foreach_tensor_floor_cuda_
 
 - func: _foreach_log(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log_slow
     CUDA: foreach_tensor_log_cuda
 
 - func: _foreach_log_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log_slow_
     CUDA: foreach_tensor_log_cuda_
 
 - func: _foreach_log10(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log10_slow
     CUDA: foreach_tensor_log10_cuda
 
 - func: _foreach_log10_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log10_slow_
     CUDA: foreach_tensor_log10_cuda_
 
 - func: _foreach_log1p(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log1p_slow
     CUDA: foreach_tensor_log1p_cuda
 
 - func: _foreach_log1p_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log1p_slow_
     CUDA: foreach_tensor_log1p_cuda_
 
 - func: _foreach_log2(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log2_slow
     CUDA: foreach_tensor_log2_cuda
 
 - func: _foreach_log2_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_log2_slow_
     CUDA: foreach_tensor_log2_cuda_
 
 - func: _foreach_neg(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_neg_slow
     CUDA: foreach_tensor_neg_cuda
 
 - func: _foreach_neg_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_neg_slow_
     CUDA: foreach_tensor_neg_cuda_
 
 - func: _foreach_tan(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tan_slow
     CUDA: foreach_tensor_tan_cuda
 
 - func: _foreach_tan_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tan_slow_
     CUDA: foreach_tensor_tan_cuda_
 
 - func: _foreach_tanh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tanh_slow
     CUDA: foreach_tensor_tanh_cuda
 
 - func: _foreach_tanh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_tanh_slow_
     CUDA: foreach_tensor_tanh_cuda_
 
 - func: _foreach_sin(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sin_slow
     CUDA: foreach_tensor_sin_cuda
 
 - func: _foreach_sin_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sin_slow_
     CUDA: foreach_tensor_sin_cuda_
 
 - func: _foreach_sinh(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sinh_slow
     CUDA: foreach_tensor_sinh_cuda
 
 - func: _foreach_sinh_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sinh_slow_
     CUDA: foreach_tensor_sinh_cuda_
 
 - func: _foreach_round(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_round_slow
     CUDA: foreach_tensor_round_cuda
 
 - func: _foreach_round_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_round_slow_
     CUDA: foreach_tensor_round_cuda_
 
 - func: _foreach_lgamma(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_lgamma_slow
     CUDA: foreach_tensor_lgamma_cuda
 
 - func: _foreach_lgamma_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_lgamma_slow_
     CUDA: foreach_tensor_lgamma_cuda_
 
 - func: _foreach_frac(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_frac_slow
     CUDA: foreach_tensor_frac_cuda
 
 - func: _foreach_frac_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_frac_slow_
     CUDA: foreach_tensor_frac_cuda_
 
 - func: _foreach_reciprocal(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_reciprocal_slow
     CUDA: foreach_tensor_reciprocal_cuda
 
 - func: _foreach_reciprocal_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_reciprocal_slow_
     CUDA: foreach_tensor_reciprocal_cuda_
 
 - func: _foreach_sigmoid(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sigmoid_slow
     CUDA: foreach_tensor_sigmoid_cuda
 
 - func: _foreach_sigmoid_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_sigmoid_slow_
     CUDA: foreach_tensor_sigmoid_cuda_
 
 - func: _foreach_trunc(Tensor[] tensors) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_trunc_slow
     CUDA: foreach_tensor_trunc_cuda
 
 - func: _foreach_trunc_(Tensor(a!)[] self) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_trunc_slow_
     CUDA: foreach_tensor_trunc_cuda_
 
 - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalar_slow_
     CUDA: foreach_tensor_addcdiv_scalar_cuda_
 
 - func: _foreach_addcmul_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalar_slow_
     CUDA: foreach_tensor_addcmul_scalar_cuda_
 
 - func: _foreach_addcdiv_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalarlist_slow_
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda_
 
 - func: _foreach_addcmul_.ScalarList(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> ()
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalarlist_slow_
     CUDA: foreach_tensor_addcmul_scalarlist_cuda_
 
 - func: _foreach_addcdiv.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalar_slow
     CUDA: foreach_tensor_addcdiv_scalar_cuda
 
 - func: _foreach_addcmul.Scalar(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalar_slow
     CUDA: foreach_tensor_addcmul_scalar_cuda
 
 - func: _foreach_addcdiv.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcdiv_scalarlist_slow
     CUDA: foreach_tensor_addcdiv_scalarlist_cuda
 
 - func: _foreach_addcmul.ScalarList(Tensor[] input, Tensor[] tensor1, Tensor[] tensor2, float[] scalars) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_addcmul_scalarlist_slow
     CUDA: foreach_tensor_addcmul_scalarlist_cuda
 
 - func: _foreach_maximum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_maximum_slow
     CUDA: foreach_tensor_maximum_cuda
 
 - func: _foreach_minimum.List(Tensor[] tensors1, Tensor[] tensors2) -> Tensor[]
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: foreach_tensor_minimum_slow
     CUDA: foreach_tensor_minimum_cuda
 
 - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   dispatch:
     CPU: legacy::cpu::_th_mode
     CUDA: legacy::cuda::_th_mode
@@ -8181,7 +7127,6 @@
     CUDA: legacy::cuda::_th_mode_out
 
 - func: bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: bucketize_cpu
     CUDA: bucketize_cuda
@@ -8193,13 +7138,11 @@
     CUDA: bucketize_out_cuda
 
 - func: bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: bucketize_cpu
     CUDA: bucketize_cuda
 
 - func: searchsorted.Tensor(Tensor sorted_sequence, Tensor self, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
@@ -8211,7 +7154,6 @@
     CUDA: searchsorted_out_cuda
 
 - func: searchsorted.Scalar(Tensor sorted_sequence, Scalar self, *, bool out_int32=False, bool right=False) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: searchsorted_cpu
     CUDA: searchsorted_cuda
@@ -8225,7 +7167,6 @@
     CPU, CUDA: mse_loss_out
 
 - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss
@@ -8237,7 +7178,6 @@
     CPU, CUDA: mse_loss_backward_out
 
 - func: mse_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: mse_loss_backward
@@ -8249,7 +7189,6 @@
     DefaultBackend: l1_loss_out
 
 - func: l1_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: l1_loss
@@ -8261,7 +7200,6 @@
     CPU, CUDA: l1_loss_backward_out
 
 - func: l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: l1_loss_backward
@@ -8299,7 +7237,6 @@
   python_module: nn
 
 - func: multilabel_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 - func: multilabel_margin_loss_forward.output(Tensor self, Tensor target, int reduction, *, Tensor(a!) output, Tensor(b!) is_target) -> (Tensor(a!), Tensor(b!))
@@ -8310,7 +7247,6 @@
     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_forward_out
 
 - func: multilabel_margin_loss_forward(Tensor self, Tensor target, int reduction) -> (Tensor output, Tensor is_target)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_forward_cpu
@@ -8324,7 +7260,6 @@
     CUDA: legacy::cuda::_thnn_multilabel_margin_loss_backward_out
 
 - func: multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: multilabel_margin_loss_backward_cpu
@@ -8410,7 +7345,6 @@
     CUDA: smooth_l1_loss_out
 
 - func: smooth_l1_loss(Tensor self, Tensor target, int reduction=Mean, float beta=1.0) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: smooth_l1_loss
@@ -8423,7 +7357,6 @@
     CUDA: smooth_l1_loss_backward_out
 
 - func: smooth_l1_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, float beta) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: smooth_l1_loss_backward
@@ -8435,7 +7368,6 @@
     DefaultBackend: soft_margin_loss_out
 
 - func: soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: soft_margin_loss
@@ -8447,7 +7379,6 @@
     DefaultBackend: soft_margin_loss_backward_out
 
 - func: soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: soft_margin_loss_backward
@@ -8459,7 +7390,6 @@
     CPU, CUDA: elu_out
 
 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: elu
@@ -8471,13 +7401,11 @@
     CPU, CUDA: elu_backward_out
 
 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: elu_backward
 
 - func: elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: elu_
@@ -8490,7 +7418,6 @@
     CUDA: legacy::cuda::_thnn_glu_forward_out
 
 - func: glu(Tensor self, int dim=-1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: glu
@@ -8504,7 +7431,6 @@
     CUDA: legacy::cuda::_thnn_glu_backward_out
 
 - func: glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: glu_backward
@@ -8517,20 +7443,17 @@
     CPU, CUDA: hardsigmoid_out
 
 - func: hardsigmoid(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid
     QuantizedCPU: hardsigmoid_quantized_cpu
 
 - func: hardsigmoid_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid_
 
 - func: hardsigmoid_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardsigmoid_backward
@@ -8543,7 +7466,6 @@
     QuantizedCPU: hardtanh_out_quantized_cpu
 
 - func: hardtanh(Tensor self, Scalar min_val=-1, Scalar max_val=1) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh
@@ -8556,13 +7478,11 @@
     CPU, CUDA: hardtanh_backward_out
 
 - func: hardtanh_backward(Tensor grad_output, Tensor self, Scalar min_val, Scalar max_val) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh_backward
 
 - func: hardtanh_(Tensor(a!) self, Scalar min_val=-1, Scalar max_val=1) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardtanh_
@@ -8575,19 +7495,16 @@
     CPU, CUDA: hardswish_out
 
 - func: hardswish(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish
 
 - func: hardswish_(Tensor(a!) self) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_
 
 - func: hardswish_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: hardswish_backward
@@ -8600,20 +7517,17 @@
     QuantizedCPU: leaky_relu_out_quantized_cpu
 
 - func: leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu
     QuantizedCPU: leaky_relu_quantized_cpu
 
 - func: leaky_relu_backward(Tensor grad_output, Tensor self, Scalar negative_slope, bool self_is_result) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu_backward
 
 - func: leaky_relu_(Tensor(a!) self, Scalar negative_slope=0.01) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: leaky_relu_
@@ -8624,7 +7538,6 @@
   python_module: nn
 
 - func: log_sigmoid(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 - func: log_sigmoid_forward.output(Tensor self, *, Tensor(a!) output, Tensor(b!) buffer) -> (Tensor(a!), Tensor(b!))
@@ -8635,7 +7548,6 @@
     CUDA: legacy::cuda::_thnn_log_sigmoid_forward_out
 
 - func: log_sigmoid_forward(Tensor self) -> (Tensor output, Tensor buffer)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: log_sigmoid_forward_cpu
@@ -8649,7 +7561,6 @@
     CUDA: legacy::cuda::_thnn_log_sigmoid_backward_out
 
 - func: log_sigmoid_backward(Tensor grad_output, Tensor self, Tensor buffer) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: log_sigmoid_backward_cpu
@@ -8663,20 +7574,17 @@
     CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward_out
 
 - func: rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: rrelu_with_noise_cpu
     CUDA: legacy::cuda::_thnn_rrelu_with_noise_forward
 
 - func: rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: rrelu_with_noise_backward
 
 - func: rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: rrelu_with_noise_cpu_
@@ -8689,7 +7597,6 @@
     CPU, CUDA: softplus_out
 
 - func: softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softplus
@@ -8701,7 +7608,6 @@
     CPU, CUDA: softplus_backward_out
 
 - func: softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softplus_backward
@@ -8713,7 +7619,6 @@
     CPU, CUDA: softshrink_out
 
 - func: softshrink(Tensor self, Scalar lambd=0.5) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softshrink
@@ -8725,7 +7630,6 @@
     CPU, CUDA: softshrink_backward_out
 
 - func: softshrink_backward(Tensor grad_output, Tensor self, Scalar lambd) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: softshrink_backward
@@ -8738,23 +7642,19 @@
     MkldnnCPU: mkldnn_adaptive_avg_pool2d_out
 
 - func: adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
 
 - func: mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     MkldnnCPU: mkldnn_adaptive_avg_pool2d
 
 - func: _adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     CPU: adaptive_avg_pool2d_cpu
     CUDA: adaptive_avg_pool2d_cuda
     QuantizedCPU: adaptive_avg_pool2d_quantized_cpu
 
 - func: _adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool2d_backward_cpu
@@ -8769,7 +7669,6 @@
     QuantizedCPU: adaptive_avg_pool3d_out_quantized_cpu
 
 - func: adaptive_avg_pool3d(Tensor self, int[3] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool3d_cpu
@@ -8784,7 +7683,6 @@
     CUDA: adaptive_avg_pool3d_backward_out_cuda
 
 - func: adaptive_avg_pool3d_backward(Tensor grad_output, Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_avg_pool3d_backward_cpu
@@ -8800,7 +7698,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool2d(Tensor self, int[2] output_size) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool2d_cpu
@@ -8814,7 +7711,6 @@
     CUDA: adaptive_max_pool2d_backward_out_cuda
 
 - func: adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool2d_backward_cpu
@@ -8830,7 +7726,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool3d_cpu
@@ -8844,7 +7739,6 @@
     CUDA: adaptive_max_pool3d_backward_out_cuda
 
 - func: adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: adaptive_max_pool3d_backward_cpu
@@ -8859,7 +7753,6 @@
     MkldnnCPU: mkldnn_avg_pool2d_out
 
 - func: avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool2d_cpu
@@ -8875,7 +7768,6 @@
     CUDA: avg_pool2d_backward_out_cuda
 
 - func: avg_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool2d_backward_cpu
@@ -8890,7 +7782,6 @@
     MkldnnCPU: mkldnn_avg_pool3d_out
 
 - func: avg_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool3d_cpu
@@ -8906,7 +7797,6 @@
     CUDA: avg_pool3d_backward_out_cuda
 
 - func: avg_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, bool ceil_mode, bool count_include_pad, int? divisor_override) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: avg_pool3d_backward_cpu
@@ -8922,7 +7812,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: fractional_max_pool2d(Tensor self, int[2] kernel_size, int[2] output_size, Tensor random_samples) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool2d_cpu
@@ -8936,7 +7825,6 @@
     CUDA: fractional_max_pool2d_backward_out_cuda
 
 - func: fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool2d_backward_cpu
@@ -8952,7 +7840,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: fractional_max_pool3d(Tensor self, int[3] kernel_size, int[3] output_size, Tensor random_samples) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool3d_cpu
@@ -8966,7 +7853,6 @@
     CUDA: fractional_max_pool3d_backward_out_cuda
 
 - func: fractional_max_pool3d_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] output_size, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: fractional_max_pool3d_backward_cpu
@@ -8982,7 +7868,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool2d_with_indices_cpu
@@ -8996,7 +7881,6 @@
     CUDA: max_pool2d_with_indices_backward_out_cuda
 
 - func: max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool2d_with_indices_backward_cpu
@@ -9012,7 +7896,6 @@
 
 # Return: (Tensor output, Tensor indices)
 - func: max_pool3d_with_indices(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool3d_with_indices_cpu
@@ -9026,7 +7909,6 @@
     CUDA: max_pool3d_with_indices_backward_out_cuda
 
 - func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_pool3d_with_indices_backward_cpu
@@ -9040,7 +7922,6 @@
     CUDA: max_unpooling2d_forward_out_cuda
 
 - func: max_unpool2d(Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_forward_cpu
@@ -9054,7 +7935,6 @@
     CUDA: max_unpooling2d_backward_out_cuda
 
 - func: max_unpool2d_backward(Tensor grad_output, Tensor self, Tensor indices, int[2] output_size) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling2d_backward_cpu
@@ -9068,7 +7948,6 @@
     CUDA: max_unpooling3d_forward_out_cuda
 
 - func: max_unpool3d(Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_forward_cpu
@@ -9082,7 +7961,6 @@
     CUDA: max_unpooling3d_backward_out_cuda
 
 - func: max_unpool3d_backward(Tensor grad_output, Tensor self, Tensor indices, int[3] output_size, int[3] stride, int[3] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: max_unpooling3d_backward_cpu
@@ -9096,7 +7974,6 @@
     CUDA: reflection_pad1d_out_cuda
 
 - func: reflection_pad1d(Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, QuantizedCPU: reflection_pad1d_cpu
@@ -9110,7 +7987,6 @@
     CUDA: reflection_pad1d_backward_out_cuda
 
 - func: reflection_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: reflection_pad1d_backward_cpu
@@ -9124,7 +8000,6 @@
     CUDA: reflection_pad2d_out_cuda
 
 - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, QuantizedCPU: reflection_pad2d_cpu
@@ -9138,7 +8013,6 @@
     CUDA: reflection_pad2d_backward_out_cuda
 
 - func: reflection_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: reflection_pad2d_backward_cpu
@@ -9152,7 +8026,6 @@
     CUDA: replication_pad1d_out_cuda
 
 - func: replication_pad1d(Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad1d_cpu
@@ -9166,7 +8039,6 @@
     CUDA: replication_pad1d_backward_out_cuda
 
 - func: replication_pad1d_backward(Tensor grad_output, Tensor self, int[2] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad1d_backward_cpu
@@ -9180,7 +8052,6 @@
     CUDA: replication_pad2d_out_cuda
 
 - func: replication_pad2d(Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad2d_cpu
@@ -9194,7 +8065,6 @@
     CUDA: replication_pad2d_backward_out_cuda
 
 - func: replication_pad2d_backward(Tensor grad_output, Tensor self, int[4] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad2d_backward_cpu
@@ -9208,7 +8078,6 @@
     CUDA: replication_pad3d_out_cuda
 
 - func: replication_pad3d(Tensor self, int[6] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad3d_cpu
@@ -9222,28 +8091,24 @@
     CUDA: replication_pad3d_backward_out_cuda
 
 - func: replication_pad3d_backward(Tensor grad_output, Tensor self, int[6] padding) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: replication_pad3d_backward_cpu
     CUDA: replication_pad3d_backward_cuda
 
 - func: upsample_linear1d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_cpu
     CUDA: upsample_linear1d_cuda
 
 - func: upsample_linear1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_backward_cpu
     CUDA: upsample_linear1d_backward_cuda
 
 - func: upsample_bilinear2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_cpu
@@ -9251,54 +8116,46 @@
     QuantizedCPU: upsample_bilinear2d_quantized_cpu
 
 - func: upsample_bilinear2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_backward_cpu
     CUDA: upsample_bilinear2d_backward_cuda
 
 - func: upsample_trilinear3d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_cpu
     CUDA: upsample_trilinear3d_cuda
 
 - func: upsample_trilinear3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_backward_cpu
     CUDA: upsample_trilinear3d_backward_cuda
 
 - func: upsample_bicubic2d.vec(Tensor input, int[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_cpu
     CUDA: upsample_bicubic2d_cuda
 
 - func: upsample_bicubic2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, bool align_corners, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_backward_cpu
     CUDA: upsample_bicubic2d_backward_cuda
 
 - func: upsample_nearest1d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: upsample_nearest1d
 
 - func: upsample_nearest1d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     DefaultBackend: upsample_nearest1d_backward
 
 - func: upsample_nearest2d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_cpu
@@ -9306,14 +8163,12 @@
     QuantizedCPU: upsample_nearest2d_quantized_cpu
 
 - func: upsample_nearest2d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_backward_cpu
     CUDA: upsample_nearest2d_backward_cuda
 
 - func: upsample_nearest3d.vec(Tensor input, int[]? output_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_cpu
@@ -9321,7 +8176,6 @@
     QuantizedCPU: upsample_nearest3d_quantized_cpu
 
 - func: upsample_nearest3d_backward.vec(Tensor grad_output, int[]? output_size, int[] input_size, float[]? scale_factors) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_backward_cpu
@@ -9336,7 +8190,6 @@
     CUDA: upsample_linear1d_out_cuda
 
 - func: upsample_linear1d(Tensor self, int[1] output_size, bool align_corners, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_cpu
@@ -9350,7 +8203,6 @@
     CUDA: upsample_linear1d_backward_out_cuda
 
 - func: upsample_linear1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, bool align_corners, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_linear1d_backward_cpu
@@ -9364,7 +8216,6 @@
     CUDA: upsample_bilinear2d_out_cuda
 
 - func: upsample_bilinear2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_cpu
@@ -9379,7 +8230,6 @@
     CUDA: upsample_bilinear2d_backward_out_cuda
 
 - func: upsample_bilinear2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bilinear2d_backward_cpu
@@ -9393,7 +8243,6 @@
     CUDA: upsample_bicubic2d_out_cuda
 
 - func: upsample_bicubic2d(Tensor self, int[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_cpu
@@ -9407,7 +8256,6 @@
     CUDA: upsample_bicubic2d_backward_out_cuda
 
 - func: upsample_bicubic2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_bicubic2d_backward_cpu
@@ -9421,7 +8269,6 @@
     CUDA: upsample_trilinear3d_out_cuda
 
 - func: upsample_trilinear3d(Tensor self, int[3] output_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_cpu
@@ -9435,14 +8282,13 @@
     CUDA: upsample_trilinear3d_backward_out_cuda
 
 - func: upsample_trilinear3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, bool align_corners, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_trilinear3d_backward_cpu
     CUDA: upsample_trilinear3d_backward_cuda
 
 - func: upsample_nearest1d.out(Tensor self, int[1] output_size, float? scales=None, *, Tensor(a!) out) -> Tensor(a!)
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  use_c10_dispatcher: full
   python_module: nn
   structured: True
   dispatch:
@@ -9450,12 +8296,11 @@
     CUDA: upsample_nearest1d_out_cuda
 
 - func: upsample_nearest1d(Tensor self, int[1] output_size, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   structured_delegate: upsample_nearest1d.out
 
 - func: upsample_nearest1d_backward.grad_input(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)
-  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  use_c10_dispatcher: full
   python_module: nn
   structured: True
   dispatch:
@@ -9463,7 +8308,6 @@
     CUDA: upsample_nearest1d_backward_out_cuda
 
 - func: upsample_nearest1d_backward(Tensor grad_output, int[1] output_size, int[3] input_size, float? scales=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   structured_delegate: upsample_nearest1d_backward.grad_input
 
@@ -9475,7 +8319,6 @@
     CUDA: upsample_nearest2d_out_cuda
 
 - func: upsample_nearest2d(Tensor self, int[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_cpu
@@ -9490,7 +8333,6 @@
     CUDA: upsample_nearest2d_backward_out_cuda
 
 - func: upsample_nearest2d_backward(Tensor grad_output, int[2] output_size, int[4] input_size, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest2d_backward_cpu
@@ -9504,7 +8346,6 @@
     CUDA: upsample_nearest3d_out_cuda
 
 - func: upsample_nearest3d(Tensor self, int[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_cpu
@@ -9519,7 +8360,6 @@
     CUDA: upsample_nearest3d_backward_out_cuda
 
 - func: upsample_nearest3d_backward(Tensor grad_output, int[3] output_size, int[5] input_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: upsample_nearest3d_backward_cpu
@@ -9532,7 +8372,6 @@
     CPU, CUDA: sigmoid_backward_out
 
 - func: sigmoid_backward(Tensor grad_output, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: sigmoid_backward
@@ -9544,7 +8383,6 @@
     CPU, CUDA: logit_backward_out
 
 - func: logit_backward(Tensor grad_output, Tensor self, float? eps=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: logit_backward
@@ -9556,7 +8394,6 @@
     CPU, CUDA: tanh_backward_out
 
 - func: tanh_backward(Tensor grad_output, Tensor output) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU, CUDA: tanh_backward
@@ -9601,7 +8438,6 @@
     CUDA: slow_conv_transpose2d_backward_out_cuda
 
 - func: slow_conv_transpose2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, Tensor columns, Tensor ones, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose2d_backward_cpu
@@ -9629,7 +8465,6 @@
     CUDA: slow_conv_transpose3d_backward_out_cuda
 
 - func: slow_conv_transpose3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] output_padding, int[3] dilation, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_backward_cpu
@@ -9665,7 +8500,6 @@
     CUDA: slow_conv2d_backward_out_cuda
 
 - func: thnn_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_cpu
@@ -9698,7 +8532,6 @@
     CUDA: thnn_conv_depthwise2d_backward_out
 
 - func: thnn_conv_depthwise2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[2] output_mask) -> (Tensor grad_input, Tensor grad_weight)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CUDA: thnn_conv_depthwise2d_backward
@@ -9730,7 +8563,6 @@
     CPU: slow_conv3d_backward_out_cpu
 
 - func: slow_conv3d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, Tensor finput, Tensor fgrad_input, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv3d_backward_cpu
@@ -9743,7 +8575,6 @@
     CUDA: slow_conv_dilated2d_cuda
 
 - func: slow_conv_dilated2d_backward(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_backward_cpu
@@ -9757,7 +8588,6 @@
     CUDA: slow_conv_dilated3d_cuda
 
 - func: slow_conv_dilated3d_backward(Tensor grad_output, Tensor self, Tensor weight, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_backward_cpu
@@ -9771,7 +8601,6 @@
     CUDA: col2im_out_cuda
 
 - func: col2im(Tensor self, int[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: col2im_cpu
@@ -9785,14 +8614,12 @@
     CUDA: col2im_backward_out_cuda
 
 - func: col2im_backward(Tensor grad_output, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: col2im_backward_cpu
     CUDA: col2im_backward_cuda
 
 - func: column_stack(Tensor[] tensors) -> Tensor
-  use_c10_dispatcher: full
   dispatch:
     Math: column_stack
 
@@ -9809,7 +8636,6 @@
     CUDA: im2col_out_cuda
 
 - func: im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: im2col_cpu
@@ -9823,30 +8649,25 @@
     CUDA: im2col_backward_out_cuda
 
 - func: im2col_backward(Tensor grad_output, int[2] input_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: im2col_backward_cpu
     CUDA: im2col_backward_cuda
 
 - func: isfinite(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: isinf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   device_guard: False
 
 - func: record_stream(Tensor(a!) self, Stream s) -> ()
-  use_c10_dispatcher: full
   variants: method
   dispatch:
     CUDA: record_stream_cuda
 
 - func: isposinf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -9855,7 +8676,6 @@
     CPU, CUDA: isposinf_out
 
 - func: isneginf(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -9868,12 +8688,10 @@
 # of the vmap frontend API (see torch/_vmap_internals.py). They are not
 # user-facing, hence the leading underscore. Please don't use them them anywhere else.
 - func: _add_batch_dim(Tensor self, int batch_dim, int level) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 # See NOTE [_add_batch_dim and _remove_batch_dim]
 - func: _remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
-  use_c10_dispatcher: full
   variants: function
 
 ## Functions related to the fast Fourier transform and the torch.fft namespace
@@ -9888,72 +8706,128 @@
 # torch.fft.fft
 # NOTE: NOT an alias for torch.fft, which has different semantics
 - func: fft_fft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_fft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_ifft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_ifft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_rfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_rfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_irfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_irfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_hfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_hfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_ihfft(Tensor self, int? n=None, int dim=-1, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_ihfft.out(Tensor self, int? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_fft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_fft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_ifft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_ifft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_rfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_rfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_irfft2(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_irfft2.out(Tensor self, int[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_fftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_fftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_ifftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_ifftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_rfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_rfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
 - func: fft_irfftn(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor
-  use_c10_dispatcher: full
+  python_module: fft
+  variants: function
+
+- func: fft_irfftn.out(Tensor self, int[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
@@ -9962,18 +8836,26 @@
   python_module: fft
   variants: function
 
+- func: fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_rfftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
   python_module: fft
   variants: function
 
+- func: fft_rfftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: fft
+  variants: function
+
 - func: fft_fftshift(Tensor self, int[1]? dim=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
 - func: fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: fft
   variants: function
 
@@ -9987,7 +8869,6 @@
 # See linalg_det as an example.
 
 - func: linalg_cholesky(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10002,25 +8883,21 @@
 
 # torch.linalg.det, alias for torch.det
 - func: linalg_det(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
 
 - func: det(Tensor self) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: det
 
 - func: _syevd_helper(Tensor self, bool compute_eigenvectors, str uplo) -> (Tensor, Tensor)
-  use_c10_dispatcher: full
   variants: function
   dispatch:
     CPU: _syevd_helper_cpu
     CUDA: _syevd_helper_cuda
 
 - func: linalg_eigh(Tensor self, str UPLO="L") -> (Tensor eigenvalues, Tensor eigenvectors)
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10033,7 +8910,6 @@
     DefaultBackend: linalg_eigh_out
 
 - func: linalg_eigvalsh(Tensor self, str UPLO="L") -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10046,7 +8922,6 @@
     DefaultBackend: linalg_eigvalsh_out
 
 - func: inner(Tensor self, Tensor other) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: inner.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -10054,14 +8929,12 @@
 
 # torch.outer, alias for torch.ger
 - func: outer(Tensor self, Tensor vec2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
 
 - func: outer.out(Tensor self, Tensor vec2, *, Tensor(a!) out) -> Tensor(a!)
   use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
 
 - func: ger(Tensor self, Tensor vec2) -> Tensor
-  use_c10_dispatcher: full
   variants: function, method
   dispatch:
     DefaultBackend: ger
@@ -10072,12 +8945,10 @@
     DefaultBackend: ger_out
 
 - func: linalg_norm(Tensor self, Scalar? ord=None, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
 
 - func: linalg_norm.ord_str(Tensor self, str ord, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
 
@@ -10092,7 +8963,6 @@
   variants: function
 
 - func: linalg_cond(Tensor self, Scalar? p=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10106,7 +8976,6 @@
     Math: linalg_cond_out
 
 - func: linalg_cond.p_str(Tensor self, str p) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10154,7 +9023,6 @@
     Math: linalg_tensorinv_out
 
 - func: linalg_tensorsolve(Tensor self, Tensor other, int[]? dims=None) -> Tensor
-  use_c10_dispatcher: full
   python_module: linalg
   variants: function
   dispatch:
@@ -10167,8 +9035,28 @@
   dispatch:
     Math: linalg_tensorsolve_out
 
-- func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor
+- func: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R)
+  python_module: linalg
+  use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    DefaultBackend: linalg_qr
+
+- func: linalg_qr.out(Tensor self, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)
+  use_c10_dispatcher: hacky_wrapper_for_legacy_signatures
+  python_module: linalg
+  variants: function
+  dispatch:
+    DefaultBackend: linalg_qr_out
+
+- func: _linalg_qr_helper(Tensor self, str mode) -> (Tensor, Tensor)
   use_c10_dispatcher: full
+  variants: function
+  dispatch:
+    CPU: _linalg_qr_helper_cpu
+    CUDA: _linalg_qr_helper_cuda
+
+- func: linalg_matrix_rank(Tensor self, float? tol=None, bool hermitian=False) -> Tensor
   python_module: linalg
   variants: function
   dispatch:
@@ -10184,30 +9072,36 @@
 ## Functions that are only for testing
 # It is undocumented and should not be used outside of tests.
 - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor
-  use_c10_dispatcher: full
 
 # Note: this function is only for testing.
 - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: _test_optional_intlist
 
 # Note: this function is only for testing.
 - func: _test_optional_filled_intlist(Tensor values, int[2]? addends) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: _test_optional_intlist
 
 # Note: this function is only for testing.
 - func: _test_optional_floatlist(Tensor values, float[]? addends) -> Tensor
-  use_c10_dispatcher: full
   python_module: nn
   dispatch:
     CPU: _test_optional_floatlist
 
 # Note: this function is only for testing.
 - func: _test_string_default(Tensor dummy, str a="\"'\\", str b='"\'\\') -> Tensor
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor
+  use_c10_dispatcher: full
+  python_module: nn
+
+# Note: this function is only for testing.
+- func: _test_ambiguous_defaults.b(Tensor dummy, int a=2, str b="2") -> Tensor
+  cpp_no_default_args: ['a', 'b']
   use_c10_dispatcher: full
   python_module: nn
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index b7d893ad55fc..05762bfb036f 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -746,7 +746,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
       run_status == pytorch_qnnp_status_success,
       "failed to run quantized::conv2d (qnnpack) operator");
 
-  return output.contiguous(act.suggest_memory_format());
+  return output;
 }
 
 template at::Tensor PackedConvWeightsQnnp<2>::apply(
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index d621efafee41..fb7e16539c15 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -7,6 +7,7 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/InitialTensorOptions.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/native/IndexingUtils.h>
 
 #include <TH/THBlasUtils.h>
 
@@ -14,7 +15,6 @@ namespace at { namespace native {
 
 using namespace at::sparse;
 
-
 /******************************************************************************
  * access methods
  ******************************************************************************/
@@ -328,7 +328,7 @@ SparseTensor dense_to_sparse(const Tensor& self, int64_t sparse_dim){
 
   Tensor values;
   if (self.dim() > 0) {
-    std::vector<Tensor> ix = indices.chunk(indices.size(0), 0);
+    auto ix = toListOfOptionalTensors(indices.chunk(indices.size(0), 0));
     values = self.index(ix).squeeze(0).clone(at::MemoryFormat::Preserve);
   } else {
     AT_ASSERT(nz.sizes().equals({0, 1}));
diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
index 60df74061c7a..6c3298b72e75 100644
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@@ -544,7 +544,7 @@ SparseTensor& add_out_sparse_non_contiguous(SparseTensor& r, const SparseTensor&
 
 Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, Scalar value);
 
-SparseTensor& add_out_sparse_cpu(SparseTensor& r, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+SparseTensor& add_out_sparse_cpu(const SparseTensor& t, const SparseTensor& src, Scalar value, SparseTensor& r) {
   if (!t.is_sparse()) {
     return add_out_dense_sparse_cpu(r, t, src, value);
   }
@@ -650,7 +650,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen
       dstBuffer.add_(srcBuffer, value);
     }
   } else {
-    AT_DISPATCH_ALL_TYPES(
+    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Bool,
         commonDtype, "add_dense_sparse", [&] {
           add_dense_sparse_worker_cpu<scalar_t>(resultBuffer, value, sparse, indices, valuesBuffer);
         });
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
index 753ea9fa4937..fce3446816e7 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@@ -338,8 +338,8 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
     if (sparse.dense_dim() == 0) {
       TORCH_CHECK(cuda::getApplyGrid(nnz, grid, curDevice), "add: Argument #0: tensor too large or too many dimensions");
 
-      AT_DISPATCH_ALL_TYPES_AND2(
-        at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] {
+      AT_DISPATCH_ALL_TYPES_AND3(
+        at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "add_out_dense_sparse_cuda", [&] {
           apply::sparseElementwiseKernelScalar<TensorCAddOp<scalar_t>, uint64_t, scalar_t>
             <<<grid, block, 0, stream>>>(
               TensorCAddOp<scalar_t>(value.to<scalar_t>()),
@@ -399,7 +399,7 @@ Tensor& add_out_dense_sparse_cuda(Tensor& r_, const Tensor& dense, const SparseT
 
 Tensor& add_out_dense_sparse_cuda(Tensor& r, const Tensor& dense, const SparseTensor& sparse_, Scalar value);
 
-SparseTensor& add_out_sparse_cuda(SparseTensor& r_, const SparseTensor& t, const SparseTensor& src, Scalar value) {
+SparseTensor& add_out_sparse_cuda(const SparseTensor& t, const SparseTensor& src, Scalar value, SparseTensor& r_) {
   if (!t.is_sparse()) {
     return add_out_dense_sparse_cuda(r_, t, src, value);
   }
diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp
index 4dba9de7d5b0..88c519c09ea3 100644
--- a/aten/src/ATen/native/vulkan/VulkanAten.cpp
+++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp
@@ -548,7 +548,7 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape));
   m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select));
   m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose));
-  m.impl_UNBOXED("transpose_", at::native::vulkan::aten::transpose_);
+  m.impl("transpose_", at::native::vulkan::aten::transpose_);
   m.impl("view", TORCH_FN(at::native::vulkan::aten::view));
   m.impl("unsqueeze", TORCH_FN(at::native::vulkan::aten::unsqueeze));
   m.impl("empty.memory_format", at::native::vulkan::aten::empty);
@@ -569,11 +569,11 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("_cat", TORCH_FN(at::native::vulkan::aten::cat));
   m.impl("mul.Scalar", TORCH_FN(at::native::vulkan::aten::mul_scalar));
   m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar));
-  m.impl_UNBOXED(
+  m.impl(
       "convolution_overrideable", at::native::vulkan::aten::convolution);
   m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_);
   m.impl("relu_", at::native::vulkan::aten::relu_);
-  m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_);
+  m.impl("add_.Tensor", at::native::vulkan::aten::add_);
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
index 58394dca19da..2c02e034603e 100644
--- a/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/KO4C4HW_to_image.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) readonly buffer kernel {
   vec4 data[];
diff --git a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
index d5b9af843dbe..75243a69bca3 100644
--- a/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/adaptive_avg_pool2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add.glsl b/aten/src/ATen/native/vulkan/glsl/add.glsl
index 8dcff0476edf..361927373a49 100644
--- a/aten/src/ATen/native/vulkan/glsl/add.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add_.glsl b/aten/src/ATen/native/vulkan/glsl/add_.glsl
index ed82d0cbe87b..d6360a376c58 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
index 8882ba0d8ff2..735086a8150a 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
index bffd680669fb..a418a28bb5c3 100644
--- a/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/add_scalar_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/addmm.glsl b/aten/src/ATen/native/vulkan/glsl/addmm.glsl
index 61f76fa8cf5d..a8f09252a167 100644
--- a/aten/src/ATen/native/vulkan/glsl/addmm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/addmm.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
index df2bbcf18014..5de8cf13225f 100644
--- a/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/avg_pool2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/clamp.glsl b/aten/src/ATen/native/vulkan/glsl/clamp.glsl
index c394dfd26627..52c2d2d96c26 100644
--- a/aten/src/ATen/native/vulkan/glsl/clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/clamp.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
index b16258685114..3f138bb93ec6 100644
--- a/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/clamp_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
index 9646eb8c9f19..547eec7fafef 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
@@ -18,7 +17,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 padding;
   ivec2 dilate;
   vec2 clamp;
-  int stacks_per_tower;
+  ivec3 src_kernel;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -29,9 +28,6 @@ void main() {
   /* Dynamically Uniform */
   const ivec3 size = imageSize(uOutput);
   const ivec3 isize = textureSize(uInput, 0);
-  const int tower = pos.z/(uBlock.stacks_per_tower);
-  const int tower_offset = pos.z % uBlock.stacks_per_tower;
-  const ivec4 block = tower_offset * uBlock.kernel.z + ivec4(0, 1, 2, 3);
 
   if (all(lessThan(pos, size))) {
     const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding;
@@ -43,16 +39,15 @@ void main() {
     vec4 sum = uBias.data[pos.z];
 
     for (int z = 0; z < uBlock.kernel.z; z+=4) {
-      const ivec4 kz = block + z;
-
-      for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) {
-        for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) {
-          const vec4 In = texelFetch(uInput, ivec3(x, y, z/4), 0);
-
-          sum = fma(In.xxxx, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.x), 0), sum);
-          sum = fma(In.yyyy, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.y), 0), sum);
-          sum = fma(In.zzzz, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.z), 0), sum);
-          sum = fma(In.wwww, texelFetch(uKernel, ivec3(kx, (uBlock.kernel.y*tower) + ky, kz.w), 0), sum);
+      const int z4 = z/4;
+      for (int y = start.y, ky = kstart.y + pos.z * uBlock.src_kernel.y; y < end.y; y += uBlock.dilate.y, ++ky) {
+        for (int x = start.x, kx = 4*kstart.x + z4*uBlock.src_kernel.z; x < end.x; x += uBlock.dilate.x, kx+=4) {
+          const vec4 In = texelFetch(uInput, ivec3(x, y, z4), 0);
+
+          sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0 + kx, ky, 0), 0), sum);
+          sum = fma(In.yyyy, texelFetch(uKernel, ivec3(1 + kx, ky, 0), 0), sum);
+          sum = fma(In.zzzz, texelFetch(uKernel, ivec3(2 + kx, ky, 0), 0), sum);
+          sum = fma(In.wwww, texelFetch(uKernel, ivec3(3 + kx, ky, 0), 0), sum);
         }
       }
     }
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
index fe50262f7d46..f8f929461ce7 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
@@ -18,6 +17,7 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 padding;
   ivec2 dilate;
   vec2 clamp;
+  ivec2 src_kernel;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -39,10 +39,10 @@ void main() {
     vec4 sum = uBias.data[pos.z];
 
     for (int y = start.y, ky = kstart.y; y < end.y; y += uBlock.dilate.y, ++ky) {
-      for (int x = start.x, kx = kstart.x; x < end.x; x += uBlock.dilate.x, ++kx) {
+      for (int x = start.x, kx = kstart.x + ky*uBlock.src_kernel.x; x < end.x; x += uBlock.dilate.x, ++kx) {
         sum = fma(
             texelFetch(uInput, ivec3(x, y, pos.z), 0),
-            texelFetch(uKernel, ivec3(kx, ky, pos.z), 0),
+            texelFetch(uKernel, ivec3(kx, pos.z, 0), 0),
             sum);
       }
     }
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
index 37a5898b9f10..5155c07669c1 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_dw_clamp.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
index b73c58e0f54d..89411284fed4 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
index 5cef89c2727f..8baae9b5fcd5 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_nogroup_clamp_1x.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba32f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform PRECISION sampler3D uKernel;
diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
index 48d9f785008b..b28f0550132f 100644
--- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
@@ -17,7 +16,6 @@ layout(set = 0, binding = 4)          uniform PRECISION restrict           Block
   ivec2 stride;
   ivec2 padding;
   vec2 clamp;
-  int stacks_per_tower;
 } uBlock;
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
@@ -28,9 +26,6 @@ void main() {
   /* Dynamically Uniform */
   const ivec3 size = imageSize(uOutput);
   const ivec3 isize = textureSize(uInput, 0);
-  const int tower = pos.z/(uBlock.stacks_per_tower);
-  const int tower_offset = pos.z % uBlock.stacks_per_tower;
-  const ivec4 block = tower_offset * uBlock.kernel.x + ivec4(0, 1, 2, 3);
 
   if (all(lessThan(pos, size))) {
     const ivec2 ipos = pos.xy * uBlock.stride - uBlock.padding;
@@ -39,12 +34,11 @@ void main() {
 
     for (int z = 0; z < uBlock.kernel.x; z+=4) {
       const vec4 In = texelFetch(uInput, ivec3(ipos.x, ipos.y, z/4), 0);
-      const ivec4 kz = block + z;
 
-      sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0, tower, kz.x), 0), sum);
-      sum = fma(In.yyyy, texelFetch(uKernel, ivec3(0, tower, kz.y), 0), sum);
-      sum = fma(In.zzzz, texelFetch(uKernel, ivec3(0, tower, kz.z), 0), sum);
-      sum = fma(In.wwww, texelFetch(uKernel, ivec3(0, tower, kz.w), 0), sum);
+      sum = fma(In.xxxx, texelFetch(uKernel, ivec3(z+0, pos.z, 0), 0), sum);
+      sum = fma(In.yyyy, texelFetch(uKernel, ivec3(z+1, pos.z, 0), 0), sum);
+      sum = fma(In.zzzz, texelFetch(uKernel, ivec3(z+2, pos.z, 0), 0), sum);
+      sum = fma(In.wwww, texelFetch(uKernel, ivec3(z+3, pos.z, 0), 0), sum);
     }
 
     imageStore(
diff --git a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
index d19c370ec9bd..01d653bf06de 100644
--- a/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/image_to_nchw.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
index 948b797a5207..88373605d010 100644
--- a/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/max_pool2d.glsl
@@ -1,7 +1,6 @@
 #version 450 core
 #define PRECISION $precision
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, rgba16f, binding = 0) writeonly PRECISION uniform image3D uOutput;
 layout(set = 0, binding = 1) uniform PRECISION sampler3D uInput;
 layout(set = 0, binding = 2) uniform constBlock {
diff --git a/aten/src/ATen/native/vulkan/glsl/mean.glsl b/aten/src/ATen/native/vulkan/glsl/mean.glsl
index 130d716ca9e6..551fd747f103 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
index 266226aa708b..b8d0add329f2 100644
--- a/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mean2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mm.glsl b/aten/src/ATen/native/vulkan/glsl/mm.glsl
index 00ab5f31e6db..157acfe9c074 100644
--- a/aten/src/ATen/native/vulkan/glsl/mm.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mm.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
index d3a98ba30bea..c0ae48fe3883 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
index b49252e128cc..f959052879ad 100644
--- a/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/mul_scalar_.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
index fb87b5a36918..adbafcbd0438 100644
--- a/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/glsl/permute.glsl b/aten/src/ATen/native/vulkan/glsl/permute.glsl
index af8e33588f78..3d1191ff6eea 100644
--- a/aten/src/ATen/native/vulkan/glsl/permute.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/permute.glsl
@@ -1,6 +1,5 @@
 #version 450 core
 layout(std430) buffer;
-layout(std430) uniform;
 layout(set = 0, binding = 0) writeonly buffer outputBuffer {
   float data[];
 }
diff --git a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
index efb1c5c7fc9a..b4db9b87dacb 100644
--- a/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
+++ b/aten/src/ATen/native/vulkan/glsl/upsample_nearest2d.glsl
@@ -2,7 +2,6 @@
 #define PRECISION $precision
 
 layout(std430) buffer;
-layout(std430) uniform;
 
 /* Qualifiers: layout - storage - precision - memory */
 
diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
index 369a47fee93a..9f25d89bca9b 100644
--- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp
@@ -167,10 +167,10 @@ Tensor& relu_(Tensor& self) {
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
   m.impl("clamp", TORCH_FN(clamp));
   m.impl("clamp_", TORCH_FN(clamp_));
-  m.impl_UNBOXED("hardtanh", hardtanh);
-  m.impl_UNBOXED("hardtanh_", hardtanh_);
-  m.impl_UNBOXED("relu", relu);
-  m.impl_UNBOXED("relu_", relu_);
+  m.impl("hardtanh", hardtanh);
+  m.impl("hardtanh_", hardtanh_);
+  m.impl("relu", relu);
+  m.impl("relu_", relu_);
 }
 
 #endif /* USE_VULKAN_API */
diff --git a/aten/src/ATen/native/vulkan/ops/Common.h b/aten/src/ATen/native/vulkan/ops/Common.h
index 6f7080f71a80..b0bbeeaf34f1 100644
--- a/aten/src/ATen/native/vulkan/ops/Common.h
+++ b/aten/src/ATen/native/vulkan/ops/Common.h
@@ -36,7 +36,7 @@ struct Layout final {
 };
 
 struct Experimentation {
-  static constexpr bool kUseConv2dOldApi = true;
+  static constexpr bool kUseConv2dOldApi = false;
 };
 
 struct ConvPrepackLimits final {
diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
index 5af2c14b80cb..d88545e3a25a 100644
--- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp
@@ -25,7 +25,7 @@ inline bool is_pointwise(const IntArrayRef filter) {
          (1 == filter[Layout::Filter::width]);
 }
 
-vTensor pack_weights(
+vTensor pack_weights_dw(
     api::Resource::Pool& pool,
     const Tensor& weight_arg,
     const int64_t groups) {
@@ -39,161 +39,201 @@ vTensor pack_weights(
   const IntArrayRef src_filter = weight.sizes();
   const float* const src_weight_ptr = weight.data_ptr<float>();
 
-  //
-  // Depthwise
-  //
+  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
+  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
+  const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
+  vTensor v_weight{
+      api::context(),
+      &pool,
+      {
+          4,
+          num_stacks,
+          src_kw_sz * src_kh_sz,
+      },
+      weight.options(),
+  };
 
-  if (is_depthwise(src_filter, groups)) {
-    vTensor v_weight{
-        api::context(),
-        &pool,
-        src_filter,
-        weight.options(),
-    };
+  using Future = vTensor::Future<float, vTensor::Access::Write>;
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
+  Future::Payload v_weight_payload = v_weight_future.wait();
 
-    using Future = vTensor::Future<void, vTensor::Access::Write>;
-    Future v_weight_future = v_weight.host<void, vTensor::Access::Write>();
-    Future::Payload v_weight_payload = v_weight_future.wait();
+  /* Source */
+  const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
+  const int64_t src_block_sz =
+      src_kernel_sz * src_filter[Layout::Filter::input];
 
-    memcpy(
-        v_weight_payload.get(),
-        src_weight_ptr,
-        std::min(weight.nbytes(), v_weight.nbytes()));
+  /* Destination */
+  const int64_t dst_kw_sz = src_kw_sz * src_kh_sz;
+  const int64_t dst_kh_sz = num_stacks;
+  const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
 
-    return v_weight;
-  }
+  float* const dst_weight_ptr = v_weight_payload.get();
+  memset(dst_weight_ptr, 0, v_weight.nbytes());
 
-  //
-  // General
-  //
+  for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
+    /* Source */
+    const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
-  if (Experimentation::kUseConv2dOldApi) {
-    const uint32_t OC = src_filter[Layout::Filter::output];
-    const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u);
-    const uint32_t C = src_filter[Layout::Filter::input];
-    const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u);
-    const uint32_t KH = src_filter[Layout::Filter::height];
-    const uint32_t KW = src_filter[Layout::Filter::width];
-
-    vTensor v_weight{
-      api::context(),
-      &pool,
-      {
-        1,
-        4 * KH * KW,
-        OC_4,
-        4 * C_4
-      },
-      weight.options(),
-    };
+    /* Destination */
+    const int64_t dst_oh = src_oc / 4;
+    const int64_t dst_c = src_oc % 4;
 
-    using Future = vTensor::Future<float, vTensor::Access::Write>;
-    Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
-    Future::Payload v_weight_payload = v_weight_future.wait();
+    float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
 
-    float* const dst_weight_ptr = v_weight_payload.get();
-    memset(dst_weight_ptr, 0, v_weight.nbytes());
+    for (int64_t src_ih = 0; src_ih < src_filter[Layout::Filter::height]; ++src_ih) {
+      memcpy(
+          dst_weight_c_ptr + dst_oh * dst_kw_sz + src_ih * src_kw_sz,
+          src_weight_oc_ptr + src_ih * src_kw_sz,
+          sizeof(float) * src_kw_sz);
+    }
+  }
 
-    const float* src = src_weight_ptr;
-    float* const dst = dst_weight_ptr;
+  return v_weight;
+}
 
+vTensor pack_weights_old(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg,
+    const int64_t groups) {
+  if (weight_arg.is_vulkan()) {
+    return convert(weight_arg);
+  }
+
+  const Tensor weight = weight_arg.contiguous();
+  const IntArrayRef src_filter = weight.sizes();
+  const float* const src_weight_ptr = weight.data_ptr<float>();
+
+  const uint32_t OC = src_filter[Layout::Filter::output];
+  const uint32_t OC_4 = at::native::vulkan::api::utils::div_up(OC, 4u);
+  const uint32_t C = src_filter[Layout::Filter::input];
+  const uint32_t C_4 = at::native::vulkan::api::utils::div_up(C, 4u);
+  const uint32_t KH = src_filter[Layout::Filter::height];
+  const uint32_t KW = src_filter[Layout::Filter::width];
+
+  vTensor v_weight{
+    api::context(),
+    &pool,
     {
-      uint32_t ridx = 0;
-      const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16;
-      for (uint32_t oc = 0; oc < OC; ++oc) {
-        int oc_4 = oc / 4;
-        int oc_4_i = oc % 4;
-        float* dst_oc = dst + oc_4 * oc_4SizeNumel;
-        for (uint32_t ic = 0; ic < C; ++ic) {
-          int ic_4 = ic / 4;
-          int ic_4_i = ic % 4;
-          float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
-          for (uint32_t ky = 0; ky < KH; ++ky) {
-            float* dst_ky = dst_ic + ky * KW * 16;
-            for (uint32_t kx = 0; kx < KW; ++kx) {
-              float* dst_kx = dst_ky + kx * 16;
-              dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
-            }
+      1,
+      4 * KH * KW,
+      OC_4,
+      4 * C_4
+    },
+    weight.options(),
+  };
+
+  using Future = vTensor::Future<float, vTensor::Access::Write>;
+  Future v_weight_future = v_weight.host<float, vTensor::Access::Write>();
+  Future::Payload v_weight_payload = v_weight_future.wait();
+
+  float* const dst_weight_ptr = v_weight_payload.get();
+  memset(dst_weight_ptr, 0, v_weight.nbytes());
+
+  const float* src = src_weight_ptr;
+  float* const dst = dst_weight_ptr;
+
+  {
+    uint32_t ridx = 0;
+    const uint32_t oc_4SizeNumel = KW * KH * C_4 * 16;
+    for (uint32_t oc = 0; oc < OC; ++oc) {
+      int oc_4 = oc / 4;
+      int oc_4_i = oc % 4;
+      float* dst_oc = dst + oc_4 * oc_4SizeNumel;
+      for (uint32_t ic = 0; ic < C; ++ic) {
+        int ic_4 = ic / 4;
+        int ic_4_i = ic % 4;
+        float* dst_ic = dst_oc + ic_4 * KW * KH * 16;
+        for (uint32_t ky = 0; ky < KH; ++ky) {
+          float* dst_ky = dst_ic + ky * KW * 16;
+          for (uint32_t kx = 0; kx < KW; ++kx) {
+            float* dst_kx = dst_ky + kx * 16;
+            dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++];
           }
         }
       }
+    }
 
-      // shader KO4C4HW_to_image
-      struct Image3D {
-        float* data_;
-        uint32_t dim0_, dim1_, dim2_;
-
-        Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) {
-          dim0_ = dim0;
-          dim1_ = dim1;
-          dim2_ = dim2;
-          data_ = new float[dim0 * dim1 * dim2 * 4];
-          memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
-        }
+    // shader KO4C4HW_to_image
+    struct Image3D {
+      float* data_;
+      uint32_t dim0_, dim1_, dim2_;
+
+      Image3D(uint32_t dim0, uint32_t dim1, uint32_t dim2) {
+        dim0_ = dim0;
+        dim1_ = dim1;
+        dim2_ = dim2;
+        data_ = new float[dim0 * dim1 * dim2 * 4];
+        memset(data_, 0.f, dim0 * dim1 * dim2 * 4 * sizeof(float));
+      }
 
-        inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
-          return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_;
-        }
+      inline uint32_t idx(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+        return i3 + i2 * 4 + i1 * 4 * dim2_ + i0 * 4 * dim2_ * dim1_;
+      }
 
-        void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) {
-          data_[idx(i0, i1, i2, i3)] = value;
-        }
+      void set(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, float value) {
+        data_[idx(i0, i1, i2, i3)] = value;
+      }
 
-        float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
-          return data_[idx(i0, i1, i2, i3)];
-        }
-      } image{4 * C_4, OC_4, KH * KW};
-
-      for (uint32_t sx = 0; sx < C_4; ++sx) {
-        for (uint32_t sy = 0; sy < OC_4; ++sy) {
-          for (uint32_t sz = 0; sz < (KH * KW); ++sz) {
-            for (uint32_t vi = 0; vi < 4; ++vi) {
-              int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz;
-              image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]);
-              image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]);
-              image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]);
-              image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]);
-            }
+      float get(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
+        return data_[idx(i0, i1, i2, i3)];
+      }
+    } image{4 * C_4, OC_4, KH * KW};
+
+    for (uint32_t sx = 0; sx < C_4; ++sx) {
+      for (uint32_t sy = 0; sy < OC_4; ++sy) {
+        for (uint32_t sz = 0; sz < (KH * KW); ++sz) {
+          for (uint32_t vi = 0; vi < 4; ++vi) {
+            int bufferVIdx = 4 * sx * KH * KW + 4 * sy * C_4 * KH * KW + 4 * sz;
+            image.set(4 * sx + 0, sy, sz, vi, dst[4 * (bufferVIdx + 0) + vi]);
+            image.set(4 * sx + 1, sy, sz, vi, dst[4 * (bufferVIdx + 1) + vi]);
+            image.set(4 * sx + 2, sy, sz, vi, dst[4 * (bufferVIdx + 2) + vi]);
+            image.set(4 * sx + 3, sy, sz, vi, dst[4 * (bufferVIdx + 3) + vi]);
           }
         }
       }
+    }
 
-      // inverse function of nchw_to_image
-      const uint32_t W = 4 * C_4;
-      const uint32_t H = OC_4;
-      const uint32_t D = KH * KW;
-      for (uint32_t sx = 0; sx < W; ++sx) {
-        for (uint32_t sy = 0; sy < H; ++sy) {
-          for (uint32_t sz = 0; sz < D; ++sz) {
-            for (uint32_t szvi = 0; szvi < 4; ++szvi) {
-              dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi);
-            }
+    // inverse function of nchw_to_image
+    const uint32_t W = 4 * C_4;
+    const uint32_t H = OC_4;
+    const uint32_t D = KH * KW;
+    for (uint32_t sx = 0; sx < W; ++sx) {
+      for (uint32_t sy = 0; sy < H; ++sy) {
+        for (uint32_t sz = 0; sz < D; ++sz) {
+          for (uint32_t szvi = 0; szvi < 4; ++szvi) {
+            dst_weight_ptr[W * sy + sx + (4 * sz + szvi) * W * H] = image.get(sx, sy, sz, szvi);
           }
         }
       }
     }
+  }
 
-    return v_weight;
+  return v_weight;
+}
+
+vTensor pack_weights_2d(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg,
+    const int64_t groups) {
+  if (weight_arg.is_vulkan()) {
+    return convert(weight_arg);
   }
 
+  const Tensor weight = weight_arg.contiguous();
+  const IntArrayRef src_filter = weight.sizes();
+  const float* const src_weight_ptr = weight.data_ptr<float>();
+
+  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
+  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
   const int64_t num_stacks = div_up(src_filter[Layout::Filter::output], INT64_C(4));
-  const int64_t stack_depth =
-      4 * api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
-  const int64_t max_stacks_per_tower =
-      ConvPrepackLimits::maxStackDepth / stack_depth;
-  const int64_t num_towers = div_up(num_stacks, max_stacks_per_tower);
-  int64_t stacks_per_tower = num_stacks;
-  if (num_towers > 1) {
-    stacks_per_tower = div_up(num_stacks, num_towers);
-  }
+  const int64_t stack_depth = api::utils::align_up(src_filter[Layout::Filter::input], INT64_C(4));
   vTensor v_weight{
       api::context(),
       &pool,
       {
-          stacks_per_tower,
-          stack_depth,
-          src_filter[Layout::Filter::height] * num_towers,
-          src_filter[Layout::Filter::width],
+          4,
+          src_kh_sz * num_stacks,
+          src_kw_sz * stack_depth,
       },
       weight.options(),
   };
@@ -203,53 +243,59 @@ vTensor pack_weights(
   Future::Payload v_weight_payload = v_weight_future.wait();
 
   /* Source */
-  const int64_t src_kw_sz = src_filter[Layout::Filter::width];
-  const int64_t src_kh_sz = src_filter[Layout::Filter::height];
   const int64_t src_kernel_sz = src_kw_sz * src_kh_sz;
   const int64_t src_block_sz =
       src_kernel_sz * src_filter[Layout::Filter::input];
 
   /* Destination */
-  const IntArrayRef dst_filter = v_weight.sizes();
-  const int64_t dst_kw_sz = src_filter[Layout::Filter::width];
-  const int64_t dst_kh_sz = src_filter[Layout::Filter::height] * num_towers;
+  const int64_t dst_kw_sz = src_kw_sz * stack_depth;
+  const int64_t dst_kh_sz = src_kh_sz * num_stacks;
   const int64_t dst_kernel_sz = dst_kw_sz * dst_kh_sz;
-  const int64_t dst_block_sz =
-      dst_kernel_sz * dst_filter[Layout::Filter::input];
-
-  TORCH_INTERNAL_ASSERT(src_kernel_sz*num_towers == dst_kernel_sz, "Internal error!");
 
   float* const dst_weight_ptr = v_weight_payload.get();
   memset(dst_weight_ptr, 0, v_weight.nbytes());
 
   for (int64_t src_oc = 0; src_oc < src_filter[Layout::Filter::output]; ++src_oc) {
-    const int64_t i_tower = src_oc / (stacks_per_tower * 4);
     /* Source */
-    const float* const src_weight_oc_ptr =
-        src_weight_ptr + src_oc * src_block_sz;
+    const float* const src_weight_oc_ptr = src_weight_ptr + src_oc * src_block_sz;
 
     /* Destination */
-    const int64_t local_oc = src_oc % (stacks_per_tower * 4);
-    const int64_t dst_oc = local_oc / 4;
-    const int64_t dst_oc_offset = local_oc % 4;
+    const int64_t dst_oh = src_oc / 4;
+    const int64_t dst_c = src_oc % 4;
 
-    float* const dst_weight_oc_ptr = dst_weight_ptr + dst_oc * dst_block_sz +
-        dst_oc_offset * dst_kernel_sz;
+    float* const dst_weight_c_ptr = dst_weight_ptr + dst_c * dst_kernel_sz;
 
     for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) {
-      const int64_t dst_ic = 4 * src_ic;
-
-      memcpy(
-          dst_weight_oc_ptr + dst_ic * dst_kernel_sz +
-              (i_tower * src_kernel_sz),
-          src_weight_oc_ptr + src_ic * src_kernel_sz,
-          sizeof(float) * src_kernel_sz);
+      const int64_t dst_ic4 = src_ic/4;
+      for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) {
+        for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) {
+          memcpy(
+              dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz +
+                dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4,
+              src_weight_oc_ptr + src_ic * src_kernel_sz + src_ih * src_kw_sz + src_iw,
+              sizeof(float));
+        }
+      }
     }
   }
 
   return v_weight;
 }
 
+vTensor pack_weights(
+    api::Resource::Pool& pool,
+    const Tensor& weight_arg,
+    const int64_t groups) {
+  if (is_depthwise(weight_arg.sizes(), groups)) {
+    return pack_weights_dw(pool, weight_arg, groups);
+  }
+
+  if (Experimentation::kUseConv2dOldApi) {
+    return pack_weights_old(pool, weight_arg, groups);
+  }
+  return pack_weights_2d(pool, weight_arg, groups);
+}
+
 vTensor pack_biases(
     api::Resource::Pool& pool,
     const c10::optional<Tensor>& bias,
@@ -394,6 +440,7 @@ void conv2d_depthwise(
     const vTensor& v_weight,
     const vTensor& v_bias,
     const IntArrayRef filter,
+    const IntArrayRef src_filter,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
@@ -406,6 +453,7 @@ void conv2d_depthwise(
       int32_t padding_x, padding_y;
       int32_t dilate_x, dilate_y;
       float clamp_x, clamp_y;
+      int32_t src_filter_w, src_filter_h;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::width]),
       safe_downcast<int32_t>(filter[Layout::Filter::height]),
@@ -417,6 +465,8 @@ void conv2d_depthwise(
       safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
       output_min,
       output_max,
+      safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
     };
 
     context->dispatch(
@@ -473,14 +523,12 @@ void conv2d_pointwise(
     const float output_min,
     const float output_max) {
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const int64_t stacks_per_tower = v_weight.sizes()[0];
 
     const struct {
       int32_t kernel_ic, kernel_oc;
       int32_t stride_x, stride_y;
       int32_t padding_x, padding_y;
       float clamp_x, clamp_y;
-      int32_t stacks_per_tower;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::input]),
       safe_downcast<int32_t>(filter[Layout::Filter::output]),
@@ -490,7 +538,6 @@ void conv2d_pointwise(
       safe_downcast<int32_t>(padding[Layout::Parameter::height]),
       output_min,
       output_max,
-      safe_downcast<int32_t>(stacks_per_tower),
     };
 
     context->dispatch(
@@ -542,20 +589,20 @@ void conv2d(
     const vTensor& v_weight,
     const vTensor& v_bias,
     const IntArrayRef filter,
+    const IntArrayRef src_filter,
     const IntArrayRef stride,
     const IntArrayRef padding,
     const IntArrayRef dilation,
     const float output_min,
     const float output_max) {
   if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) {
-    const int64_t stacks_per_tower = v_weight.sizes()[0];
     const struct {
       int32_t kernel_x, kernel_y, kernel_ic, kernel_oc;
       int32_t stride_x, stride_y;
       int32_t padding_x, padding_y;
       int32_t dilate_x, dilate_y;
       float clamp_x, clamp_y;
-      int32_t stacks_per_tower;
+      int32_t src_filter_w, src_filter_h, src_filter_w4;
     } block {
       safe_downcast<int32_t>(filter[Layout::Filter::width]),
       safe_downcast<int32_t>(filter[Layout::Filter::height]),
@@ -569,7 +616,9 @@ void conv2d(
       safe_downcast<int32_t>(dilation[Layout::Parameter::height]),
       output_min,
       output_max,
-      safe_downcast<int32_t>(stacks_per_tower),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::width]),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::height]),
+      safe_downcast<int32_t>(src_filter[Layout::Filter::width]*4),
     };
 
     context->dispatch(
@@ -639,7 +688,7 @@ Tensor convolution(
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
-  m.impl_UNBOXED("convolution_overrideable", convolution);
+  m.impl("convolution_overrideable", convolution);
 }
 
 #endif /* USE_VULKAN_API */
@@ -859,6 +908,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
           packed_.v_weight,
           packed_.v_bias,
           packed_.filter,
+          unpacked_.filter,
           packed_.stride,
           packed_.padding,
           packed_.dilation,
@@ -904,6 +954,7 @@ Tensor Conv2dOpContext::run(const Tensor& input_arg) const {
               packed_.v_weight,
               packed_.v_bias,
               packed_.filter,
+              unpacked_.filter,
               packed_.stride,
               packed_.padding,
               packed_.dilation,
diff --git a/aten/src/ATen/native/vulkan/ops/Factory.cpp b/aten/src/ATen/native/vulkan/ops/Factory.cpp
index 6e48ba120c31..14deb30b9888 100644
--- a/aten/src/ATen/native/vulkan/ops/Factory.cpp
+++ b/aten/src/ATen/native/vulkan/ops/Factory.cpp
@@ -45,7 +45,7 @@ Tensor empty_strided(
 #ifdef USE_VULKAN_API
 
 TORCH_LIBRARY_IMPL(aten, Vulkan, m) {
-  m.impl_UNBOXED("empty.memory_format", at::native::vulkan::ops::empty_memory_format);
+  m.impl("empty.memory_format", at::native::vulkan::ops::empty_memory_format);
   m.impl("empty_strided", TORCH_FN(at::native::vulkan::ops::empty_strided));
 }
 
diff --git a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
index e8442a64d0ad..da13fb9574d5 100644
--- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
+++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
@@ -73,21 +73,21 @@ TORCH_LIBRARY(xnnpack, m) {
 }
 
 TORCH_LIBRARY(prepacked, m) {
-  m.def("linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext");
-  m.def("linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y");
-  m.def("conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext");
-  m.def("conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext");
-  m.def("conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y");
-  m.def("conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y");
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_prepack(Tensor W, Tensor? B=None, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.LinearOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::linear_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.LinearOpContext W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.Conv2dOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_prepack(Tensor W, Tensor? B, int[2] stride, int[2] padding, int[2] output_padding, int[2] dilation, int groups, Scalar? output_min=None, Scalar? output_max=None) -> __torch__.torch.classes.xnnpack.TransposeConv2dOpContext"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.Conv2dOpContext W_prepack) -> Tensor Y"));
+  m.def(TORCH_SELECTIVE_SCHEMA("prepacked::conv2d_transpose_clamp_run(Tensor X, __torch__.torch.classes.xnnpack.TransposeConv2dOpContext W_prepack) -> Tensor Y"));
 }
 
 TORCH_LIBRARY_IMPL(prepacked, CPU, m) {
-  m.impl("linear_clamp_prepack", TORCH_FN(createLinearClampPrePackOpContext));
-  m.impl("linear_clamp_run", TORCH_FN(internal::linear::linear_clamp_run));
-  m.impl("conv2d_clamp_prepack", TORCH_FN(createConv2dClampPrePackOpContext));
-  m.impl("conv2d_transpose_clamp_prepack", TORCH_FN(createConv2dTransposeClampPrePackOpContext));
-  m.impl("conv2d_clamp_run", TORCH_FN(internal::convolution2d::conv2d_clamp_run));
-  m.impl("conv2d_transpose_clamp_run", TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_prepack"), TORCH_FN(createLinearClampPrePackOpContext));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::linear_clamp_run"), TORCH_FN(internal::linear::linear_clamp_run));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_prepack"), TORCH_FN(createConv2dClampPrePackOpContext));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_prepack"), TORCH_FN(createConv2dTransposeClampPrePackOpContext));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_clamp_run));
+  m.impl(TORCH_SELECTIVE_NAME("prepacked::conv2d_transpose_clamp_run"), TORCH_FN(internal::convolution2d::conv2d_transpose_clamp_run));
 }
 
 } // namespace xnnpack
diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
index e923f6d73bd0..ed4359c6883e 100644
--- a/aten/src/ATen/templates/RegisterDispatchKey.cpp
+++ b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -37,10 +37,13 @@
 
 namespace at {
 
-namespace {
-
 ${dispatch_definitions}
 
+// NB: TORCH_LIBRARY_IMPL must be in an anonymous namespace to avoid
+// ambiguity with conflicting identifiers that may have been defined in
+// at namespace already.
+namespace {
+
 TORCH_LIBRARY_IMPL(aten, ${DispatchKey}, m) {
   ${dispatch_registrations}
 }
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index d42c8c23fe9c..0dfef701c51b 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -28,6 +28,7 @@ class Tensor;
 }
 namespace c10{
 struct TensorOptions;
+template<class T> class List;
 }
 namespace at {
 struct Generator;
@@ -207,10 +208,6 @@ class TORCH_API Tensor {
   Tensor& operator=(const Tensor&) &&;
   Tensor& operator=(Tensor&&) &&;
 
-  #ifdef _MSC_VER
-  #pragma warning( pop )
-  #endif
-
   bool is_same(const Tensor& other) const noexcept {
     return impl_ == other.impl_;
   }
@@ -760,6 +757,12 @@ class TORCH_API Tensor {
   c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
 };
 
+// For "multiple ... operators specified" warnings, closing brace of class
+// declaration must be included between pragma push & pop
+#ifdef _MSC_VER
+#pragma warning( pop )
+#endif
+
 int64_t get_device(Tensor self);
 
 template <typename T>
diff --git a/aten/src/ATen/test/cpu_rng_test.cpp b/aten/src/ATen/test/cpu_rng_test.cpp
index 6d596095d7a0..805ed40557b6 100644
--- a/aten/src/ATen/test/cpu_rng_test.cpp
+++ b/aten/src/ATen/test/cpu_rng_test.cpp
@@ -28,6 +28,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
   uint64_t current_seed() const override { throw std::runtime_error("not implemented"); }
   uint64_t seed() override { throw std::runtime_error("not implemented"); }
+  void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); }
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override { throw std::runtime_error("not implemented"); }
   TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); }
 
   static DeviceType device_type() { return DeviceType::CPU; }
diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp
index 14e75205aa66..a0e2648758ff 100644
--- a/aten/src/ATen/test/ivalue_test.cpp
+++ b/aten/src/ATen/test/ivalue_test.cpp
@@ -51,6 +51,91 @@ TEST(IValueTest, Basic) {
   ASSERT_EQ(tv.use_count(), 2);
 }
 
+static std::array<IValue, 5> makeSampleIValues() {
+  return { at::rand({3, 4}), "hello", 42, true, 1.5 };
+}
+
+static std::array<IValue, 5> makeMoreSampleIValues() {
+  return { at::rand({3, 4}), "goodbye", 23, false, 0.5 };
+}
+
+// IValue::operator== doesn't seem to work on Tensors.
+#define EXPECT_IVALUE_EQ(a, b)                          \
+  EXPECT_EQ((a).isTensor(), (b).isTensor());            \
+  if ((a).isTensor()) {                                 \
+    EXPECT_TRUE(a.toTensor().equal(b.toTensor()));      \
+  } else {                                              \
+    EXPECT_EQ(a, b);                                    \
+  }
+
+TEST(IValueTest, Swap) {
+  // swap() has the following 3 cases: tensor, intrusive_ptr, or
+  // neither. Exercise all pairs of the three.
+
+  auto sampleInputs = makeSampleIValues();
+  auto sampleTargets = makeMoreSampleIValues();
+  for (const auto& input: sampleInputs) {
+    for (const auto& target: sampleTargets) {
+      IValue a(input);
+      IValue b(target);
+      EXPECT_IVALUE_EQ(a, input);
+      EXPECT_IVALUE_EQ(b, target);
+      a.swap(b);
+      EXPECT_IVALUE_EQ(a, target);
+      EXPECT_IVALUE_EQ(b, input);
+    }
+  }
+}
+
+TEST(IValueTest, CopyConstruct) {
+  auto sampleInputs = makeSampleIValues();
+  for (const IValue& v: sampleInputs) {
+    IValue copy(v);
+    EXPECT_IVALUE_EQ(copy, v);
+  }
+}
+
+TEST(IValueTest, MoveConstruct) {
+  auto sampleInputs = makeSampleIValues();
+  for (const IValue& v: sampleInputs) {
+    IValue source(v);
+    IValue target(std::move(source));
+    EXPECT_IVALUE_EQ(target, v);
+    EXPECT_TRUE(source.isNone());
+  }
+}
+
+TEST(IValueTest, CopyAssign) {
+  auto sampleInputs = makeSampleIValues();
+  auto sampleTargets = makeMoreSampleIValues();
+
+  for (const IValue& input: sampleInputs) {
+    for (const IValue& target: sampleTargets) {
+      IValue copyTo(target);
+      IValue copyFrom(input);
+      copyTo = copyFrom;
+      EXPECT_IVALUE_EQ(copyTo, input);
+      EXPECT_IVALUE_EQ(copyFrom, input);
+      EXPECT_IVALUE_EQ(copyTo, copyFrom);
+    }
+  }
+}
+
+TEST(IValueTest, MoveAssign) {
+  auto sampleInputs = makeSampleIValues();
+  auto sampleTargets = makeMoreSampleIValues();
+
+  for (const IValue& input: sampleInputs) {
+    for (const IValue& target: sampleTargets) {
+      IValue moveTo(target);
+      IValue moveFrom(input);
+      moveTo = std::move(moveFrom);
+      EXPECT_IVALUE_EQ(moveTo, input);
+      EXPECT_TRUE(moveFrom.isNone());
+    }
+  }
+}
+
 TEST(IValueTest, Tuple) {
   std::tuple<int64_t, at::Tensor> t = std::make_tuple(123, at::randn({1}));
   auto iv = IValue(t);
@@ -318,5 +403,137 @@ TEST(IValueTest, EnumEquality) {
   );
 }
 
+TEST(IValueTest, isPtrType) {
+  IValue tensor(at::rand({3, 4}));
+  IValue undefinedTensor((at::Tensor()));
+  IValue integer(42);
+  IValue str("hello");
+
+  EXPECT_TRUE(tensor.isPtrType());
+  EXPECT_FALSE(undefinedTensor.isPtrType());
+  EXPECT_FALSE(integer.isPtrType());
+  EXPECT_TRUE(str.isPtrType());
+}
+
+TEST(IValueTest, isAliasOf) {
+  auto sampleIValues = makeSampleIValues();
+  for (auto& iv: sampleIValues) {
+    for (auto& iv2: sampleIValues) {
+      if (&iv == &iv2 && iv.isPtrType()) {
+        EXPECT_TRUE(iv.isAliasOf(iv2));
+      } else {
+        EXPECT_FALSE(iv.isAliasOf(iv2));
+      }
+    }
+  }
+}
+
+TEST(IValueTest, internalToPointer) {
+  IValue tensor(at::rand({3, 4}));
+  IValue str("hello");
+
+  EXPECT_EQ(tensor.internalToPointer(), tensor.unsafeToTensorImpl());
+  EXPECT_NE(str.internalToPointer(), nullptr);
+
+  IValue nullStr((c10::intrusive_ptr<ivalue::ConstantString>()));
+  ASSERT_TRUE(nullStr.isString());
+  EXPECT_EQ(nullStr.internalToPointer(), nullptr);
+}
+
+TEST(IValueTest, IdentityComparisonAndHashing) {
+  at::Tensor t1 = at::rand({3, 4});
+  at::Tensor t2 = at::rand({3, 4});
+  IValue tv1(t1), tv2(t2);
+  IValue tv1b(t1);
+
+  EXPECT_EQ(tv1.hash(), tv1b.hash());
+  EXPECT_NE(tv1.hash(), tv2.hash());
+
+  EXPECT_TRUE(tv1.is(tv1));
+  EXPECT_TRUE(tv1.is(tv1b));
+  EXPECT_TRUE(tv1b.is(tv1));
+  EXPECT_TRUE(tv2.is(tv2));
+
+  EXPECT_FALSE(tv1.is(tv2));
+  EXPECT_FALSE(tv2.is(tv1));
+
+  IValue none;
+  IValue undefinedTensor((at::Tensor()));
+
+  EXPECT_TRUE(none.is(undefinedTensor));
+  EXPECT_TRUE(undefinedTensor.is(none));
+
+  // Is this a bug? We should probably have a is b => a.hash() == b.hash()
+  EXPECT_NE(none.hash(), undefinedTensor.hash());
+
+  auto sampleIValues = makeSampleIValues();
+  auto sampleIValues2 = makeSampleIValues();
+  auto moreSampleIValues = makeMoreSampleIValues();
+
+  ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size());
+  for (int ii = 0; ii < sampleIValues.size(); ++ii) {
+    // Constant strings will have the same pointer value.
+    if (sampleIValues[ii].isPtrType() && !sampleIValues[ii].isString()) {
+      EXPECT_NE(sampleIValues[ii].hash(), sampleIValues2[ii].hash());
+    } else {
+      EXPECT_EQ(sampleIValues[ii].hash(), sampleIValues2[ii].hash());
+    }
+    EXPECT_NE(sampleIValues[ii].hash(), moreSampleIValues[ii].hash());
+  }
+}
+
+TEST(IValueTest, getSubValues) {
+  // Scalars have no subvalues.
+  IValue integer(42), float_(1.5);
+
+  IValue::HashAliasedIValues subvalues;
+
+  integer.getSubValues(subvalues);
+  EXPECT_TRUE(subvalues.empty());
+
+  subvalues.clear();
+
+  float_.getSubValues(subvalues);
+  EXPECT_TRUE(subvalues.empty());
+
+  subvalues.clear();
+
+  at::Tensor t1(at::rand({3, 4})), t2(at::rand({3, 4}));
+  IValue tv1(t1), tv2(t2);
+  IValue list(std::vector<at::Tensor>{t1, t2});
+  IValue tuple(ivalue::Tuple::create({tv1, tv2}));
+
+  std::unordered_map<int64_t, at::Tensor> m;
+  m[1] = t1;
+  m[2] = t2;
+
+  IValue dict(std::move(m));
+
+  auto objType = ClassType::create(nullopt, {});
+  objType->addAttribute("t1", tv1.type());
+  objType->addAttribute("t2", tv2.type());
+
+  auto o = ivalue::Object::create(StrongTypePtr(nullptr, objType), 2);
+  o->setSlot(0, tv1);
+  o->setSlot(1, tv2);
+
+  IValue object(o);
+  tv1.getSubValues(subvalues);
+  EXPECT_EQ(subvalues.size(), 1);
+  EXPECT_EQ(subvalues.count(tv1), 1);
+
+  subvalues.clear();
+
+  for (auto& container: {list, tuple, dict, object}) {
+    container.getSubValues(subvalues);
+    EXPECT_EQ(subvalues.size(), 3);
+    EXPECT_EQ(subvalues.count(container), 1);
+    EXPECT_EQ(subvalues.count(tv1), 1);
+    EXPECT_EQ(subvalues.count(tv2), 1);
+
+    subvalues.clear();
+  }
+}
+
 // TODO(gmagogsfm): Add type conversion test?
 } // namespace c10
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 68c0b4f3f71a..3b7bfb47fe62 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -138,3 +138,23 @@ TEST(TestScalar, TestConj) {
   ASSERT_EQ(float_scalar.conj().toDouble(), 3.0);
   ASSERT_EQ(complex_scalar.conj().toComplexDouble(), c10::complex<double>(2.3, -3.5));
 }
+
+TEST(TestScalar, TestEqual) {
+  ASSERT_FALSE(Scalar(1.0).equal(false));
+  ASSERT_FALSE(Scalar(1.0).equal(true));
+  ASSERT_FALSE(Scalar(true).equal(1.0));
+  ASSERT_TRUE(Scalar(true).equal(true));
+
+  ASSERT_TRUE(Scalar(c10::complex<double>{2.0, 5.0}).equal(c10::complex<double>{2.0, 5.0}));
+  ASSERT_TRUE(Scalar(c10::complex<double>{2.0, 0}).equal(2.0));
+  ASSERT_TRUE(Scalar(c10::complex<double>{2.0, 0}).equal(2));
+
+  ASSERT_TRUE(Scalar(2.0).equal(c10::complex<double>{2.0, 0.0}));
+  ASSERT_FALSE(Scalar(2.0).equal(c10::complex<double>{2.0, 4.0}));
+  ASSERT_FALSE(Scalar(2.0).equal(3.0));
+  ASSERT_TRUE(Scalar(2.0).equal(2));
+
+  ASSERT_TRUE(Scalar(2).equal(c10::complex<double>{2.0, 0}));
+  ASSERT_TRUE(Scalar(2).equal(2));
+  ASSERT_TRUE(Scalar(2).equal(2.0));
+}
diff --git a/aten/src/TH/CMakeLists.txt b/aten/src/TH/CMakeLists.txt
index a3ed10126b93..5661a697da38 100644
--- a/aten/src/TH/CMakeLists.txt
+++ b/aten/src/TH/CMakeLists.txt
@@ -79,7 +79,6 @@ install(FILES
   THHalf.h
   THTensor.hpp
   THStorageFunctions.hpp
-  THGenerator.hpp
   DESTINATION "${ATEN_INSTALL_INCLUDE_SUBDIR}/TH")
 
 install(FILES
diff --git a/aten/src/TH/THGenerator.hpp b/aten/src/TH/THGenerator.hpp
deleted file mode 100644
index 1a40611f8b5b..000000000000
--- a/aten/src/TH/THGenerator.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-#include <ATen/core/MT19937RNGEngine.h>
-
-/**
- * THGeneratorState is a POD class needed for memcpys
- * in torch.get_rng_state() and torch.set_rng_state().
- * It is a legacy class and even though it is replaced with
- * at::CPUGeneratorImpl, we need this class and some of its fields
- * to support backward compatibility on loading checkpoints.
- */
-struct THGeneratorState {
-  /* The initial seed. */
-  uint64_t the_initial_seed;
-  int left;  /* = 1; */
-  int seeded; /* = 0; */
-  uint64_t next;
-  uint64_t state[at::MERSENNE_STATE_N]; /* the array for the state vector  */
-
-  /********************************/
-
-  /* For normal distribution */
-  double normal_x;
-  double normal_y;
-  double normal_rho;
-  int normal_is_valid; /* = 0; */
-};
-
-/**
- * THGeneratorStateNew is a POD class containing
- * new data introduced in at::CPUGeneratorImpl and the legacy state. It is used
- * as a helper for torch.get_rng_state() and torch.set_rng_state()
- * functions.
- */ 
-struct THGeneratorStateNew {
-  THGeneratorState legacy_pod;
-  float next_float_normal_sample;
-  bool is_next_float_normal_sample_valid;
-};
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index 399bcc38e1de..c37b0b9bb7f0 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -11,7 +11,6 @@
 #include <type_traits>
 #include <ATen/Utils.h>
 #include <ATen/core/DistributionsHelper.h>
-#include <TH/THGenerator.hpp>
 
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
@@ -149,119 +148,4 @@ void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTens
     }
 }
 #endif
-
-#if defined(TH_REAL_IS_BYTE)
-void THTensor_(getRNGState)(at::Generator _generator, THTensor *self)
-{
-  // See Note [Acquire lock when using random generators]
-  std::lock_guard<std::mutex> lock(_generator.mutex());
-  static const size_t size = sizeof(THGeneratorStateNew);
-  THTensor_(resize1d)(self, size);
-  THArgCheck(THTensor_(nElement)(self) == size, 1, "RNG state is wrong size");
-  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
-  static_assert(std::is_pod<THGeneratorStateNew>::value, "THGeneratorStateNew is not a PODType");
-
-  // cast byte tensor to POD type
-  THGeneratorStateNew* rng_state = (THGeneratorStateNew*)self->data<scalar_t>();
-
-  // accumulate generator data to be copied into byte tensor
-  auto accum_state = std::make_unique<THGeneratorStateNew>();
-  auto cast_generator = at::check_generator<at::CPUGeneratorImpl>(_generator);
-  auto rng_data = cast_generator->engine().data();
-  accum_state->legacy_pod.the_initial_seed = rng_data.seed_;
-  accum_state->legacy_pod.left = rng_data.left_;
-  accum_state->legacy_pod.seeded = rng_data.seeded_;
-  accum_state->legacy_pod.next = rng_data.next_;
-  std::copy(rng_data.state_.begin(), rng_data.state_.end(), std::begin(accum_state->legacy_pod.state));
-  accum_state->legacy_pod.normal_x = 0.0; // we don't use it anymore and this is just a dummy
-  accum_state->legacy_pod.normal_rho = 0.0; // we don't use it anymore and this is just a dummy
-  accum_state->legacy_pod.normal_is_valid = false;
-  accum_state->legacy_pod.normal_y = 0.0;
-  accum_state->next_float_normal_sample = 0.0f;
-  accum_state->is_next_float_normal_sample_valid = false;
-  if(cast_generator->next_double_normal_sample()) {
-    accum_state->legacy_pod.normal_is_valid = true;
-    accum_state->legacy_pod.normal_y = *(cast_generator->next_double_normal_sample());
-  }
-  if(cast_generator->next_float_normal_sample()) {
-    accum_state->is_next_float_normal_sample_valid = true;
-    accum_state->next_float_normal_sample = *(cast_generator->next_float_normal_sample());
-  }
-
-  memcpy(rng_state, accum_state.get(), size);
-}
-
-void THTensor_(setRNGState)(at::Generator _generator, THTensor *self)
-{
-  // See Note [Acquire lock when using random generators]
-  std::lock_guard<std::mutex> lock(_generator.mutex());
-  auto cast_generator = at::check_generator<at::CPUGeneratorImpl>(_generator);
-  THArgCheck(THTensor_(isContiguous)(self), 1, "RNG state needs to be contiguous");
-  static_assert(std::is_pod<THGeneratorState>::value, "THGeneratorState is not a PODType");
-  static_assert(std::is_pod<THGeneratorStateNew>::value, "THGeneratorStateNew is not a PODType");
-
-  static const size_t size_legacy = sizeof(THGeneratorState);
-  static const size_t size_current = sizeof(THGeneratorStateNew);
-  static_assert(size_legacy != size_current, "Legacy THGeneratorState and THGeneratorStateNew can't be of the same size");
-
-  at::mt19937 engine;
-  auto float_normal_sample = c10::optional<float>();
-  auto double_normal_sample = c10::optional<double>();
-
-  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
-  THGeneratorState* legacy_pod;
-  if (THTensor_(nElement)(self) == size_legacy) {
-    legacy_pod = (THGeneratorState*)self->data<scalar_t>();
-    // Note that in legacy THGeneratorState, we didn't have float version
-    // of normal sample and hence we leave the c10::optional<float> as is
-
-    // Update next_double_normal_sample.
-    // Note that legacy THGeneratorState stores two uniform values (normal_x, normal_y)
-    // and a rho value (normal_rho). These three values were redundant and in the new
-    // DistributionsHelper.h, we store the actual extra normal sample, rather than three
-    // intermediate values.
-    if (legacy_pod->normal_is_valid) {
-      auto r = legacy_pod->normal_rho;
-      auto theta = 2.0 * M_PI * legacy_pod->normal_x;
-      // we return the sin version of the normal sample when in caching mode
-      double_normal_sample = c10::optional<double>(r * ::sin(theta));
-    }
-  } else if (THTensor_(nElement)(self) == size_current) {
-    auto rng_state = (THGeneratorStateNew*)self->data<scalar_t>();
-    legacy_pod = &rng_state->legacy_pod;
-    // update next_float_normal_sample
-    if (rng_state->is_next_float_normal_sample_valid) {
-      float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
-    }
-
-    // Update next_double_normal_sample.
-    // Note that in getRNGState, we now return the actual normal sample in normal_y
-    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
-    // are squashed to 0.0.
-    if (legacy_pod->normal_is_valid) {
-      double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
-    }
-  } else {
-    AT_ERROR("Expected either a THGeneratorState of size ", size_legacy,
-             " or a THGeneratorStateNew of size ", size_current,
-             " but found the input RNG state size to be ", THTensor_(nElement)(self));
-  }
-
-  // construct engine_
-  // Note that legacy THGeneratorState stored a state array of 64 bit uints, whereas in our
-  // redefined mt19937, we have changed to a state array of 32 bit uints. Hence, we are
-  // doing a std::copy.
-  at::mt19937_data_pod rng_data;
-  std::copy(std::begin(legacy_pod->state), std::end(legacy_pod->state), rng_data.state_.begin());
-  rng_data.seed_ = legacy_pod->the_initial_seed;
-  rng_data.left_ = legacy_pod->left;
-  rng_data.seeded_ = legacy_pod->seeded;
-  rng_data.next_ = static_cast<uint32_t>(legacy_pod->next);
-  engine.set_data(rng_data);
-  THArgCheck(engine.is_valid(), 1, "Invalid mt19937 state");
-  cast_generator->set_engine(engine);
-  cast_generator->set_next_float_normal_sample(float_normal_sample);
-  cast_generator->set_next_double_normal_sample(double_normal_sample);
-}
-#endif
 #endif
diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h
index ffc52bc69390..ddeb905680cd 100644
--- a/aten/src/TH/generic/THTensorRandom.h
+++ b/aten/src/TH/generic/THTensorRandom.h
@@ -9,9 +9,4 @@ TH_API void THTensor_(multinomialAliasSetup)(THTensor *prob_dist, THLongTensor *
 TH_API void THTensor_(multinomialAliasDraw)(THLongTensor *self, THTensor *q, THLongTensor *J, int n_sample, c10::optional<at::Generator> _generator);
 #endif
 
-#if defined(TH_REAL_IS_BYTE)
-TH_API void THTensor_(getRNGState)(at::Generator _generator, THTensor *self);
-TH_API void THTensor_(setRNGState)(at::Generator _generator, THTensor *self);
-#endif
-
 #endif
diff --git a/aten/src/THC/THCTensorMathMagma.cu b/aten/src/THC/THCTensorMathMagma.cu
index ce6ca38afd2b..36316a6bf2eb 100644
--- a/aten/src/THC/THCTensorMathMagma.cu
+++ b/aten/src/THC/THCTensorMathMagma.cu
@@ -8,7 +8,7 @@
 #include <ATen/native/cuda/MiscUtils.h>
 
 #ifdef USE_MAGMA
-#include <magma.h>
+#include <magma_v2.h>
 #endif
 
 #ifndef DIVUP
diff --git a/aten/src/THC/THCTensorMathMagma.cuh b/aten/src/THC/THCTensorMathMagma.cuh
index 5ceac465c317..1fb5821afce5 100644
--- a/aten/src/THC/THCTensorMathMagma.cuh
+++ b/aten/src/THC/THCTensorMathMagma.cuh
@@ -2,7 +2,7 @@
 #define THC_TENSOR_MATH_MAGMA_CUH
 
 #ifdef USE_MAGMA
-#include <magma.h>
+#include <magma_v2.h>
 #endif
 
 #ifdef USE_MAGMA
diff --git a/aten/src/THC/THCTensorRandom.cu b/aten/src/THC/THCTensorRandom.cu
index aefb427f4e67..8655ea2fb829 100644
--- a/aten/src/THC/THCTensorRandom.cu
+++ b/aten/src/THC/THCTensorRandom.cu
@@ -12,60 +12,6 @@
 #define MAX_NUM_BLOCKS 200
 #define BLOCK_SIZE 256
 
-// NB: ROCm compiler seems to have a bug where __host__ functions must be
-// explicitly specified extern "C" otherwise ROCm compiler doesn't respect it.
-// See https://github.com/RadeonOpenCompute/hcc/issues/839
-__host__ void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state)
-{
-  auto gen = at::check_generator<at::CUDAGeneratorImpl>(gen_);
-  std::lock_guard<std::mutex> lock(gen->mutex_);
-  // The RNG state comprises the seed, and an offset used for Philox.
-  // The following line is just here for BC reason. sizeof curandStateMtgp32 is 4120.
-  // It used to be static const size_t states_size = MAX_NUM_BLOCKS * sizeof(curandStateMtgp32);
-  // MAX_NUM_BLOCKS was 200 and sizeof(curandStateMtgp32) is 4120. Hardcoding these numbers here
-  // because this is just host side code and we don't want to worry about linking with cuda
-  static const size_t states_size = 200 * sizeof(4120);
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = states_size + seed_size + offset_size;
-  THByteTensor_resize1d(rng_state, total_size);
-  THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
-  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
-  // since curandStateMTGP is not used anymore, fill gen_states of THCGenerator with deterministic garbage value of -1
-  // gen_states in THCGenerator struct was an array of curandStateMtgp32s.
-  memset(THByteTensor_data(rng_state), -1, states_size);
-  auto current_seed = gen->current_seed();
-  auto offset = static_cast<int64_t>(gen->philox_offset_per_thread()); // Note that old THCGeneratorState had offset as std::atomic<int64_t>
-  memcpy(THByteTensor_data(rng_state) + states_size, &current_seed, seed_size);
-  memcpy(THByteTensor_data(rng_state) + states_size + seed_size, &offset, offset_size);
-}
-
-__host__ void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state)
-{
-  auto gen = at::check_generator<at::CUDAGeneratorImpl>(gen_);
-  std::lock_guard<std::mutex> lock(gen->mutex_);
-  static const size_t states_size = 200 * sizeof(4120); // this line is just here for BC reason
-  static const size_t seed_size = sizeof(uint64_t);
-  static const size_t offset_size = sizeof(int64_t);
-  static const size_t total_size = states_size + seed_size + offset_size;
-  bool no_philox_seed = false;
-  if (THByteTensor_nElement(rng_state) == total_size - offset_size) {
-    no_philox_seed = true;
-  }
-  else {
-    THArgCheck(THByteTensor_nElement(rng_state) == total_size, 1, "RNG state is wrong size");
-  }
-  THArgCheck(THByteTensor_isContiguous(rng_state), 1, "RNG state must be contiguous");
-  uint64_t input_seed;
-  memcpy(&input_seed, THByteTensor_data(rng_state) + states_size, seed_size);
-  gen->set_current_seed(input_seed);
-  int64_t philox_offset = 0;
-  if (!no_philox_seed) {
-    memcpy(&philox_offset, THByteTensor_data(rng_state) + states_size + seed_size, offset_size);
-  }
-  gen->set_philox_offset_per_thread(static_cast<uint64_t>(philox_offset));
-}
-
 #include <THC/generic/THCTensorRandom.cu>
 #include <THC/THCGenerateAllTypes.h>
 
diff --git a/aten/src/THC/THCTensorRandom.h b/aten/src/THC/THCTensorRandom.h
index b1d7f1ef1797..696e36f70bec 100644
--- a/aten/src/THC/THCTensorRandom.h
+++ b/aten/src/THC/THCTensorRandom.h
@@ -9,9 +9,4 @@
 #include <THC/generic/THCTensorRandom.h>
 #include <THC/THCGenerateBoolType.h>
 
-#include <ATen/CUDAGeneratorImpl.h>
-
-TORCH_CUDA_API void THCRandom_getRNGState(at::Generator gen_, THByteTensor *rng_state);
-TORCH_CUDA_API void THCRandom_setRNGState(at::Generator gen_, THByteTensor *rng_state);
-
 #endif
diff --git a/benchmarks/functional_autograd_benchmark/ppl_models.py b/benchmarks/functional_autograd_benchmark/ppl_models.py
index 906ebac5d41b..94ba6698a91d 100644
--- a/benchmarks/functional_autograd_benchmark/ppl_models.py
+++ b/benchmarks/functional_autograd_benchmark/ppl_models.py
@@ -24,8 +24,9 @@ def forward(beta_value: Tensor) -> Tensor:
         mu = X.mm(beta_value)
 
         # We need to compute the first and second gradient of this score with respect
-        # to beta_value.
-        score = dist.Bernoulli(logits=mu).log_prob(Y).sum() + beta_prior.log_prob(beta_value).sum()
+        # to beta_value. We disable Bernoulli validation because Y is a relaxed value.
+        score = (dist.Bernoulli(logits=mu, validate_args=False).log_prob(Y).sum() +
+                 beta_prior.log_prob(beta_value).sum())
         return score
 
     return forward, (beta_value.to(device),)
@@ -40,7 +41,7 @@ def get_robust_regression(device: torch.device) -> GetterReturnType:
     Y = torch.rand(N, 1, device=device)
 
     # Predefined nu_alpha and nu_beta, nu_alpha.shape: (1, 1), nu_beta.shape: (1, 1)
-    nu_alpha = torch.randn(1, 1, device=device)
+    nu_alpha = torch.rand(1, 1, device=device)
     nu_beta = torch.rand(1, 1, device=device)
     nu = dist.Gamma(nu_alpha, nu_beta)
 
diff --git a/benchmarks/functional_autograd_benchmark/torchvision_models.py b/benchmarks/functional_autograd_benchmark/torchvision_models.py
index 25361af77661..c1d9eaf6105a 100644
--- a/benchmarks/functional_autograd_benchmark/torchvision_models.py
+++ b/benchmarks/functional_autograd_benchmark/torchvision_models.py
@@ -247,7 +247,7 @@ class IntermediateLayerGetter(nn.ModuleDict):
     Additionally, it is only able to query submodules that are directly
     assigned to the model. So if `model` is passed, `model.feature1` can
     be returned, but not `model.feature1.layer2`.
-    Arguments:
+    Args:
         model (nn.Module): model on which we will extract the features
         return_layers (Dict[name, new_name]): a dict containing the names
             of the modules for which the activations will be returned as
@@ -324,7 +324,7 @@ def forward(self, x):
 class FCN(_SimpleSegmentationModel):
     """
     Implements a Fully-Convolutional Network for semantic segmentation.
-    Arguments:
+    Args:
         backbone (nn.Module): the network used to compute the features for the model.
             The backbone should return an OrderedDict[Tensor], with the key being
             "out" for the last feature map used, and "aux" if an auxiliary classifier
@@ -509,7 +509,7 @@ def box_area(boxes):
     """
     Computes the area of a set of bounding boxes, which are specified by its
     (x1, y1, x2, y2) coordinates.
-    Arguments:
+    Args:
         boxes (Tensor[N, 4]): boxes for which the area will be computed. They
             are expected to be in (x1, y1, x2, y2) format
     Returns:
diff --git a/c10/CMakeLists.txt b/c10/CMakeLists.txt
index 48bceb440954..b175e5bdd6ce 100644
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@@ -23,7 +23,7 @@ configure_file(
     ${CMAKE_BINARY_DIR}/c10/macros/cmake_macros.h)
 
 # Note: if you want to add ANY dependency to the c10 library, make sure you
-# check with the core PyTorch developers as the dependendency will be
+# check with the core PyTorch developers as the dependency will be
 # transitively passed on to all libraries dependent on PyTorch.
 file(GLOB C10_SRCS
         *.cpp
diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h
index 486272ece92e..58d456b950ed 100644
--- a/c10/core/DispatchKeySet.h
+++ b/c10/core/DispatchKeySet.h
@@ -124,7 +124,7 @@ class DispatchKeySet final {
 public:
   // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the
   // set. The iterator is only invalidated by the destruction of the underlying
-  // DispatchKeySet as the iterator stores a pointer to the raw represenation of
+  // DispatchKeySet as the iterator stores a pointer to the raw representation of
   // the DispatchKeySet.
   class iterator {
    public:
@@ -235,7 +235,7 @@ C10_API DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t);
 C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t);
 
 // This API exists because we have a use case for checking
-// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefind)
+// getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined)
 // in OperatorEntry.cpp but we disallow it in has() API.
 C10_API bool isIncludedInAlias(DispatchKey k, DispatchKey alias);
 
diff --git a/c10/core/GeneratorImpl.h b/c10/core/GeneratorImpl.h
index 3af652a1a3b2..84e620e93a72 100644
--- a/c10/core/GeneratorImpl.h
+++ b/c10/core/GeneratorImpl.h
@@ -13,6 +13,7 @@
 #include <c10/core/Device.h>
 #include <c10/core/DispatchKeySet.h>
 #include <c10/util/python_stub.h>
+#include <c10/core/TensorImpl.h>
 
 /**
  * Note [Generator]
@@ -71,6 +72,8 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
   virtual void set_current_seed(uint64_t seed) = 0;
   virtual uint64_t current_seed() const = 0;
   virtual uint64_t seed() = 0;
+  virtual void set_state(const c10::TensorImpl& new_state) = 0;
+  virtual c10::intrusive_ptr<c10::TensorImpl> get_state() const = 0;
   Device device() const;
 
   // See Note [Acquire lock when using random generators]
diff --git a/c10/core/MemoryFormat.h b/c10/core/MemoryFormat.h
index e25814cd0717..6528f6c8f110 100644
--- a/c10/core/MemoryFormat.h
+++ b/c10/core/MemoryFormat.h
@@ -98,7 +98,7 @@ inline std::vector<int64_t> get_channels_last_strides_3d(IntArrayRef sizes) {
 // 1. Please do not combine these helper functions, each helper function handles
 // exactly one case of sizes + memory_format, by doing this, the strides indices
 // will be a constant array and we can access it using constant index number,
-// the complier will fully unroll the loop on strides indices to gain a better
+// the compiler will fully unroll the loop on strides indices to gain a better
 // performance.
 // 2. No error check in helper function, caller ensures the correctness of the input
 // 3. All helper functions have similar comments, only 1st helper function is commented here.
@@ -205,7 +205,7 @@ inline bool is_channels_last_strides_3d_s5(const IntArrayRef sizes, const IntArr
 //   a. we identify corner cases where the implementation compromises on.
 //
 // By the time accumulated permutation is enabled to replace implicit
-// memory_foramt through strides, we should be updating our tests and fix the
+// memory_format through strides, we should be updating our tests and fix the
 // issues in our tests.
 //
 // We use Channels Last 2d as an example above.
diff --git a/c10/core/Scalar.cpp b/c10/core/Scalar.cpp
index 35aa5d60f001..203b544924ec 100644
--- a/c10/core/Scalar.cpp
+++ b/c10/core/Scalar.cpp
@@ -3,7 +3,7 @@
 namespace c10 {
 
 Scalar Scalar::operator-() const {
-  TORCH_CHECK(!isBoolean(), "torch boolean negative, the `-` operator, is not suppported.");
+  TORCH_CHECK(!isBoolean(), "torch boolean negative, the `-` operator, is not supported.");
   if (isFloatingPoint()) {
     return Scalar(-v.d);
   } else if (isComplex()) {
@@ -21,4 +21,14 @@ Scalar Scalar::conj() const {
   }
 }
 
+Scalar Scalar::log() const {
+  if (isComplex()) {
+    return std::log(v.z);
+  } else if (isFloatingPoint()) {
+    return std::log(v.d);
+  } else {
+    return std::log(v.i);
+  }
+}
+
 }  // namespace c10
diff --git a/c10/core/Scalar.h b/c10/core/Scalar.h
index 6151f6d2b150..368228e8202e 100644
--- a/c10/core/Scalar.h
+++ b/c10/core/Scalar.h
@@ -88,6 +88,45 @@ class C10_API Scalar {
 
   Scalar operator-() const;
   Scalar conj() const;
+  Scalar log() const;
+
+  template<typename T, typename std::enable_if<!c10::is_complex<T>::value, int>::type = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      auto val = v.z;
+      return (val.real() == num) && (val.imag() == T());
+    } else if (isFloatingPoint()) {
+      return v.d == num;
+    } else if (isIntegral(/*includeBool=*/false)) {
+      return v.i == num;
+    } else {
+      // boolean scalar does not equal to a non boolean value
+      return false;
+    }
+  }
+
+  template<typename T, typename std::enable_if<c10::is_complex<T>::value, int>::type = 0>
+  bool equal(T num) const {
+    if (isComplex()) {
+      return v.z == num;
+    } else if (isFloatingPoint()) {
+      return (v.d == num.real()) && (num.imag() == T());
+    } else if (isIntegral(/*includeBool=*/false)) {
+      return (v.i == num.real()) && (num.imag() == T());
+    } else {
+      // boolean scalar does not equal to a non boolean value
+      return false;
+    }
+  }
+
+  bool equal(bool num) const {
+    if (isBoolean()) {
+      return static_cast<bool>(v.i) == num;
+    } else {
+      return false;
+    }
+  }
+
   ScalarType type() const {
     if (isComplex()) {
       return ScalarType::ComplexDouble;
diff --git a/c10/core/Stream.cpp b/c10/core/Stream.cpp
index 9a5c838c73fe..1a56c9d68567 100644
--- a/c10/core/Stream.cpp
+++ b/c10/core/Stream.cpp
@@ -2,7 +2,7 @@
 
 namespace c10 {
 
-// Not very parseable, but I don't know a good compact syntax for streams.
+// Not very parsable, but I don't know a good compact syntax for streams.
 // Feel free to change this into something more compact if needed.
 std::ostream& operator<<(std::ostream& stream, const Stream& s) {
   stream << "stream " << s.id() << " on device " << s.device();
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 3326404e1d07..e7f9c1260263 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -19,7 +19,7 @@
 #include <c10/util/python_stub.h>
 
 // A global boolean variable to control whether we free memory when a Tensor
-// is shrinked to a smaller size. As a result, a Tensor is always going to
+// is shrunk to a smaller size. As a result, a Tensor is always going to
 // keep the memory allocated for its maximum capacity reshaped to so far.
 //
 // This parameter is respected "upper-case" methods which call Resize()
@@ -625,7 +625,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    * The API is as follows:
    *   - "new_grad" is a Tensor containing the new value of the gradient that should
    *     be set
-   *   - "self" should reprensent the Tensor whose forward grad is accessed. It is
+   *   - "self" should represent the Tensor whose forward grad is accessed. It is
    *     required when dealing with view.
    *   - "level" allows to specify the level of forward AD nesting for which the
    *     gradient should be set. Note that since levels are not fully supported
@@ -1381,7 +1381,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     // error in attempt to invoke TypeMeta::ctor()
     static_assert(
         std::is_default_constructible<T>::value,
-        "Tensor can't hold non-default-constructible types");
+        "Tensor can't hold non-default-constructable types");
     return static_cast<T*>(raw_mutable_data(caffe2::TypeMeta::Make<T>()));
   }
 
diff --git a/c10/core/impl/DeviceGuardImplInterface.h b/c10/core/impl/DeviceGuardImplInterface.h
index 2ef02b57d3be..258f8953f4de 100644
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@@ -126,7 +126,7 @@ struct C10_API DeviceGuardImplInterface {
 /**
  * Increments the event's version and enqueues a job with this version
  * in the stream's work queue. When the stream process that job
- * it nofifies all streams waiting on / blocked by that version of the
+ * it notifies all streams waiting on / blocked by that version of the
  * event to continue and marks that version as recorded.
  * */
   virtual void record(
diff --git a/c10/core/impl/LocalDispatchKeySet.cpp b/c10/core/impl/LocalDispatchKeySet.cpp
index 358e6ef7e1f7..ff3e454eda8a 100644
--- a/c10/core/impl/LocalDispatchKeySet.cpp
+++ b/c10/core/impl/LocalDispatchKeySet.cpp
@@ -5,10 +5,6 @@
 namespace c10 {
 namespace impl {
 
-C10_DEFINE_bool(disable_variable_dispatch, false, "This flag forcibly disables the Variable code paths from executing, which currently breaks profiling in the process.");
-
-namespace {
-
 /// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
 /// thread_local is not supported.
 #ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY
@@ -18,25 +14,15 @@ thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set;
 
 #else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
 
-static PODLocalDispatchKeySet raw_local_dispatch_key_set;
+PODLocalDispatchKeySet raw_local_dispatch_key_set;
 
 #endif
 
-} // anonymous namespace
-
+#ifdef _MSC_VER
 LocalDispatchKeySet tls_local_dispatch_key_set() {
-  // Hack until variable performance is fixed
-  //
-  // ezyang: I'm pretty unhappy about this implementation, it looks wrong
-  // to me, as it seems to be performing a mutation on
-  // raw_local_dispatch_key_set.  I can't conveniently test the correct
-  // version though...
-  if (FLAGS_disable_variable_dispatch) {
-    raw_local_dispatch_key_set.set_excluded(
-      raw_local_dispatch_key_set.excluded() | autograd_dispatch_keyset);
-  }
   return raw_local_dispatch_key_set;
 }
+#endif // _MSC_VER
 
 void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set) {
   raw_local_dispatch_key_set = PODLocalDispatchKeySet {
diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h
index 5262b1d4d6c0..313dc5ca3508 100644
--- a/c10/core/impl/LocalDispatchKeySet.h
+++ b/c10/core/impl/LocalDispatchKeySet.h
@@ -23,8 +23,6 @@
 namespace c10 {
 namespace impl {
 
-C10_DECLARE_bool(disable_variable_dispatch);
-
 // POD version of LocalDispatchKeySet.  Declared here just so that
 // we can put it in the guards.
 struct C10_API PODLocalDispatchKeySet {
@@ -54,7 +52,24 @@ struct C10_API LocalDispatchKeySet {
   DispatchKeySet excluded_;
 };
 
+// thread_local variables cannot be C10_API on Windows.
+#ifdef _MSC_VER
 C10_API LocalDispatchKeySet tls_local_dispatch_key_set();
+#else // _MSC_VER
+/// In the CAFFE2_FB_LIMITED_MOBILE_CAPABILITY build setting,
+/// thread_local is not supported.
+#ifndef CAFFE2_FB_LIMITED_MOBILE_CAPABILITY
+  extern C10_API thread_local PODLocalDispatchKeySet raw_local_dispatch_key_set;
+#else // defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
+  extern C10_API PODLocalDispatchKeySet raw_local_dispatch_key_set;
+#endif
+
+inline C10_API LocalDispatchKeySet tls_local_dispatch_key_set() {
+  // Don't let people fiddle with the thread_local directly just
+  // because they include this header.
+  return raw_local_dispatch_key_set;
+}
+#endif // _MSC_VER
 
 // Internal, use ThreadLocalStateGuard
 C10_API void _force_tls_local_dispatch_key_set(LocalDispatchKeySet key_set);
diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index c8fa53df6f02..256fc54b08a1 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -13,7 +13,7 @@ configure_file(
     ${CMAKE_BINARY_DIR}/c10/cuda/impl/cuda_cmake_macros.h)
 
 # Note: if you want to add ANY dependency to the c10 library, make sure you
-# check with the core PyTorch developers as the dependendency will be
+# check with the core PyTorch developers as the dependency will be
 # transitively passed on to all libraries dependent on PyTorch.
 
 # Note: if you add a new source file/header, you will need to update
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 0b5d2992538c..493296248e5b 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -62,7 +62,7 @@ constexpr size_t kSmallSize = 1048576;      // largest "small" allocation is 1 M
 constexpr size_t kSmallBuffer = 2097152;    // "small" allocations are packed in 2 MiB blocks
 constexpr size_t kLargeBuffer = 20971520;   // "large" allocations may be packed in 20 MiB blocks
 constexpr size_t kMinLargeAlloc = 10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
-constexpr size_t kRoundLarge = 2097152;     // round up large allocs to 2 MiB
+constexpr size_t kRoundLarge = 2097152;     // round up large allocations to 2 MiB
 
 typedef std::bitset<static_cast<size_t>(StatType::NUM_TYPES)> StatTypes;
 
diff --git a/c10/cuda/CUDAStream.cpp b/c10/cuda/CUDAStream.cpp
index 457331f4a00d..d1e290c3f02c 100644
--- a/c10/cuda/CUDAStream.cpp
+++ b/c10/cuda/CUDAStream.cpp
@@ -60,7 +60,7 @@ static LeakyStreamInternals default_streams[C10_COMPILE_TIME_MAX_GPUS];
 // in the pool to be returned when a stream is requested (round-robin fashion
 // , see the note in CUDAStream.h).
 //
-// unique_ptr<T[]> is used instead of vector<T> because T might be non-moveable
+// unique_ptr<T[]> is used instead of vector<T> because T might be non-movable
 // and non-copyable.
 static std::once_flag device_flags[C10_COMPILE_TIME_MAX_GPUS];
 static std::atomic<uint32_t> low_priority_counters[C10_COMPILE_TIME_MAX_GPUS];
diff --git a/c10/cuda/CUDAStream.h b/c10/cuda/CUDAStream.h
index 41802b3bc9ef..05eddf5ce122 100644
--- a/c10/cuda/CUDAStream.h
+++ b/c10/cuda/CUDAStream.h
@@ -152,7 +152,7 @@ class C10_CUDA_API CUDAStream {
   static std::tuple<int, int> priority_range() {
       // Note: this returns the range of priority **supported by PyTorch**, not
       // the range of priority **supported by CUDA**. The former is a subset of
-      // the latter. Curently PyTorch only supports 0 and -1, which are "low" and
+      // the latter. Currently PyTorch only supports 0 and -1, which are "low" and
       // "high" priority.
       int least_priority, greatest_priority;
       C10_CUDA_CHECK(
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 46ff50621417..5499a7d8b81c 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -316,7 +316,7 @@ __host__ __device__
 #define C10_MOBILE 1
 #endif // ANDROID / IOS
 
-// Portably determine if a type T is trivially copyable or not.
+// Portable determination of whether type T is trivially copyable.
 // Warning: __has_trivial_copy for GCC may not always detect the non-POD
 // correctly. For example, T = std::unique_ptr may evaluate to true and be
 // treated as POD. This can cause unexpected behavior.
diff --git a/c10/mobile/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp
index bde4067d45dc..0114856ca89b 100644
--- a/c10/mobile/CPUCachingAllocator.cpp
+++ b/c10/mobile/CPUCachingAllocator.cpp
@@ -61,7 +61,7 @@ void CPUCachingAllocator::record_free(void* ptr) {
   // is being freed outside the scope of this allocator.
   // At the moment only way to capture this is to have the allocator,
   // that uses this CachingAllocator as the backing allocator,
-  // call this function explicity upon freeing memory while
+  // call this function explicitly upon freeing memory while
   // outside the scope of caching allocator.
   // If the memory is freed in some other way, then we will likely
   // have undefined behavior or page fault. But this can be
diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h
index 2f11e6ea8669..c80fee0682eb 100644
--- a/c10/mobile/CPUCachingAllocator.h
+++ b/c10/mobile/CPUCachingAllocator.h
@@ -26,7 +26,7 @@
  * What are the cons?
  *    There are some cons that were observed where use of caching allocator led to
  *    worse performance on some platforms. Reason being that the caching mechanism
- *    used by this allocator left us worse off compared to the corresonding platform's
+ *    used by this allocator left us worse off compared to the corresponding platform's
  *    tuned memory allocator. In that case it seemed better to not use this allocator.
  *    Note there are some ideas to fix this in the works.
  *
@@ -63,7 +63,7 @@ class C10_API CPUCachingAllocator {
     //    returned the memory to OS via free_cached.
     //  1.1. Therefore even when the said memory is "freed" via this
     //       allocator (and thus cached), it will continue to stay
-    //       in allocaiton_map_. Furthermore it will also exist in
+    //       in allocation_map_. Furthermore it will also exist in
     //       available_map_. Thus an allocated memory pointer can be in both
     //       allocation_map_ and available_map_ simultaneously.
     // 2. Memory pointer maybe removed from allocation_map_, when it
diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp
index 5f2b28b4b2d0..0118d0a29587 100644
--- a/c10/mobile/CPUProfilingAllocator.cpp
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@@ -133,7 +133,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
   ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_end_offset_to_size_iter;
   // Upon free end_ptr = offset + size
   // If end_ptr exists merge freed allocation
-  // Also find coresponding offset in size_to_offet
+  // Also find corresponding offset in size_to_offset
   // Remove that entry and update with new size and offset
   // If end_ptr does not exist then just insert offset,size
   // in map and correspondingly size, offset in the other map.
@@ -176,7 +176,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
       }
       allocation_offsets[mem_event.allocation_id] = alloc_offset;
     } else {
-      // 1. Check if freed block is adjancent to an existing free block
+      // 1. Check if freed block is adjacent to an existing free block
       //    at its end boundary. This is done by checking
       //    free_end_offset_to_size_iter.
       //    If we find such a block, remove it and adjust size of
@@ -186,7 +186,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
       //    free_start_offset_to_size_iter.
       //    If we find such a block, remove it and adjust size of
       //    the block being freed.
-      // 3. Inser the freed block in map.
+      // 3. Insert the freed block in map.
       auto freed_offset = allocation_offsets[mem_event.allocation_id];
       auto freed_size = mem_event.size;
       auto end_offset = freed_offset + freed_size;
@@ -223,7 +223,7 @@ std::vector<uint64_t> formulate_greedy_allocation_plan(
     }
   }
   TORCH_CHECK(validate_allocation_plan(mem_events, allocation_offsets),
-      "ProfilingAllocator: Allocation plan invaild.");
+      "ProfilingAllocator: Allocation plan invalid.");
   return allocation_offsets;
 }
 
@@ -394,7 +394,7 @@ CPUProfilingAllocator::~CPUProfilingAllocator() {
 
 WithProfileAllocationsGuard::WithProfileAllocationsGuard(
     AllocationPlan* plan) {
-  // Nesting of allocation profiling does not seem meanigful.
+  // Nesting of allocation profiling does not seem meaningful.
   TORCH_CHECK(allocation_planner == nullptr,
       "Nesting profiling allocations is not supported.");
   planner_ = std::make_unique<AllocationPlanner>(plan);
@@ -409,7 +409,7 @@ WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
 
 WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
     AllocationPlan* plan, bool* success) {
-  // Nesting of allocation profiling does not seem meanigful.
+  // Nesting of allocation profiling does not seem meaningful.
   TORCH_CHECK(allocation_planner == nullptr,
       "Nesting profiling allocations is not supported.");
   planner_ = std::make_unique<AllocationPlanner>(plan, true);
diff --git a/c10/test/util/bfloat16_test.cpp b/c10/test/util/bfloat16_test.cpp
index d08f512053ab..af00bab99c5b 100644
--- a/c10/test/util/bfloat16_test.cpp
+++ b/c10/test/util/bfloat16_test.cpp
@@ -87,7 +87,7 @@ namespace {
   }
 
   TEST(BFloat16Math, Addition) {
-    // This test verifies that if only first 7 bits of float's mantisa are
+    // This test verifies that if only first 7 bits of float's mantissa are
     // changed after addition, we should have no loss in precision.
 
     // input bits
@@ -108,8 +108,8 @@ namespace {
     EXPECT_EQ(res, expected);
   }
 
-  TEST(BFloat16Math, Substraction) {
-    // This test verifies that if only first 7 bits of float's mantisa are
+  TEST(BFloat16Math, Subtraction) {
+    // This test verifies that if only first 7 bits of float's mantissa are
     // changed after subtraction, we should have no loss in precision.
 
     // input bits
diff --git a/c10/test/util/intrusive_ptr_test.cpp b/c10/test/util/intrusive_ptr_test.cpp
index 2ea283d1a4f0..9df5b004a094 100644
--- a/c10/test/util/intrusive_ptr_test.cpp
+++ b/c10/test/util/intrusive_ptr_test.cpp
@@ -694,21 +694,21 @@ TEST(IntrusivePtrTest, Equality_Nullptr) {
   EXPECT_FALSE(var1 != var2);
 }
 
-TEST(IntrusivePtrTest, Nonequality) {
+TEST(IntrusivePtrTest, Inequality) {
   intrusive_ptr<SomeClass> var1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1 != var2);
   EXPECT_FALSE(var1 == var2);
 }
 
-TEST(IntrusivePtrTest, Nonequality_NullptrLeft) {
+TEST(IntrusivePtrTest, Inequality_NullptrLeft) {
   intrusive_ptr<SomeClass> var1;
   intrusive_ptr<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1 != var2);
   EXPECT_FALSE(var1 == var2);
 }
 
-TEST(IntrusivePtrTest, Nonequality_NullptrRight) {
+TEST(IntrusivePtrTest, Inequality_NullptrRight) {
   intrusive_ptr<SomeClass> var1 = make_intrusive<SomeClass>();
   intrusive_ptr<SomeClass> var2;
   EXPECT_TRUE(var1 != var2);
@@ -2487,28 +2487,28 @@ TEST(WeakIntrusivePtrTest, Equality_Invalid) {
   EXPECT_FALSE(var1 != var2);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality) {
+TEST(WeakIntrusivePtrTest, Inequality) {
   IntrusiveAndWeak<SomeClass> var1 = make_intrusive<SomeClass>();
   IntrusiveAndWeak<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1.weak != var2.weak);
   EXPECT_FALSE(var1.weak == var2.weak);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality_InvalidLeft) {
+TEST(WeakIntrusivePtrTest, Inequality_InvalidLeft) {
   weak_intrusive_ptr<SomeClass> var1 = make_invalid_weak<SomeClass>();
   IntrusiveAndWeak<SomeClass> var2 = make_intrusive<SomeClass>();
   EXPECT_TRUE(var1 != var2.weak);
   EXPECT_FALSE(var1 == var2.weak);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality_InvalidRight) {
+TEST(WeakIntrusivePtrTest, Inequality_InvalidRight) {
   IntrusiveAndWeak<SomeClass> var1 = make_intrusive<SomeClass>();
   weak_intrusive_ptr<SomeClass> var2 = make_invalid_weak<SomeClass>();
   EXPECT_TRUE(var1.weak != var2);
   EXPECT_FALSE(var1.weak == var2);
 }
 
-TEST(WeakIntrusivePtrTest, Nonequality_WeakOnly) {
+TEST(WeakIntrusivePtrTest, Inequality_WeakOnly) {
   weak_intrusive_ptr<SomeClass> var1 = make_weak_only<SomeClass>();
   weak_intrusive_ptr<SomeClass> var2 = make_weak_only<SomeClass>();
   EXPECT_TRUE(var1 != var2);
diff --git a/c10/util/Bitset.h b/c10/util/Bitset.h
index e849563e60fe..964146be05e7 100644
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@@ -64,7 +64,7 @@ struct bitset final {
     bitset cur = *this;
     size_t index = cur.find_first_set();
     while (0 != index) {
-      // -1 because find_first_set() is not one-indiced.
+      // -1 because find_first_set() is not one-indexed.
       index -= 1;
       func(index);
       cur.unset(index);
@@ -73,7 +73,7 @@ struct bitset final {
   }
 
 private:
-  // Return the index of the first set bit. The returned index is one-indiced
+  // Return the index of the first set bit. The returned index is one-indexed
   // (i.e. if the very first bit is set, this function returns '1'), and a return
   // of '0' means that there was no bit set.
   size_t find_first_set() const {
diff --git a/c10/util/Flags.h b/c10/util/Flags.h
index 6bfe62507fcd..b4352510c997 100644
--- a/c10/util/Flags.h
+++ b/c10/util/Flags.h
@@ -4,7 +4,7 @@
 /* Commandline flags support for C10.
  *
  * This is a portable commandline flags tool for c10, so we can optionally
- * choose to use gflags or a lightweighted custom implementation if gflags is
+ * choose to use gflags or a lightweight custom implementation if gflags is
  * not possible on a certain platform. If you have gflags installed, set the
  * macro C10_USE_GFLAGS will seamlessly route everything to gflags.
  *
diff --git a/c10/util/Logging.h b/c10/util/Logging.h
index acab3cfecd23..6fa7e93f26d8 100644
--- a/c10/util/Logging.h
+++ b/c10/util/Logging.h
@@ -284,7 +284,7 @@ BINARY_COMP_HELPER(LessEquals, <=)
  * Very lightweight logging for the first time API usage. It's beneficial for
  * tracking of individual functionality usage in larger applications.
  *
- * In order to ensure light-weightness of logging, we utilize static variable
+ * In order to ensure light-weightedness of logging, we utilize static variable
  * trick - LogAPIUsage will be invoked only once and further invocations will
  * just do an atomic check.
  *
diff --git a/c10/util/Optional.h b/c10/util/Optional.h
index 2e901795aed7..440adb6a1654 100644
--- a/c10/util/Optional.h
+++ b/c10/util/Optional.h
@@ -10,17 +10,17 @@
 // From https://github.com/akrzemi1/Optional
 //
 // C10
-// - Move to `c10` namespace.
-// - Remove macro use in line 478 because the nvcc device compiler cannot handle
+// - Move file to `c10` namespace.
+// - Remove macro use in line 478 because the nvcc device compiler cannot handle it
 // it.
-// - revise constructor logic so that it is consistent with c++ 17 standard documented
-// here in (8): https://en.cppreference.com/w/cpp/utility/optional/optional, and
-// could be able to support initialization of optionals from convertible type U, also
-// remove two old constructors optional(const T&) and optional(T&&) as it could be
-// handled by the template<U=T> case with default template argument.
-// - `constexpr struct in_place_t {} in_place{}` is moved to `c10/util/in_place.h`,
+// - Revise constructor logic so that it is 1) consistent with c++ 17 standard documented
+// here in (8): https://en.cppreference.com/w/cpp/utility/optional/optional, and 2)
+// able to support initialization of optionals from convertible type U.
+// - Remove the constructors for `optional(const T&)` and `optional(T&&)`, as they can be
+// handled by the template<U=T> case with the default template argument.
+// - Move `constexpr struct in_place_t {} in_place{}` to `c10/util/in_place.h`
 // so that it can also be used in `c10/util/variant.h`.
-// - Remove special cases for pre-c++14 compilers to make code simpler
+// - Remove special cases for pre-c++14 compilers to make code simpler.
 
 #ifndef C10_UTIL_OPTIONAL_H_
 #define C10_UTIL_OPTIONAL_H_
@@ -412,7 +412,7 @@ using OptionalBase = typename std::conditional<
         constexpr_optional_base<typename std::remove_const<
             T>::type>, // use base with trivial destructor
         optional_base<typename std::remove_const<T>::type>>::type;
-#endif 
+#endif
 
 template <class T>
 class optional : private OptionalBase<T> {
@@ -438,7 +438,7 @@ class optional : private OptionalBase<T> {
           constexpr_optional_base<typename std::remove_const<
               U>::type>, // use base with trivial destructor
           optional_base<typename std::remove_const<U>::type>>::type;
-#endif 
+#endif
 
   static_assert(
       !std::is_same<typename std::decay<T>::type, nullopt_t>::value,
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index 076a1d401065..9b32d8edfe7f 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -832,7 +832,7 @@ SmallVectorImpl<T>& SmallVectorImpl<T>::operator=(
 
   // If we have to grow to have enough elements, destroy the current elements.
   // This allows us to avoid copying them during the grow.
-  // FIXME: don't do this if they're efficiently moveable.
+  // FIXME: don't do this if they're efficiently movable.
   if (this->capacity() < RHSSize) {
     // Destroy current elements.
     this->destroy_range(this->begin(), this->end());
diff --git a/c10/util/TypeCast.h b/c10/util/TypeCast.h
index df15509d7e0f..85513ecc5e2f 100644
--- a/c10/util/TypeCast.h
+++ b/c10/util/TypeCast.h
@@ -44,7 +44,7 @@ struct static_cast_with_inter_type {
 // Note: Converting from negative float values to unsigned integer types is
 // undefined behavior in C++, and current CPU and GPU compilers exhibit
 // divergent behavior. Casting from negative float values to signed
-// integer types and then to unsigned integer types is not undefiend,
+// integer types and then to unsigned integer types is not undefined,
 // however, so this cast improves the consistency of type conversions
 // to uint8 across compilers.
 // Further note: Type conversions across compilers still have other undefined
diff --git a/c10/util/complex.h b/c10/util/complex.h
index 2578da2957ab..d4d5525170af 100644
--- a/c10/util/complex.h
+++ b/c10/util/complex.h
@@ -61,7 +61,7 @@ namespace c10 {
 //     Since we only support float and double, on will use `complex& operator=(T x)`
 // - Copy assignment operator and converting assignment operator
 //   - There is no specialization of converting assignment operators, which type is
-//     convertible is soly depend on whether the scalar type is convertable
+//     convertible is solely dependent on whether the scalar type is convertible
 //
 // In addition to the standard assignment, we also provide assignment operators with std and thrust
 //
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 761dd27d6d46..790d97ee3994 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -206,7 +206,7 @@ class intrusive_ptr final {
       "NullType must have a constexpr singleton() method");
 #endif
   static_assert(
-      std::is_same<TTarget*, decltype(NullType::singleton())>::value,
+      std::is_base_of<TTarget, typename std::remove_pointer<decltype(NullType::singleton())>::type>::value,
       "NullType::singleton() must return a element_type* pointer");
 
   TTarget* target_;
@@ -509,7 +509,7 @@ class weak_intrusive_ptr final {
       "NullType must have a constexpr singleton() method");
 #endif
   static_assert(
-      std::is_same<TTarget*, decltype(NullType::singleton())>::value,
+      std::is_base_of<TTarget, typename std::remove_pointer<decltype(NullType::singleton())>::type>::value,
       "NullType::singleton() must return a element_type* pointer");
 
   TTarget* target_;
@@ -700,7 +700,7 @@ class weak_intrusive_ptr final {
   /**
    * Takes an owning (but must be weakly referenced) pointer to TTarget* and
    * creates a weak_intrusive_ptr that takes over ownership.
-   * Thas means the weakcount is not increased.
+   * This means that the weakcount is not increased.
    * This is the counter-part to weak_intrusive_ptr::release() and the pointer
    * passed in *must* have been created using weak_intrusive_ptr::release().
    */
diff --git a/c10/util/typeid.cpp b/c10/util/typeid.cpp
index f3fe048b4cca..79c093cbeb31 100644
--- a/c10/util/typeid.cpp
+++ b/c10/util/typeid.cpp
@@ -60,7 +60,7 @@ CAFFE_KNOWN_TYPE(bool*)
 CAFFE_KNOWN_TYPE(char*)
 CAFFE_KNOWN_TYPE(int*)
 
-// For some of the compilers, long is definied separately from int32_t and
+// For some of the compilers, long is defined separately from int32_t and
 // int64_t. As a result we will need to actually define them separately.
 // It is recommended that one does NOT use long - use int32_t and int64_t
 // explicitly. Explicit long type annotation may go away in the future.
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4fcf86be55e2..9b934e4831e8 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -340,9 +340,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
 
   set(GENERATED_CXX_TORCH
     "${TORCH_SRC_DIR}/csrc/autograd/generated/Functions.cpp"
-    "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_0.cpp"
-    "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_1.cpp"
-    "${TORCH_SRC_DIR}/csrc/jit/generated/generated_unboxing_wrappers_2.cpp"
     )
 
   if(NOT INTERN_DISABLE_AUTOGRAD)
@@ -434,8 +431,6 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     "${TOOLS_PATH}/autograd/load_derivatives.py"
     "${TOOLS_PATH}/autograd/nested_dict.py"
     "${TOOLS_PATH}/autograd/utils.py"
-    "${TOOLS_PATH}/jit/gen_unboxing_wrappers.py"
-    "${TOOLS_PATH}/jit/templates/generated_unboxing_wrappers.cpp"
     WORKING_DIRECTORY "${TORCH_ROOT}")
 
 
@@ -479,6 +474,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   # This one needs to be unconditionally added as Functions.cpp is also unconditionally added
   list(APPEND TORCH_SRCS
     ${TORCH_SRC_DIR}/csrc/autograd/FunctionsManual.cpp
+    ${TORCH_SRC_DIR}/csrc/utils/out_types.cpp
   )
 
   if(NOT INTERN_DISABLE_AUTOGRAD)
diff --git a/caffe2/contrib/aten/README.md b/caffe2/contrib/aten/README.md
index 377a1f780271..593079ef1393 100644
--- a/caffe2/contrib/aten/README.md
+++ b/caffe2/contrib/aten/README.md
@@ -1,6 +1,6 @@
 # An ATen operator for Caffe2
 
-[ATen](https://github.com/zdevito/aten) is a simple tensor library thats exposes the Tensor operations in Torch
+ATen is a simple tensor library thats exposes the Tensor operations in Torch
 and PyTorch directly in C++14. This library provides a generated wrapper around the ATen API
 that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
 ToffeeIR.
@@ -8,8 +8,8 @@ ToffeeIR.
 
 ### Example Usage in Caffe2
 
-First identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
-[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+First identify a function in ATen you want to call in Functions.h,
+Tensor.h, or Type.h.
 
 We will call the `pow` operator:
 
diff --git a/caffe2/contrib/aten/aten_op.cc b/caffe2/contrib/aten/aten_op.cc
index 9e7479141ad4..dba68d21c2dd 100644
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@@ -6,13 +6,17 @@ namespace caffe2 {
 namespace internal {
 at::Tensor index_with_uint8_handling(
     const at::Tensor& self,
-    at::TensorList indices) {
+    const torch::List<c10::optional<at::Tensor>>& indices) {
   // Support BC only for the simplest case of mask indexing
-  if (indices.size() == 1 && indices[0].scalar_type() == at::kByte) {
-    TORCH_WARN(
-        "Indexing with uint8 mask tensor in ATenOp is now deprecated,"
-        " please use a bool mask instead.");
-    return at::index(self, {indices[0].to(at::kBool)});
+  if (indices.size() == 1) {
+    c10::optional<at::Tensor> first = indices[0];
+    if (first.has_value()
+        && first->scalar_type() == at::kByte) {
+      TORCH_WARN(
+          "Indexing with uint8 mask tensor in ATenOp is now deprecated,"
+          " please use a bool mask instead.");
+      return at::index(self, {first->to(at::kBool)});
+    }
   }
   return at::index(self, indices);
 }
diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h
index f3a42dbd8f59..cd1ce7651b48 100644
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@@ -21,7 +21,7 @@ using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...)
 namespace internal {
 TORCH_API at::Tensor index_with_uint8_handling(
     const at::Tensor& self,
-    at::TensorList indices);
+    const torch::List<c10::optional<at::Tensor>>& indices);
 }
 
 template <class Context>
@@ -86,6 +86,16 @@ class ATenOp : public Operator<Context> {
 
   std::vector<at::Tensor> peekSlice(size_t i, size_t len, size_t N) {
     std::vector<at::Tensor> results;
+    results.reserve(len);
+    for (size_t ii = i; ii < i + len; ++ii) {
+      results.push_back(peek(ii, N));
+    }
+    return results;
+  }
+
+  torch::List<c10::optional<at::Tensor>> peekSliceOptionals(size_t i, size_t len, size_t N) {
+    torch::List<c10::optional<at::Tensor>> results;
+    results.reserve(len);
     for (size_t ii = i; ii < i + len; ++ii) {
       results.push_back(peek(ii, N));
     }
diff --git a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
index 85c275bb5178..c3f615ee37b9 100644
--- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
+++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
@@ -6,7 +6,7 @@ operators that haven't been standardized yet, or custom `torch.autograd.Function
 are specific to a network.
 
 To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
-[ATen](https://github.com/zdevito/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/caffe2/caffe2/tree/master/caffe2/contrib/aten)
+[ATen](https://github.com/pytorch/pytorch/tree/master/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/master/caffe2/contrib/aten)
 that can run these tensor functions in a Caffe2 network after importing them through ONNX.
 
 This guide explains how to configure Caffe2 and modify your PyTorch program to use
@@ -61,8 +61,8 @@ We can add a `symbolic` method to it like so:
 
 The function `graph.at` adds a new ATen op the computation graph.
 You can call any ATen function using this facility. To do so,
-first identify a function in ATen you want to call in [Functions.h](https://github.com/zdevito/ATen/blob/master/doc/Functions.h),
-[Tensor.h](https://github.com/zdevito/ATen/blob/master/doc/Tensor.h), or [Type.h](https://github.com/zdevito/ATen/blob/master/doc/Type.h).
+first identify a function in ATen you want to call in Functions.h,
+Tensor.h, or Type.h.
 
 As an example, we might want to call the `pow` operator:
 
@@ -86,9 +86,9 @@ To call methods of ATen's `Type` objects, you provide an additional string attri
 that determines the type. For instance, `ones` creates a new constant tensor of all ones:
 ```
 class Type {
-	...
-	virtual Tensor ones(IntArrayRef size) const;
-	...
+  ...
+  virtual Tensor ones(IntArrayRef size) const;
+  ...
 };
 ```
 
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
index 2a822058bfdf..64d3de547bb7 100755
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@@ -68,7 +68,7 @@ def value_has_tensors(v):
 
 
 def value_is_tensor_type(v):
-    return value_has_tensors(v) and v['dynamic_type'] != 'TensorList'
+    return value_has_tensors(v) and v['dynamic_type'] not in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']
 
 
 # for each aten type, how do we handle a return value of that type?
@@ -208,7 +208,7 @@ def self_as_first_argument(arguments):
 def get_num_inputs(o):
     args = 0
     for a in o['arguments']:
-        if a['type'] == 'TensorList':
+        if a['type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']:
             return '*'
         elif value_has_tensors(a):
             args += 1
@@ -236,11 +236,11 @@ def emit_assignments(o, env):
     decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
     factory_methods = find_factory_methods(decls)
     filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)]
-    top_env = {
+    top_env: Dict[str, List] = {
         'mappings': [],
         'implementations': [],
         'cases': [],
-    }  # type: Dict[str, List]
+    }
     seen: Set[str] = set()
     key = 0
     for o in filtered:
@@ -277,23 +277,28 @@ def emit_assignments(o, env):
             # e.g. "Float" is at::kFloat
             assert('Type' in o['method_of'])
 
-        static_tensor_inputs = sum(arg['type'] != 'TensorList' and value_is_tensor_type(arg) for arg in o['arguments'])
-        has_tensorlist = any(arg['type'] == 'TensorList' for arg in o['arguments'])
+        static_tensor_inputs = sum(arg['type'] not in ['TensorList', 'const c10::List<c10::optional<Tensor>> &'] and value_is_tensor_type(arg) for arg in o['arguments'])
+        has_tensorlist = any(arg['type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &'] for arg in o['arguments'])
         if has_tensorlist:
-            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] == 'TensorList'][0]
+            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] in ['TensorList', 'const c10::List<c10::optional<Tensor>> &']][0]
 
         real_inputs = 0
         for i, arg in enumerate(o['arguments']):
             env['arguments'].append(arg['name'])
-            # Emulate logic in gen_unboxing_wrappers.py. Pretend the flat argument
-            # list is a stack where the end is the top.
+            # Pretend the flat argument list is a stack where the end is the top.
             view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
             if arg['type'] == 'TensorList':
                 # NOTE: do not advance real_inputs here. After this we will
-                # switch to indexing the "stack" from the end as if we only had
+                # switch to indexing the "stack" from the end
                 env['statements'].append(
                     'auto {} = peekSlice({}, InputSize() - {}, InputSize());'
                     .format(arg['name'], real_inputs, static_tensor_inputs))
+            elif arg['type'] == 'const c10::List<c10::optional<Tensor>> &':
+                # NOTE: do not advance real_inputs here. After this we will
+                # switch to indexing the "stack" from the end
+                env['statements'].append(
+                    'auto {} = peekSliceOptionals({}, InputSize() - {}, InputSize());'
+                    .format(arg['name'], real_inputs, static_tensor_inputs))
             elif value_is_tensor_type(arg):
                 # load tensor inputs from Caffe2
                 env['statements'].append(
diff --git a/caffe2/contrib/fakelowp/test/test_chunking.py b/caffe2/contrib/fakelowp/test/test_chunking.py
new file mode 100644
index 000000000000..306b5c3b3f02
--- /dev/null
+++ b/caffe2/contrib/fakelowp/test/test_chunking.py
@@ -0,0 +1,142 @@
+# Must happen before importing caffe2.python.*
+import caffe2.python.fakelowp.init_shared_libs  # noqa
+import datetime
+import numpy as np
+from hypothesis import given, settings, example
+from hypothesis import strategies as st
+from caffe2.python import core, workspace
+from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
+from caffe2.python.fakelowp.test_utils import print_test_debug_info
+import caffe2.python.serialized_test.serialized_test_util as serial
+
+# Test that parallel chunks behave the same way as the serial one
+
+workspace.GlobalInit(
+    [
+        "caffe2",
+        "--glow_global_fp16=1",
+        "--glow_global_fused_scale_offset_fp16=1",
+        "--glow_global_force_sls_fp16_accum=1",
+        "--glow_nnpi_num_parallel_chunks=2",
+        "--glow_use_dag_optimizer=false",
+        "--glow_dump_graph=true",
+    ]
+)
+
+class Fusions(serial.SerializedTestCase):
+    def _get_scale_zp(self, tensor):
+        tensor_max = np.max(tensor)
+        tensor_min = min(0, np.min(tensor))
+        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
+        if scale < 1e-6:
+            scale = 1e-6
+        zero_point = 0 - tensor_min / scale
+        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
+        return (scale, zero_point)
+
+    @given(
+        scale=st.floats(1e-4, 1e2),
+        zp=st.integers(-128, 128),
+        rand_seed=st.integers(0, 65534),
+        m=st.integers(32, 64),
+        k=st.integers(1000, 6000),
+        n=st.integers(200, 600),
+    )
+    # @example(m=64, k=5423, n=553, scale=1e-3, zp=120, rand_seed=1)
+    @settings(deadline=datetime.timedelta(seconds=1000), max_examples=1)
+    def test_ParallelFC(self, m, k, n, scale, zp, rand_seed):
+        np.random.seed(rand_seed)
+        workspace.ResetWorkspace()
+
+        # Y = W_T * X + b
+        X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \
+            .astype(np.float32)
+
+        W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32)
+        b_fp32 = np.zeros((n,), dtype=np.float32)
+
+        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
+
+        workspace.FeedBlob("X", X_fp32)
+        workspace.FeedBlob("W", W_fp32)
+        workspace.FeedBlob("b", b_fp32)
+
+        workspace.RunOperatorOnce(
+            core.CreateOperator(
+                "Int8FCPackWeight",
+                ["W"],
+                ["W_int8"],
+                engine="DNNLOWP",
+                save_unpacked_weights=True,
+                in_scale=X_scale,
+            )
+        )
+
+        ref_net = core.Net("net")
+        ref_net.Int8QuantizeNNPI(
+            ["X"],
+            ["X_int8"],
+            Y_scale=X_scale,
+            Y_zero_point=X_zero_point
+        )
+        ref_net.Int8FCFakeAcc32NNPI(
+            ["X_int8", "W_int8", "b"],
+            ["Y_int8"],
+            Y_scale=X_scale,
+            Y_zero_point=X_zero_point,
+        )
+        ref_net.Int8Relu(
+            ["Y_int8"],
+            ["Y_relu"],
+            Y_zero_point=X_zero_point,
+            Y_scale=X_scale,
+        )
+        ref_net.Int8DequantizeNNPI(
+            ["Y_relu"],
+            ["Y"]
+        )
+        ref_net.Proto().external_output.append("Y")
+
+        # run ref_net
+        workspace.RunNetOnce(ref_net)
+        Y_fbgemm = workspace.FetchBlob("Y")
+
+        # run onnxifi net
+        ref_net.Proto().op[0].type = "Int8Quantize"
+        ref_net.Proto().op[1].type = "Int8FC"
+        ref_net.Proto().op[2].type = "Int8Relu"
+        ref_net.Proto().op[3].type = "Int8Dequantize"
+        net_onnxified = onnxifi_caffe2_net(
+            ref_net.Proto(),
+            {},
+            debug=True,
+            adjust_batch=False,
+            use_onnx=False,
+            weight_names=["W_int8", "b"],
+        )
+        num_onnxified_ops = sum(
+            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
+        )
+        print(net_onnxified)
+        np.testing.assert_equal(num_onnxified_ops, 1)
+        workspace.CreateNet(net_onnxified)
+        workspace.RunNet(net_onnxified.name)
+        Y_glow = workspace.FetchBlob("Y")
+
+        if not np.allclose(Y_glow, Y_fbgemm):
+            diff_Y = np.abs(Y_glow - Y_fbgemm)
+            print_test_debug_info(
+                "int8_fc",
+                {
+                    "seed": rand_seed,
+                    "n": n,
+                    "X": X_fp32,
+                    "W": W_fp32,
+                    "b": b_fp32,
+                    "Y_fbgemm": Y_fbgemm,
+                    "Y_glow": Y_glow,
+                    "diff": diff_Y,
+                    "maxdiff": diff_Y.max(axis=1),
+                },
+            )
+            assert 0
diff --git a/caffe2/contrib/fakelowp/test/test_fusions.py b/caffe2/contrib/fakelowp/test/test_fusions.py
index 335159c8318e..3e22d7c5937b 100644
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ b/caffe2/contrib/fakelowp/test/test_fusions.py
@@ -27,7 +27,7 @@ class Fusions(serial.SerializedTestCase):
         rand_seed=st.integers(0, 65534),
     )
     @settings(deadline=datetime.timedelta(seconds=10))
-    def Skip_test_tanhquantize(self, scale, zp, size, rand_seed):
+    def test_tanhquantize(self, scale, zp, size, rand_seed):
         np.random.seed(rand_seed)
 
         workspace.ResetWorkspace()
diff --git a/caffe2/contrib/gloo/gloo_test.py b/caffe2/contrib/gloo/gloo_test.py
index fbca9b8fe64c..5ae066f5e3ca 100644
--- a/caffe2/contrib/gloo/gloo_test.py
+++ b/caffe2/contrib/gloo/gloo_test.py
@@ -27,7 +27,6 @@
 
 op_engine = 'GLOO'
 
-
 class TemporaryDirectory:
     def __enter__(self):
         self.tmpdir = tempfile.mkdtemp()
diff --git a/caffe2/opt/fakefp16_transform.cc b/caffe2/opt/fakefp16_transform.cc
index 424056bd2c80..cbd3132dfc08 100644
--- a/caffe2/opt/fakefp16_transform.cc
+++ b/caffe2/opt/fakefp16_transform.cc
@@ -299,8 +299,8 @@ void fakeFp16Transform(NetDef* net) {
           FLAGS_fake_fp16_conversion_use_fp16_acc,
           FLAGS_fake_fp16_conversion_use_nnpi);
 
-  auto blacklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist);
-  auto blacklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
+  auto blocklist_pos = glow::ParseNetPositionList(FLAGS_onnxifi_blacklist);
+  auto blocklist_type = glow::ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
 
   // A hack to only do fakefp16 transformation for operators which will be
   // lowered to ONNXIFI.
@@ -320,7 +320,7 @@ void fakeFp16Transform(NetDef* net) {
     auto* op = net->mutable_op(i);
     auto net_pos =
         ArgumentHelper::GetSingleArgument<OperatorDef, int>(*op, "net_pos", -1);
-    if (blacklist_pos.count(net_pos) || blacklist_type.count(op->type())) {
+    if (blocklist_pos.count(net_pos) || blocklist_type.count(op->type())) {
       continue;
     }
     auto it = kFakeFp16OpConversionMap.find(op->type());
diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc
index ee3ce1b27e2c..45ce9a487fbb 100644
--- a/caffe2/opt/glow_net_transform.cc
+++ b/caffe2/opt/glow_net_transform.cc
@@ -107,7 +107,7 @@ void onnxifi(
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
-    const std::unordered_set<int>& blacklist,
+    const std::unordered_set<int>& blocklist,
     const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size,
@@ -154,19 +154,19 @@ void onnxifi(
   // Before applying backlist, make sure the ops in the net all have an net_pos;
   caffe2::BackendTransformerBase::annotateOpIndex(net);
 
-  // Parse the blacklist
-  auto more_blacklist = ParseNetPositionList(FLAGS_onnxifi_blacklist);
-  for (const auto& b : blacklist) {
-    more_blacklist.emplace(b);
+  // Parse the blocklist
+  auto more_blocklist = ParseNetPositionList(FLAGS_onnxifi_blacklist);
+  for (const auto& b : blocklist) {
+    more_blocklist.emplace(b);
   }
 
   // ONNX mode will change the op order so it doesn't apply here
   if (!opts.use_onnx) {
-    auto blacklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
+    auto blocklisted_ops = ParseBlackListOps(FLAGS_onnxifi_blacklist_ops);
     for (const auto& op : net->op()) {
-      if (blacklisted_ops.count(op.type())) {
+      if (blocklisted_ops.count(op.type())) {
         ArgumentHelper helper(op);
-        more_blacklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
+        more_blocklist.emplace(helper.GetSingleArgument(op, kNetPos, -1));
       }
     }
   }
@@ -179,7 +179,7 @@ void onnxifi(
   // 1. for specified op, we find its input and outputs.
   // 2. for each input and output, we create a new copy op and attach it as an
   // input to the copy.
-  // 3. we blacklist these new copy operators from onnxification. This forces
+  // 3. we blocklist these new copy operators from onnxification. This forces
   // these intermediate tensors to also become outputs of the onnxifi op.
   // 4. we put the right arguments on the copy ops so TensorObserver can print
   // out the values.
@@ -213,11 +213,11 @@ void onnxifi(
     AddArgument(kNetPos, pos, &copy_op);
     AddArgument("observe_input_tensors", 1, &copy_op);
     net->add_op()->CopyFrom(copy_op);
-    more_blacklist.emplace(pos);
+    more_blocklist.emplace(pos);
   }
 
   OnnxifiTransformer ts(opts);
-  ts.transform(ws, net, weight_names, more_shape_hints, more_blacklist);
+  ts.transform(ws, net, weight_names, more_shape_hints, more_blocklist);
 
   // Cleanup the input from the workspace
   for (const auto& i : input_names) {
diff --git a/caffe2/opt/glow_net_transform.h b/caffe2/opt/glow_net_transform.h
index e8d1c9b9054f..f6cd975a6e91 100644
--- a/caffe2/opt/glow_net_transform.h
+++ b/caffe2/opt/glow_net_transform.h
@@ -16,7 +16,7 @@ namespace caffe2 {
 namespace glow {
 /// Onnxifi transformation on the net and workspace.  We also
 /// needed the input data/shape to populate the shape. In addition, we take a \p
-/// blacklist to control and mask what ops we want to consider in onnxifi
+/// blocklist to control and mask what ops we want to consider in onnxifi
 /// process. We can also set whether to use ONNX proto or C2 proto through
 /// ONNXIFI interface.
 void onnxifi(
@@ -25,7 +25,7 @@ void onnxifi(
     const std::vector<std::string>& input_names,
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
-    const std::unordered_set<int>& blacklist,
+    const std::unordered_set<int>& blocklist,
     const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size = 0,
diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
index 8089314c3100..2dd8c8d2d8b4 100644
--- a/caffe2/opt/onnxifi_transformer.cc
+++ b/caffe2/opt/onnxifi_transformer.cc
@@ -1195,11 +1195,11 @@ void OnnxifiTransformer::applyFilteringRules(
   blocklistCpuPartition(net, blocklisted_ops);
 }
 
-void OnnxifiTransformer::getBackendId() {
+std::vector<onnxBackendID> OnnxifiTransformer::getBackendId() {
   idx_ = 0;
 
   if (opts_.use_onnx) {
-    return;
+    return backend_ids_;
   }
   // Try to find a backend that support Caffe2 proto. Note that this is quite
   // opportunistic as we don't officially support Caffe2 proto.
@@ -1214,6 +1214,7 @@ void OnnxifiTransformer::getBackendId() {
       break;
     }
   }
+  return backend_ids_;
 }
 
 NetDef OnnxifiTransformer::TransformViaC2(
diff --git a/caffe2/opt/onnxifi_transformer.h b/caffe2/opt/onnxifi_transformer.h
index d88eb739750c..d1af1731013d 100644
--- a/caffe2/opt/onnxifi_transformer.h
+++ b/caffe2/opt/onnxifi_transformer.h
@@ -61,6 +61,17 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
       const ShapeInfoMap& shape_hints,
       const std::unordered_set<int>& blocklisted_ops) override;
 
+  // Query whether an operator is supported by passing C2 protobuf
+  bool supportOpC2(
+      const caffe2::OperatorDef& op,
+      const ShapeInfoMap& shape_hints,
+      const std::unordered_set<std::string>& weights,
+      const std::unordered_set<int>& blocklisted_ops,
+      onnxBackendID backend_id) const;
+
+  // Determine backend id
+  std::vector<onnxBackendID> getBackendId();
+
  private:
   // Since we create new tensors during the conversion process, we actually need
   // into inject them into the original workspace
@@ -114,14 +125,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
       ShapeInfoMap* shape_hints_max_bs,
       const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs);
 
-  // Query whether an operator is supported by passing C2 protobuf
-  bool supportOpC2(
-      const caffe2::OperatorDef& op,
-      const ShapeInfoMap& shape_hints,
-      const std::unordered_set<std::string>& weights,
-      const std::unordered_set<int>& blocklisted_ops,
-      onnxBackendID backend_id) const;
-
   // Query whether an operator is supported by passing ONNX protobuf
   bool supportOpOnnx(
       const caffe2::OperatorDef& op,
@@ -152,9 +155,6 @@ class TORCH_API OnnxifiTransformer final : public BackendTransformerBase {
       const std::unordered_set<std::string>& weights,
       std::unordered_set<int>* blocklisted_ops) const;
 
-  // Determine backend id
-  void getBackendId();
-
   // Extract partition info from the original net
   void extractPartitionInfo(const NetDef& net);
 
diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py
index d6754adc20fd..32b9ec34d1f8 100644
--- a/caffe2/python/_import_c_extension.py
+++ b/caffe2/python/_import_c_extension.py
@@ -5,16 +5,6 @@
 import sys
 from caffe2.python import extension_loader
 
-# NOTE: we have to import python protobuf here **before** we load cpp extension.
-# Otherwise it breaks under certain build conditions if cpp implementation of
-# protobuf is used. Presumably there's some registry in protobuf library and
-# python side has to initialize the dictionary first, before static
-# initialization in python extension does so. Otherwise, duplicated protobuf
-# descriptors will be created and it can lead to obscure errors like
-#   "Parameter to MergeFrom() must be instance of same class:
-#    expected caffe2.NetDef got caffe2.NetDef."
-import caffe2.proto
-
 # We will first try to load the gpu-enabled caffe2. If it fails, we will then
 # attempt to load the cpu version. The cpu backend is the minimum required, so
 # if that still fails, we will exit loud.
diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
index 1b683be0d51e..b4cb8f2da0b4 100644
--- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
+++ b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
@@ -5,7 +5,7 @@
 
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core, dyndep, workspace
+from caffe2.python import core, workspace
 
 
 def benchmark_sparse_lengths_sum(
diff --git a/caffe2/python/compatibility.py b/caffe2/python/compatibility.py
deleted file mode 100644
index 9d615a308333..000000000000
--- a/caffe2/python/compatibility.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from six import PY2, PY3
-
-if PY2:
-    import collections
-    container_abcs = collections
-elif PY3:
-    import collections.abc
-    container_abcs = collections.abc
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
index 18033661a69e..b4b37811de10 100644
--- a/caffe2/python/convert.py
+++ b/caffe2/python/convert.py
@@ -5,6 +5,3 @@
 
 
 
-from caffe2.proto import caffe2_pb2, torch_pb2
-
-import caffe2.python._import_c_extension as C
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
index a1dc52aad2d9..d9d82bf5e6c4 100644
--- a/caffe2/python/convert_test.py
+++ b/caffe2/python/convert_test.py
@@ -3,10 +3,8 @@
 
 
 
-from caffe2.python import convert, workspace
-from caffe2.proto import caffe2_pb2, torch_pb2
+from caffe2.python import workspace
 import unittest
-import numpy as np
 
 class TestOperator(unittest.TestCase):
     def setUp(self):
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index 3674b7aa4585..293eccca0dd4 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -3,7 +3,6 @@
 
 
 
-from future.utils import bytes_to_native_str
 from hypothesis import given, settings
 import hypothesis.strategies as st
 import unittest
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 95abb7159d42..8537e1ee3cf1 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -1056,7 +1056,7 @@ def sumN(*dev_indices):
         """Create a Sum op for 2 or more blobs on different devices.
         Saves the result on the first device.
 
-        Arguments:
+        Args:
         dev_indices -- a list of device indices, which can be translated into
                        CUDA identifiers with model._devices
         """
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
index 0c45fb50aed9..ac1c72284fbf 100644
--- a/caffe2/python/dataio_test.py
+++ b/caffe2/python/dataio_test.py
@@ -6,7 +6,6 @@
 from caffe2.python.dataio import (
     CompositeReader,
     CompositeReaderBuilder,
-    Reader,
     ReaderBuilder,
     ReaderWithDelay,
     ReaderWithLimit,
@@ -29,7 +28,6 @@
 import shutil
 import unittest
 import tempfile
-import time
 
 
 def make_source_dataset(ws, size=100, offset=0, name=None):
diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py
index ae4473ea4864..7c5a0026c113 100644
--- a/caffe2/python/ideep/conv_op_test.py
+++ b/caffe2/python/ideep/conv_op_test.py
@@ -4,7 +4,6 @@
 
 
 import unittest
-import sys
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import numpy as np
diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py
index 18ce574b623b..a0a782ab8a03 100644
--- a/caffe2/python/ideep/convfusion_op_test.py
+++ b/caffe2/python/ideep/convfusion_op_test.py
@@ -5,8 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
-import copy
+from hypothesis import given
 import numpy as np
 import math
 from caffe2.proto import caffe2_pb2
diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py
index 33b0a52a7421..5b07333758dd 100644
--- a/caffe2/python/ideep/dropout_op_test.py
+++ b/caffe2/python/ideep/dropout_op_test.py
@@ -7,8 +7,6 @@
 from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
-
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py
index a259e01bab10..39ede0d214fe 100644
--- a/caffe2/python/ideep/order_switch_op_test.py
+++ b/caffe2/python/ideep/order_switch_op_test.py
@@ -10,7 +10,6 @@
 import caffe2.python.ideep_test_util as mu
 
 from hypothesis import given, settings
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 
 
diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py
index 47114832f85d..1beb24bc8803 100644
--- a/caffe2/python/ideep/shape_op_test.py
+++ b/caffe2/python/ideep/shape_op_test.py
@@ -7,7 +7,6 @@
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import numpy as np
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py
index 618a0e7fbfc3..97efafa72057 100644
--- a/caffe2/python/ideep/spatial_bn_op_test.py
+++ b/caffe2/python/ideep/spatial_bn_op_test.py
@@ -7,9 +7,8 @@
 import hypothesis.strategies as st
 import numpy as np
 import unittest
-from caffe2.python import brew, core, workspace
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
-from caffe2.python.model_helper import ModelHelper
 import caffe2.python.ideep_test_util as mu
 
 
diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py
index aa1c5bc260fa..42feeed00122 100644
--- a/caffe2/python/ideep/test_ideep_net.py
+++ b/caffe2/python/ideep/test_ideep_net.py
@@ -9,7 +9,6 @@
 import numpy as np
 import argparse
 import time
-import os.path
 
 
 def GetArgumentParser():
diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py
index 962d4051718b..2d0f35a7406f 100644
--- a/caffe2/python/ideep/transform_ideep_net.py
+++ b/caffe2/python/ideep/transform_ideep_net.py
@@ -6,7 +6,6 @@
 import argparse
 import copy
 import json
-import os.path
 
 import numpy as np
 
diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py
index 8b324ed964ae..f8b784822a07 100644
--- a/caffe2/python/ideep/transpose_op_test.py
+++ b/caffe2/python/ideep/transpose_op_test.py
@@ -7,7 +7,6 @@
 import hypothesis.strategies as st
 from hypothesis import given, settings
 import numpy as np
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.ideep_test_util as mu
diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py
index 7129ed14ba74..0cc643317c93 100644
--- a/caffe2/python/ideep_test_util.py
+++ b/caffe2/python/ideep_test_util.py
@@ -14,7 +14,6 @@
 import hypothesis.strategies as st
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace
 from caffe2.python import hypothesis_test_util as hu
 
 cpu_do = hu.cpu_do
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
index 9d825f3827b9..6a5a3c82dd30 100644
--- a/caffe2/python/layer_model_helper.py
+++ b/caffe2/python/layer_model_helper.py
@@ -17,7 +17,6 @@
 from caffe2.python.optimizer import get_param_device, Optimizer
 from caffe2.python.regularizer import Regularizer, RegularizationBy
 from caffe2.python.layers import layers
-from caffe2.proto import caffe2_pb2
 from future.utils import viewitems, viewvalues
 
 import logging
diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py
index 68c27b587567..b076cd8c5e75 100644
--- a/caffe2/python/layers/merge_id_lists.py
+++ b/caffe2/python/layers/merge_id_lists.py
@@ -16,7 +16,7 @@
 class MergeIdLists(ModelLayer):
     """Merge multiple ID_LISTs into a single ID_LIST
 
-    Arguments:
+    Args:
         model: A layer model instance
         input_record: Tuple (Struct) of ID_LIST features to be
         merged
diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py
index 2b084bea591b..fddb20e6bb14 100644
--- a/caffe2/python/mkl/mkl_LRN_op_test.py
+++ b/caffe2/python/mkl/mkl_LRN_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py
index ae42902d9102..c192137dc28c 100644
--- a/caffe2/python/mkl/mkl_LRN_speed_test.py
+++ b/caffe2/python/mkl/mkl_LRN_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py
index f1fe7b062318..74c4f2c6cde9 100644
--- a/caffe2/python/mkl/mkl_conv_op_test.py
+++ b/caffe2/python/mkl/mkl_conv_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py
index 01786d55c337..180d93f26570 100644
--- a/caffe2/python/mkl/mkl_fc_op_test.py
+++ b/caffe2/python/mkl/mkl_fc_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py
index 85f5605e9676..243e49c2f8f8 100644
--- a/caffe2/python/mkl/mkl_fc_speed_test.py
+++ b/caffe2/python/mkl/mkl_fc_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py
index 26a9b7131b0b..f233275786f7 100644
--- a/caffe2/python/mkl/mkl_fill_op_test.py
+++ b/caffe2/python/mkl/mkl_fill_op_test.py
@@ -5,8 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
+from hypothesis import given
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.mkl_test_util as mu
diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py
index b25e0f915cc7..aa43aed97a09 100644
--- a/caffe2/python/mkl/mkl_pool_speed_test.py
+++ b/caffe2/python/mkl/mkl_pool_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py
index 2ac9080ce670..86856b130d63 100644
--- a/caffe2/python/mkl/mkl_sbn_op_test.py
+++ b/caffe2/python/mkl/mkl_sbn_op_test.py
@@ -5,7 +5,7 @@
 
 import unittest
 import hypothesis.strategies as st
-from hypothesis import given, settings
+from hypothesis import given
 import numpy as np
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py
index 3b3b71d1c997..05885ceca575 100644
--- a/caffe2/python/mkl/mkl_sbn_speed_test.py
+++ b/caffe2/python/mkl/mkl_sbn_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py
index 9a7310a484d1..ab2e4428519a 100644
--- a/caffe2/python/mkl/mkl_speed_test.py
+++ b/caffe2/python/mkl/mkl_speed_test.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 from caffe2.proto import caffe2_pb2
-from caffe2.python import cnn, core, workspace, test_util
+from caffe2.python import core, workspace, test_util
 
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
index 3a88a3deeccc..b52501584064 100644
--- a/caffe2/python/mkl/rewrite_graph.py
+++ b/caffe2/python/mkl/rewrite_graph.py
@@ -6,7 +6,6 @@
 import copy
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
-import caffe2.python._import_c_extension as C
 
 
 def rewrite_init_net_simple(net):
diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py
index 1eb4fe7b2909..fd525ed4766a 100644
--- a/caffe2/python/net_builder.py
+++ b/caffe2/python/net_builder.py
@@ -13,7 +13,7 @@
 class NetBuilder(context.Managed):
     """
     Scope-driven mechanism for building nets, loops and conditional blocks.
-    Arguments:
+    Args:
       name: NetBuilder's name
       initial_scope: list of blobs that are available for reading/writing
     Example:
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
index 3d9adc696486..bd9d10fcbae1 100644
--- a/caffe2/python/nomnigraph_test.py
+++ b/caffe2/python/nomnigraph_test.py
@@ -3,7 +3,7 @@
 
 
 
-from caffe2.python import core, workspace, test_util
+from caffe2.python import core, test_util
 from caffe2.proto import caffe2_pb2
 import caffe2.python.nomnigraph as ng
 
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
index 5d445576b32c..193a6f217f93 100644
--- a/caffe2/python/onnx/backend.py
+++ b/caffe2/python/onnx/backend.py
@@ -5,14 +5,7 @@
 
 To run this, you will need to have Caffe2 installed as well.
 """
-
-
-
-
-
-import os
 import collections
-from subprocess import Popen, PIPE
 import sys
 import zipfile
 import itertools
@@ -23,16 +16,13 @@
 # importing onnx first, which will cause it to go out and pick up the
 # system protobuf.
 import onnx.backend
-
-import caffe2
 from caffe2.python import core, workspace, rnn_cell, gru_cell
-from caffe2.python.compatibility import container_abcs
 from caffe2.python.model_helper import ModelHelper
 from caffe2.proto import caffe2_pb2
 import caffe2.python.utils
 import numpy as np
 import onnx
-from onnx import checker, GraphProto, TensorProto, AttributeProto, ModelProto
+from onnx import TensorProto
 import onnx.numpy_helper
 import onnx.defs
 import onnx.optimizer
@@ -42,7 +32,6 @@
 
 from caffe2.python.onnx.workspace import Workspace
 from caffe2.python.onnx.backend_rep import Caffe2Rep
-from caffe2.python.onnx.backend_cpp_rep import Caffe2CppRep
 
 import caffe2.python._import_c_extension as C
 
@@ -781,7 +770,7 @@ def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version
         ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version)
         if isinstance(ops, Caffe2Ops):
             return ops
-        if not isinstance(ops, container_abcs.Iterable):
+        if not isinstance(ops, collections.abc.Iterable):
             ops = [ops]
         return Caffe2Ops(ops, [], [])
 
diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py
index 126eef8a8470..7e469e514a73 100644
--- a/caffe2/python/onnx/bin/conversion.py
+++ b/caffe2/python/onnx/bin/conversion.py
@@ -9,8 +9,7 @@
 
 from caffe2.proto import caffe2_pb2
 import click
-import numpy as np
-from onnx import checker, ModelProto
+from onnx import ModelProto
 
 from caffe2.python.onnx.backend import Caffe2Backend as c2
 import caffe2.python.onnx.frontend as c2_onnx
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
index ee3c30949ff7..b5121602aff5 100644
--- a/caffe2/python/onnx/frontend.py
+++ b/caffe2/python/onnx/frontend.py
@@ -10,22 +10,18 @@
 
 
 
-
+import collections
 import itertools
 import logging
 import re
 
 from caffe2.python import core as caffe2_core
-from caffe2.python.compatibility import container_abcs
-from caffe2.proto import caffe2_legacy_pb2
-from enum import Enum
-from onnx import (defs, checker, helper, numpy_helper, mapping,
-                  ModelProto, GraphProto, NodeProto, AttributeProto, TensorProto, OperatorSetIdProto)
-from onnx.helper import make_tensor, make_tensor_value_info, make_attribute, make_model
+from onnx import (checker, helper, numpy_helper, mapping,
+                  GraphProto, NodeProto, TensorProto, OperatorSetIdProto)
+from onnx.helper import make_tensor_value_info, make_model
 import numpy as np
 
 from caffe2.python.onnx.helper import c2_native_run_net
-from caffe2.python.onnx.error import Unsupported
 
 import caffe2.python._import_c_extension as C
 
@@ -156,7 +152,7 @@ def caffe2_op_to_onnx_node(cls, op_def, shapes):
         const_tensors = []
         if isinstance(nodes, tuple):
             nodes, const_tensors = nodes
-        if not isinstance(nodes, container_abcs.Iterable):
+        if not isinstance(nodes, collections.abc.Iterable):
             nodes = [nodes]
         return nodes, const_tensors
 
diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py
index 7f8f1a6d346a..6e73a5d5c95d 100644
--- a/caffe2/python/onnx/helper.py
+++ b/caffe2/python/onnx/helper.py
@@ -9,9 +9,6 @@
 from onnx.backend.base import namedtupledict
 
 from caffe2.python.onnx.workspace import Workspace
-import caffe2.python._import_c_extension as C
-
-import io
 import logging
 import time
 
diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py
index a04e7e4554b9..3e67c4948b1f 100644
--- a/caffe2/python/onnx/onnxifi.py
+++ b/caffe2/python/onnx/onnxifi.py
@@ -11,9 +11,7 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
 import caffe2.python._import_c_extension as C
-import numpy as np
 
 
 def onnxifi_caffe2_net(
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
index 7eafccaec9e4..4316149d5bf6 100644
--- a/caffe2/python/onnx/test_onnxifi.py
+++ b/caffe2/python/onnx/test_onnxifi.py
@@ -3,16 +3,14 @@
 
 
 
-import json
 import numpy as np
-import os
 import time
 import unittest
 
 import onnx
 import onnx.defs
 from onnx.backend.base import namedtupledict
-from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from caffe2.python.models.download import ModelDownloader
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
index d2efcc79823e..aab5a04a169c 100644
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ b/caffe2/python/onnx/tests/c2_ref_test.py
@@ -6,7 +6,6 @@
 
 
 
-import json
 import os
 import unittest
 
@@ -17,7 +16,7 @@
 from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
 from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
 
-from onnx import defs, mapping
+from onnx import mapping
 import caffe2.python.onnx.frontend as c2_onnx
 import caffe2.python.onnx.backend as c2
 
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
index 5166ec3c5083..e8b718a5a2be 100644
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ b/caffe2/python/onnx/tests/onnx_backend_test.py
@@ -13,7 +13,7 @@
 
 import caffe2.python.onnx.backend as c2
 
-from caffe2.python import core, workspace
+from caffe2.python import core
 core.SetEnginePref({}, {})
 
 # This is a pytest magic variable to load extra plugins
diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py
index d34d4a0e5287..96f954037178 100644
--- a/caffe2/python/onnx/tests/ssa_test.py
+++ b/caffe2/python/onnx/tests/ssa_test.py
@@ -7,11 +7,10 @@
 
 
 import copy
-import onnx
 import numpy as np
 from caffe2.proto import caffe2_pb2
 from caffe2.python import core
-from onnx import helper, TensorProto
+from onnx import TensorProto
 
 import caffe2.python.onnx.frontend as c2_onnx
 from caffe2.python.onnx.helper import c2_native_run_net
diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py
index d224daf05ba3..bebfc1012957 100644
--- a/caffe2/python/onnx/tests/test_utils.py
+++ b/caffe2/python/onnx/tests/test_utils.py
@@ -6,7 +6,6 @@
 
 
 
-import os
 import unittest
 
 import numpy as np
diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py
index 3a1ebcd4ec67..f039ef09f637 100644
--- a/caffe2/python/operator_fp_exceptions_test.py
+++ b/caffe2/python/operator_fp_exceptions_test.py
@@ -3,7 +3,6 @@
 
 
 from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
 from caffe2.python.test_util import TestCase
 
 import numpy as np
diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py
index 6cf8170b34f8..88197d16d70b 100644
--- a/caffe2/python/operator_test/blobs_queue_db_test.py
+++ b/caffe2/python/operator_test/blobs_queue_db_test.py
@@ -3,7 +3,6 @@
 
 
 
-import unittest
 import numpy as np
 
 import caffe2.proto.caffe2_pb2 as caffe2_pb2
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 05b8212242e4..38fe43899990 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -2,7 +2,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py
index bf9af112a5b0..2eb2acf87902 100644
--- a/caffe2/python/operator_test/bucketize_op_test.py
+++ b/caffe2/python/operator_test/bucketize_op_test.py
@@ -2,10 +2,9 @@
 
 
 
-from caffe2.python import core, dyndep
+from caffe2.python import core
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
 import numpy as np
 
 
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
index 1927b4eac78f..ac83681f08bf 100644
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ b/caffe2/python/operator_test/concat_split_op_test.py
@@ -3,8 +3,7 @@
 
 
 
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
+from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index ae54cd37a91d..e600aa2c9ee9 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -2,7 +2,6 @@
 
 import collections
 import functools
-import os
 import unittest
 
 import caffe2.python._import_c_extension as C
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
index 04bfbbe6f4f6..d979407321a4 100644
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
@@ -3,7 +3,6 @@
 
 
 
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
index b75e7b7b1a10..4d7b90c431a6 100644
--- a/caffe2/python/operator_test/crf_test.py
+++ b/caffe2/python/operator_test/crf_test.py
@@ -9,7 +9,6 @@
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 from hypothesis import given, settings
-import unittest
 
 
 class TestCRFOp(hu.HypothesisTestCase):
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
index d1852e7dd9e8..c88f93503a15 100644
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ b/caffe2/python/operator_test/cross_entropy_ops_test.py
@@ -9,7 +9,6 @@
 import numpy as np
 
 import unittest
-import os
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
index 1dda7166e65a..29440c00a4b3 100644
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from caffe2.python.test_util import caffe2_flaky
 from collections import defaultdict, Counter
 from hypothesis import given, settings
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py
index db1b826cfe41..ef4433a41a18 100644
--- a/caffe2/python/operator_test/cudnn_recurrent_test.py
+++ b/caffe2/python/operator_test/cudnn_recurrent_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import model_helper, workspace, core, rnn_cell
-from caffe2.proto import caffe2_pb2
 from future.utils import viewitems
 import numpy as np
 
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
index f6ad0e38e73c..67289de5e924 100644
--- a/caffe2/python/operator_test/deform_conv_test.py
+++ b/caffe2/python/operator_test/deform_conv_test.py
@@ -1,6 +1,5 @@
 
 
-import os
 import unittest
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
index 2d6d6429f833..cdfffce288dd 100644
--- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py
+++ b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, dyndep, utils, workspace
+from caffe2.python import core, utils
 from hypothesis import given, settings
 import hypothesis.strategies as st
 
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
index e948fdae9673..5b46548e072b 100644
--- a/caffe2/python/operator_test/distance_op_test.py
+++ b/caffe2/python/operator_test/distance_op_test.py
@@ -6,7 +6,6 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
index ac0dc3dd0975..2bd85625a3d9 100644
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ b/caffe2/python/operator_test/elementwise_linear_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 8dbfdc1871e8..31f70086de7b 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -10,7 +10,6 @@
 import numpy as np
 
 import unittest
-import os
 
 class TestElementwiseOps(hu.HypothesisTestCase):
 
diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py
index b843bfdc95b9..8150977945a2 100644
--- a/caffe2/python/operator_test/enforce_finite_op_test.py
+++ b/caffe2/python/operator_test/enforce_finite_op_test.py
@@ -8,7 +8,6 @@
 
 from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
 
 
 class TestEnforceFinite(hu.HypothesisTestCase):
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
index 0d198b1aff14..aba2c1106da3 100644
--- a/caffe2/python/operator_test/expand_op_test.py
+++ b/caffe2/python/operator_test/expand_op_test.py
@@ -3,7 +3,7 @@
 
 
 
-from caffe2.python import core, workspace
+from caffe2.python import core
 from hypothesis import given, settings
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py
index 19fa329c9389..5a20b63166be 100644
--- a/caffe2/python/operator_test/feature_maps_ops_test.py
+++ b/caffe2/python/operator_test/feature_maps_ops_test.py
@@ -2,7 +2,7 @@
 
 
 
-from caffe2.python import core, workspace, dyndep
+from caffe2.python import core, workspace
 from caffe2.python.test_util import TestCase
 import numpy as np
 
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
index f38df09ec9fb..7b7a33dcd90a 100644
--- a/caffe2/python/operator_test/glu_op_test.py
+++ b/caffe2/python/operator_test/glu_op_test.py
@@ -6,7 +6,7 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import assume, given, settings, HealthCheck
+from hypothesis import given, settings
 import hypothesis.strategies as st
 import numpy as np
 
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index 62aba236d5ba..8e864bb42152 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -12,7 +12,6 @@
 import caffe2.python.hypothesis_test_util as hu
 
 import unittest
-import os
 
 class TestGroupConvolution(hu.HypothesisTestCase):
 
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
index 99444f39ac26..1a7db2634989 100644
--- a/caffe2/python/operator_test/gru_test.py
+++ b/caffe2/python/operator_test/gru_test.py
@@ -16,7 +16,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import unittest
-import os
 
 
 def gru_unit(*args, **kwargs):
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
index 90a8197e7ccf..c0a1e8f49f5a 100644
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ b/caffe2/python/operator_test/hyperbolic_ops_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
index 760228382bc6..42cb1deaf8ae 100644
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ b/caffe2/python/operator_test/im2col_col2im_test.py
@@ -10,9 +10,6 @@
 import hypothesis.strategies as st
 import numpy as np
 
-import unittest
-import os
-
 
 class TestReduceFrontSum(hu.HypothesisTestCase):
     @given(batch_size=st.integers(1, 3),
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
index fb4f3c935ba8..efce9d7001fe 100644
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ b/caffe2/python/operator_test/instance_norm_test.py
@@ -11,7 +11,6 @@
 import caffe2.python.serialized_test.serialized_test_util as serial
 
 import unittest
-import os
 
 
 class TestInstanceNorm(serial.SerializedTestCase):
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
index 6ed2db2e88c2..f205d8e650b2 100644
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ b/caffe2/python/operator_test/jsd_ops_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
index 62e94afe9e7d..d402cce4c4f9 100644
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ b/caffe2/python/operator_test/layer_norm_op_test.py
@@ -13,7 +13,6 @@
 import hypothesis.strategies as st
 
 import numpy as np
-import os
 import torch
 
 import unittest
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
index 626ec0542b7d..cda2f7da323e 100644
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ b/caffe2/python/operator_test/lengths_pad_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
index fc4e89e2545b..49b0ba7ec22c 100644
--- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
+++ b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
@@ -3,7 +3,7 @@
 import caffe2.python.hypothesis_test_util as hu
 import hypothesis.strategies as st
 import numpy as np
-from caffe2.python import core, dyndep, workspace
+from caffe2.python import core, workspace
 from hypothesis import given
 
 
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
index e0a5f9609588..441fcc747835 100644
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ b/caffe2/python/operator_test/lengths_tile_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
index 24cb65ac96f8..f6a07ead3cf9 100644
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ b/caffe2/python/operator_test/loss_ops_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
index b8cef19b24df..8b4001a574ac 100644
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ b/caffe2/python/operator_test/matmul_op_test.py
@@ -9,8 +9,6 @@
 
 from hypothesis import assume, given, settings
 import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
index 5830089f8e9b..ee2c6fc8fbf7 100644
--- a/caffe2/python/operator_test/mean_op_test.py
+++ b/caffe2/python/operator_test/mean_op_test.py
@@ -6,8 +6,6 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
index 3b270df254ce..bee44e360e3f 100644
--- a/caffe2/python/operator_test/moments_op_test.py
+++ b/caffe2/python/operator_test/moments_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
index a202581f808c..c32aa99470db 100644
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ b/caffe2/python/operator_test/numpy_tile_op_test.py
@@ -9,7 +9,7 @@
 import hypothesis.strategies as st
 import unittest
 
-from caffe2.python import core, workspace
+from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
index 4cff53b87d6e..5ad9c277239d 100644
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ b/caffe2/python/operator_test/onnx_while_test.py
@@ -3,7 +3,7 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
+from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given, settings
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
index 9a76e6b847a5..eceb1e5ba6a9 100644
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import core
-from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
index 6d4e6bbdcd08..788c4035dd5f 100644
--- a/caffe2/python/operator_test/pad_test.py
+++ b/caffe2/python/operator_test/pad_test.py
@@ -5,8 +5,6 @@
 from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np
 import unittest
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
index d81b0a963185..40c4192e21e9 100644
--- a/caffe2/python/operator_test/percentile_op_test.py
+++ b/caffe2/python/operator_test/percentile_op_test.py
@@ -3,7 +3,7 @@
 
 
 
-from caffe2.python import core, workspace, dyndep
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import numpy as np
 
diff --git a/caffe2/python/operator_test/rand_quantization_op_test.py b/caffe2/python/operator_test/rand_quantization_op_test.py
index e244f77149e1..a702ab41577f 100644
--- a/caffe2/python/operator_test/rand_quantization_op_test.py
+++ b/caffe2/python/operator_test/rand_quantization_op_test.py
@@ -6,7 +6,6 @@
 import numpy as np
 import struct
 import unittest
-import os
 
 from hypothesis import given, example
 import hypothesis.strategies as st
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
index 13650e6cad4e..33ada4d6881c 100644
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ b/caffe2/python/operator_test/recurrent_network_test.py
@@ -11,9 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 
-import os
-import unittest
-
 class RecurrentNetworkTest(serial.SerializedTestCase):
     @given(T=st.integers(1, 4),
            n=st.integers(1, 5),
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
index 727631befe89..7b79b3b81aed 100644
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ b/caffe2/python/operator_test/reduce_ops_test.py
@@ -11,7 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import itertools as it
-import unittest
 
 
 class TestReduceOps(serial.SerializedTestCase):
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
index 7d4287df6609..6a99f2b27d42 100644
--- a/caffe2/python/operator_test/reduction_ops_test.py
+++ b/caffe2/python/operator_test/reduction_ops_test.py
@@ -3,7 +3,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from hypothesis import assume, given, settings
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
index c74157a039b0..ea835acead61 100644
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -3,7 +3,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import core, workspace
 from hypothesis import given
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
index 4609473f91f0..65c0669abfb0 100644
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ b/caffe2/python/operator_test/sequence_ops_test.py
@@ -11,7 +11,6 @@
 import hypothesis.strategies as st
 import numpy as np
 import unittest
-import os
 
 
 def _gen_test_add_padding(with_pad_data=True,
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
index 35f7bd2a5e29..21a530346329 100644
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ b/caffe2/python/operator_test/spatial_bn_op_test.py
@@ -3,7 +3,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import brew, core, utils, workspace
 import caffe2.python.hip_test_util as hiputl
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
index 5bd6cb1d08f8..51f328c95f5f 100644
--- a/caffe2/python/operator_test/square_root_divide_op_test.py
+++ b/caffe2/python/operator_test/square_root_divide_op_test.py
@@ -5,7 +5,6 @@
 
 from caffe2.python import core
 from functools import partial
-from hypothesis import given
 from hypothesis import strategies as st
 
 import caffe2.python.hypothesis_test_util as hu
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index 02276b08c176..beb8a3781832 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -8,7 +8,6 @@
 import unittest
 import numpy as np
 from caffe2.python import brew, core, workspace, cnn, optimizer
-from caffe2.proto import caffe2_pb2
 from caffe2.python.modeling.initializers import (
     Initializer, PseudoFP16Initializer)
 
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
index b3926e941194..6513f216a9be 100644
--- a/caffe2/python/pybind_state.h
+++ b/caffe2/python/pybind_state.h
@@ -232,7 +232,6 @@ class TensorFeeder : public BlobFeederBase {
         for (int i = 0; i < tensor.numel(); ++i) {
           char* str;
           Py_ssize_t strSize;
-#if PY_MAJOR_VERSION > 2
           if (PyBytes_Check(input[i])) {
             CAFFE_ENFORCE(
                 PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
@@ -246,11 +245,6 @@ class TensorFeeder : public BlobFeederBase {
           } else {
             CAFFE_THROW("Unsupported python object type passed into ndarray.");
           }
-#else
-          CAFFE_ENFORCE(
-              PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
-              "Unsupported python object type passed into ndarray.");
-#endif // PY_MAJOR_VERSION > 2
           outPtr[i] = std::string(str, strSize);
         }
         break;
@@ -342,18 +336,12 @@ class PythonOpBase : public Operator<Context> {
         try {
           builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
         } catch (const py::error_already_set& e) {
-#if PY_MAJOR_VERSION >= 3
           LOG(INFO) << "Cannot unpickle python operator: " << e.what();
           LOG(INFO) << "Try latin1 encoding for python3 run";
           // to use the `_a` literal for arguments
           using namespace pybind11::literals;
           builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1")
                              .template cast<py::tuple>();
-#else
-          // for py2, simply re-throw the exception, as there is no encoding
-          // argument for pickle.loads
-          throw;
-#endif
         }
         CAFFE_ENFORCE(builder_call);
         CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
index dee96413dbe5..34fddbc1a66e 100644
--- a/caffe2/python/rnn/lstm_comparison.py
+++ b/caffe2/python/rnn/lstm_comparison.py
@@ -2,7 +2,6 @@
 
 
 
-from caffe2.proto import caffe2_pb2
 from caffe2.python import workspace, core, lstm_benchmark, utils
 from copy import copy
 
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 9c85d0efd2a5..f6da5e126119 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -7,7 +7,6 @@
 
 import functools
 import inspect
-import itertools
 import logging
 import numpy as np
 import random
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index 9bd69eb32902..bf3c8e9a0d06 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -4,7 +4,6 @@
 
 
 from caffe2.python import scope, core, workspace
-from caffe2.proto import caffe2_pb2
 
 import unittest
 import threading
diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py
index ba10247eaa2e..abf63626a7fa 100644
--- a/caffe2/python/test/executor_test_util.py
+++ b/caffe2/python/test/executor_test_util.py
@@ -14,7 +14,6 @@
 
 import time
 import numpy as np
-from hypothesis import settings
 
 
 CI_MAX_EXAMPLES = 2
diff --git a/caffe2/python/test/inference_lstm_op_test.py b/caffe2/python/test/inference_lstm_op_test.py
index 20caab9ba78b..768827bd8876 100644
--- a/caffe2/python/test/inference_lstm_op_test.py
+++ b/caffe2/python/test/inference_lstm_op_test.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python3
-import inspect
 
 import hypothesis.strategies as st
 import numpy as np
 import torch
-from caffe2.python import core, workspace
+from caffe2.python import core
 from caffe2.python.test_util import TestCase
 from hypothesis import given, settings
 from torch import nn
diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py
index 7790e0f6d8f5..a407f33fe253 100644
--- a/caffe2/python/test/python_protobuf_test.py
+++ b/caffe2/python/test/python_protobuf_test.py
@@ -5,9 +5,6 @@
 # make sure we use cpp implementation of protobuf
 import os
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
-
-# import cpp extension first
-from caffe2.python import core
 # then import protobuf
 from caffe2.proto import caffe2_pb2, metanet_pb2
 
diff --git a/caffe2/python/trt/test_pt_onnx_trt.py b/caffe2/python/trt/test_pt_onnx_trt.py
index 96f1ad76f6b7..5e6abb5c4d0b 100644
--- a/caffe2/python/trt/test_pt_onnx_trt.py
+++ b/caffe2/python/trt/test_pt_onnx_trt.py
@@ -15,17 +15,13 @@
 
 import os
 import unittest
-from typing import List, Any
 
 from PIL import Image
 import numpy as np
 import torch
-from torch.onnx import OperatorExportTypes
 import torchvision.models as models
 
 import pycuda.driver as cuda
-# This import causes pycuda to automatically manage CUDA context creation and cleanup.
-import pycuda.autoinit
 
 import tensorrt as trt
 TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
index 39d37ca9fa0a..2782cca7c13f 100644
--- a/caffe2/python/trt/test_trt.py
+++ b/caffe2/python/trt/test_trt.py
@@ -7,7 +7,7 @@
 from caffe2.python import core, workspace
 import onnx
 import onnx.defs
-from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
+from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model
 from onnx.backend.base import namedtupledict
 from caffe2.python.models.download import ModelDownloader
 import caffe2.python.onnx.backend as c2
@@ -16,7 +16,6 @@
 from caffe2.python.onnx.tests.test_utils import TestCase
 import numpy as np
 import os.path
-import json
 import time
 import unittest
 import tarfile
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
index 0936941aac03..1b201007daab 100644
--- a/caffe2/python/trt/transform.py
+++ b/caffe2/python/trt/transform.py
@@ -12,9 +12,7 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
-from caffe2.python import core, workspace
-import caffe2.python.onnx.frontend as c2_front
+from caffe2.python import workspace
 import caffe2.python._import_c_extension as C
 import numpy as np
 
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index 947dd9bf296d..289d107303fa 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -6,12 +6,12 @@
 
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python.compatibility import container_abcs
 from future.utils import viewitems
 from google.protobuf.message import DecodeError, Message
 from google.protobuf import text_format
 
 import sys
+import collections
 import copy
 import functools
 import numpy as np
@@ -126,7 +126,7 @@ def MakeArgument(key, value):
     """Makes an argument based on the value type."""
     argument = caffe2_pb2.Argument()
     argument.name = key
-    iterable = isinstance(value, container_abcs.Iterable)
+    iterable = isinstance(value, collections.abc.Iterable)
 
     # Fast tracking common use case where a float32 array of tensor parameters
     # needs to be serialized.  The entire array is guaranteed to have the same
diff --git a/caffe2/serialize/inline_container.h b/caffe2/serialize/inline_container.h
index a34a6db70115..87c3151bbb76 100644
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@@ -12,6 +12,7 @@
 
 #include "caffe2/serialize/istream_adapter.h"
 #include "caffe2/serialize/read_adapter_interface.h"
+#include "caffe2/serialize/versions.h"
 
 extern "C" {
 typedef struct mz_zip_archive mz_zip_archive;
@@ -90,68 +91,6 @@ typedef struct mz_zip_archive mz_zip_archive;
 namespace caffe2 {
 namespace serialize {
 
-constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
-constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;
-
-// Versions (i.e. why was the version number bumped?)
-
-// Note [Dynamic Versions and torch.jit.save vs. torch.save]
-//
-// Our versioning scheme has a "produced file format version" which
-// describes how an archive is to be read. The version written in an archive
-// is at least this current produced file format version, but may be greater
-// if it includes certain symbols. We refer to these conditional versions
-// as "dynamic," since they are identified at runtime.
-//
-// Dynamic versioning is useful when an operator's semantics are updated.
-// When using torch.jit.save we want those semantics to be preserved. If
-// we bumped the produced file format version on every change, however,
-// then older versions of PyTorch couldn't read even simple archives, like
-// a single tensor, from newer versions of PyTorch. Instead, we
-// assign dynamic versions to these changes that override the
-// produced file format version as needed. That is, when the semantics
-// of torch.div changed it was assigned dynamic version 4, and when
-// torch.jit.saving modules that use torch.div those archives also have
-// (at least) version 4. This prevents earlier versions of PyTorch
-// from accidentally performing the wrong kind of division. Modules
-// that don't use torch.div or other operators with dynamic versions
-// can write the produced file format version, and these programs will
-// run as expected on earlier versions of PyTorch.
-//
-// While torch.jit.save attempts to preserve operator semantics,
-// torch.save does not. torch.save is analogous to pickling Python, so
-// a function that uses torch.div will have different behavior if torch.saved
-// and torch.loaded across PyTorch versions. From a technical perspective,
-// torch.save ignores dynamic versioning.
-
-// 1. Initial version
-// 2. Removed op_version_set version numbers
-// 3. Added type tags to pickle serialization of container types
-// 4. (Dynamic) Stopped integer division using torch.div
-//      (a versioned symbol preserves the historic behavior of versions 1--3)
-// 5. (Dynamic) Stops torch.full inferring a floating point dtype
-//      when given bool or integer fill values.
-constexpr uint64_t kProducedFileFormatVersion = 0x3L;
-
-// the version we write when the archive contains bytecode.
-// It must be higher or eq to kProducedFileFormatVersion.
-// Because torchscript changes is likely introduce bytecode change.
-// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion
-// should be increased too. The relationship is:
-// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion
-//   >= kProducedFileFormatVersion
-constexpr uint64_t kProducedBytecodeVersion = 0x4L;
-
-static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion,
-    "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion.");
-
-// Introduce kMinSupportedBytecodeVersion for limited backward compatibility
-// support of bytecode. If
-// kMinSupportedBytecodeVersion <= model_version <= kProducedBytecodeVersion (in loader),
-// we should support this model_version. For example, we provide a wrapper to
-// handle an updated operator.
-constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
-
 class TORCH_API PyTorchStreamReader final {
  public:
   explicit PyTorchStreamReader(const std::string& file_name);
diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h
new file mode 100644
index 000000000000..4da4b2c50305
--- /dev/null
+++ b/caffe2/serialize/versions.h
@@ -0,0 +1,68 @@
+#pragma once
+
+namespace caffe2 {
+namespace serialize {
+
+constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L;
+constexpr uint64_t kMaxSupportedFileFormatVersion = 0x5L;
+
+// Versions (i.e. why was the version number bumped?)
+
+// Note [Dynamic Versions and torch.jit.save vs. torch.save]
+//
+// Our versioning scheme has a "produced file format version" which
+// describes how an archive is to be read. The version written in an archive
+// is at least this current produced file format version, but may be greater
+// if it includes certain symbols. We refer to these conditional versions
+// as "dynamic," since they are identified at runtime.
+//
+// Dynamic versioning is useful when an operator's semantics are updated.
+// When using torch.jit.save we want those semantics to be preserved. If
+// we bumped the produced file format version on every change, however,
+// then older versions of PyTorch couldn't read even simple archives, like
+// a single tensor, from newer versions of PyTorch. Instead, we
+// assign dynamic versions to these changes that override the
+// produced file format version as needed. That is, when the semantics
+// of torch.div changed it was assigned dynamic version 4, and when
+// torch.jit.saving modules that use torch.div those archives also have
+// (at least) version 4. This prevents earlier versions of PyTorch
+// from accidentally performing the wrong kind of division. Modules
+// that don't use torch.div or other operators with dynamic versions
+// can write the produced file format version, and these programs will
+// run as expected on earlier versions of PyTorch.
+//
+// While torch.jit.save attempts to preserve operator semantics,
+// torch.save does not. torch.save is analogous to pickling Python, so
+// a function that uses torch.div will have different behavior if torch.saved
+// and torch.loaded across PyTorch versions. From a technical perspective,
+// torch.save ignores dynamic versioning.
+
+// 1. Initial version
+// 2. Removed op_version_set version numbers
+// 3. Added type tags to pickle serialization of container types
+// 4. (Dynamic) Stopped integer division using torch.div
+//      (a versioned symbol preserves the historic behavior of versions 1--3)
+// 5. (Dynamic) Stops torch.full inferring a floating point dtype
+//      when given bool or integer fill values.
+constexpr uint64_t kProducedFileFormatVersion = 0x3L;
+
+// the version we write when the archive contains bytecode.
+// It must be higher or eq to kProducedFileFormatVersion.
+// Because torchscript changes is likely introduce bytecode change.
+// If kProducedFileFormatVersion is increased, kProducedBytecodeVersion
+// should be increased too. The relationship is:
+// kMaxSupportedFileFormatVersion >= (most likely ==) kProducedBytecodeVersion
+//   >= kProducedFileFormatVersion
+constexpr uint64_t kProducedBytecodeVersion = 0x4L;
+
+static_assert(kProducedBytecodeVersion >= kProducedFileFormatVersion,
+    "kProducedBytecodeVersion must be higher or equal to kProducedFileFormatVersion.");
+
+// Introduce kMinSupportedBytecodeVersion for limited backward compatibility
+// support of bytecode. If
+// kMinSupportedBytecodeVersion <= model_version <= kProducedBytecodeVersion (in loader),
+// we should support this model_version. For example, we provide a wrapper to
+// handle an updated operator.
+constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L;
+} // namespace serialize
+} // namespace caffe2
diff --git a/docker/caffe2/jenkins/common/install_python.sh b/docker/caffe2/jenkins/common/install_python.sh
index 48a47b271107..19633d451ab3 100755
--- a/docker/caffe2/jenkins/common/install_python.sh
+++ b/docker/caffe2/jenkins/common/install_python.sh
@@ -135,11 +135,6 @@ if [ -z "${INSTALL_SETUPTOOLS}" ]; then
   pip install -U pip setuptools!=38.5.2
 fi
 
-# tornado 5.0 requires Python 2.7.9+ or 3.4+
-if [[ $($PYTHON -c 'import sys; print(int(sys.version_info <= (2, 7, 9) or sys.version_info <= (3, 4)))' == 1) ]]; then
-    pip install 'tornado<5'
-fi
-
 # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
 # defaults installs the most recent networkx version, so we install this lower
 # version explicitly before scikit-image pulls it in as a dependency
diff --git a/docs/caffe2/process.py b/docs/caffe2/process.py
index 9fa37e5fbb5a..3b94b9d38502 100644
--- a/docs/caffe2/process.py
+++ b/docs/caffe2/process.py
@@ -1,20 +1,21 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
 ## @package process
 # Module doxygen.process
 # Script to insert preamble for doxygen and regen API docs
 
-import glob, os, shutil
+import os
+import shutil
 
 # Module caffe2...caffe2.python.control_test
-def insert(originalfile,first_line,description):
-    with open(originalfile,'r') as f:
+def insert(originalfile, first_line, description):
+    with open(originalfile, 'r') as f:
         f1 = f.readline()
-        if(f1.find(first_line)<0):
+        if(f1.find(first_line) < 0):
             docs = first_line + description + f1
-            with open('newfile.txt','w') as f2:
+            with open('newfile.txt', 'w') as f2:
                 f2.write(docs)
                 f2.write(f.read())
-            os.rename('newfile.txt',originalfile)
+            os.rename('newfile.txt', originalfile)
         else:
             print('already inserted')
 
@@ -29,15 +30,15 @@ def insert(originalfile,first_line,description):
     for file in files:
         if (file.endswith(".py") and not file.endswith("_test.py") and not file.endswith("__.py")):
             filepath = os.path.join(root, file)
-            print("filepath: " + filepath)
+            print(("filepath: " + filepath))
             directory = os.path.dirname(filepath)[2:]
-            directory = directory.replace("/",".")
-            print "directory: " + directory
+            directory = directory.replace("/", ".")
+            print("directory: " + directory)
             name = os.path.splitext(file)[0]
             first_line = "## @package " + name
             description = "\n# Module " + directory + "." + name + "\n"
-            print first_line,description
-            insert(filepath,first_line,description)
+            print(first_line, description)
+            insert(filepath, first_line, description)
 
 if os.path.exists("doxygen/doxygen-python"):
     print("Looks like you ran this before, so we need to cleanup those old files...")
diff --git a/docs/source/linalg.rst b/docs/source/linalg.rst
index 36f91627d522..d6de2373ad57 100644
--- a/docs/source/linalg.rst
+++ b/docs/source/linalg.rst
@@ -22,3 +22,4 @@ Functions
 .. autofunction:: solve
 .. autofunction:: tensorinv
 .. autofunction:: tensorsolve
+.. autofunction:: qr
diff --git a/docs/source/onnx.rst b/docs/source/onnx.rst
index 49bbc1df45a0..a1f88a64c83d 100644
--- a/docs/source/onnx.rst
+++ b/docs/source/onnx.rst
@@ -755,7 +755,7 @@ but intuitively the interface they provide looks like this::
       ONNX outputs whose values correspond to the original PyTorch return values
       of the autograd Function (or None if an output is not supported by ONNX).
 
-      Arguments:
+      Args:
         g (Graph): graph to write the ONNX representation into
         inputs (Value...): list of values representing the variables which contain
             the inputs for this function
@@ -782,7 +782,7 @@ but intuitively the interface they provide looks like this::
         The set of operators and the inputs/attributes they take
         is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
 
-        Arguments:
+        Args:
             opname (string): The ONNX operator name, e.g., `Abs` or `Add`.
             args (Value...): The inputs to the operator; usually provided
                 as arguments to the `symbolic` definition.
diff --git a/docs/source/quantization.rst b/docs/source/quantization.rst
index a389de60416a..1cac90ffab86 100644
--- a/docs/source/quantization.rst
+++ b/docs/source/quantization.rst
@@ -530,6 +530,71 @@ Best Practices
    ``fbgemm`` backend.  This argument prevents overflow on some int8 instructions
    by reducing the range of quantized data type by 1 bit.
 
+Common Errors
+---------------------------------------
+
+Passing a non-quantized Tensor into a quantized kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you see an error similar to::
+
+  RuntimeError: Could not run 'quantized::some_operator' with arguments from the 'CPU' backend...
+
+This means that you are trying to pass a non-quantized Tensor to a quantized
+kernel. A common workaround is to use ``torch.quantization.QuantStub`` to
+quantize the tensor.  This needs to be done manually in Eager mode quantization.
+An e2e example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.quant = torch.quantization.QuantStub()
+          self.conv = torch.nn.Conv2d(1, 1, 1)
+
+      def forward(self, x):
+          # during the convert step, this will be replaced with a
+          # `quantize_per_tensor` call
+          x = self.quant(x)
+          x = self.conv(x)
+          return x
+
+Passing a quantized Tensor into a non-quantized kernel
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you see an error similar to::
+
+  RuntimeError: Could not run 'aten::thnn_conv2d_forward' with arguments from the 'QuantizedCPU' backend.
+
+This means that you are trying to pass a quantized Tensor to a non-quantized
+kernel. A common workaround is to use ``torch.quantization.DeQuantStub`` to
+dequantize the tensor.  This needs to be done manually in Eager mode quantization.
+An e2e example::
+
+  class M(torch.nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.quant = torch.quantization.QuantStub()
+          self.conv1 = torch.nn.Conv2d(1, 1, 1)
+          # this module will not be quantized (see `qconfig = None` logic below)
+          self.conv2 = torch.nn.Conv2d(1, 1, 1)
+          self.dequant = torch.quantization.DeQuantStub()
+
+      def forward(self, x):
+          # during the convert step, this will be replaced with a
+          # `quantize_per_tensor` call
+          x = self.quant(x)
+          x = self.conv1(x)
+          # during the convert step, this will be replaced with a
+          # `dequantize` call
+          x = self.dequant(x)
+          x = self.conv2(x)
+          return x
+
+  m = M()
+  m.qconfig = some_qconfig
+  # turn off quantization for conv2
+  m.conv2.qconfig = None
+
 
 Modules that provide quantization functions and classes
 -------------------------------------------------------
diff --git a/mypy-strict.ini b/mypy-strict.ini
index 42fc73abf1cc..7cc6fff83577 100644
--- a/mypy-strict.ini
+++ b/mypy-strict.ini
@@ -31,9 +31,11 @@ strict_equality = True
 
 files = tools/codegen/gen.py,
     tools/autograd/gen_annotated_fn_args.py,
+    tools/autograd/gen_autograd.py,
     tools/autograd/gen_python_functions.py,
     tools/autograd/gen_trace_type.py,
     tools/autograd/gen_variable_factories.py,
+    tools/autograd/gen_variable_type.py,
     tools/autograd/load_derivatives.py,
     torch/utils/benchmark/utils/common.py,
     torch/utils/benchmark/utils/timer.py,
diff --git a/mypy.ini b/mypy.ini
index ad15421ed168..2873c0fc6cea 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -17,8 +17,13 @@ check_untyped_defs = True
 files =
     torch,
     caffe2,
+    test/test_bundled_images.py,
+    test/test_bundled_inputs.py,
     test/test_complex.py,
+    test/test_dataset.py,
+    test/test_expecttest.py,
     test/test_futures.py,
+    test/test_numpy_interop.py,
     test/test_torch.py,
     test/test_type_hints.py,
     test/test_type_info.py
@@ -95,24 +100,6 @@ ignore_errors = True
 [mypy-torch._utils]
 ignore_errors = True
 
-[mypy-torch._overrides]
-ignore_errors = True
-
-[mypy-torch.utils.tensorboard._caffe2_graph]
-ignore_errors = True
-
-[mypy-torch.contrib._tensorboard_vis]
-ignore_errors = True
-
-[mypy-torch.nn.utils.prune]
-ignore_errors = True
-
-[mypy-torch.utils.show_pickle]
-ignore_errors = True
-
-[mypy-torch.utils.hipify.hipify_python]
-ignore_errors = True
-
 [mypy-torch.utils.benchmark.examples.*]
 ignore_errors = True
 
@@ -128,6 +115,12 @@ ignore_errors = True
 [mypy-torch.overrides]
 ignore_errors = True
 
+#
+# Adding type annotations to caffe2 is probably not worth the effort
+# only work on this if you have a specific reason for it, otherwise
+# leave these ignores as they are.
+#
+
 [mypy-caffe2.python.*]
 ignore_errors = True
 
diff --git a/scripts/model_zoo/update-models-from-caffe2.py b/scripts/model_zoo/update-models-from-caffe2.py
index fb582a047bc6..d3e46e449d8a 100644
--- a/scripts/model_zoo/update-models-from-caffe2.py
+++ b/scripts/model_zoo/update-models-from-caffe2.py
@@ -6,15 +6,12 @@
 import caffe2.python.workspace as c2_workspace
 import glob
 import json
-import math
 import numpy as np
 import onnx
 import caffe2.python.onnx.frontend
 import caffe2.python.onnx.backend
 import os
 import shutil
-import subprocess
-import sys
 import tarfile
 import tempfile
 
@@ -25,7 +22,6 @@
 from caffe2.python.models.download import downloadFromURLToFile, getURLFromName, deleteDirectory
 from caffe2.proto import caffe2_pb2
 from onnx import numpy_helper
-from filechunkio import FileChunkIO
 
 
 """A script converting Caffe2 models to ONNX, and updating ONNX model zoos.
diff --git a/scripts/release_notes/categorize.py b/scripts/release_notes/categorize.py
index b72eb9094b7b..985d11f2e2bd 100644
--- a/scripts/release_notes/categorize.py
+++ b/scripts/release_notes/categorize.py
@@ -1,8 +1,7 @@
-import json
 import argparse
 import os
 import textwrap
-from common import dict_to_features, categories, topics, get_features, CommitDataCache
+from common import categories, topics, CommitDataCache
 from commitlist import CommitList
 
 class Categorizer:
diff --git a/scripts/release_notes/commitlist.py b/scripts/release_notes/commitlist.py
index fda7c913addd..0a76f896f217 100644
--- a/scripts/release_notes/commitlist.py
+++ b/scripts/release_notes/commitlist.py
@@ -1,6 +1,6 @@
 import argparse
 from common import run, topics
-from collections import namedtuple, defaultdict
+from collections import defaultdict
 import os
 import csv
 import pprint
diff --git a/setup.py b/setup.py
index 01f173d6825b..50983a89ad55 100644
--- a/setup.py
+++ b/setup.py
@@ -186,7 +186,7 @@
                                                                      python_min_version_str))
     sys.exit(-1)
 
-from setuptools import setup, Extension, distutils, find_packages
+from setuptools import setup, Extension, find_packages
 from collections import defaultdict
 from distutils import core
 from distutils.core import Distribution
@@ -892,6 +892,7 @@ def print_box(msg):
                 'include/torch/csrc/jit/serialization/*.h',
                 'include/torch/csrc/jit/python/*.h',
                 'include/torch/csrc/jit/testing/*.h',
+                'include/torch/csrc/jit/tensorexpr/*.h',
                 'include/torch/csrc/onnx/*.h',
                 'include/torch/csrc/utils/*.h',
                 'include/pybind11/*.h',
diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index ce8d691562b7..2d5d50096c81 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -32,6 +32,7 @@
     # Internal, profiler-specific ops
     ("profiler::_call_end_callbacks_on_jit_fut*", datetime.date(9999, 1, 1)),
     ("profiler::_record_function_enter", datetime.date(9999, 1, 1)),
+    ("aten::_qr_helper", datetime.date(2021, 1, 31)),
     ("aten::fft", datetime.date(2021, 1, 31)),
     ("aten::ifft", datetime.date(2021, 1, 31)),
     ("aten::irfft", datetime.date(2021, 1, 31)),
@@ -71,7 +72,7 @@ def allow_listed(schema, allow_list):
 dont_parse_list = [
     ("_TorchScriptTesting.*", datetime.date(2099, 9, 17)),
     ("test_backend", datetime.date(2099, 9, 17)),
-    ("c10d.frontend", datetime.date(2020, 12, 30)),
+    ("dist_c10d", datetime.date(2021, 1, 30)),
 ]
 
 
diff --git a/test/cpp/api/autograd.cpp b/test/cpp/api/autograd.cpp
index e4bb96ece6fb..3f79c771c2be 100644
--- a/test/cpp/api/autograd.cpp
+++ b/test/cpp/api/autograd.cpp
@@ -175,7 +175,7 @@ TEST(AutogradAPITests, AnomalyMode) {
     auto y = x.pow(1.5);
     auto gr =
         grad({y}, {x}, {}, /*retain_graph=*/true, /*create_backward=*/true);
-    ASSERT_THROWS_WITH(grad({gr[0]}, {x});, "returned nan");
+    ASSERT_THROWS_WITH(grad({gr[0]}, {x}, {torch::tensor({0.0})});, "returned nan");
     auto msgs = warnings.messages();
     ASSERT_EQ(msgs.size(), 2);
     ASSERT_TRUE(
diff --git a/test/cpp/api/misc.cpp b/test/cpp/api/misc.cpp
index 160075d0d268..a8d6320e9533 100644
--- a/test/cpp/api/misc.cpp
+++ b/test/cpp/api/misc.cpp
@@ -82,3 +82,11 @@ TEST_F(AutogradTest, CanPassCustomGradientInputs) {
   z.sum().backward(torch::ones({}) * 2);
   ASSERT_TRUE(x.grad().allclose(y * 2));
 }
+
+TEST(UtilsTest, AmbiguousOperatorDefaults) {
+  auto tmp = at::empty({}, at::kCPU);
+  at::_test_ambiguous_defaults(tmp);
+  at::_test_ambiguous_defaults(tmp, 1);
+  at::_test_ambiguous_defaults(tmp, 1, 1);
+  at::_test_ambiguous_defaults(tmp, 2, "2");
+}
diff --git a/test/cpp/api/tensor_indexing.cpp b/test/cpp/api/tensor_indexing.cpp
index efb153fbf481..03600c5c882e 100644
--- a/test/cpp/api/tensor_indexing.cpp
+++ b/test/cpp/api/tensor_indexing.cpp
@@ -83,27 +83,27 @@ TEST(TensorIndexingTest, TestNoIndices) {
   ASSERT_THROWS_WITH(tensor.index_put_(indices, value), "Passing an empty index list to Tensor::index_put_() is not valid syntax");
 }
 
-TEST(TensorIndexingTest, TestAdvancedIndexingWithArrayRefOfTensor) {
+TEST(TensorIndexingTest, TestAdvancedIndexingWithListOfTensor) {
   {
     torch::Tensor tensor = torch::randn({20, 20});
     torch::Tensor index = torch::arange(10, torch::kLong).cpu();
-    torch::Tensor result_with_array_ref = tensor.index(at::ArrayRef<torch::Tensor>({index}));
+    torch::Tensor result = at::index(tensor, {index});
     torch::Tensor result_with_init_list = tensor.index({index});
-    ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list));
+    ASSERT_TRUE(result.equal(result_with_init_list));
   }
   {
     torch::Tensor tensor = torch::randn({20, 20});
     torch::Tensor index = torch::arange(10, torch::kLong).cpu();
-    torch::Tensor result_with_array_ref = tensor.index_put_(at::ArrayRef<torch::Tensor>({index}), torch::ones({20}));
+    torch::Tensor result = at::index_put_(tensor, {index}, torch::ones({20}));
     torch::Tensor result_with_init_list = tensor.index_put_({index}, torch::ones({20}));
-    ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list));
+    ASSERT_TRUE(result.equal(result_with_init_list));
   }
   {
     torch::Tensor tensor = torch::randn({20, 20});
     torch::Tensor index = torch::arange(10, torch::kLong).cpu();
-    torch::Tensor result_with_array_ref = tensor.index_put_(at::ArrayRef<torch::Tensor>({index}), torch::ones({1, 20}));
+    torch::Tensor result = at::index_put_(tensor, {index}, torch::ones({1, 20}));
     torch::Tensor result_with_init_list = tensor.index_put_({index}, torch::ones({1, 20}));
-    ASSERT_TRUE(result_with_array_ref.equal(result_with_init_list));
+    ASSERT_TRUE(result.equal(result_with_init_list));
   }
 }
 
@@ -173,7 +173,7 @@ TEST(TensorIndexingTest, TestBoolIndices) {
 TEST(TensorIndexingTest, TestBoolIndicesAccumulate) {
   auto mask = torch::zeros({10}, torch::kBool);
   auto y = torch::ones({10, 10});
-  y.index_put_({mask}, y.index({mask}), /*accumulate=*/true);
+  y.index_put_({mask}, {y.index({mask})}, /*accumulate=*/true);
   assert_tensor_equal(y, torch::ones({10, 10}));
 }
 
diff --git a/test/cpp/jit/test_save_load.cpp b/test/cpp/jit/test_save_load.cpp
index 2e59358b4e00..e102a6ff767c 100644
--- a/test/cpp/jit/test_save_load.cpp
+++ b/test/cpp/jit/test_save_load.cpp
@@ -120,5 +120,33 @@ TEST(SerializationTest, TypeTags) {
   }
 }
 
+TEST(SerializationTest, TestJitStream_CUDA) {
+  torch::jit::Module model;
+  std::vector<torch::jit::IValue> inputs;
+  // Deserialize the ScriptModule from a file using torch::jit::load().
+  // Load the scripted model. This should have been generated by tests_setup.py
+  // Refer: TorchSaveJitStream_CUDA in test/cpp/jit/tests_setup.py
+  model = torch::jit::load("saved_stream_model.pt");
+
+  auto output = model.forward(inputs);
+  auto list_of_elements = output.toTuple()->elements();
+  auto is_stream_s = list_of_elements[0].toBool();
+
+  // a,b: These are the two input tensors
+  // c: This is output tensor generated by the operation torch.cat(a,b)
+  auto a = list_of_elements[1].toTensor();
+  auto b = list_of_elements[2].toTensor();
+  auto c = list_of_elements[3].toTensor();
+  // op: this is used to verify if the cat operation produced the same results
+  // as that on the GPU with torch.cat
+  auto op = at::cat({a, b}, 0);
+
+  // Check if the stream is set
+  ASSERT_TRUE(is_stream_s);
+  // Check if the sizes of the outputs (op and c) is same on the GPU and CPU
+  ASSERT_EQ(op.sizes(), c.sizes());
+  // Check if both the output tensors are equal
+  ASSERT_TRUE(op.equal(c));
+}
 } // namespace jit
 } // namespace torch
diff --git a/test/cpp/jit/tests_setup.py b/test/cpp/jit/tests_setup.py
index 68871d1c21d2..928a06d9b5a0 100644
--- a/test/cpp/jit/tests_setup.py
+++ b/test/cpp/jit/tests_setup.py
@@ -63,11 +63,38 @@ def setup(self):
 
         torch.save(value, self.path, _use_new_zipfile_serialization=False)
 
+class TorchSaveJitStream_CUDA(FileSetup):
+    path = 'saved_stream_model.pt'
+
+    def setup(self):
+        if not torch.cuda.is_available():
+            return
+
+        class Model(torch.nn.Module):
+            def forward(self):
+                device_index = torch.cuda._current_device()
+                s = torch.jit.cuda.Stream(device_index, 0)
+                a = torch.rand(3, 4, device="cuda")
+                b = torch.rand(3, 4, device="cuda")
+
+                with torch.jit.cuda.stream(s):
+                    is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id()
+                    c = torch.cat((a, b), 0).to("cuda")
+                s.synchronize()
+                return is_stream_s, a, b, c
+
+        model = Model()
+
+        # Script the model and save
+        script_model = torch.jit.script(model)
+        torch.jit.save(script_model, self.path)
+
 
 tests = [
     EvalModeForLoadedModule(),
     SerializationInterop(),
     TorchSaveError(),
+    TorchSaveJitStream_CUDA()
 ]
 
 def setup():
diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp
index cf658ad488f6..902c2a701197 100644
--- a/test/cpp/tensorexpr/test_kernel.cpp
+++ b/test/cpp/tensorexpr/test_kernel.cpp
@@ -19,6 +19,65 @@ namespace jit {
 using namespace torch::indexing;
 using namespace torch::jit::tensorexpr;
 
+TEST(Kernel, InliningIntermediates) {
+  // here, each mul has only one use, so it should be completely inlined
+  {
+    const auto graph_string = R"IR(
+        graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
+              %1 : Float(5, 3, strides=[3, 1], device=cpu)):
+          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+          %one : int = prim::Constant[value=1]()
+          %4 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
+          %5: Float(5, 3, strides=[3, 1]) = aten::add(%4, %1, %one)
+          return (%5))IR";
+    KernelScope kernel_scope;
+    auto graph = std::make_shared<Graph>();
+    parseIR(graph_string, &*graph);
+    TensorExprKernel k(graph);
+    auto stmt = k.getCodeGenStmt();
+    std::ostringstream oss;
+    oss << *stmt;
+    torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
+  }
+  {
+    const auto graph_template = R"IR(
+        graph(%0 : Float(5, 3, strides=[3, 1], device=${device}),
+              %1 : Float(5, 3, strides=[3, 1], device=${device})):
+          %2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
+          %one : int = prim::Constant[value=1]()
+          %3 : Float(5, 3, strides=[3, 1]) = aten::sub(%0, %2, %one)
+          %4 : Float(5, 3, strides=[3, 1]) = aten::add(%3, %0, %one)
+          %5 : Float(5, 3, strides=[3, 1]) = aten::div(%3, %0)
+          return (%4, %5))IR";
+    for (bool use_cuda : {false, true}) {
+      if (!torch::cuda::is_available() && use_cuda) {
+        continue;
+      }
+
+      KernelScope kernel_scope;
+      TemplateEnv env;
+      env.s("device", use_cuda ? "cuda:0" : "cpu");
+      const auto graph_string = format(graph_template, env);
+      auto graph = std::make_shared<Graph>();
+      parseIR(graph_string, &*graph);
+      auto device = use_cuda ? kCUDA : kCPU;
+      TensorExprKernel k(graph);
+      auto stmt = k.getCodeGenStmt();
+      std::ostringstream oss;
+      oss << *stmt;
+      // aten_mul only has one use, inlined completely
+      torch::jit::testing::FileCheck().check_not("aten_mul")->run(oss.str());
+
+      // aten_sub should be removed in cuda, exist in cpu
+      // 5 uses: allocate, initialize, free and two reads
+      size_t num_out1_uses = use_cuda ? 0 : 5;
+      torch::jit::testing::FileCheck()
+          .check_count("aten_sub", num_out1_uses, /*exactly*/ true)
+          ->run(oss.str());
+    }
+  }
+}
+
 TEST(Kernel, _1) {
   KernelScope kernel_scope;
 
@@ -714,7 +773,10 @@ TEST(Kernel, Softmax2D) {
       ver_env.d("softmax_dim", softmax_dim);
       ver_env.d("softmax_dim_size", softmax_dim_size);
       const auto verification_pattern = format(verification_template, ver_env);
-      torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      // verication sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
       std::vector<IValue> stack = fmap<IValue>(inputs);
       k.run(stack);
@@ -789,7 +851,10 @@ TEST(Kernel, Softmax3D) {
       ver_env.d("softmax_dim", softmax_dim);
       ver_env.d("softmax_dim_size", softmax_dim_size);
       const auto verification_pattern = format(verification_template, ver_env);
-      torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      // verication sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
       std::vector<IValue> stack = fmap<IValue>(inputs);
       k.run(stack);
@@ -870,7 +935,10 @@ TEST(Kernel, Softmax4D) {
       ver_env.d("softmax_dim", softmax_dim);
       ver_env.d("softmax_dim_size", softmax_dim_size);
       const auto verification_pattern = format(verification_template, ver_env);
-      torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
+
+      // verication sting temporarily disabled until
+      // inlining of exp() is benchmarked and determined
+      // torch::jit::testing::FileCheck().run(verification_pattern, oss.str());
 
       std::vector<IValue> stack = fmap<IValue>(inputs);
       k.run(stack);
diff --git a/test/cpp/tensorexpr/test_loopnest.cpp b/test/cpp/tensorexpr/test_loopnest.cpp
index af95f9971513..7294aa53c803 100644
--- a/test/cpp/tensorexpr/test_loopnest.cpp
+++ b/test/cpp/tensorexpr/test_loopnest.cpp
@@ -3649,45 +3649,6 @@ TEST(LoopNest, DeadStoreEliminationWithIntermediates) {
   torch::jit::testing::FileCheck().run(expected_ir2, oss.str());
 }
 
-TEST(LoopNest, InlineOutputBuffers) {
-  KernelScope kernel_scope;
-  const int M = 4;
-  const int N = 5;
-  const int K = 6;
-  Placeholder a_buf("a", kFloat, {M, N});
-  Placeholder b_buf("b", kFloat, {N, K});
-  Tensor* c = Compute(
-      "broadcast_add",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return a_buf.load(m, n) + b_buf.load(n, k);
-      });
-  Tensor* out1 = Compute(
-      "out1",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return c->call(m, n, k) + 1;
-      });
-
-  Tensor* out2 = Compute(
-      "out2",
-      {{M, "m"}, {N, "n"}, {K, "k"}},
-      [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) {
-        return out1->call(m, n, k) / c->call(m, n, k) * 4;
-      });
-  for (const bool inline_outputs : {true, false}) {
-    LoopNest l({out1, out2});
-    l.inlineIntermediateBufs(inline_outputs);
-    Stmt* stmt1 = l.root_stmt();
-    std::ostringstream oss;
-    oss << *stmt1;
-    size_t num_out1_uses = inline_outputs ? 1 : 2;
-    torch::jit::testing::FileCheck()
-        .check_count("out1", num_out1_uses, /*exactly*/ true)
-        ->run(oss.str());
-  }
-}
-
 TEST(LoopNest, CompoundTensorSimple) {
   KernelScope kernel_scope;
 
diff --git a/test/cpp_extensions/msnpu_extension.cpp b/test/cpp_extensions/msnpu_extension.cpp
index 88c1d509b34c..ea67910f96da 100644
--- a/test/cpp_extensions/msnpu_extension.cpp
+++ b/test/cpp_extensions/msnpu_extension.cpp
@@ -53,10 +53,10 @@ std::tuple<Tensor,Tensor,Tensor> fake_convolution_backward(
 }
 
 TORCH_LIBRARY_IMPL(aten, MSNPU, m) {
-  m.impl_UNBOXED("empty.memory_format",                empty_override);
-  m.impl_UNBOXED("add.Tensor",                         add_override);
-  m.impl_UNBOXED("convolution_overrideable",           fake_convolution);
-  m.impl_UNBOXED("convolution_backward_overrideable",  fake_convolution_backward);
+  m.impl("empty.memory_format",                empty_override);
+  m.impl("add.Tensor",                         add_override);
+  m.impl("convolution_overrideable",           fake_convolution);
+  m.impl("convolution_backward_overrideable",  fake_convolution_backward);
 }
 
 // TODO: Extend this to exercise multi-device setting.  In that case,
diff --git a/test/cpp_extensions/rng_extension.cpp b/test/cpp_extensions/rng_extension.cpp
index bf16a840dfc9..f3ab91fb3cab 100644
--- a/test/cpp_extensions/rng_extension.cpp
+++ b/test/cpp_extensions/rng_extension.cpp
@@ -22,6 +22,8 @@ struct TestCPUGenerator : public c10::GeneratorImpl {
   void set_current_seed(uint64_t seed) override { throw std::runtime_error("not implemented"); }
   uint64_t current_seed() const override { throw std::runtime_error("not implemented"); }
   uint64_t seed() override { throw std::runtime_error("not implemented"); }
+  void set_state(const c10::TensorImpl& new_state) override { throw std::runtime_error("not implemented"); }
+  c10::intrusive_ptr<c10::TensorImpl> get_state() const override { throw std::runtime_error("not implemented"); }
   TestCPUGenerator* clone_impl() const override { throw std::runtime_error("not implemented"); }
 
   static DeviceType device_type() { return DeviceType::CPU; }
@@ -54,9 +56,9 @@ size_t getInstanceCount() {
 }
 
 TORCH_LIBRARY_IMPL(aten, CustomRNGKeyId, m) {
-  m.impl_UNBOXED("aten::random_.from",                 random_from_to);
-  m.impl_UNBOXED("aten::random_.to",                   random_to);
-  m.impl_UNBOXED("aten::random_",                      random_);
+  m.impl("aten::random_.from",                 random_from_to);
+  m.impl("aten::random_.to",                   random_to);
+  m.impl("aten::random_",                      random_);
 }
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
diff --git a/test/distributed/test_c10d.py b/test/distributed/test_c10d.py
index 5ffd4b4fb088..93e26be7ee98 100644
--- a/test/distributed/test_c10d.py
+++ b/test/distributed/test_c10d.py
@@ -4641,6 +4641,43 @@ def test_nccl_barrier_timeout_new_group_non_member(self):
             with self.assertRaisesRegex(RuntimeError, "Timed out initializing process group"):
                 c10d.new_group([0], timeout=timedelta(seconds=1))
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_nccl_barrier_device_ids(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store)
+
+        c10d.barrier(device_ids=[self.rank])
+
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_nccl_barrier_device_ids_function_argument(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store)
+
+        with self.assertRaisesRegex(RuntimeError, "Invalid function argument"):
+            c10d.barrier(device_ids=self.rank)
+
+    @requires_gloo()
+    def test_gloo_barrier_device_ids(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo",
+            rank=self.rank,
+            world_size=self.world_size,
+            store=store)
+
+        with self.assertRaisesRegex(RuntimeError, "device_ids not supported"):
+            c10d.barrier(device_ids=[self.rank])
+
 if __name__ == "__main__":
     assert (
         not torch.cuda._initialized
diff --git a/test/distributed/test_jit_c10d.py b/test/distributed/test_jit_c10d.py
index 85788b914059..182a405d0e78 100644
--- a/test/distributed/test_jit_c10d.py
+++ b/test/distributed/test_jit_c10d.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed as c10d
 import time
+from datetime import timedelta
 from typing import List
 
 import torch.testing._internal.common_utils as common
@@ -31,6 +32,14 @@ def unique_process_group_name(prefix):
     now = int(time.time() * 1000)
     return "%s_%d" % (prefix, now)
 
+def _create_tcp_store():
+    addr = "localhost"
+    port = common.find_free_port()
+    timeout = timedelta(minutes=5)
+    timeout_millisecond = int(timeout / timedelta(milliseconds=1))
+    return torch.classes.dist_c10d.TCPStore(addr, port, 1, True, timeout_millisecond)
+
+
 @unittest.skipIf(
     TEST_WITH_TSAN,
     "TSAN is not fork-safe since we're forking in a multi-threaded environment",
@@ -48,19 +57,15 @@ def setUp(self):
             raise unittest.SkipTest("NCCL test requires 2+ GPUs")
 
     def _create_nccl_pg(self, name_prefix):
-        addr = "localhost"
-        port = common.find_free_port()
-        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+        tcp_store = _create_tcp_store()
         opts = torch.classes.dist_c10d.ProcessGroupNCCLOptions(0, True)
 
         name = unique_process_group_name(name_prefix)
 
-        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)  
+        return torch.classes.dist_c10d.ProcessGroupNCCL(tcp_store, self.rank, self.world_size, opts, name)
 
     def _create_nccl_pg_as_base_process_group(self, name):
-        addr = "localhost"
-        port = common.find_free_port()
-        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+        tcp_store = _create_tcp_store()
 
         return torch.classes.dist_c10d.frontend().new_process_group_helper(
             self.world_size, self.rank, [], "nccl", tcp_store, name, 0)
@@ -155,9 +160,7 @@ def test_frontend_singleton(self):
         frontend1 = torch.classes.dist_c10d.frontend()
         frontend2 = torch.classes.dist_c10d.frontend()
 
-        addr = "localhost"
-        port = common.find_free_port()
-        tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+        tcp_store = _create_tcp_store()
 
         pg_name = unique_process_group_name("singleton_test_process_group")
 
@@ -180,9 +183,7 @@ def test_process_group_as_module_member(self):
         class TestModule(torch.nn.Module):
             def __init__(self):
                 super(TestModule, self).__init__()
-                addr = "localhost"
-                port = common.find_free_port()
-                tcp_store = torch.classes.dist_c10d.TCPStore(addr, port, 1, True)
+                tcp_store = _create_tcp_store()
 
                 name = unique_process_group_name("module_member_process_group")
                 self.pg = torch.classes.dist_c10d.frontend().new_process_group_helper(
diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py
index b057d12a285d..8c927f35fd2e 100644
--- a/test/distributions/test_distributions.py
+++ b/test/distributions/test_distributions.py
@@ -727,7 +727,7 @@ def _gradcheck_log_prob(self, dist_ctor, ctor_params):
         # performs gradient checks on log_prob
         distribution = dist_ctor(*ctor_params)
         s = distribution.sample()
-        if s.is_floating_point():
+        if not distribution.support.is_discrete:
             s = s.detach().requires_grad_()
 
         expected_shape = distribution.batch_shape + distribution.event_shape
@@ -1422,7 +1422,7 @@ def test_uniform(self):
         self.assertEqual(Uniform(0.0, 1.0).sample((1,)).size(), (1,))
 
         # Check log_prob computation when value outside range
-        uniform = Uniform(low_1d, high_1d)
+        uniform = Uniform(low_1d, high_1d, validate_args=False)
         above_high = torch.tensor([4.0])
         below_low = torch.tensor([-1.0])
         self.assertEqual(uniform.log_prob(above_high).item(), -inf)
@@ -1517,7 +1517,7 @@ def test_halfcauchy(self):
 
     def test_halfnormal(self):
         std = torch.randn(5, 5).abs().requires_grad_()
-        std_1d = torch.randn(1, requires_grad=True)
+        std_1d = torch.randn(1).abs().requires_grad_()
         std_delta = torch.tensor([1e-5, 1e-5])
         self.assertEqual(HalfNormal(std).sample().size(), (5, 5))
         self.assertEqual(HalfNormal(std).sample((7,)).size(), (7, 5, 5))
@@ -1978,6 +1978,8 @@ def gradcheck_func(samples, mu, sigma, prec, scale_tril):
                     sigma = 0.5 * (sigma + sigma.transpose(-1, -2))  # Ensure symmetry of covariance
                 if prec is not None:
                     prec = 0.5 * (prec + prec.transpose(-1, -2))  # Ensure symmetry of precision
+                if scale_tril is not None:
+                    scale_tril = scale_tril.tril()
                 return MultivariateNormal(mu, sigma, prec, scale_tril).log_prob(samples)
             gradcheck(gradcheck_func, (mvn_samples, mean, covariance, precision, scale_tril), raise_exception=True)
 
@@ -2643,7 +2645,7 @@ def test_cdf_log_prob(self):
             for i, param in enumerate(params):
                 dist = Dist(**param)
                 samples = dist.sample()
-                if samples.dtype.is_floating_point:
+                if not dist.support.is_discrete:
                     samples.requires_grad_()
                 try:
                     cdfs = dist.cdf(samples)
@@ -3050,11 +3052,9 @@ def setUp(self):
         self.scalar_sample = 1
         self.tensor_sample_1 = torch.ones(3, 2)
         self.tensor_sample_2 = torch.ones(3, 2, 3)
-        Distribution.set_default_validate_args(True)
 
     def tearDown(self):
         super(TestDistributionShapes, self).tearDown()
-        Distribution.set_default_validate_args(False)
 
     def test_entropy_shape(self):
         for Dist, params in EXAMPLES:
@@ -3186,23 +3186,23 @@ def test_one_hot_categorical_shape(self):
         self.assertEqual(dist.sample().size(), torch.Size((3,)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_1)
-        simplex_sample = self.tensor_sample_2 / self.tensor_sample_2.sum(-1, keepdim=True)
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 2,)))
+        sample = torch.tensor([0., 1., 0.]).expand(3, 2, 3)
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 2,)))
         self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((3,)))
-        simplex_sample = torch.ones(3, 3) / 3
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        sample = torch.eye(3)
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,)))
         # batched
         dist = OneHotCategorical(torch.tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]]))
         self.assertEqual(dist._batch_shape, torch.Size((3,)))
         self.assertEqual(dist._event_shape, torch.Size((2,)))
         self.assertEqual(dist.sample().size(), torch.Size((3, 2)))
         self.assertEqual(dist.sample((3, 2)).size(), torch.Size((3, 2, 3, 2)))
-        simplex_sample = self.tensor_sample_1 / self.tensor_sample_1.sum(-1, keepdim=True)
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3,)))
+        sample = torch.tensor([0., 1.])
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3,)))
         self.assertRaises(ValueError, dist.log_prob, self.tensor_sample_2)
         self.assertEqual(dist.log_prob(dist.enumerate_support()).size(), torch.Size((2, 3)))
-        simplex_sample = torch.ones(3, 1, 2) / 2
-        self.assertEqual(dist.log_prob(simplex_sample).size(), torch.Size((3, 3)))
+        sample = torch.tensor([0., 1.]).expand(3, 1, 2)
+        self.assertEqual(dist.log_prob(sample).size(), torch.Size((3, 3)))
 
     def test_cauchy_shape_scalar_params(self):
         cauchy = Cauchy(0, 1)
@@ -3531,12 +3531,15 @@ def __init__(self, probs):
                                                          [0.2, 0.7, 0.1],
                                                          [0.33, 0.33, 0.34],
                                                          [0.2, 0.2, 0.6]])
-        pareto = pairwise(Pareto, [2.5, 4.0, 2.5, 4.0], [2.25, 3.75, 2.25, 3.75])
+        pareto = (Pareto(torch.tensor([2.5, 4.0, 2.5, 4.0]).expand(4, 4),
+                         torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)),
+                  Pareto(torch.tensor([2.25, 3.75, 2.25, 3.8]).expand(4, 4),
+                         torch.tensor([2.25, 3.75, 2.25, 3.75]).expand(4, 4)))
         poisson = pairwise(Poisson, [0.3, 1.0, 5.0, 10.0])
-        uniform_within_unit = pairwise(Uniform, [0.15, 0.95, 0.2, 0.8], [0.1, 0.9, 0.25, 0.75])
+        uniform_within_unit = pairwise(Uniform, [0.1, 0.9, 0.2, 0.75], [0.15, 0.95, 0.25, 0.8])
         uniform_positive = pairwise(Uniform, [1, 1.5, 2, 4], [1.2, 2.0, 3, 7])
         uniform_real = pairwise(Uniform, [-2., -1, 0, 2], [-1., 1, 1, 4])
-        uniform_pareto = pairwise(Uniform, [6.5, 8.5, 6.5, 8.5], [7.5, 7.5, 9.5, 9.5])
+        uniform_pareto = pairwise(Uniform, [6.5, 7.5, 6.5, 8.5], [7.5, 8.5, 9.5, 9.5])
         continuous_bernoulli = pairwise(ContinuousBernoulli, [0.1, 0.2, 0.5, 0.9])
 
         # These tests should pass with precision = 0.01, but that makes tests very expensive.
@@ -4148,8 +4151,8 @@ def test_lazy_logits_initialization(self):
                 probs = param.pop('probs')
                 param['logits'] = probs_to_logits(probs)
                 dist = Dist(**param)
-                shape = (1,) if not dist.event_shape else dist.event_shape
-                dist.log_prob(torch.ones(shape))
+                # Create new instance to generate a valid sample
+                dist.log_prob(Dist(**param).sample())
                 message = 'Failed for {} example 0/{}'.format(Dist.__name__, len(params))
                 self.assertFalse('probs' in vars(dist), msg=message)
                 try:
@@ -4455,7 +4458,6 @@ def test_stack_transform(self):
 class TestValidation(TestCase):
     def setUp(self):
         super(TestCase, self).setUp()
-        Distribution.set_default_validate_args(True)
 
     def test_valid(self):
         for Dist, params in EXAMPLES:
@@ -4475,7 +4477,6 @@ def test_invalid(self):
 
     def tearDown(self):
         super(TestValidation, self).tearDown()
-        Distribution.set_default_validate_args(False)
 
 
 class TestJit(TestCase):
diff --git a/test/elias.py b/test/elias.py
deleted file mode 100644
index 74dbc3cbaa09..000000000000
--- a/test/elias.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import torch
-
-@torch.jit.script
-def foo(x):
-    return x + x + x
-
-torch._C._jit_override_can_fuse_on_cpu(True)
-
-foo(torch.rand([2], requires_grad=False))
-foo(torch.rand([2], requires_grad=False))
-foo(torch.rand([2], requires_grad=False))
-print(torch.jit.last_executed_optimized_graph())
diff --git a/test/jit/test_cuda.py b/test/jit/test_cuda.py
new file mode 100644
index 000000000000..f7af8e3a2efc
--- /dev/null
+++ b/test/jit/test_cuda.py
@@ -0,0 +1,476 @@
+import os
+import sys
+import gc
+import unittest
+
+import torch
+from typing import NamedTuple
+from torch.testing._internal.jit_utils import JitTestCase
+from torch.testing._internal.common_utils import skipIfRocm, skipCUDANonDefaultStreamIf
+
+# Make the helper files in test/ importable
+pytorch_test_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+sys.path.append(pytorch_test_dir)
+
+# Check if GPU is available
+TEST_CUDA = torch.cuda.is_available()
+# Check if multiple GPU's are available
+TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
+
+# If GPU is not available, then do not run the tests
+if not TEST_CUDA:
+    print('CUDA not available, skipping tests', file=sys.stderr)
+    JitTestCase = object  # noqa: F811
+
+TEST_LARGE_TENSOR = TEST_CUDA
+
+# If GPU is available, then initialize the cuda context and check
+# if there is memory available to allocate for LARGE Tensors.
+if TEST_CUDA:
+    torch.ones(1).cuda()  # initialize cuda context
+    TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 5e9
+
+if __name__ == "__main__":
+    raise RuntimeError(
+        "This test file is not meant to be run directly, use:\n\n"
+        "\tpython test/test_jit.py TESTNAME\n\n"
+        "instead."
+    )
+
+class TestCUDA(JitTestCase):
+    """
+    A suite of tests for the CUDA API in TorchScript.
+    """
+    def setUp(self):
+        super(TestCUDA, self).setUp()
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        super(TestCUDA, self).tearDown()
+
+    @skipIfRocm
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    def test_current_stream(self):
+        # Test current stream on the device and check if the stream device index
+        # matches with the device ID
+        @torch.jit.script
+        def fn():
+            device_index = torch.cuda._current_device()
+            s0 = torch.cuda.current_stream(device_index)
+            s1 = torch.cuda.current_stream(1)
+            s2 = torch.cuda.current_stream(0)
+
+            return s0.device_index(), s1.device_index(), s2.device_index()
+
+        d0, d1, d2 = fn()
+
+        # By default, the current device ID is 0.
+        self.assertEqual(0, d0)
+        self.assertEqual(1, d1)
+        self.assertEqual(0, d2)
+        self.assertEqual(d0, d2)
+
+    @skipIfRocm
+    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
+    @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
+    @skipCUDANonDefaultStreamIf(True)
+    def test_streams_and_events(self):
+        # This test checks for the default stream ID is set to 0 on the device
+        @torch.jit.script
+        def test_default_streams():
+            s0 = torch.cuda.default_stream(0)
+            s1 = torch.cuda.default_stream(1)
+
+            d = torch.device('cuda:1')
+
+            # Check the current stream id and default id are same
+            # on the current device. The current device id by default is 0
+            s2 = torch.cuda.current_stream(0)
+            check_s2 = s2.id() == s0.id()
+            check_d0 = torch.cuda._current_device() == s2.device_index()
+
+            # Set the current device to d1 and check if the stream
+            # has been set to the default stream on d1
+            with torch.jit.cuda.device(d):
+                s3 = torch.cuda.current_stream(1)
+                check_s3 = s3.id() == s1.id()
+                check_d1 = torch.cuda._current_device() == s3.device_index()
+
+            # Check if the current device was reset to 0
+            is_device_d0 = torch.cuda._current_device() == s2.device_index()
+
+            return s0.device_index(), s1.device_index(), check_s2, check_s3, check_d0, check_d1, is_device_d0
+
+        d0, d1, check_s2, check_s3, check_d0, check_d1, is_device_d0 = test_default_streams()
+
+        self.assertEqual(d0, 0)
+        self.assertEqual(d1, 1)
+        self.assertTrue(check_s2)
+        self.assertTrue(check_s3)
+        self.assertTrue(check_d0)
+        self.assertTrue(check_d1)
+        self.assertTrue(is_device_d0)
+
+        # This test checks if the Stream Context manager is a no op
+        # when the stream is none for `with torch.jit.cuda.stream`
+        @torch.jit.script
+        def test_set_none_stream():
+            device_index = torch.cuda._current_device()
+            current_stream = torch.cuda.current_stream(device_index)
+            default_stream = torch.cuda.default_stream(device_index)
+
+            # When stream is none, check if this operation is a no-op
+            with torch.jit.cuda.stream(None):
+                cur_device_index = torch.cuda._current_device()
+                is_device_index_same = cur_device_index == device_index
+                is_current_stream_same = torch.cuda.current_stream(cur_device_index).id() == current_stream.id()
+                is_default_stream_same = torch.cuda.default_stream(device_index).id() == default_stream.id()
+
+            # Check if the device index, current stream and default streams have not changed
+            are_streams_same = is_device_index_same and is_current_stream_same and is_default_stream_same
+            return are_streams_same
+        self.assertTrue(test_set_none_stream())
+
+        # This test checks if the Device Context manager is a no op
+        # when the device is none for `with torch.jit.cuda.device`
+        @torch.jit.script
+        def test_set_device_none():
+            device_index = torch.cuda._current_device()
+            # When device is none, check if this operation is a no-op
+            with torch.jit.cuda.device(None):
+                # Check if the current device is the same
+                is_device_same = torch.cuda._current_device() == device_index
+            return is_device_same
+        self.assertTrue(test_set_device_none())
+
+        # Check if a CUDA JIT stream is created
+        # on the _current_device
+        @torch.jit.script
+        def test_simple_stream():
+            device_index = torch.cuda._current_device()
+            s = torch.jit.cuda.Stream(device_index, 0)
+            return device_index == s.device_index()
+
+        self.assertTrue(test_simple_stream(), "Could not create Stream!")
+
+        # Class used to store results for the test: test_get_stream.
+        class Result(NamedTuple):
+            t1 : torch.Tensor
+            t2 : torch.Tensor
+            is_current_and_default_stream_same : bool
+            is_default_and_user_stream_not_same : bool
+            is_stream_set : bool
+            is_stream_reset : bool
+            default_stream_query : bool
+            default_stream_id : int
+            user_stream_id : int
+
+        # The test aims at checking different stream proporties.
+        @torch.jit.script
+        def test_get_stream():
+            device_index = torch.cuda._current_device()
+            current_stream = torch.cuda.current_stream(device_index)
+            default_stream = torch.cuda.default_stream(device_index)
+            user_stream = torch.jit.cuda.Stream(device_index, 0)
+
+            # Check if the current and default streams are the same on the device
+            is_current_and_default_stream_same = current_stream.id() == default_stream.id()
+            # Check if user stream and default stream are not the same on the device
+            is_default_and_user_stream_not_same = default_stream.id() != user_stream.id()
+
+            with torch.jit.cuda.stream(user_stream):
+                is_stream_set = torch.cuda.current_stream(device_index).id() == user_stream.id()
+
+            # Check if the stream was reset to current_stream
+            is_stream_reset = torch.cuda.current_stream(device_index).id() == current_stream.id()
+
+            tensor1 = torch.rand(10000, 10000, device="cuda")
+            tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+            default_stream.synchronize()
+            default_stream_query = default_stream.query()
+
+            # Capture all the results in the class Result
+            res = Result(
+                tensor1, tensor2, is_current_and_default_stream_same,
+                is_default_and_user_stream_not_same, is_stream_set,
+                is_stream_reset, default_stream_query, default_stream.id(), user_stream.id())
+            return res
+
+        result = test_get_stream()
+
+        self.assertEqual(torch.matmul(result.t1, result.t1), result.t2)
+        self.assertTrue(result.is_current_and_default_stream_same)
+        self.assertTrue(result.is_default_and_user_stream_not_same)
+        self.assertTrue(result.is_stream_set)
+        self.assertTrue(result.is_stream_reset)
+        self.assertTrue(result.default_stream_query)
+        self.assertEqual(result.default_stream_id, 0)  # Check if the default stream ID is always 0
+        self.assertNotEqual(result.user_stream_id, 0)  # Check if the user stream is always non zero
+
+        # Test the stream context manager. This test checks if the stream is switched
+        # to the user stream on using the stream context manager.
+        @torch.jit.script
+        def test_stream_context():
+            device_index = torch.cuda._current_device()
+            current_stream = torch.cuda.current_stream(device_index)
+            user_stream = torch.jit.cuda.Stream(device_index, 0)
+            A = torch.rand(1000, 1000, device="cuda")
+
+            with torch.jit.cuda.stream(user_stream):
+                check = torch.cuda.current_stream(device_index).id() == user_stream.id()
+                B = torch.mm(A, A).to("cuda")
+            # Wait for B to be computed
+            user_stream.synchronize()
+            # Check if the stream has been reset on the current device
+            is_stream_reset = torch.cuda.current_stream(device_index).id() == current_stream.id()
+
+            return A, B, check, is_stream_reset
+
+        A, B, is_stream_set, is_stream_reset = test_stream_context()
+        self.assertEqual(torch.matmul(A, A), B)
+        self.assertTrue(is_stream_set, "Error: Current stream was not set to user stream!")
+        self.assertTrue(is_stream_reset, "Error: The stream was not restored to previous stream!")
+
+        # Test multiple nested streams. Check if the operations are computed as expected on the streams
+        # This test has been adapted from the eager mode tests available at test/test_cuda.py
+        @torch.jit.script
+        def test_multiple_stream():
+            prev_device_index = torch.cuda._current_device()
+            prev_current_stream = torch.cuda.current_stream(prev_device_index)
+            s1 = torch.jit.cuda.Stream(0, 0)
+            s2 = torch.jit.cuda.Stream(1, 0)
+
+            A = torch.rand(1000, 1000, device="cuda")
+            B = torch.rand(1000, 1000, device="cuda")
+            with torch.jit.cuda.stream(s1):
+                C = torch.mm(A, A).to("cuda")
+                # Check if the stream and device have been set to s1
+                is_stream_s1 = torch.cuda.current_stream(s1.device_index()).id() == s1.id()
+                is_device_s1 = torch.cuda._current_device() == s1.device_index()
+                with torch.jit.cuda.stream(s2):
+                    # Check if the stream and device have been set to s2
+                    is_stream_s2 = torch.cuda.current_stream(s2.device_index()).id() == s2.id()
+                    is_device_s2 = torch.cuda._current_device() == s2.device_index()
+                    D = torch.mm(B, B).to("cuda")
+                # Check if the stream and device have been set to s1
+                is_stream_s1_after = torch.cuda.current_stream(s1.device_index()).id() == s1.id()
+                is_device_s1_after = torch.cuda._current_device() == s1.device_index()
+                # Wait for D to be computed
+                s2.synchronize()
+            # Wait for C to be computed on S1
+            s1.synchronize()
+
+            # Check if the stream and device has been restored to previous stream and device
+            is_device_current = torch.cuda._current_device() == prev_device_index
+            is_stream_current = torch.cuda.current_stream(prev_device_index).id() == prev_current_stream.id()
+
+            check_stream = is_stream_s1 and is_stream_s2 and is_stream_s1_after and is_stream_current
+            check_device = is_device_s1 and is_device_s2 and is_device_s1_after and is_device_current
+            return A, B, C, D, check_stream, check_device
+        A, B, C, D, check_stream, check_device = test_multiple_stream()
+
+        self.assertEqual(torch.matmul(A, A), C)
+        self.assertEqual(torch.matmul(B, B), D)
+        self.assertTrue(check_stream)
+        self.assertTrue(check_device)
+
+        # Test multiple streams waiting on each other for the operations to be completed.
+        @torch.jit.script
+        def test_data_dependency_between_streams():
+            device_index = torch.cuda._current_device()
+            prev_current_stream = torch.cuda.current_stream(device_index)
+            s1 = torch.jit.cuda.Stream(0, 0)
+            s2 = torch.jit.cuda.Stream(0, 0)
+            event = torch.jit.cuda.Event(False, False, False)
+
+            A = torch.rand(1000, 1000, device="cuda")
+            with torch.jit.cuda.stream(s1):
+                is_stream_s1 = torch.cuda.current_stream(device_index).id() == s1.id()
+                B = torch.mm(A, A).to("cuda")
+            s1.record_event(event)
+            # Check if the current_stream is reset
+            is_current_stream_1 = torch.cuda.current_stream(device_index).id() == prev_current_stream.id()
+            # Wait for ops on s1 to be computed
+            s2.wait_event(event)
+            with torch.jit.cuda.stream(s2):
+                is_stream_s2 = torch.cuda.current_stream(device_index).id() == s2.id()
+                C = torch.mm(B, B).to("cuda")
+            # Wait for C to be computed
+            s2.synchronize()
+            # Check if the current_stream is reset
+            is_current_stream_2 = torch.cuda.current_stream(device_index).id() == prev_current_stream.id()
+
+            check_stream = is_current_stream_1 and is_current_stream_2 and is_stream_s1 and is_stream_s2
+            return A, B, C, check_stream
+
+        A, B, C, check_stream = test_data_dependency_between_streams()
+        self.assertEqual(torch.matmul(A, A), B)
+        self.assertEqual(torch.matmul(B, B), C)
+        self.assertTrue(check_stream)
+
+        # Test a simple CUDA event. Test if the CUDA event was created successfully
+        @torch.jit.script
+        def test_simple_event():
+            e = torch.jit.cuda.Event(True, False, False)
+            return e is not None
+        self.assertTrue(test_simple_event(), "Could not create CUDA Event!")
+
+        # Record the CUDA event for operation torch.mm on the current stream
+        # and then test if the elapsed time is greater than 0. This test is also
+        # an adaption from eager mdoe CUDA tests available at test/test_cuda.py
+        @torch.jit.script
+        def test_event():
+            device_index = torch.cuda._current_device()
+            stream = torch.cuda.current_stream(device_index)
+            event = torch.jit.cuda.Event(True, False, False)
+            is_true_event_query = event.query()
+            start_event = torch.jit.cuda.Event(True, False, False)
+            stream.record_event(start_event)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+            stream.record_event(event)
+            event.synchronize()
+            is_again_true_event_query = event.query()
+
+            if not (is_true_event_query and is_again_true_event_query):
+                return -1.0
+            return start_event.elapsed_time(event)
+
+        self.assertGreater(test_event(), 0)
+
+        # Check for stream synchronization , when a large tensor multiplication is
+        # computed on the stream. The stream.query should be true once the synchroniztion is done
+        @torch.jit.script
+        def test_stream_synchronize() -> float:
+            device_index = torch.cuda._current_device()
+            s = torch.jit.cuda.Stream(device_index, 0)
+            e_tik = torch.jit.cuda.Event(True, False, False)
+            e_tok = torch.jit.cuda.Event(True, False, False)
+
+            e_tik.record(s)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            with torch.jit.cuda.stream(s):
+                tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+            s.synchronize()
+            e_tok.record(s)
+            e_tok.synchronize()
+
+            if not s.query():
+                return -1.0
+
+            # not necessary to check e_tik and e_tok, as elapsed_time would throw
+            # exception if otherwise.
+            return e_tik.elapsed_time(e_tok)
+        self.assertGreater(test_stream_synchronize(), 0)
+
+        # Test event synchronization for the event that records a stream doing
+        # a large tensor multiplication. Check if the elapsed time is greater than 0
+        # and the stream.query evaluates to true.
+        @torch.jit.script
+        def test_event_synchronize() -> float:
+            device_index = torch.cuda._current_device()
+            s = torch.jit.cuda.Stream(device_index, 0)
+            e_tik = torch.jit.cuda.Event(True, False, False)
+            e_tok = torch.jit.cuda.Event(True, False, False)
+
+            e_tik.record(s)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            with torch.jit.cuda.stream(s):
+                tensor = torch.mm(tensor1, tensor1).to("cuda")
+            s.record_event(e_tok)
+            e_tok.synchronize()
+            s.synchronize()
+
+            if not s.query():
+                return -1.0
+
+            # not necessary to check e_tik and e_tok, as elapsed_time would throw
+            # exception if otherwise.
+            return e_tik.elapsed_time(e_tok)
+
+        self.assertGreater(test_event_synchronize(), 0)
+
+        # Test for event wait. Check if event waits for the all the operations on
+        # the stream to be done. Check for synchronizations and query on the streams
+        # and events. This test is adapted from eager mode tests for CUDA. Please refer
+        # test/test_cuda.py
+        @torch.jit.script
+        def test_event_wait() -> float:
+            device_index = torch.cuda._current_device()
+            s0 = torch.cuda.current_stream(device_index)
+            s1 = torch.jit.cuda.Stream(device_index, 0)
+            e_tik = torch.jit.cuda.Event(True, True, False)
+            e_tok = torch.jit.cuda.Event(True, True, False)
+
+            e_tik.record(s0)
+            tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+            with torch.jit.cuda.stream(s0):
+                tensor2 = torch.mm(tensor1, tensor1).cuda()
+            e_sync = torch.jit.cuda.Event(True, False, False)
+            e_sync.record(torch.cuda.current_stream(device_index))
+            e_sync.wait(s1)
+            with torch.jit.cuda.stream(s1):
+                tensor3 = torch.rand(1000000000, 1000000000, device="cuda")
+                tensor4 = torch.mm(tensor3, tensor3).cuda()
+            s1.synchronize()
+            e_tok.record(torch.cuda.current_stream(device_index))
+            e_tok.synchronize()
+            s0.synchronize()
+
+            if not s0.query() or not s1.query() or not e_sync.query():
+                return -1.0
+
+            # not necessary to check e_tik and e_tok, as elapsed_time would throw
+            # exception if otherwise.
+            return e_tik.elapsed_time(e_tok)
+        self.assertGreater(test_event_wait(), 0)
+
+        # Test for stream wait_event. Checks if the stream waits on the event
+        @torch.jit.script
+        def test_wait_event():
+            d1 = torch.device('cuda:1')
+
+            with torch.jit.cuda.device(d1):
+                s0 = torch.cuda.current_stream(1)
+                tensor1 = torch.rand(1000000000, 1000000000, device="cuda")
+                tensor2 = torch.mm(tensor1, tensor1).to("cuda")
+                e0 = torch.jit.cuda.Event(False, False, False)
+                s0.record_event(e0)
+
+            s1 = torch.cuda.current_stream(0)
+            s1.wait_event(e0)
+            s1.synchronize()
+
+            return e0.query() and s0.query() and s1.query()
+        self.assertTrue(test_wait_event())
+
+        # Test if a scripted module with cuda streams can be saved, loaded and executed
+        def test_save_load(self):
+            class Model(torch.nn.Module):
+                def forward(self):
+                    device_index = torch.cuda._current_device()
+                    s = torch.jit.cuda.Stream(device_index, 0)
+                    a = torch.rand(3, 4, device="cuda")
+                    b = torch.rand(3, 4, device="cuda")
+
+                    with torch.jit.cuda.stream(s):
+                        is_stream_s = torch.cuda.current_stream(s.device_index()).id() == s.id()
+                        c = torch.cat((a, b), 0).cuda()
+                    s.synchronize()
+                    return is_stream_s, a, b, c
+
+            model = Model()
+
+            # Script the model and save
+            script_model = torch.jit.script(model)
+            is_stream_s, a, b, c = script_model()
+            # Verify if the output is correct
+            self.assertTrue(is_stream_s)
+            self.assertEqual(torch.cat((a, b), 0), c)
+
+            # Save and load scripted model
+            load_model = self.getExportImportCopy(script_model)
+            is_stream_s, a_load, b_load, c_load = load_model()
+            self.assertTrue(is_stream_s)
+            self.assertEqual(torch.cat((a_load, b_load), 0), c_load)
diff --git a/test/jit/test_enum.py b/test/jit/test_enum.py
index aa34c22413ad..b39732d0e9bc 100644
--- a/test/jit/test_enum.py
+++ b/test/jit/test_enum.py
@@ -356,6 +356,6 @@ def iterate_enum(x: Color):
             .check_same("Color.BLUE") \
             .run(str(scripted.graph))
 
-        # PURPLE always appear last because we follow Python's Enum definition order.
+        # PURPLE always appears last because we follow Python's Enum definition order.
         self.assertEqual(scripted(Color.RED), [Color.GREEN.value, Color.BLUE.value])
         self.assertEqual(scripted(Color.GREEN), [Color.RED.value, Color.BLUE.value])
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py
index 7b7490e80ec5..28fc594744a7 100644
--- a/test/jit/test_freezing.py
+++ b/test/jit/test_freezing.py
@@ -1331,3 +1331,33 @@ def forward(self, x: torch.Tensor, key: str) -> Any:
         m.eval()
         with self.assertRaisesRegex(RuntimeError, "Freezing modules containing prim::ModuleDictIndex is not supported"):
             mf = torch._C._freeze_module(m._c)
+
+
+    def test_freeze_non_module_class_getattr(self):
+        class BoxCoder(object):
+            def __init__(self, bbox_xform_clip):
+                # type: (float) -> None
+                self.bbox_xform_clip = bbox_xform_clip
+
+            def decode(self, input):
+                return input * self.bbox_xform_clip
+
+        class MyModule(torch.nn.Module):
+            __annotations__ = {
+                'box_coder': BoxCoder,
+            }
+
+            def __init__(self):
+                super(MyModule, self).__init__()
+                self.box_coder = BoxCoder(50.)
+
+            def forward(self, input):
+                return self.box_coder.decode(input)
+
+        model = MyModule()
+        model.eval()
+        script_model = torch.jit.freeze(torch.jit.script(model))
+        inp = torch.randn([4, 4])
+        output_eager = model(inp)
+        self.assertEqual(model(inp), script_model(inp))
+        FileCheck().check_not("GetAttr").run(script_model.graph)
diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py
index bd9a2bb32b89..a0dc99a4e463 100644
--- a/test/jit/test_recursive_script.py
+++ b/test/jit/test_recursive_script.py
@@ -495,6 +495,59 @@ def forward(self, x):
 
         self.checkModule(M(), (torch.randn(5, 5),))
 
+    def test_prepare_scriptable_basic(self):
+        class SeluButReluWhenScripted(torch.nn.SELU):
+            def __prepare_scriptable__(self):
+                return nn.ReLU()
+
+        t = torch.randn(5, 5)
+        m = SeluButReluWhenScripted()
+        sm = torch.jit.script(m)
+        eager_out = m(t)
+        script_out = sm(t)
+        self.assertNotEqual(eager_out, script_out)
+
+    def test_prepare_scriptable_iterable_modules(self):
+        class SeluButReluWhenScripted(torch.nn.SELU):
+            def __prepare_scriptable__(self):
+                return nn.ReLU()
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                shared = SeluButReluWhenScripted()
+                self.sequential = nn.Sequential(
+                    SeluButReluWhenScripted(),
+                    SeluButReluWhenScripted(),
+                    nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()),
+                    shared,
+                )
+                self.module_list = nn.ModuleList([SeluButReluWhenScripted(),
+                                                  shared,
+                                                  SeluButReluWhenScripted()])
+
+            def forward(self, x):
+                for mod in self.module_list:
+                    x += mod(x)
+                x += self.sequential(x)
+                return x
+
+        t = torch.randn(5, 5)
+        m = M()
+        eager_out = m(t.clone())
+        sm = torch.jit.script(m)
+        script_out = sm(t.clone())
+        self.assertNotEqual(eager_out, script_out)
+
+    def test_prepare_scriptable_cycle(self):
+        t = torch.randn(5, 5)
+        c = torch.nn.Module()
+        p = torch.nn.Module()
+        c.__dict__["_p"] = p
+        p.__dict__["_c"] = c
+
+        sm = torch.jit.script(p)
+
     def test_attributes(self):
         @torch.jit.script
         class Inner2(object):
diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py
index 31eec81d480a..7f43b31fe6ec 100644
--- a/test/jit/test_torchbind.py
+++ b/test/jit/test_torchbind.py
@@ -62,6 +62,32 @@ def f():
             return ss1.pop() + ss2.pop()
         test_equality(f, lambda x: x)
 
+        # test nn module with prepare_scriptable function
+        class NonJitableClass(object):
+            def __init__(self, int1, int2):
+                self.int1 = int1
+                self.int2 = int2
+
+            def return_vals(self):
+                return self.int1, self.int2
+
+        class CustomWrapper(torch.nn.Module):
+            def __init__(self, foo):
+                super(CustomWrapper, self).__init__()
+                self.foo = foo 
+
+            def forward(self) -> None:
+                self.foo.increment(1)
+                return
+
+            def __prepare_scriptable__(self):
+                int1, int2 = self.foo.return_vals()
+                foo = torch.classes._TorchScriptTesting._Foo(int1, int2)
+                return CustomWrapper(foo) 
+
+        foo = CustomWrapper(NonJitableClass(1, 2))
+        jit_foo = torch.jit.script(foo)
+
     def test_torchbind_take_as_arg(self):
         global StackString  # see [local resolution in python]
         StackString = torch.classes._TorchScriptTesting._StackString
diff --git a/test/jit/test_tracer.py b/test/jit/test_tracer.py
index 059f59ff8702..366ca1af69e6 100644
--- a/test/jit/test_tracer.py
+++ b/test/jit/test_tracer.py
@@ -15,7 +15,7 @@
 sys.path.append(pytorch_test_dir)
 from torch.testing._internal.common_utils import suppress_warnings, \
     skipIfCompiledWithoutNumpy, enable_profiling_mode_for_profiling_tests, \
-    IS_SANDCASTLE, IS_WINDOWS
+    IS_SANDCASTLE, TemporaryFileName
 from torch.testing._internal.jit_utils import JitTestCase, enable_cpu_fuser, \
     _tmp_donotuse_dont_inline_everything, _trace, RUN_CUDA, RUN_CUDA_MULTI_GPU
 from torch.testing._internal.common_cuda import with_tf32_off
@@ -25,7 +25,6 @@
 # Standard library
 from collections import namedtuple
 from itertools import chain
-import tempfile
 from typing import Dict
 import warnings
 
@@ -1215,15 +1214,14 @@ def foo(x):
         self.run_pass('inline', traced_tensor_size.graph)
         FileCheck().check("prim::device").run(traced_tensor_size.graph)
 
-    @unittest.skipIf(IS_WINDOWS, "temp file name on windows")
     def test_trace_save(self):
         def fn(x):
             return x + 2
 
         def check(func):
-            with tempfile.NamedTemporaryFile() as f:
-                func.save(f.name)
-                loaded = torch.jit.load(f.name)
+            with TemporaryFileName() as fname:
+                func.save(fname)
+                loaded = torch.jit.load(fname)
                 input = torch.randn(2, 2)
                 self.assertEqual(func(input), loaded(input))
 
diff --git a/test/mobile/op_deps/simple_ops.cpp b/test/mobile/op_deps/simple_ops.cpp
index 3651d1b05353..a76c58838a72 100644
--- a/test/mobile/op_deps/simple_ops.cpp
+++ b/test/mobile/op_deps/simple_ops.cpp
@@ -80,7 +80,7 @@ namespace {
 // cares about the name
 TORCH_LIBRARY(_test, m) {
   m.def("AA(Tensor self) -> Tensor");
-  m.impl("AA", torch::CppFunction::makeUnboxedOnly(AA_op));
+  m.impl("AA", torch::CppFunction::makeFromUnboxedFunction(AA_op));
 
   m.def("BB(Tensor self) -> Tensor");
   m.impl("BB", TORCH_FN(BB_op));
@@ -97,10 +97,10 @@ TORCH_LIBRARY_FRAGMENT(_test, m) {
 }
 
 TORCH_LIBRARY_IMPL(_test, CPU, m) {
-  m.impl_UNBOXED("EE", EE_op);
+  m.impl("EE", EE_op);
   m.impl("FF",
          torch::dispatch(DispatchKey::CPU,
-                         torch::CppFunction::makeUnboxedOnly(FF_op))
+                         torch::CppFunction::makeFromUnboxedFunction(FF_op))
   );
   m.impl("GG",
          torch::dispatch(DispatchKey::CPU,
diff --git a/test/onnx/pytorch_helper.py b/test/onnx/pytorch_helper.py
index 17afa53b57e4..e027e0393ba7 100644
--- a/test/onnx/pytorch_helper.py
+++ b/test/onnx/pytorch_helper.py
@@ -23,7 +23,7 @@ def PyTorchModule(helper, model, sample_arguments, caffe2_inputs, prefix_name=No
     """
     Embed an ONNX-exportable PyTorch Model into a Caffe2 model being built.
 
-    Arguments:
+    Args:
         helper (caffe2.python.core.ModelHelder): the model helper where
             this imported network should be inserted
         model (torch.nn.Module): the model to be exported
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index b0c7143d0129..26896bc17863 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1554,6 +1554,35 @@ def forward(self, x, update):
         update = torch.randn(4, 1, 3, 2)
         self.run_test(IndexPutModel2(), (x, update))
 
+    @skipIfUnsupportedMinOpsetVersion(11)
+    def test_index_put_loop(self):
+        @torch.jit.script
+        def ngram_attention_bias(sequence_length: int, ngram: int, device: torch.device, dtype: torch.dtype):
+            bias = torch.ones((ngram, sequence_length), device=device, dtype=dtype) * float("-inf")
+            for stream_idx in range(ngram):
+                for i in range(sequence_length):
+                    bias[stream_idx, i] = 5
+            return bias
+
+        class ScriptModel(torch.nn.Module):
+            def __init__(self):
+                super(ScriptModel, self).__init__()
+                self.ngram = 2
+                self.max_target_positions = 512
+
+            def forward(self, hidden_states):
+                seq_length, batch_size = hidden_states.shape[:2]
+                predict_causal_mask = ngram_attention_bias(
+                    self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
+                )
+                predict_causal_mask = predict_causal_mask[:, :seq_length]
+                return predict_causal_mask
+
+        x = torch.randn(6, 2)
+        y = torch.randn(4, 1)
+        self.run_test(ScriptModel(), x, input_names=['x'],
+                      dynamic_axes={'x': {0: 'seq_length', 1: 'batch_size'}}, test_with_inputs=[y])
+
     @skipIfUnsupportedMinOpsetVersion(11)
     def test_copy_(self):
         class CopyModel(torch.nn.Module):
@@ -2105,6 +2134,31 @@ def forward(self, input):
         model = VarianceUnbiased()
         self.run_test(model, x)
 
+    def test_var_mean_mixed_dims(self):
+        class ReverseDims(torch.nn.Module):
+            def forward(self, input):
+                return torch.var_mean(input, dim=(2, 1), unbiased=False)
+
+        x = torch.randn(2, 3, 4)
+        model = ReverseDims()
+        self.run_test(model, x)
+
+        class SkipDims(torch.nn.Module):
+            def forward(self, input):
+                return torch.var_mean(input, dim=(0, 2), unbiased=False)
+
+        x = torch.randn(2, 3, 4)
+        model = SkipDims()
+        self.run_test(model, x)
+
+        class NonZeroDims(torch.nn.Module):
+            def forward(self, input):
+                return torch.var_mean(input, dim=(1, 2), unbiased=False)
+
+        x = torch.randn(2, 3, 4)
+        model = NonZeroDims()
+        self.run_test(model, x)
+
     def test_var_mean_keepdim(self):
         class Variance(torch.nn.Module):
             def forward(self, input):
diff --git a/test/onnx/verify.py b/test/onnx/verify.py
index d32c074e617f..1e57afdd8d41 100644
--- a/test/onnx/verify.py
+++ b/test/onnx/verify.py
@@ -246,7 +246,7 @@ def verify(model, args, backend, verbose=False, training=torch.onnx.TrainingMode
     For reproducibility, we recommend explicitly setting PyTorch's seed before
     invoking this function.
 
-    Arguments:
+    Args:
         model (torch.nn.Module): the model to be exported and verified
         args (tuple of arguments): the inputs to
             the model, e.g., such that ``model(*args)`` is a valid
diff --git a/test/quantization/test_quantize.py b/test/quantization/test_quantize.py
index 067c35bd3c64..c47982f0c0cc 100644
--- a/test/quantization/test_quantize.py
+++ b/test/quantization/test_quantize.py
@@ -726,6 +726,20 @@ def forward(self, x):
         ref_res = ref_m(data)
         self.assertEqual(res, ref_res)
 
+    @skipIfNoFBGEMM
+    def test_convtranspose_per_channel_fails_early(self):
+        r"""
+        Verifies that attempting to quantize a ConvTranspose module with per-Channel
+        weight observers fails in the prepare step, as opposed to the convert step.
+        """
+        m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1))
+        m.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+        with self.assertRaises(AssertionError) as context:
+            mp = torch.quantization.prepare(m)
+        self.assertTrue(
+            str(context.exception) ==
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.')
+
 
 @skipIfNoFBGEMM
 class TestPostTrainingDynamic(QuantizationTestCase):
diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py
index 66324f928f04..7965b3cc88a4 100644
--- a/test/quantization/test_quantize_fx.py
+++ b/test/quantization/test_quantize_fx.py
@@ -64,7 +64,9 @@
 )
 
 from torch.testing._internal.common_quantized import (
+    supported_qengines,
     override_qengines,
+    override_quantized_engine,
 )
 
 from torch.testing._internal.common_distributed import skip_if_not_multigpu
@@ -78,6 +80,7 @@
 import operator
 import unittest
 import io
+from typing import Callable
 
 class TestFuseFx(QuantizationTestCase):
     def test_fuse_conv_bn_relu(self):
@@ -570,7 +573,16 @@ def forward(self, x):
         m = convert_fx(m)
         m(tensor_input)
 
-    def test_standalone_module(self):
+    def _test_standalone_module(
+            self,
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check):
+        """ Test standalone module with different quantized input/quantized output
+        configurations
+        """
         class StandaloneModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -610,45 +622,32 @@ def forward(self, x):
         original_ref_m.conv2.weight = torch.nn.Parameter(original_m.standalone.conv.weight.detach())
         original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach())
 
-        qconfig_dict = {"": default_qconfig}
-        config_name = {"standalone_module_name": [("standalone", None, None)]}
-        config_class = {"standalone_module_class": [(StandaloneModule, None, None)]}
-        for prepare_config in [config_name, config_class]:
+        for is_name in [True, False]:
+            if is_name:
+                prepare_config = {
+                    "standalone_module_name": [("standalone", None, interface_config)]
+                }
+            else:
+                prepare_config = {
+                    "standalone_module_class": [(StandaloneModule, None, interface_config)]
+                }
+
             original_m_copy = copy.deepcopy(original_m)
             original_ref_m_copy = copy.deepcopy(original_ref_m)
+
+            qconfig_dict = {"": default_qconfig}
             # check prepared model
             m = prepare_fx(
                 original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config)
             # calibration
             m(data)
-            # input and output of first conv, observer for standalone module
-            # will be inserted in the standalone module itself
-            count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 2
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            # for input and output of conv in the standalone module
-            count_check = {
-                ns.call_module(torch.quantization.MinMaxObserver): 2
-            }
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            self.checkGraphModuleNodes(m, expected_node_occurrence=prepare_count_check)
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_prepare_count_check)
 
             # check converted/quantized model
             m = convert_fx(m)
-            count_check = {
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d) : 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m, expected_node_occurrence=count_check)
-            count_check = {
-                # standalone module will take float as input and output
-                # so we'll see quantize and dequantize in the modoule
-                ns.call_function(torch.quantize_per_tensor) : 1,
-                ns.call_module(nnq.Conv2d): 1,
-                ns.call_method('dequantize') : 1,
-            }
-            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check)
+            self.checkGraphModuleNodes(m, expected_node_occurrence=convert_count_check)
+            self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=standalone_convert_count_check)
             res = m(data)
 
             # quantize the reference model
@@ -658,6 +657,76 @@ def forward(self, x):
             ref_res = ref_m(data)
             self.assertEqual(res, ref_res)
 
+    def test_standalone_module_float_interface(self):
+        float_interface_config = {
+            "input_quantized_idxs": [],  # float input
+            "output_quantized_idxs": [],  # float output
+        }
+        interface_config = float_interface_config
+        # input and output of first conv, observer for standalone module
+        # will be inserted in the standalone module itself
+        prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        # for input and output of conv in the standalone module
+        standalone_prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        convert_count_check = {
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            ns.call_method("dequantize") : 1,
+        }
+        standalone_convert_count_check = {
+            # standalone module will take float as input and output
+            # so we'll see quantize and dequantize in the modoule
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d): 1,
+            ns.call_method("dequantize") : 1,
+        }
+        self._test_standalone_module(
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check)
+
+    def test_standalone_module_quantized_interface(self):
+        quantized_interface_config = {
+            "input_quantized_idxs": [0],  # quantized input
+            "output_quantized_idxs": [0],  # quantized output
+        }
+        interface_config = quantized_interface_config
+        # observer for input and output of first conv
+        prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 2
+        }
+        # for output of conv in the standalone module
+        standalone_prepare_count_check = {
+            ns.call_module(torch.quantization.MinMaxObserver): 1
+        }
+        convert_count_check = {
+            # quantizing input for conv
+            ns.call_function(torch.quantize_per_tensor) : 1,
+            ns.call_module(nnq.Conv2d) : 1,
+            # dequantizing output of standalone module
+            ns.call_method("dequantize") : 1,
+        }
+        standalone_convert_count_check = {
+            # quantization of input happens in parent module
+            # quantization of output happens in the quantized conv module
+            ns.call_function(torch.quantize_per_tensor) : 0,
+            ns.call_module(nnq.Conv2d): 1,
+            # dequantization for output happens in parent module
+            ns.call_method("dequantize") : 0,
+        }
+        self._test_standalone_module(
+            interface_config,
+            prepare_count_check,
+            standalone_prepare_count_check,
+            convert_count_check,
+            standalone_convert_count_check)
+
     @skipIfNoFBGEMM
     def test_qconfig_none(self):
         class M(torch.nn.Module):
@@ -1275,12 +1344,27 @@ def test_fp32_input_fp32_output(self):
         self._test_quantized_inputs_outputs(
             prepare_custom_config_dict, prepare_count_check, convert_count_check)
 
+    @skipIfNoFBGEMM
+    def test_convtranspose_per_channel_fails_early(self):
+        r"""
+        Verifies that attempting to quantize a ConvTranspose module with per-Channel
+        weight observers fails in the prepare step, as opposed to the convert step.
+        """
+        m = torch.nn.Sequential(torch.nn.ConvTranspose2d(1, 1, 1))
+        m.eval()
+        qconfig_dict = {'': torch.quantization.get_default_qconfig('fbgemm')}
+        with self.assertRaises(AssertionError) as context:
+            mp = prepare_fx(m, qconfig_dict)
+        self.assertTrue(
+            str(context.exception) ==
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.')
+
 @skipIfNoFBGEMM
 class TestQuantizeFxOps(QuantizationTestCase):
     """Unit tests for individual ops
     """
     @skipIfNoFBGEMM
-    def test_linear(self):
+    def test_linear_module(self):
         class ModuleLinear(torch.nn.Module):
             def __init__(self, has_relu=False, f_relu=False):
                 super(ModuleLinear, self).__init__()
@@ -1296,27 +1380,9 @@ def __init__(self, has_relu=False, f_relu=False):
             def forward(self, x):
                 return self.relu(self.linear(x))
 
-        class FuncLinear(torch.nn.Module):
-            def __init__(self, has_relu=False, f_relu=False):
-                super(FuncLinear, self).__init__()
-                self.w = torch.randn(4, 30)
-                self.b = torch.randn(4)
-                if has_relu:
-                    if f_relu:
-                        self.relu = F.relu
-                    else:
-                        self.relu = torch.nn.ReLU()
-                else:
-                    self.relu = torch.nn.Identity()
-
-            def forward(self, x):
-                return self.relu(F.linear(x, self.w, self.b))
-
         data = (torch.rand((1, 30), dtype=torch.float),)
         options = itertools.product(
             [(ModuleLinear(has_relu=False), True)],
-            # TODO: enable after raw `tensor` is supported in fx
-            # (FuncLinear(has_relu=False), False)],
             self.all_quant_types)
         quantized_nodes = {
             # is_module
@@ -1327,12 +1393,6 @@ def forward(self, x):
                 # note that we are checking the final result
                 QuantType.QAT: ns.call_module(nnq.Linear),
             },
-            False: {
-                # quant_type:
-                QuantType.DYNAMIC: ns.call_function(torch.ops.quantized.linear_dynamic),
-                QuantType.STATIC: ns.call_function(torch.ops.quantized.linear),
-                QuantType.QAT: ns.call_function(torch.ops.quantized.linear),
-            }
         }
         for (model, is_module), quant_type in options:
             self.checkGraphModeFxOp(
@@ -1341,10 +1401,58 @@ def forward(self, x):
         for f_relu, quant_type in itertools.product([True, False], [QuantType.STATIC, QuantType.QAT]):
             for model, quantized_node in [
                     (ModuleLinear(has_relu=True, f_relu=f_relu), ns.call_module(nniq.LinearReLU))]:
-                # TODO: support functional linear + relu fusion
-                # (FuncLinear(has_relu=True, f_relu=f_relu), ns.call_function(torch.ops.quantized.linear_relu))]:
                 self.checkGraphModeFxOp(model, data, quant_type, quantized_node)
 
+    @skipIfNoFBGEMM
+    def test_linear_functional(self):
+
+        class FuncLinear(torch.nn.Module):
+            def __init__(self, use_bias):
+                super(FuncLinear, self).__init__()
+                self.w = torch.randn(4, 30)
+                self.b = torch.randn(4)
+                self.use_bias = use_bias
+
+            def forward(self, x):
+                if self.use_bias:
+                    x = F.linear(x, self.w, self.b)
+                else:
+                    x = F.linear(x, self.w)
+                return x
+
+        data = (torch.rand((1, 30), dtype=torch.float),)
+        quant_type_to_qlinear_fun = {
+            QuantType.DYNAMIC: ns.call_function(torch.ops.quantized.linear_dynamic),
+            QuantType.STATIC: ns.call_function(torch.ops.quantized.linear),
+            QuantType.QAT: ns.call_function(torch.ops.quantized.linear),
+        }
+        quant_type_to_prepare_expected_node_occurrence = {
+            QuantType.DYNAMIC: {},
+            # There should be 3 observers: after input, weight and activation.
+            QuantType.STATIC: {
+                ns.call_module(torch.quantization.HistogramObserver): 2,
+                ns.call_module(torch.quantization.PerChannelMinMaxObserver): 1,
+            },
+            # There should be 3 observers: after input, weight and activation.
+            QuantType.QAT: {
+                ns.call_module(torch.quantization.FakeQuantize): 3,
+            },
+        }
+        options = itertools.product(
+            (QuantType.DYNAMIC, QuantType.STATIC, QuantType.QAT),
+            (True, False),  # use_bias
+        )
+        for quant_type, use_bias in options:
+            model = FuncLinear(use_bias)
+            qlinear_fun = quant_type_to_qlinear_fun[quant_type]
+            prepare_expected_node_occurrence = \
+                quant_type_to_prepare_expected_node_occurrence[quant_type]
+            self.checkGraphModeFxOp(
+                model, data, quant_type, qlinear_fun,
+                prepare_expected_node_occurrence=prepare_expected_node_occurrence)
+
+        # TODO(future PR): test for Linear + ReLU fusion
+
     @skipIfNoFBGEMM
     def test_conv_module(self):
         conv_module = {1 : torch.nn.Conv1d, 2 : torch.nn.Conv2d, 3 : torch.nn.Conv3d}
@@ -1387,20 +1495,6 @@ def test_conv2d_functional(self):
                 expected_node_occurrence=expected_node_occurrence,
             )
 
-    def test_linear_functional_bias_not_observed(self):
-        data = (torch.rand((1, 4), dtype=torch.float),)
-        for bias in [True, False]:
-            linear = torch.nn.Linear(4, 4, bias=bias)
-            # There should be 3 observers: after input, weight and activation.
-            expected_node_occurrence = {
-                ns.call_module(torch.quantization.HistogramObserver): 2,
-                ns.call_module(torch.quantization.PerChannelMinMaxObserver): 1,
-            }
-            self.checkGraphModeFxOp(
-                linear, data, QuantType.STATIC,
-                prepare_expected_node_occurrence=expected_node_occurrence,
-            )
-
     @skipIfNoFBGEMM
     def test_quantized_conv_relu(self):
         """tests for conv1d_relu/conv2d_relu/conv3d_relu"""
@@ -2365,6 +2459,42 @@ def test_rnn(self):
                                      [100, -155]], dtype=torch.float).unsqueeze(0).repeat(niter, 1, 1)
         self._test_rnn_impl(qconfigs, RNNDynamicModel, module_type_strs, module_types, sample_input)
 
+    def _test_conv_transpose_impl(
+            self, float_cls: Callable, q_cls: Callable, data: torch.Tensor):
+        with override_quantized_engine('qnnpack'):
+            # Create fp32 versions of FX and Eager models
+            m1 = torch.nn.Sequential(float_cls(1, 1, 1))
+            m2 = torch.nn.Sequential(float_cls(1, 1, 1))
+            m2.load_state_dict(m1.state_dict())
+            m2 = torch.quantization.QuantWrapper(m2)
+            # FX graph
+            q_result1 = self.checkGraphModeFxOp(
+                m1, (data,), QuantType.STATIC,
+                expected_node_occurrence={
+                    ns.call_module(q_cls): 1,
+                })
+            # Eager
+            m2.qconfig = get_default_qconfig(torch.backends.quantized.engine)
+            m2.eval()
+            m2p = torch.quantization.prepare(m2)
+            m2p(data)
+            m2q = torch.quantization.convert(m2p)
+            q_result2 = m2q(data)
+            # verify results match
+            self.assertTrue(torch.allclose(q_result1, q_result2))
+
+    @unittest.skipUnless('qnnpack' in supported_qengines,
+                         "This Pytorch Build has not been built with or does not support QNNPACK")
+    def test_conv_transpose_1d(self):
+        self._test_conv_transpose_impl(
+            torch.nn.ConvTranspose1d, nnq.ConvTranspose1d, torch.randn(4, 1, 4))
+
+    @unittest.skipUnless('qnnpack' in supported_qengines,
+                         "This Pytorch Build has not been built with or does not support QNNPACK")
+    def test_conv_transpose_2d(self):
+        self._test_conv_transpose_impl(
+            torch.nn.ConvTranspose2d, nnq.ConvTranspose2d, torch.randn(4, 1, 4, 4))
+
 
 class TestQuantizeFxModels(QuantizationTestCase):
     def _test_model_impl(
diff --git a/test/quantization/test_quantized_op.py b/test/quantization/test_quantized_op.py
index c676ccc0f793..a192eddca234 100644
--- a/test/quantization/test_quantized_op.py
+++ b/test/quantization/test_quantized_op.py
@@ -23,7 +23,7 @@
 from torch.testing._internal.common_utils import IS_PPC, TEST_WITH_UBSAN, IS_MACOS
 from torch.testing._internal.common_quantization import skipIfNoFBGEMM
 from torch.testing._internal.common_quantized import _quantize, _dequantize, _calculate_dynamic_qparams, \
-    override_quantized_engine, supported_qengines, override_qengines
+    override_quantized_engine, supported_qengines, override_qengines, _snr
 from torch.testing._internal.common_quantized import qengine_is_qnnpack
 from torch.quantization import PerChannelMinMaxObserver
 
@@ -2314,6 +2314,87 @@ def test_advanced_indexing(self):
                 torch.quantize_per_tensor(x_fp32_s4, scale, zp, dtype)
             self.assertEqual(x_q_s4, x_fp32_s4_ref)
 
+    @override_qengines
+    def test_custom_module_lstm(self):
+        qengine = torch.backends.quantized.engine
+
+        batch_size = 4
+        seq_len = 8
+        input_size = 12
+
+        hidden_size = 8
+        num_layers = 2
+
+        dropout = 0  # This is not supported
+
+        Bias = [False, True]
+        Batch_first = [False, True]
+        Bidirectional = [False, True]
+
+        dtype = np.uint8
+        qtype = torch.quint8
+
+        custom_module_config = {
+            'float_to_observed_custom_module_class': {
+                torch.nn.LSTM: torch.nn.quantizable.LSTM
+            }
+        }
+
+        x = np.random.randn(seq_len, batch_size, input_size)
+        scale, zero_point = _calculate_dynamic_qparams(x, dtype=dtype)
+        x = torch.from_numpy(x).to(torch.float)
+        qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point,
+                                       dtype=qtype)
+        x = qx.dequantize()
+
+        with torch.no_grad():
+            for bias, batch_first, bidirectional in itertools.product(
+                    Bias, Batch_first, Bidirectional):
+                # Assume 12dB is sufficient for functional equivalence
+                # Without the bias, linear performs poorly
+                min_power = 10 if bias else 5
+                max_mse = 5e-6 if bias else 5e-1
+
+                if batch_first:
+                    x = x.reshape(batch_size, seq_len, input_size)
+                    qx = qx.reshape(batch_size, seq_len, input_size)
+                else:
+                    x = x.reshape(seq_len, batch_size, input_size)
+                    qx = qx.reshape(seq_len, batch_size, input_size)
+
+                lstm = torch.nn.Sequential(
+                    torch.nn.LSTM(input_size, hidden_size,
+                                  num_layers=num_layers,
+                                  bias=bias, batch_first=batch_first,
+                                  dropout=dropout,
+                                  bidirectional=bidirectional))
+                lstm.eval()
+                y_ref = lstm(x)
+
+                # Prepare
+                lstm.qconfig = torch.quantization.get_default_qconfig(qengine)
+                lstm_prepared = torch.quantization.prepare(
+                    lstm, prepare_custom_config_dict=custom_module_config)
+                self.assertTrue(hasattr(lstm_prepared[0], 'layers'))
+                self.assertEqual(num_layers, len(lstm_prepared[0].layers))
+
+                # Calibrate
+                y = lstm_prepared(x)
+                self.assertEqual(y_ref, y)
+
+                # Quantize
+                lstm_quantized = torch.quantization.convert(lstm_prepared)
+                qy = lstm_quantized(qx)
+
+                snr = _snr(y, qy)
+                snr = [snr[0]] + snr[1]
+
+                for signal, mse, power in snr:
+                    self.assertTrue(
+                        power > min_power or mse < max_mse,
+                        msg=(f"Error is too high: SNR(dB): {power}, "
+                             f"Signal: {signal}, MSE: {mse}"))
+
 
 class TestDynamicQuantizedLinear(TestCase):
     """Tests the correctness of the dynamic quantized linear and linear_relu op."""
@@ -3346,7 +3427,7 @@ def _make_qconv_tensors(
         self, batch_size, input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, dilations,
         X_scale, X_zero_point, W_scale, W_zero_point,
-        use_bias, use_channelwise, use_transpose, memory_format=torch.contiguous_format
+        use_bias, use_channelwise, use_transpose
     ):
         assert not (use_channelwise and use_transpose), \
                "Cannot generate channelwise qconv_transpose_tensors "
@@ -3394,7 +3475,6 @@ def _make_qconv_tensors(
             (batch_size, input_channels,) + input_feature_map_shape,
         )
         X = X_scale * (X_init - X_zero_point).float()
-        X = X.to(memory_format=memory_format)
 
         if use_channelwise:
             W_shape = (-1, 1) + (1,) * len(kernels)
@@ -3427,15 +3507,13 @@ def _test_qconv_impl(
         input_channels_per_group, input_feature_map_shape,
         output_channels_per_group, groups, kernels, strides, pads, o_pads,
         dilations, X_scale, X_zero_point, W_scale, W_zero_point, Y_scale,
-        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose,
-        memory_format=torch.contiguous_format
+        Y_zero_point, use_bias, use_relu, use_channelwise, use_transpose
     ):
         (X, W), (X_q, W_q), bias_float = self._make_qconv_tensors(
             batch_size, input_channels_per_group, input_feature_map_shape,
             output_channels_per_group, groups, kernels,
             strides, pads, dilations, X_scale, X_zero_point, W_scale,
-            W_zero_point, use_bias, use_channelwise, use_transpose,
-            memory_format)
+            W_zero_point, use_bias, use_channelwise, use_transpose)
         # Assign weights
         W = W_q.dequantize()
         X = X_q.dequantize()
@@ -3483,14 +3561,6 @@ def _test_qconv_impl(
             pads: {pads}, o_pads: {o_pads}, dilations: {dilations},
             groups: {groups}, y_s: {Y_scale}, y_zp: {Y_zero_point}''')
 
-        # fbgemm for now forces output to be NHWC (channels last) to opportunistically
-        # improve performance
-        if torch.backends.quantized.engine == 'qnnpack':
-            # Make sure memory format is preserved
-            self.assertEqual(
-                X_q.is_contiguous(memory_format=memory_format),
-                Y_q.is_contiguous(memory_format=memory_format))
-
         # Return the quantized data for later reuse
         return X_q, W_q, bias_float
 
@@ -3563,14 +3633,12 @@ def test_qconv2d(
             dilations,
             groups,
         )
-        for memory_format in (torch.contiguous_format, torch.channels_last):
-            self._test_qconv_impl(
-                qconv, qconv_prepack, conv_op, batch_size,
-                input_channels_per_group, (height, width),
-                output_channels_per_group, groups, kernels, strides, pads, None,
-                dilations, X_scale, X_zero_point, W_scale, W_zero_point,
-                Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False,
-                memory_format)
+        self._test_qconv_impl(
+            qconv, qconv_prepack, conv_op, batch_size,
+            input_channels_per_group, (height, width),
+            output_channels_per_group, groups, kernels, strides, pads, None,
+            dilations, X_scale, X_zero_point, W_scale, W_zero_point,
+            Y_scale, Y_zero_point, use_bias, use_relu, use_channelwise, False)
 
     """Tests the correctness of quantized convolution op."""
     @given(batch_size=st.integers(1, 3),
@@ -4163,7 +4231,6 @@ def test_qconv3d_unpack(
             (stride_d, stride_h, stride_w), (pad_d, pad_h, pad_w), (o_pad, o_pad, o_pad),
             channelwise)
 
-
 class TestPadding(TestCase):
     @given(batch_size=st.integers(1, 64),
            channels=st.integers(1, 64),
diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py
index 22751697cd1d..8a70ae149c29 100644
--- a/test/quantization/test_workflow_module.py
+++ b/test/quantization/test_workflow_module.py
@@ -10,6 +10,7 @@
     PlaceholderObserver,
     NoopObserver,
     FakeQuantize,
+    FixedQParamsFakeQuantize,
     default_debug_qconfig,
     default_observer,
     default_per_channel_weight_observer,
@@ -504,6 +505,20 @@ def test_observer_qparams_respects_device_affinity(self):
             self.assertEqual(x.device, scale.device)
             self.assertEqual(x.device, zero_point.device)
 
+    def test_zero_numel(self):
+        obs_list = [MinMaxObserver, MovingAverageMinMaxObserver,
+                    PerChannelMinMaxObserver,
+                    MovingAveragePerChannelMinMaxObserver, HistogramObserver,
+                    FakeQuantize, FixedQParamsFakeQuantize]
+        for obs_cls in obs_list:
+            if obs_cls is FixedQParamsFakeQuantize:
+                obs = obs_cls(0.1, 0)
+            else:
+                obs = obs_cls()
+            x = torch.Tensor()
+            # verify no crash
+            x = obs(x)
+
 
 # HistogramObserver that works like it does on master
 class _ReferenceHistogramObserver(HistogramObserver):
diff --git a/test/run_test.py b/test/run_test.py
index 843346824036..93484f7a583e 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -19,6 +19,7 @@
 from typing import Dict, Optional
 
 TESTS = [
+    'test_type_hints',
     'test_autograd',
     'benchmark_utils/test_benchmark_utils',
     'test_binary_ufuncs',
@@ -72,7 +73,6 @@
     'test_testing',
     'test_torch',
     'test_type_info',
-    'test_type_hints',
     'test_unary_ufuncs',
     'test_utils',
     'test_view_ops',
@@ -617,7 +617,7 @@ def find_test_index(test, selected_tests, find_last_index=False):
     If :attr:`test`='torch' and :attr:`find_last_index`=False, result should be **2**.
     If :attr:`test`='torch' and :attr:`find_last_index`=True, result should be **4**.
 
-    Arguments:
+    Args:
         test (str): Name of test to lookup
         selected_tests (list): List of tests
         find_last_index (bool, optional): should we lookup the index of first or last
diff --git a/test/test_autograd.py b/test/test_autograd.py
index a8a130596855..9f5925212757 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -32,7 +32,8 @@
 from torch.testing._internal.common_utils import (TestCase, run_tests, skipIfNoLapack,
                                                   suppress_warnings, slowTest,
                                                   load_tests, random_symmetric_matrix,
-                                                  IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
+                                                  IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck,
+                                                  TemporaryFileName)
 from torch.autograd import Variable, Function, detect_anomaly, kineto_available
 from torch.autograd.function import InplaceFunction
 import torch.autograd.forward_ad as fwAD
@@ -1945,60 +1946,6 @@ def test_slice_expanded_v(self):
         expected[3:5] = v_expanded
         self.assertEqual(result, expected)
 
-    def test_stack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.stack([x, y, z], 0)
-        grad = torch.randn(3, 10, 10)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[0])
-        self.assertEqual(y.grad, grad[1])
-        self.assertEqual(z.grad, grad[2])
-
-    def test_hstack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.hstack([x, y, z])
-        grad = torch.randn(10, 30)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[:, 0:10])
-        self.assertEqual(y.grad, grad[:, 10:20])
-        self.assertEqual(z.grad, grad[:, 20:30])
-
-        x = torch.randn(10, requires_grad=True)
-        y = torch.randn(10, requires_grad=True)
-        z = torch.randn(10, requires_grad=True)
-        stacked = torch.hstack([x, y, z])
-        grad = torch.randn(30)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[0:10])
-        self.assertEqual(y.grad, grad[10:20])
-        self.assertEqual(z.grad, grad[20:30])
-
-    def test_vstack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.vstack([x, y, z])
-        grad = torch.randn(30, 10)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[0:10])
-        self.assertEqual(y.grad, grad[10:20])
-        self.assertEqual(z.grad, grad[20:30])
-
-    def test_dstack(self):
-        x = torch.randn(10, 10, requires_grad=True)
-        y = torch.randn(10, 10, requires_grad=True)
-        z = torch.randn(10, 10, requires_grad=True)
-        stacked = torch.dstack([x, y, z])
-        grad = torch.randn(10, 10, 3)
-        stacked.backward(grad)
-        self.assertEqual(x.grad, grad[:, :, 0])
-        self.assertEqual(y.grad, grad[:, :, 1])
-        self.assertEqual(z.grad, grad[:, :, 2])
-
     def test_unbind(self):
         stacked = torch.randn(3, 10, 10, requires_grad=True)
         x, y, z = stacked.unbind()
@@ -2970,6 +2917,20 @@ def run_test(input_size, norm_deg):
         run_test((10,), 3)
         run_test((10,), 1)
         run_test((10,), 1.5)
+        run_test((10,), inf)
+
+    def test_norm_inf_subgradient(self):
+        def run_test(input, expected, dim=None):
+            x = torch.tensor(input, requires_grad=True)
+            out = x.norm(inf, dim=dim, keepdim=True)
+            out.backward(torch.ones(out.size()))
+            self.assertEqual(x.grad, expected)
+
+        run_test([0., 0., 0.], [0., 0., 0.])
+        run_test([1., 0., 1.], [0.5, 0., 0.5])
+        run_test([[1., 0., 1.], [0., 1., 1.]], [[0.25, 0., 0.25], [0., 0.25, 0.25]])
+        run_test([[1., 0., 1.], [0., 1., 0.]], [[0.5, 0., 0.5], [0., 1., 0.]], (1,))
+        run_test(torch.ones((2, 2, 2)), torch.full((2, 2, 2), 0.25), (0, 2))
 
     def test_pow_zero_tensor_gradient(self):
         def run_test(input_size, exponent):
@@ -3015,18 +2976,17 @@ def gen_matrices(p):
         gradgradcheck(torch.chain_matmul, gen_matrices([3, 5, 2, 6]))
         gradgradcheck(torch.chain_matmul, gen_matrices([6, 2, 4, 8, 10]))
 
-    @unittest.skipIf(IS_WINDOWS, """File open permission error on Windows,
-            https://github.com/pytorch/pytorch/issues/34086""")
     def test_profiler_tracing(self):
         t1, t2 = torch.ones(1), torch.ones(1)
         with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof:
             torch.add(t1, t2)
 
-        with tempfile.NamedTemporaryFile(mode="w+") as f:
-            prof.export_chrome_trace(f.name)
+        with TemporaryFileName(mode="w+") as fname:
+            prof.export_chrome_trace(fname)
             # read the trace and expect valid json
             # if the JSON generated by export_chrome_trace is not valid, this will throw and fail the test.
-            json.load(f)
+            with io.open(fname, 'r') as f:
+                json.load(f)
 
         # Same test but for cuda.
         if not torch.cuda.is_available():
@@ -3037,10 +2997,11 @@ def test_profiler_tracing(self):
         with torch.autograd.profiler.profile(use_cuda=True, use_kineto=kineto_available()) as prof:
             torch.add(t1, t2)
 
-        with tempfile.NamedTemporaryFile(mode="w+") as f:
-            prof.export_chrome_trace(f.name)
+        with TemporaryFileName(mode="w+") as fname:
+            prof.export_chrome_trace(fname)
             # Now validate the json
-            json.load(f)
+            with io.open(fname, 'r') as f:
+                json.load(f)
 
     def test_profiler(self):
         x = torch.randn(10, 10)
@@ -4980,14 +4941,17 @@ def assert_only_first_requires_grad(res):
                                          return_counts=return_counts)
                     assert_only_first_requires_grad(res)
 
-
-
-def index_variable(shape, max_indices):
-    if not isinstance(shape, tuple):
-        shape = (shape,)
-    index = torch.rand(*shape).mul_(max_indices).floor_().long()
-    return index
-
+    def test_linalg_qr_r(self):
+        # torch.linalg.qr(mode='r') returns only 'r' and discards 'q', but
+        # without 'q' you cannot compute the backward pass. Check that
+        # linalg_qr_backward complains cleanly in that case.
+        inp = torch.randn((5, 7), requires_grad=True)
+        q, r = torch.linalg.qr(inp, mode='r')
+        assert q.shape == (0,)  # empty tensor
+        b = torch.sum(r)
+        with self.assertRaisesRegex(RuntimeError,
+                                    "linalg_qr_backward: cannot compute backward"):
+            b.backward()
 
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
@@ -4996,20 +4960,6 @@ def index_perm_variable(shape, max_indices):
     index = torch.randperm(max_indices).narrow(0, 0, reduce(mul, shape)).view(shape)
     return index
 
-
-def gather_variable(shape, index_dim, max_indices, duplicate=False):
-    assert len(shape) == 2
-    assert index_dim < 2
-    batch_dim = 1 - index_dim
-    index = torch.LongTensor(*shape)
-    for i in range(shape[index_dim]):
-        index.select(index_dim, i).copy_(
-            torch.randperm(max_indices)[:shape[batch_dim]])
-    if duplicate:
-        index.select(batch_dim, 0).copy_(index.select(batch_dim, 1))
-    return index
-
-
 def bernoulli_scalar():
     return torch.tensor(0, dtype=torch.uint8).bernoulli_()
 
@@ -5085,7 +5035,8 @@ def run_functional_checks(test_case, test_name, name, apply_fn, run_grad_checks,
                 'cosh', '__rmul__', 'sgn', 'abs', 'dot', 'vdot', 'tensor_split', 'matmul',
                 'bmm', 'mv', 'ger', 'diagonal', 'atan', 'angle', 'tanh', 'fill_', 'sub',
                 'exp', 'mean', 'inverse', 'triangular_solve', 'solve', 'addcmul',
-                'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr', ] + separate_complex_tests
+                'addcdiv', 'linalg.tensorinv', 'matrix_exp', 'qr',
+                'narrow', 'swapaxes', 'swapdims', 'tensor_split', 'tile'] + separate_complex_tests
 
 def add_test(
         name,
@@ -7357,18 +7308,6 @@ def test_strided_leaf_grad_layout(self, device):
         (c * d).sum().backward()
         self.assertEqual(c.grad.stride(), (2, 1))
 
-    def test_movedim(self, device):
-        for fn in [torch.movedim, torch.moveaxis]:
-            x = torch.randn(4, 3, 2, 1, dtype=torch.double, device=device, requires_grad=True)
-
-            # Positive axis
-            gradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
-            gradgradcheck(lambda x: fn(x, (0, 1, 2, 3), (3, 2, 1, 0)), x)
-
-            # Negative axis
-            gradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
-            gradgradcheck(lambda x: fn(x, (0, -1, -2, -3), (-3, -2, -1, -0)), x)
-
     def _test_atleast(self, device, torch_fn):
         # 0-dim
         s = torch.tensor(0.5, dtype=torch.double, requires_grad=True)
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 5739fb569628..2ff12396701e 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1082,13 +1082,13 @@ def test_maximum_minimum_cross_device(self, device):
         ops = (torch.maximum, torch.minimum)
 
         for torch_op in ops:
-            with self.assertRaisesRegex(RuntimeError, 
+            with self.assertRaisesRegex(RuntimeError,
                                         "Expected all tensors to be on the same device"):
                 torch_op(a, b)
 
-            with self.assertRaisesRegex(RuntimeError, 
+            with self.assertRaisesRegex(RuntimeError,
                                         "Expected all tensors to be on the same device"):
-                torch_op(b, a) 
+                torch_op(b, a)
 
         # test cuda tensor and cpu scalar
         ops = ((torch.maximum, np.maximum), (torch.minimum, np.minimum))
@@ -2560,6 +2560,17 @@ def inplace_variant_helper(x, y):
         self.compare_with_numpy(torch_fn, reference_fn, t, exact_dtype=False)
         out_variant_helper(torch.xlogy, 0, t)
 
+    def test_xlogy_scalar_type_promotion(self, device):
+        # Test that python numbers don't participate in type promotion at the same
+        # priority level as 0-dim tensors
+        t = torch.randn((), dtype=torch.float32, device=device)
+
+        self.assertEqual(t.dtype, torch.xlogy(t, 5).dtype)
+        self.assertEqual(t.dtype, torch.xlogy(t, 5.).dtype)
+
+        self.assertEqual(t.dtype, torch.xlogy(5, t).dtype)
+        self.assertEqual(t.dtype, torch.xlogy(5., t).dtype)
+
     @skipIf(not TEST_SCIPY, "Scipy required for the test.")
     def test_xlogy_bfloat16(self, device):
         def _compare_helper(x, y):
diff --git a/test/test_bundled_inputs.py b/test/test_bundled_inputs.py
index f57407c9b1d1..e12339f3acea 100644
--- a/test/test_bundled_inputs.py
+++ b/test/test_bundled_inputs.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python3
 import io
+from typing import List
+
 import torch
 import torch.utils.bundled_inputs
 from torch.testing._internal.common_utils import TestCase, run_tests
@@ -27,7 +29,7 @@ def forward(self, arg):
 
         sm = torch.jit.script(SingleTensorModel())
         original_size = model_size(sm)
-        get_expr = []
+        get_expr : List[str] = []
         samples = [
             # Tensor with small numel and small storage.
             (torch.tensor([1]),),
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 047297c438b7..c257dd8a2fd7 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3,6 +3,7 @@
 import errno
 import os
 import ctypes
+import faulthandler
 import torch
 import gc
 import time
@@ -34,18 +35,6 @@
     else:
         warnings.warn(err_msg)
 
-try:
-    import faulthandler
-    HAS_FAULTHANDLER = True
-except ImportError:
-    HAS_FAULTHANDLER = False
-    err_msg = ("faulthandler not found. Some data loader tests use it for error "
-               "reporting (e.g., TestDataLoader.test_proper_exit).")
-    if IS_PYTORCH_CI:
-        raise ImportError(err_msg) from None
-    else:
-        warnings.warn(err_msg)
-
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -86,9 +75,7 @@
 JOIN_TIMEOUT = 60.0  # seconds
 
 
-supported_multiprocessing_contexts = [None]
-if torch.multiprocessing._supports_context:
-    supported_multiprocessing_contexts += list(torch.multiprocessing.get_all_start_methods())
+supported_multiprocessing_contexts = [None] + list(torch.multiprocessing.get_all_start_methods())
 
 
 @unittest.skipIf(
@@ -312,29 +299,25 @@ def test_iterable_dataset_err(self):
 
 # takes in dummy var so this can also be used as a `worker_init_fn`
 def set_faulthander_if_available(_=None):
-    if HAS_FAULTHANDLER:
-        faulthandler.enable(sys.__stderr__)
-        if not IS_WINDOWS:
-            # windows does not have faulthandler.register
-            # chain=False prevents the default behavior of killing the process
-            faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False)
+    faulthandler.enable(sys.__stderr__)
+    if not IS_WINDOWS:
+        # windows does not have faulthandler.register
+        # chain=False prevents the default behavior of killing the process
+        faulthandler.register(signal.SIGUSR1, file=sys.__stderr__, chain=False)
 
 
 set_faulthander_if_available()
 
 # Process `pid` must have called `set_faulthander_if_available`
 def print_traces_of_all_threads(pid):
-    if HAS_FAULTHANDLER:
-        if not IS_WINDOWS:
-            # use the custom signal if available
-            os.kill(pid, signal.SIGUSR1)
-        else:
-            # otherwise we can still use the handler given by faulthandler.enable()
-            # at the cost of killing the process.
-            os.kill(pid, signal.SIGSEGV)
+    if not IS_WINDOWS:
+        # use the custom signal if available
+        os.kill(pid, signal.SIGUSR1)
     else:
-        # if there is no faulthandler, use SIGINT otherwise and hope for the best
-        os.kill(pid, signal.SIGINT)
+        # otherwise we can still use the handler given by faulthandler.enable()
+        # at the cost of killing the process.
+        os.kill(pid, signal.SIGSEGV)
+
     # wait in parent process to give subprocess some time to print
     time.sleep(5)
 
@@ -1037,17 +1020,13 @@ def test_invalid_ctor_args_combinations(self):
                                     "batch_size=None option disables auto-batching and is mutually exclusive"):
             self._get_data_loader(self.dataset, batch_size=None, drop_last=True)
 
-        if torch.multiprocessing._supports_context:
-            valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1]
-            with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"):
-                self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx)
-            with self.assertRaisesRegex(ValueError, "should specify a valid start method in"):
-                self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad')
-            with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "):
-                self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object())
-        else:
-            with self.assertRaisesRegex(ValueError, "multiprocessing_context relies on Python >= 3.4"):
-                self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='fork')
+        valid_ctx = list(torch.multiprocessing.get_all_start_methods())[-1]
+        with self.assertRaisesRegex(ValueError, r"multi-process loading \(num_workers > 0\), but got"):
+            self._get_data_loader(self.dataset, num_workers=0, multiprocessing_context=valid_ctx)
+        with self.assertRaisesRegex(ValueError, "should specify a valid start method in"):
+            self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context='bad')
+        with self.assertRaisesRegex(TypeError, "multiprocessing_context option should be a valid context "):
+            self._get_data_loader(self.dataset, num_workers=1, multiprocessing_context=object())
 
         # map-style
         sampler = torch.utils.data.SequentialSampler(self.dataset)
@@ -1504,7 +1483,7 @@ def _test_sampler(self, **kwargs):
     def test_sampler(self):
         self._test_sampler()
         self._test_sampler(num_workers=4)
-        if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context:
+        if not NO_MULTIPROCESSING_SPAWN:
             self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn')
 
     def _test_batch_sampler(self, **kwargs):
@@ -1529,7 +1508,7 @@ def _test_batch_sampler(self, **kwargs):
     def test_batch_sampler(self):
         self._test_batch_sampler()
         self._test_batch_sampler(num_workers=4)
-        if not NO_MULTIPROCESSING_SPAWN and torch.multiprocessing._supports_context:
+        if not NO_MULTIPROCESSING_SPAWN:
             self._test_batch_sampler(num_workers=4, multiprocessing_context='spawn')
 
     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
diff --git a/test/test_dataset.py b/test/test_dataset.py
index 2caa1a248435..a72b87cca555 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -90,7 +90,7 @@ def _collate_fn(batch):
             y = next(ds_iter)
             self.assertEqual(x, torch.tensor(sum(y), dtype=torch.float))
 
-        collate_ds_nolen = CollateIterableDataset(ds_nolen)
+        collate_ds_nolen = CollateIterableDataset(ds_nolen)  # type: ignore
         with self.assertRaises(NotImplementedError):
             len(collate_ds_nolen)
         ds_nolen_iter = iter(ds_nolen)
@@ -144,7 +144,7 @@ def test_sampler_dataset(self):
         arrs = range(10)
         ds = IterDatasetWithLen(arrs)
         # Default SequentialSampler
-        sampled_ds = SamplerIterableDataset(ds)
+        sampled_ds = SamplerIterableDataset(ds)  # type: ignore
         self.assertEqual(len(sampled_ds), 10)
         i = 0
         for x in sampled_ds:
@@ -152,7 +152,7 @@ def test_sampler_dataset(self):
             i += 1
 
         # RandomSampler
-        random_sampled_ds = SamplerIterableDataset(ds, sampler=RandomSampler, replacement=True)
+        random_sampled_ds = SamplerIterableDataset(ds, sampler=RandomSampler, replacement=True)  # type: ignore
 
         # Requires `__len__` to build SamplerDataset
         ds_nolen = IterDatasetWithoutLen(arrs)
diff --git a/test/test_expecttest.py b/test/test_expecttest.py
index 652a33c41869..5e2461797705 100644
--- a/test/test_expecttest.py
+++ b/test/test_expecttest.py
@@ -4,6 +4,7 @@
 import string
 import textwrap
 import doctest
+from typing import Dict, Any
 
 import hypothesis
 from hypothesis.strategies import text, integers, composite, sampled_from, booleans
@@ -38,7 +39,7 @@ def test_replace_string_literal_roundtrip(self, t, raw, quote):
         r3 = {r}{quote}placeholder3{quote}
         """.format(r='r' if raw else '', quote=quote * 3)
         new_prog = expecttest.replace_string_literal(textwrap.dedent(prog), 2, t)[0]
-        ns = {}
+        ns : Dict[str, Any] = {}
         exec(new_prog, ns)
         msg = "program was:\n{}".format(new_prog)
         self.assertEqual(ns['r'], 'placeholder', msg=msg)  # noqa: F821
diff --git a/test/test_fx.py b/test/test_fx.py
index 5e285039a6dd..2511adc52c62 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -861,6 +861,11 @@ def forward(self, x, w):
         x, w = torch.rand(3, 4), torch.rand(4, 4)
         self.assertTrue(any(n.target == torch.relu for n in traced.graph.nodes))
 
+    def test_empty_graph_codegen(self):
+        graph = torch.fx.Graph()
+        gm = torch.fx.GraphModule(torch.nn.Module(), graph)
+        self.assertEqual(gm(), None)
+
     def test_sequential(self):
         m = torch.nn.Sequential(torch.nn.Conv2d(1, 1, 1))
         gm = torch.fx.symbolic_trace(m)
@@ -1196,5 +1201,18 @@ def forward(self, x):
         input = torch.rand(3, 4)
         self.assertEqual(traced(input), Pair(input, input))
 
+    def test_return_type_exists(self):
+        class ReturnTypeModule(torch.nn.Module):
+            def other(self, x: List[str]) -> List[str]:
+                return x
+
+            def forward(self, x: List[str]) -> List[str]:
+                return self.other(x)
+
+        traced = symbolic_trace(ReturnTypeModule())
+        self.assertIn("-> typing.List[str]", traced._code)
+        scripted = torch.jit.script(traced)
+        self.assertIn("-> List[str]", scripted.code)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py
index 6e9c877b8de6..ac71d6037591 100644
--- a/test/test_fx_experimental.py
+++ b/test/test_fx_experimental.py
@@ -21,6 +21,7 @@
     PartitionMode
 )
 from torch.fx.experimental.fuser import fuse
+from torch.fx.experimental import merge_matmul
 
 try:
     from torchvision.models import resnet18
@@ -844,6 +845,128 @@ def forward(self, a):
                 for p_name in para_list:
                     assert p_name in node.attrs_for_lowering
 
+    def test_merge_matmuls(self):
+        """
+        A collection of test cases for torch.fx.experimental.merge_matmul,
+        a graph transformation that merges matrix multiplication operations.
+        """
+        # Utility function for counting matmuls for test assertions.
+        def _count_matmuls(mod):
+            gm = torch.fx.symbolic_trace(mod)
+
+            num_matmuls = 0
+            for node in gm.graph.nodes:
+                if node.target == torch.matmul:
+                    num_matmuls += 1
+
+            return num_matmuls
+
+        # Simple test case in which there are two matmuls of the same size to merge.
+        class SimpleMergeMatmulModule(torch.nn.Module):
+            def __init__(self, rhs):
+                super().__init__()
+                self.rhs = rhs
+
+            def forward(self, x, y):
+                a = torch.matmul(x, self.rhs)
+                b = torch.matmul(y, self.rhs)
+                return a + b
+
+        # Initialize inputs.
+        a = torch.randn(3, 3)
+        b = torch.randn(3, 3)
+
+        # Initialize RHS for matmuls.
+        rhs = torch.randn(3, 4)
+
+        # Construct SimpleMergeMatmulModule and call merge_matmul on it.
+        module = SimpleMergeMatmulModule(rhs)
+        opt_module = merge_matmul.merge_matmul(module)
+
+        # Numerical correctness check.
+        before = module(a, b)
+        after = opt_module(a, b)
+        before.allclose(after)
+
+        # Basic graph structure check; original module should have 2 matmuls
+        # and optimized module should have 1.
+        self.assertEqual(_count_matmuls(module), 2)
+        self.assertEqual(_count_matmuls(opt_module), 1)
+
+        # Test case in which there are multiple matmuls of different sizes to merge.
+        class FiveMergeMatmulModule(torch.nn.Module):
+            def __init__(self, rhs):
+                super().__init__()
+                self.rhs = rhs
+
+            def forward(self, a, b, c, d, e):
+                s = torch.Tensor((0))
+                matmuls = []
+
+                # For some reason using a list comprehension or for-loop for this
+                # doesn't work.
+                matmuls.append(torch.matmul(a, self.rhs))
+                matmuls.append(torch.matmul(b, self.rhs))
+                matmuls.append(torch.matmul(c, self.rhs))
+                matmuls.append(torch.matmul(d, self.rhs))
+                matmuls.append(torch.matmul(e, self.rhs))
+
+                for m in matmuls:
+                    s += torch.sum(m)
+
+                return s
+
+        # Initialize inputs.
+        inputs = [torch.randn(2 * i + 1, 5) for i in range(5)]
+
+        # Initialize RHS.
+        rhs = torch.randn(5, 4)
+
+        # Construct FiveMergeMatmulModule and call merge_matmul on it.
+        module = FiveMergeMatmulModule(rhs)
+        opt_module = merge_matmul.merge_matmul(module)
+
+        # Numerical correctness check.
+        before = module(*inputs)
+        after = opt_module(*inputs)
+        before.allclose(after)
+
+        # Basic graph structure check; original module should have len(inputs) matmuls
+        # and optimized module should have 1.
+        self.assertEqual(_count_matmuls(module), len(inputs))
+        self.assertEqual(_count_matmuls(opt_module), 1)
+
+        # Simple test case in which two matmuls cannot be merged due to a data dependency between
+        # the LHS operands.
+        class UnmergeableMatmulModule(torch.nn.Module):
+            def __init__(self, rhs):
+                super().__init__()
+                self.rhs = rhs
+
+            def forward(self, x):
+                a = torch.matmul(x, self.rhs)
+                a_abs = torch.abs(a)
+                b = torch.matmul(a_abs.transpose(1, 0), self.rhs)
+                return b
+
+        # Initialize inputs.
+        a = torch.randn(3, 3)
+
+        # Initialize RHS for matmuls.
+        rhs = torch.randn(3, 4)
+
+        # Construct UnmergeableMatmulModule and call merge_matmul on it.
+        module = UnmergeableMatmulModule(rhs)
+        opt_module = merge_matmul.merge_matmul(module)
+
+        # Numerical correctness check.
+        before = module(a)
+        after = opt_module(a)
+        before.allclose(after)
+
+        # Basic graph structure check; the number of matrix multiplcations should not have changed.
+        self.assertEqual(_count_matmuls(module), 2)
+        self.assertEqual(_count_matmuls(opt_module), 2)
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_jit.py b/test/test_jit.py
index 169164610e96..a683a8eb0b8c 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -35,6 +35,7 @@
 from jit.test_slice import TestSlice  # noqa: F401
 from jit.test_warn import TestWarn  # noqa: F401
 from jit.test_isinstance import TestIsinstance  # noqa: F401
+from jit.test_cuda import TestCUDA  # noqa: F401
 from jit.test_hash import TestHash  # noqa: F401
 
 # Torch
@@ -2393,8 +2394,7 @@ def fn(x):
         warns = [str(w.message) for w in warns]
         self.assertEqual(len(warns), 0)
 
-    @unittest.skipIf(IS_WINDOWS or True, "TODO: need to fix this test case for "
-                                         "Windows, re-enable with https://github.com/pytorch/pytorch/pull/29339")
+    @unittest.skipIf(True, "TODO: re-enable with https://github.com/pytorch/pytorch/pull/29339")
     def test_torch_load_error(self):
         class J(torch.jit.ScriptModule):
             def __init__(self):
@@ -2405,20 +2405,20 @@ def forward(self, input):
                 return input + 100
 
         j = J()
-        with tempfile.NamedTemporaryFile() as f:
-            j.save(f.name)
+        with TemporaryFileName() as fname:
+            j.save(fname)
             with self.assertRaisesRegex(RuntimeError, "is a zip"):
-                torch.load(f.name)
+                torch.load(fname)
 
-    @unittest.skipIf(IS_WINDOWS, "TODO: need to fix this test case for Windows")
     def test_torch_load_zipfile_check(self):
         @torch.jit.script
         def fn(x):
             return x + 10
 
-        with tempfile.NamedTemporaryFile() as f:
-            fn.save(f.name)
-            self.assertTrue(torch.serialization._is_zipfile(f))
+        with TemporaryFileName() as fname:
+            fn.save(fname)
+            with io.open(fname, 'rb') as f:
+                self.assertTrue(torch.serialization._is_zipfile(f))
 
     def test_python_bindings(self):
         lstm_cell = torch.jit.script(LSTMCellS)
@@ -6753,6 +6753,11 @@ def complicated_arithmetic_operation():
 
         self.checkScript(complicated_arithmetic_operation, ())
 
+    def test_in_operator_with_two_strings(self):
+        def fn() -> bool:
+            return "a" in "abcd"
+        self.checkScript(fn, ())
+
     def test_bitwise_ops(self):
 
         def int_test():
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
index 091b2eca4c9b..4886abc58758 100644
--- a/test/test_jit_fuser_te.py
+++ b/test/test_jit_fuser_te.py
@@ -1281,22 +1281,8 @@ def forward(self, x):
             self.assertEqual(ref, mod.forward(x))
             self.assertLastGraphAllFused()
 
-    @unittest.skip("temp disabled")
+    @unittest.skip("Temporarily disabled")
     def test_masked_fill(self):
-        # check scalar overload
-        def foo(x, mask):
-            return torch.masked_fill(x, mask, .6), torch.masked_fill(x, mask, 2)
-
-        mask = torch.tensor([True, False])
-        foo.__disable_jit_function_caching__ = True
-        for inp in (torch.rand([2, 2]).to(torch.int), mask), (torch.rand([2, 2]), mask):
-            ref = foo(*inp)
-            foo_s = torch.jit.script(foo)
-            warmup_forward(foo_s, *inp)
-            self.assertEqual(foo_s(*inp), ref)
-            self.assertLastGraphAllFused()
-
-        # check tensor overload
         dtypes = [
             torch.int8,
             torch.int16,
@@ -1308,27 +1294,21 @@ def foo(x, mask):
             torch.bool,
         ]
         sizes = [(2,), (4, 4)]
-        for self_dtype, mask_dtype, device, size in product(dtypes, dtypes, self.devices, sizes):
-            try:
-                input_v = self.data_for(self_dtype, device, size=size)
-                val = self.data_for(val_dtype, device, size=size)
-                mask = self.data_for(torch.bool, device, size=size)
+        for self_dtype, device, scalar_val, size in product(dtypes, self.devices, [0.4, 3], sizes):
+            input_v = self.data_for(self_dtype, device, size=size)
+            mask = self.data_for(torch.bool, device, size=size)
 
-                def fn(input_v, val, mask):
-                    return torch.masked_fill(input_v, mask, val)
-                ref = fn(input_v, val, mask)
-            except Exception:
-                # If eager mode doesn't support a dtype/op/device combo,
-                # neither does the fuser.  Catch everything to avoid needing to
-                # guess what errors might be thrown by eager.
-                continue
+            def fn(input_v, mask):
+                return torch.masked_fill(input_v, mask, scalar_val)
+            ref = fn(input_v, mask)
             try:
-                t = torch.jit.trace(fn, (input_v, val, mask))
-                torch.testing.assert_allclose(ref, t((input_v, val, mask)))
-                self.assertAllFused(t.graph_for(x))
+                t = torch.jit.trace(fn, (input_v, mask))
+                torch.testing.assert_allclose(ref, t(input_v, mask))
+                print(torch.jit.last_executed_optimized_graph())
+                self.assertLastGraphAllFused()
             except Exception as e:
                 raise RuntimeError(
-                    " ".join(["Failed:", str(dtype), op.__name__, device, str(size)])
+                    " ".join(["Failed:", str(self_dtype), op.__name__, device, str(size)])
                 )
 
     def test_isnan(self):
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
index dc6bb2fbf878..1cf67f87ded9 100644
--- a/test/test_jit_profiling.py
+++ b/test/test_jit_profiling.py
@@ -4,7 +4,6 @@
 
 if __name__ == '__main__':
     run_tests()
-    if not PY2:
-        import test_jit_py3
-        suite = unittest.findTestCases(test_jit_py3)
-        unittest.TextTestRunner().run(suite)
+    import test_jit_py3
+    suite = unittest.findTestCases(test_jit_py3)
+    unittest.TextTestRunner().run(suite)
diff --git a/test/test_jit_simple.py b/test/test_jit_simple.py
index 23da6602c572..23c7f3b4b6f6 100644
--- a/test/test_jit_simple.py
+++ b/test/test_jit_simple.py
@@ -4,7 +4,6 @@
 
 if __name__ == '__main__':
     run_tests()
-    if not PY2:
-        import test_jit_py3
-        suite = unittest.findTestCases(test_jit_py3)
-        unittest.TextTestRunner().run(suite)
+    import test_jit_py3
+    suite = unittest.findTestCases(test_jit_py3)
+    unittest.TextTestRunner().run(suite)
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 8402be471a88..6ebfb03256f3 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -2716,6 +2716,135 @@ def test_old_matrix_rank(self, device, dtype):
             self.assertEqual(torch.matrix_rank(aaT, True), np.linalg.matrix_rank(aaT.cpu().numpy(), True))
             self.assertEqual(torch.matrix_rank(aaT, 0.01, True), np.linalg.matrix_rank(aaT.cpu().numpy(), 0.01, True))
 
+    @precisionOverride({torch.float32: 5e-6, torch.complex64: 5e-6})
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
+    def test_qr(self, device, dtype):
+        def run_test(tensor_dims, some):
+            A = torch.randn(*tensor_dims, dtype=dtype, device=device)
+            Q, R = torch.qr(A, some=some)
+
+            # Check0: Q[-2:] = (m, n_columns), R[-2:] = (n_columns, n)
+            m, n = tensor_dims[-2:]
+            n_columns = m if (not some) and m > n else min(m, n)
+            self.assertEqual(Q.size(-2), m)
+            self.assertEqual(R.size(-1), n)
+            self.assertEqual(Q.size(-1), n_columns)
+
+            A_ = A.cpu().numpy()
+            Q_ = Q.cpu().numpy()
+            R_ = R.cpu().numpy()
+
+            # Check1: A = QR
+            self.assertEqual(A_, np.matmul(Q_, R_))
+
+            # Check2: A = QR (with out)
+            Q_out, R_out = torch.full_like(Q, math.nan), torch.full_like(R, math.nan)
+            torch.qr(A, some=some, out=(Q_out, R_out))
+            Q_out_ = Q_out.cpu().numpy()
+            R_out_ = R_out.cpu().numpy()
+            self.assertEqual(A_, np.matmul(Q_out_, R_out_))
+
+            # Check3: Q == Q_out, R == R_out
+            self.assertEqual(Q_, Q_out_)
+            self.assertEqual(R_, R_out_)
+
+            # Check4: Q^{T}Q = I, triu(R) = R
+            eye = torch.eye(n_columns, device=device, dtype=dtype).expand(Q.shape[:-2] + (n_columns, n_columns)).cpu().numpy()
+            self.assertEqual(np.matmul(Q_.swapaxes(-1, -2).conj(), Q_), eye)
+            self.assertEqual(R.triu(), R)
+
+        tensor_dims_list = [(3, 5), (5, 5), (5, 3),  # Single matrix
+                            (7, 3, 5), (7, 5, 5), (7, 5, 3),  # 3-dim Tensors
+                            (7, 5, 3, 5), (7, 5, 5, 5), (7, 5, 5, 3)]  # 4-dim Tensors
+        for tensor_dims, some in itertools.product(tensor_dims_list, [True, False]):
+            run_test(tensor_dims, some)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    def test_qr_vs_numpy(self, device, dtype):
+        """
+        test torch.linalg.qr vs numpy.linalg.qr
+        """
+        sizes_to_test = [
+            (7, 5),
+            (5, 7),
+            (5, 0),    # empty
+            (0, 5),    # empty
+        ]
+        for size in sizes_to_test:
+            t = torch.randn(size, device=device, dtype=dtype)
+            np_t = t.cpu().numpy()
+            for mode in ['reduced', 'complete']:
+                exp_q, exp_r = np.linalg.qr(np_t, mode=mode)
+                q, r = torch.linalg.qr(t, mode=mode)
+                self.assertEqual(q, exp_q)
+                self.assertEqual(r, exp_r)
+            #
+            # for mode='r' we need a special logic because numpy returns only r
+            exp_r = np.linalg.qr(np_t, mode='r')
+            q, r = torch.linalg.qr(t, mode='r')
+            # check that q is empty
+            assert q.shape == (0,)
+            assert q.dtype == t.dtype
+            assert q.device == t.device
+            # check r
+            self.assertEqual(r, exp_r)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    def test_qr_batched(self, device, dtype):
+        """
+        test torch.linalg.qr vs numpy.linalg.qr. We need some special logic
+        because numpy does not support batched qr
+        """
+        def np_qr_batched(a, mode):
+            """poor's man batched version of np.linalg.qr"""
+            all_q = []
+            all_r = []
+            for matrix in a:
+                q, r = np.linalg.qr(matrix, mode=mode)
+                all_q.append(q)
+                all_r.append(r)
+            return np.array(all_q), np.array(all_r)
+
+        t = torch.randn((3, 7, 5), device=device, dtype=dtype)
+        np_t = t.cpu().numpy()
+        for mode in ['reduced', 'complete']:
+            exp_q, exp_r = np_qr_batched(np_t, mode=mode)
+            q, r = torch.linalg.qr(t, mode=mode)
+            self.assertEqual(q, exp_q)
+            self.assertEqual(r, exp_r)
+
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.float, torch.double, torch.cfloat, torch.cdouble)
+    def test_qr_out(self, device, dtype):
+        """
+        test torch.linalg.qr(out=...) vs torch.lingalg.qr
+        """
+        sizes_to_test = [
+            (7, 5),
+            (5, 7),
+            (5, 0),    # empty
+            (0, 5),    # empty
+        ]
+        for size in sizes_to_test:
+            t = torch.randn(size, device=device, dtype=dtype)
+            np_t = t.cpu().numpy()
+            for mode in ['reduced', 'complete', 'r']:
+                q, r = torch.linalg.qr(t, mode=mode)
+                out = (torch.empty((0), dtype=dtype, device=device),
+                       torch.empty((0), dtype=dtype, device=device))
+                q2, r2 = torch.linalg.qr(t, mode=mode, out=out)
+                assert q2 is out[0]
+                assert r2 is out[1]
+                self.assertEqual(q2, q)
+                self.assertEqual(r2, r)
+
     @dtypes(torch.double, torch.cdouble)
     def test_einsum(self, device, dtype):
         def check(equation, *operands):
@@ -5835,51 +5964,6 @@ def check_norm(a, b, expected_norm, gels_result):
         torch.lstsq(b, a, out=(tb, ta))
         self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, atol=1e-8, rtol=0)
 
-    @precisionOverride({torch.float32: 5e-6, torch.complex64: 5e-6})
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
-    def test_qr(self, device, dtype):
-        def run_test(tensor_dims, some):
-            A = torch.randn(*tensor_dims, dtype=dtype, device=device)
-            Q, R = torch.qr(A, some=some)
-
-            # Check0: Q[-2:] = (m, n_columns), R[-2:] = (n_columns, n)
-            m, n = tensor_dims[-2:]
-            n_columns = m if (not some) and m > n else min(m, n)
-            self.assertEqual(Q.size(-2), m)
-            self.assertEqual(R.size(-1), n)
-            self.assertEqual(Q.size(-1), n_columns)
-
-            A_ = A.cpu().numpy()
-            Q_ = Q.cpu().numpy()
-            R_ = R.cpu().numpy()
-
-            # Check1: A = QR
-            self.assertEqual(A_, np.matmul(Q_, R_))
-
-            # Check2: A = QR (with out)
-            Q_out, R_out = torch.full_like(Q, math.nan), torch.full_like(R, math.nan)
-            torch.qr(A, some=some, out=(Q_out, R_out))
-            Q_out_ = Q_out.cpu().numpy()
-            R_out_ = R_out.cpu().numpy()
-            self.assertEqual(A_, np.matmul(Q_out_, R_out_))
-
-            # Check3: Q == Q_out, R == R_out
-            self.assertEqual(Q_, Q_out_)
-            self.assertEqual(R_, R_out_)
-
-            # Check4: Q^{T}Q = I, triu(R) = R
-            eye = torch.eye(n_columns, device=device, dtype=dtype).expand(Q.shape[:-2] + (n_columns, n_columns)).cpu().numpy()
-            self.assertEqual(np.matmul(Q_.swapaxes(-1, -2).conj(), Q_), eye)
-            self.assertEqual(R.triu(), R)
-
-        tensor_dims_list = [(3, 5), (5, 5), (5, 3),  # Single matrix
-                            (7, 3, 5), (7, 5, 5), (7, 5, 3),  # 3-dim Tensors
-                            (7, 5, 3, 5), (7, 5, 5, 5), (7, 5, 5, 3)]  # 4-dim Tensors
-        for tensor_dims, some in itertools.product(tensor_dims_list, [True, False]):
-            run_test(tensor_dims, some)
-
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     def test_lapack_empty(self, device):
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index 75b486043c42..81b33c5900db 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -3,7 +3,6 @@
 import os
 import sys
 import time
-import subprocess
 import unittest
 import copy
 from sys import platform
@@ -525,7 +524,7 @@ def test_cuda_bad_call(self):
     @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)')
     @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
     def test_wrong_cuda_fork(self):
-        results = self.run_process_no_exception("""\
+        stderr = TestCase.runWithPytorchAPIUsageStderr("""\
 import torch
 from torch.multiprocessing import Process
 def run(rank):
@@ -542,7 +541,7 @@ def run(rank):
     for p in processes:
         p.join()
 """)
-        self.assertRegex(results[1].decode('ascii'), "Cannot re-initialize CUDA in forked subprocess.")
+        self.assertRegex(stderr, "Cannot re-initialize CUDA in forked subprocess.")
 
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
@@ -831,15 +830,6 @@ def test_cuda_parameter_sharing(self):
         param = Parameter(torch.arange(1., 26, device='cuda').view(5, 5))
         self._test_autograd_sharing(param, mp.get_context('spawn'), is_parameter=True)
 
-    @staticmethod
-    def run_process_no_exception(code):
-        popen = subprocess.Popen(
-            [sys.executable, '-c', code],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE)
-        pipes = popen.communicate()
-        return pipes
-
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     def test_integer_parameter_serialization(self):
diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
index bbb69f6e147f..1906b83ca8d6 100644
--- a/test/test_namedtuple_return_api.py
+++ b/test/test_namedtuple_return_api.py
@@ -12,7 +12,7 @@
 all_operators_with_namedtuple_return = {
     'max', 'min', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig',
     'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq',
-    'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "unpack_dual"
+    'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "unpack_dual", 'linalg_qr',
 }
 
 
@@ -58,7 +58,7 @@ def test_namedtuple_return(self):
                names=('values', 'indices'), hasout=True),
             op(operators=['svd'], input=(), names=('U', 'S', 'V'), hasout=True),
             op(operators=['slogdet'], input=(), names=('sign', 'logabsdet'), hasout=False),
-            op(operators=['qr'], input=(), names=('Q', 'R'), hasout=True),
+            op(operators=['qr', 'linalg_qr'], input=(), names=('Q', 'R'), hasout=True),
             op(operators=['solve'], input=(a,), names=('solution', 'LU'), hasout=True),
             op(operators=['geqrf'], input=(), names=('a', 'tau'), hasout=True),
             op(operators=['symeig', 'eig'], input=(True,), names=('eigenvalues', 'eigenvectors'), hasout=True),
diff --git a/test/test_nn.py b/test/test_nn.py
index 1d63be6e3075..386ba369dca6 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -9283,18 +9283,19 @@ def test_flatten(self):
     def test_unflatten(self):
         tensor_input = torch.randn(2, 50)
 
-        # Unflatten Tensor
+        # Unflatten Tensor (unflattened_size as a tuple of ints and list of ints)
 
-        unflatten = nn.Unflatten(dim=1, unflattened_size=(2, 5, 5))
-        tensor_output = unflatten(tensor_input)
-        self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5]))
+        for us in ((2, 5, 5), [2, 5, 5]):
+            unflatten = nn.Unflatten(dim=1, unflattened_size=us)
+            tensor_output = unflatten(tensor_input)
+            self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5]))
 
         # Unflatten NamedTensor
 
         unflatten = nn.Unflatten(dim='features', unflattened_size=(('C', 2), ('H', 5), ('W', 5)))
         named_tensor_input = tensor_input.refine_names('N', 'features')
         named_tensor_output = unflatten(named_tensor_input)
-        self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5]))
+        self.assertEqual(named_tensor_output.size(), torch.Size([2, 2, 5, 5]))
 
     def test_unflatten_invalid_arg(self):
         # Wrong type for unflattened_size (tuple of floats)
@@ -9304,6 +9305,13 @@ def test_unflatten_invalid_arg(self):
                 r"unflattened_size must be tuple of ints, but found element of type float at pos 2"):
             nn.Unflatten(dim=1, unflattened_size=(2, 5, 5.0))
 
+        # Wrong type for unflattened_size (list of lists and list of tuples)
+        for us in ([['C', 2], ['W', 5], ['H', 5]], [('C', 2), ('W', 5), ('H', 5)]):
+            with self.assertRaisesRegex(
+                    TypeError,
+                    r"unflattened_size must be a tuple of tuples, but found type list"):
+                nn.Unflatten(dim='features', unflattened_size=us)
+
         # Wrong type for unflattened_size (tuple of lists)
 
         with self.assertRaisesRegex(
@@ -9311,19 +9319,12 @@ def test_unflatten_invalid_arg(self):
                 r"unflattened_size must be tuple of tuples, but found element of type list at pos 0"):
             nn.Unflatten(dim='features', unflattened_size=(['C', 2], ['W', 5], ['H', 5]))
 
-        # Wrong type for unflattened_size (list of ints)
-
-        with self.assertRaisesRegex(
-                TypeError,
-                r"unflattened_size must be a tuple of ints, but found type list"):
-            nn.Unflatten(dim=1, unflattened_size=[2, 5, 5])
-
-        # Wrong type for unflattened_size (list of lists)
+        # Wrong type for unflattened_size (tuple of dicts)
 
         with self.assertRaisesRegex(
                 TypeError,
-                r"unflattened_size must be a tuple of tuples, but found type list"):
-            nn.Unflatten(dim='features', unflattened_size=[['C', 2], ['W', 5], ['H', 5]])
+                r"unflattened_size must be tuple of tuples, but found element of type dict at pos 0"):
+            nn.Unflatten(dim='features', unflattened_size=({'C': 2}, {'W': 5}, {'H': 5}))
 
     def test_layer_norm_grads_with_create_graph_flag(self):
         atol = 1e-5
diff --git a/test/test_numpy_interop.py b/test/test_numpy_interop.py
index 35ac4eb94889..81c385ae90a2 100644
--- a/test/test_numpy_interop.py
+++ b/test/test_numpy_interop.py
@@ -47,10 +47,8 @@ def get_castable_tensor(shape, dtype):
             else:
                 # can't directly use min and max, because for int64_t, max - min
                 # is greater than int64_t range and triggers UB.
-                dtype_info = torch.iinfo(dtype)
-                low = max(dtype_info.min, int(-1e10))
-                high = min(dtype_info.max, int(1e10))
-                dtype_info = torch.iinfo(dtype)
+                low = max(torch.iinfo(dtype).min, int(-1e10))
+                high = min(torch.iinfo(dtype).max, int(1e10))
                 t = torch.empty(shape, dtype=torch.int64).random_(low, high)
             return t.to(dtype)
 
@@ -272,10 +270,12 @@ def test_numpy_array_interface(self, device):
         ]
         for tp, dtype in zip(types, dtypes):
             if np.dtype(dtype).kind == 'u':
-                x = torch.Tensor([1, 2, 3, 4]).type(tp)
+                # .type expects a XxxTensor, which have no type hints on
+                # purpose, so ignore during mypy type checking
+                x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
                 array = np.array([1, 2, 3, 4], dtype=dtype)
             else:
-                x = torch.Tensor([1, -2, 3, -4]).type(tp)
+                x = torch.Tensor([1, -2, 3, -4]).type(tp)  # type: ignore
                 array = np.array([1, -2, 3, -4], dtype=dtype)
 
             # Test __array__ w/o dtype argument
@@ -309,7 +309,7 @@ def test_numpy_array_interface(self, device):
         float_types = [torch.DoubleTensor, torch.FloatTensor]
         float_dtypes = [np.float64, np.float32]
         for tp, dtype in zip(float_types, float_dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
             array = np.array([1, 2, 3, 4], dtype=dtype)
             for func in ['sin', 'sqrt', 'ceil']:
                 ufunc = getattr(np, func)
@@ -321,7 +321,7 @@ def test_numpy_array_interface(self, device):
 
         # Test functions with boolean return value
         for tp, dtype in zip(types, dtypes):
-            x = torch.Tensor([1, 2, 3, 4]).type(tp)
+            x = torch.Tensor([1, 2, 3, 4]).type(tp)  # type: ignore
             array = np.array([1, 2, 3, 4], dtype=dtype)
             geq2_x = np.greater_equal(x, 2)
             geq2_array = np.greater_equal(array, 2).astype('uint8')
@@ -360,7 +360,7 @@ def test_parse_numpy_int(self, device):
             self.assertEqual(torch.ones([2, 2, 2, 2]).mean(scalar), torch.ones([2, 2, 2, 2]).mean(np_val))
 
             # numpy integral type parses like a python int in custom python bindings:
-            self.assertEqual(torch.Storage(np_val).size(), scalar)
+            self.assertEqual(torch.Storage(np_val).size(), scalar)  # type: ignore
 
             tensor = torch.tensor([2], dtype=torch.int)
             tensor[0] = np_val
diff --git a/test/test_overrides.py b/test/test_overrides.py
index 95f94504d84e..f32b04cb2e53 100644
--- a/test/test_overrides.py
+++ b/test/test_overrides.py
@@ -563,6 +563,8 @@ def instance_gen():
                     func_args.append(instance_gen())
                 elif t == 'TensorList':
                     func_args.append([instance_gen(), instance_gen()])
+                elif t == 'c10::List<c10::optional<Tensor>>':
+                    func_args.append([instance_gen(), instance_gen()])
                 elif t == 'IntArrayRef':
                     size = arg.get('size', 2)
                     if size == 1:
diff --git a/test/test_profiler.py b/test/test_profiler.py
index d24fabe76998..826a9f5d0b57 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -1,14 +1,14 @@
 import collections
 import gc
+import io
 import unittest
 
-import tempfile
 import torch
 import torch.nn as nn
 import torch.optim
 import torch.utils.data
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS)
+    TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS, TemporaryFileName)
 import torch.autograd.profiler as profiler
 from torch.autograd.profiler import profile
 from torch.autograd import kineto_available
@@ -282,7 +282,6 @@ def trace_handler(p):
         print(p.key_averages().table(
             sort_by="self_cuda_time_total", row_limit=-1))
 
-    @unittest.skipIf(IS_WINDOWS, "Disabled on windows (permissions)")
     def test_export_stacks(self):
         with profile(with_stack=True, use_kineto=kineto_available()) as p:
             x = torch.randn(10, 10)
@@ -290,9 +289,10 @@ def test_export_stacks(self):
             z = torch.mm(x, y)
             z = z + y
 
-        with tempfile.NamedTemporaryFile(mode="w+") as f:
-            p.export_stacks(f.name)
-            lines = f.readlines()
+        with TemporaryFileName(mode="w+") as fname:
+            p.export_stacks(fname)
+            with io.open(fname, 'r') as f:
+                lines = f.readlines()
             assert len(lines) > 0, "Empty stacks file"
             for line in lines:
                 is_int = False
diff --git a/test/test_quantization.py b/test/test_quantization.py
index f68bfcd058b6..1c370913c6d0 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -15,6 +15,7 @@
 from quantization.test_quantized_op import TestPadding  # noqa: F401
 from quantization.test_quantized_op import TestQuantizedEmbeddingOps  # noqa: F401
 from quantization.test_quantized_op import TestDynamicQuantizedRNNOp  # noqa: F401
+
 # Quantized Functional
 from quantization.test_quantized_functional import TestQuantizedFunctional  # noqa: F401
 
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 8fd5926caa82..916f133c3fe1 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -17,7 +17,7 @@
 from torch.serialization import check_module_version_greater_or_equal
 
 from torch.testing._internal.common_utils import TestCase, IS_WINDOWS, \
-    TEST_DILL, run_tests, download_file, BytesIOContext
+    TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 
 # These tests were all copied from `test/test_torch.py` at some point, so see
@@ -137,25 +137,22 @@ def test(name_or_buffer):
         with tempfile.NamedTemporaryFile() as f:
             test(f)
 
-        if sys.platform != "win32":
-            with tempfile.NamedTemporaryFile() as f:
-                test(f.name)
+        with TemporaryFileName() as fname:
+            test(fname)
 
         test(io.BytesIO())
 
     def test_serialization(self):
         # Test serialization with a real file
         b = self._test_serialization_data()
-        for use_name in (False, True):
-            # Passing filename to torch.save(...) will cause the file to be opened twice,
-            # which is not supported on Windows
-            if sys.platform == "win32" and use_name:
-                continue
-            with tempfile.NamedTemporaryFile() as f:
-                handle = f if not use_name else f.name
-                torch.save(b, handle)
-                f.seek(0)
-                c = torch.load(handle)
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(b, f)
+            f.seek(0)
+            c = torch.load(f)
+            self._test_serialization_assert(b, c)
+        with TemporaryFileName() as fname:
+            torch.save(b, fname)
+            c = torch.load(fname)
             self._test_serialization_assert(b, c)
         # test non-ascii encoding of bytes arrays/strings
         # The following bytes are produced by serializing
@@ -716,9 +713,8 @@ def test(name_or_buffer):
         with tempfile.NamedTemporaryFile() as f:
             test(f)
 
-        if sys.platform != "win32":
-            with tempfile.NamedTemporaryFile() as f:
-                test(f.name)
+        with TemporaryFileName() as fname:
+            test(fname)
 
         test(io.BytesIO())
 
@@ -737,12 +733,11 @@ def test_serialization_2gb_file(self):
             f.seek(0)
             state = torch.load(f)
 
-    @unittest.skipIf(IS_WINDOWS, "torch.save with filename will open file twice, not supported in Windows.")
     def test_pathlike_serialization(self):
         model = torch.nn.Conv2d(20, 3200, kernel_size=3)
 
-        with tempfile.NamedTemporaryFile() as f:
-            path = pathlib.Path(f.name)
+        with TemporaryFileName() as fname:
+            path = pathlib.Path(fname)
             torch.save(model, path)
             torch.load(path)
 
diff --git a/test/test_shape_ops.py b/test/test_shape_ops.py
index 43321508e0e2..f7da08eb24d7 100644
--- a/test/test_shape_ops.py
+++ b/test/test_shape_ops.py
@@ -378,21 +378,31 @@ def test_flip(self, device):
             self.assertEqual(size, list(data.flip(ds).size()))
 
         # test rectangular case
-        data = torch.tensor([1, 2, 3, 4, 5, 6]).view(2, 3).to(device)
-        flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]]).to(device)
-        flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]]).to(device)
+        data = torch.tensor([1, 2, 3, 4, 5, 6], device=device).view(2, 3)
+        flip0_result = torch.tensor([[4, 5, 6], [1, 2, 3]], device=device)
+        flip1_result = torch.tensor([[3, 2, 1], [6, 5, 4]], device=device)
 
         self.assertEqual(flip0_result, data.flip(0))
         self.assertEqual(flip1_result, data.flip(1))
 
         # test empty tensor, should just return an empty tensor of the same shape
-        data = torch.tensor([])
+        data = torch.tensor((), device=device)
         self.assertEqual(data, data.flip(0))
 
         # test bool tensor
-        a = torch.tensor([False, True])
+        a = torch.tensor([False, True], device=device)
         self.assertEqual(a.flip(0), torch.tensor([True, False]))
 
+        # case: dims=()
+        a = torch.randn(3, 2, 1, device=device)
+        if device == 'cpu':
+            self.assertEqual(a.flip(dims=()), a)
+        else:
+            # Reference: https://github.com/pytorch/pytorch/issues/49982
+            with self.assertRaisesRegex(IndexError,
+                                        "flip dims size out of range, got flip dims size=0"):
+                a.flip(dims=())
+
     def _rand_shape(self, dim, min_size, max_size):
         shape = []
         for i in range(dim):
diff --git a/test/test_sparse.py b/test/test_sparse.py
index 6daf3f1931d2..228c66aa403e 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -356,6 +356,11 @@ def test_to_sparse(self):
         sp, _, _ = self._gen_sparse(2, 10, [3, 3, 3])
         self.assertRaises(RuntimeError, lambda: sp.to_sparse())
 
+    def test_sparse_bool(self):
+        a = self.value_tensor([True, False]).to(torch.bool)
+        b = a.to_sparse().to_dense()
+        self.assertEqual(a, b)
+
     def test_scalar(self):
         # tensor with value
         a = self.sparse_tensor(self.index_tensor([]).unsqueeze(1), 12.3, [])
@@ -3168,6 +3173,14 @@ def different_dtypes():
         test_sparse_matmul(2, 0, [0, 10], [10, 0])
         test_error_cases()
 
+    def test_assign(self):
+        def assign_to(a):
+            a, i_a, v_a = self._gen_sparse(2, 5, [2, 3])
+            a[0] = 100
+
+        self.assertRaises(TypeError, assign_to)
+
+
 class TestUncoalescedSparse(TestSparse):
     def setUp(self):
         super(TestUncoalescedSparse, self).setUp()
diff --git a/test/test_spectral_ops.py b/test/test_spectral_ops.py
index 6192d6c4d6b6..085af5294a04 100644
--- a/test/test_spectral_ops.py
+++ b/test/test_spectral_ops.py
@@ -225,13 +225,13 @@ def test_empty_fft(self, device, dtype):
     def test_fft_invalid_dtypes(self, device):
         t = torch.randn(64, device=device, dtype=torch.complex128)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
+        with self.assertRaisesRegex(RuntimeError, "rfft expects a real input tensor"):
             torch.fft.rfft(t)
 
         with self.assertRaisesRegex(RuntimeError, "rfftn expects a real-valued input tensor"):
             torch.fft.rfftn(t)
 
-        with self.assertRaisesRegex(RuntimeError, "Expected a real input tensor"):
+        with self.assertRaisesRegex(RuntimeError, "ihfft expects a real input tensor"):
             torch.fft.ihfft(t)
 
     @skipCUDAIfRocm
@@ -332,6 +332,27 @@ def test_fft_backward(self, device, dtype):
                 args = args[1:]
                 self._fft_grad_check_helper(fname, input, args)
 
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    def test_fft_invalid_out_types(self, device):
+
+        complex_fft_funcs = [torch.fft.fft, torch.fft.ifft, torch.fft.fftn, torch.fft.ifftn,
+                             torch.fft.rfft, torch.fft.rfftn, torch.fft.ihfft]
+        real_fft_funcs = [torch.fft.irfft, torch.fft.irfftn, torch.fft.hfft]
+        fft_funcs = complex_fft_funcs + real_fft_funcs
+
+        # Test errors on invalid out dtypes
+        x = torch.rand(10, device=device, dtype=torch.float32)
+        for out_dtype, funcs in [(torch.int16, fft_funcs),
+                                 (torch.float32, complex_fft_funcs),
+                                 (torch.complex64, real_fft_funcs)]:
+            out = torch.empty((), device=device, dtype=out_dtype)
+
+            for func in funcs:
+                with self.assertRaisesRegex(RuntimeError, "expects a .* output tensor"):
+                    func(x, out=out)
+
     # nd-fft tests
 
     @skipCPUIfNoMkl
@@ -463,10 +484,10 @@ def test_fftn_invalid(self, device):
                      torch.fft.rfftn, torch.fft.irfftn)
 
         for func in fft_funcs:
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(0, 1, 0))
 
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(2, -1))
 
             with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"):
@@ -578,10 +599,10 @@ def test_fft2_invalid(self, device):
                      torch.fft.rfft2, torch.fft.irfft2)
 
         for func in fft_funcs:
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(0, 0))
 
-            with self.assertRaisesRegex(RuntimeError, "FFT dims must be unique"):
+            with self.assertRaisesRegex(RuntimeError, "dims must be unique"):
                 func(a, dim=(2, -1))
 
             with self.assertRaisesRegex(RuntimeError, "dim and shape .* same length"):
@@ -623,6 +644,19 @@ def test_fftfreq_numpy(self, device, dtype):
                 actual = torch_fn(*args, device=device, dtype=dtype)
                 self.assertEqual(actual, expected, exact_dtype=False)
 
+    @skipCPUIfNoMkl
+    @skipCUDAIfRocm
+    @onlyOnCPUAndCUDA
+    @dtypes(torch.float, torch.double)
+    def test_fftfreq_out(self, device, dtype):
+        for func in (torch.fft.fftfreq, torch.fft.rfftfreq):
+            expect = func(n=100, d=.5, device=device, dtype=dtype)
+            actual = torch.empty((), device=device, dtype=dtype)
+            with self.assertWarnsRegex(UserWarning, "out tensor will be resized"):
+                func(n=100, d=.5, out=actual)
+            self.assertEqual(actual, expect)
+
+
     @skipCPUIfNoMkl
     @skipCUDAIfRocm
     @onlyOnCPUAndCUDA
@@ -1066,10 +1100,12 @@ def test_complex_stft_onesided(self, device):
         with self.assertRaisesRegex(RuntimeError, 'complex'):
             x.stft(10, pad_mode='constant', onesided=True)
 
+    # stft is currently warning that it requires return-complex while an upgrader is written
     def test_stft_requires_complex(self, device):
         x = torch.rand(100)
-        with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'):
-            y = x.stft(10, pad_mode='constant')
+        y = x.stft(10, pad_mode='constant')
+        # with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'):
+        #     y = x.stft(10, pad_mode='constant')
 
     @skipCUDAIfRocm
     @skipCPUIfNoMkl
diff --git a/test/test_testing.py b/test/test_testing.py
index b87345186cb3..4ff215233fe2 100644
--- a/test/test_testing.py
+++ b/test/test_testing.py
@@ -3,9 +3,9 @@
 import math
 
 from torch.testing._internal.common_utils import \
-    (TestCase, run_tests, make_tensor)
+    (TestCase, make_tensor, run_tests, slowTest)
 from torch.testing._internal.common_device_type import \
-    (instantiate_device_type_tests, onlyOnCPUAndCUDA, dtypes)
+    (instantiate_device_type_tests, onlyCUDA, onlyOnCPUAndCUDA, dtypes)
 
 # For testing TestCase methods and torch.testing functions
 class TestTesting(TestCase):
@@ -438,6 +438,53 @@ def test_assert_messages(self, device):
         self.assertEqual("no_user_msg", self._get_assert_msg(msg=None, debug_msg="no_user_msg"))
         self.assertEqual("debug_msg\nuser_msg", self._get_assert_msg(msg="user_msg", debug_msg="debug_msg"))
 
+    @onlyCUDA
+    @slowTest
+    def test_cuda_assert_should_stop_test_suite(self, device):
+        # This test is slow because it spawn another process to run another test suite.
+
+        # Test running of cuda assert test suite should early terminate.
+        stderr = TestCase.runWithPytorchAPIUsageStderr("""\
+#!/usr/bin/env python
+
+import torch
+
+from torch.testing._internal.common_utils import (TestCase, run_tests, slowTest)
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
+
+# This test is added to ensure that test suite terminates early when
+# CUDA assert was thrown since all subsequent test will fail.
+# See: https://github.com/pytorch/pytorch/issues/49019
+# This test file should be invoked from test_testing.py
+class TestThatContainsCUDAAssertFailure(TestCase):
+
+    @slowTest
+    def test_throw_unrecoverable_cuda_exception(self, device):
+        x = torch.rand(10, device=device)
+        # cause unrecoverable CUDA exception, recoverable on CPU
+        y = x[torch.tensor([25])].cpu()
+
+    @slowTest
+    def test_trivial_passing_test_case_on_cpu_cuda(self, device):
+        x1 = torch.tensor([0., 1.], device=device)
+        x2 = torch.tensor([0., 1.], device='cpu')
+        self.assertEqual(x1, x2)
+
+instantiate_device_type_tests(
+    TestThatContainsCUDAAssertFailure,
+    globals(),
+    only_for='cuda'
+)
+
+if __name__ == '__main__':
+    run_tests()
+""")
+        # should capture CUDA error
+        self.assertIn('CUDA error: device-side assert triggered', stderr)
+        # should run only 1 test because it throws unrecoverable error.
+        self.assertIn('Ran 1 test', stderr)
+
+
 instantiate_device_type_tests(TestTesting, globals())
 
 if __name__ == '__main__':
diff --git a/test/test_throughput_benchmark.py b/test/test_throughput_benchmark.py
index d2f993ddaa3a..9d60344b5912 100644
--- a/test/test_throughput_benchmark.py
+++ b/test/test_throughput_benchmark.py
@@ -1,10 +1,9 @@
 
 import torch
-import tempfile
 from torch.utils import ThroughputBenchmark
 from torch.testing import assert_allclose
 
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import run_tests, TestCase, TemporaryFileName
 
 class TwoLayerNet(torch.jit.ScriptModule):
     def __init__(self, D_in, H, D_out):
@@ -76,8 +75,8 @@ def test_module(self):
         self.linear_test(TwoLayerNetModule)
 
     def test_profiling(self):
-        with tempfile.NamedTemporaryFile(delete=False) as f:
-            self.linear_test(TwoLayerNetModule, profiler_output_path=f.name)
+        with TemporaryFileName() as fname:
+            self.linear_test(TwoLayerNetModule, profiler_output_path=fname)
 
 
 if __name__ == '__main__':
diff --git a/test/test_torch.py b/test/test_torch.py
index 6532c2e5e17d..72fa853e2e7c 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -937,10 +937,6 @@ def test_index_add_all_dtypes(self):
                         # index_add calls atomicAdd on cuda.
                         zeros = torch.zeros(size, dtype=dtype, device=device)
 
-                        # index_add is not supported for complex dtypes on cuda yet
-                        if device.startswith('cuda') and dtype.is_complex:
-                            continue
-
                         added = zeros.index_add(0, torch.arange(0, size[0], dtype=idx_dtype, device=device), tensor)
                         self.assertEqual(added, tensor)
 
@@ -5693,7 +5689,8 @@ def test_storage_multigpu(self, devices):
             x = torch.tensor([], device=device)
             self.assertEqual(x.dtype, x.storage().dtype)
 
-    @dtypes(torch.float, torch.double, torch.half)
+    @dtypesIfCUDA(torch.float, torch.double, torch.half)
+    @dtypes(torch.float, torch.double)
     def test_multinomial(self, device, dtype):
         def make_prob_dist(shape, is_contiguous):
             if is_contiguous:
@@ -6870,7 +6867,6 @@ def inner(self, device, dtype):
     ('rot90', 'k1_d12', _small_3d, lambda t, d: [1, [1, 2]], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False),
     ('rot90', 'k1_neg_d', _small_3d, lambda t, d: [1, [1, -1]], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False),
     ('rot90', 'default', _small_3d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types + _complex_types, _cpu_types, False),
-    ('rsqrt', '', lambda t, d: _small_3d(t, d) + 1, lambda t, d: [], 1e-2, 1e-5, 1e-4, _float_types_no_half),
     ('sinh', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types),
     ('tan', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types),
     ('tan', 'complex', lambda t, d: _small_3d(t, d), lambda t, d: [], 1e-3, 1e-5, 1e-5, _complex_types),
diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py
index 776482306f4d..960991a4820b 100644
--- a/test/test_unary_ufuncs.py
+++ b/test/test_unary_ufuncs.py
@@ -642,14 +642,6 @@ def test_sign_complex_assert_raises(self, device):
             size = [5, 5]
             tensor = torch.rand(size, dtype=dtype, device=device)
 
-            # index_add calls atomicAdd on cuda.
-            zeros = torch.zeros(size, dtype=dtype, device=device)
-
-            # index_add is not supported for complex dtypes on cuda yet
-            if device.startswith('cuda') and dtype.is_complex:
-                self.assertRaises(RuntimeError,
-                                  lambda: zeros.index_add(0, torch.arange(0, size[0], dtype=torch.long, device=device), tensor))
-
             with self.assertRaisesRegex(RuntimeError,
                                         (r'Unlike NumPy, torch.sign is not intended to support complex numbers\. '
                                          r'Please use torch.sgn instead\.')):
@@ -1715,7 +1707,6 @@ def _medium_2d(dtype, device):
     _TorchMathTestMeta('ceil'),
     _TorchMathTestMeta('rad2deg'),
     _TorchMathTestMeta('deg2rad'),
-    _TorchMathTestMeta('rsqrt', reffn=lambda x: np.reciprocal(np.sqrt(x))),
     _TorchMathTestMeta('frac', reffn='fmod', refargs=lambda x: (x.numpy(), 1)),
     _TorchMathTestMeta('trunc'),
     _TorchMathTestMeta('round'),
diff --git a/test/test_view_ops.py b/test/test_view_ops.py
index 3a1411d1a167..be33aa1ab44a 100644
--- a/test/test_view_ops.py
+++ b/test/test_view_ops.py
@@ -100,6 +100,12 @@ def is_view_of(self, base, other):
 
         return True
 
+    # Returns true if v1 and v2 are views of the same base
+    def is_view_of_same_base(self, v1, v2):
+        if (not v1._is_view() or v1 is v2):
+            return False
+        return self.is_view_of(v1._base, v2)
+
     # Performs transpose if contiguous=True, else returns the input tensor as is
     def _do_transpose(self, x, contiguous=False, dim0=0, dim1=1):
         if contiguous:
@@ -457,6 +463,64 @@ def test_reshape_nonview(self, device):
         nv[6] = 0
         self.assertNotEqual(t[1, 1], nv[6])
 
+    def test_flatten_view(self, device):
+        def test_writes_propagate(t, v):
+            idx_t = (0,) * t.ndim
+            idx_v = (0,) * v.ndim
+            v[idx_v] = 0
+            self.assertEqual(t[idx_t], v[idx_v])
+
+        t = torch.ones(1, 2, 3, 4, device=device)
+        v = t.flatten()
+        self.assertTrue(self.is_view_of(t, v))
+        test_writes_propagate(t, v)
+
+        # zero-dimensional tensor
+        t = torch.tensor(1, device=device)
+        v = t.flatten()
+        test_writes_propagate(t, v)
+        self.assertTrue(self.is_view_of(t, v))
+
+        t = torch.ones(1, 2, 3, 4, device=device).transpose(2, 3)
+        v = t.flatten(0, 1)
+        test_writes_propagate(t, v)
+        self.assertTrue(self.is_view_of_same_base(t, v))
+
+        # stride[i] = stride[i + 1] * size[i + 1] is satisfied for 3 groups:
+        t = torch.ones(720, device=device) \
+            .as_strided((2, 3, 2, 3, 5, 4), (6, 2, 15, 5, 1, 0))
+        #               [--1--|---2---|-3-] [--1--|----2---|-3-]
+        v1 = t.flatten(0, 1)
+        v2 = v1.flatten(1, 3)
+        v3 = v2.flatten(2, 2)
+        test_writes_propagate(t, v1)
+        self.assertTrue(self.is_view_of_same_base(t, v1))
+        test_writes_propagate(t, v2)
+        self.assertTrue(self.is_view_of_same_base(t, v2))
+        test_writes_propagate(t, v3)
+        self.assertTrue(self.is_view_of_same_base(t, v3))
+
+    @onlyOnCPUAndCUDA
+    def test_flatten_nonview(self, device):
+        def assert_is_nonview(t, nv):
+            idx_t = (0,) * t.ndim
+            idx_nv = (0,) * nv.ndim
+            self.assertTrue(not nv._is_view())
+            nv[idx_nv] = 0
+            self.assertNotEqual(t[idx_t], nv[idx_nv])
+        t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3)
+        nv = t.flatten(1, 3)
+        assert_is_nonview(t, nv)
+
+        t = torch.ones(2, 2, device=device).T
+        nv = t.flatten()
+        assert_is_nonview(t, nv)
+
+        # flatten returns the original object if start_dim=end_dim
+        t = t = torch.ones(2, 2, device=device)
+        nv = t.flatten(1, 1)
+        self.assertTrue(t is nv)
+
     def test_basic_indexing_slice_view(self, device):
         t = torch.ones(5, 5, device=device)
         v = t[:2, :3]
diff --git a/test/test_vmap.py b/test/test_vmap.py
index cc25dff3b306..b722fc126b24 100644
--- a/test/test_vmap.py
+++ b/test/test_vmap.py
@@ -1907,6 +1907,16 @@ def test_split(self):
         test(vmap(vmap(lambda t: op(t, [4] * 8 + [8] * 4, 1), in_dims=2)),
              (torch.rand(B1, 2, B0, 64, B2),), in_dims=2)
 
+    def test_trace(self):
+        op = torch.trace
+        test = self._vmap_test
+        B0, B1, B2 = 7, 11, 13
+
+        test(op, (torch.rand(B0, 2, 5),))
+        test(op, (torch.rand(2, B0, 5),), in_dims=1)
+        test(vmap(op), (torch.rand(B1, 2, B0, 5),), in_dims=2)
+        test(vmap(vmap(op, in_dims=2)), (torch.rand(B1, 2, B0, 5, B2),), in_dims=2)
+
     def test_transpose(self):
         op = torch.transpose
         test = self._vmap_view_test
@@ -2313,6 +2323,10 @@ def test_slice(self, device):
         self._batched_grad_test(lambda x: x[:, 1:3], (x,))
         self._batched_grad_test(lambda x: x[..., 1:3], (x,))
 
+    def test_trace(self, device):
+        x = torch.randn(2, 3, device=device, requires_grad=True)
+        self._batched_grad_test(Tensor.trace, (x,))
+
     @allowVmapFallbackUsage
     def test_symeig(self, device):
         def op(x):
@@ -2322,6 +2336,11 @@ def op(x):
         self._batched_grad_test(op, (x,), {})
         self._batched_grad_grad_test(op, (x,), {})
 
+    def test_threshold(self, device):
+        x = torch.randn(2, 3, device=device, requires_grad=True)
+        self._batched_grad_test(lambda x: F.threshold(x, 0.5, 0.0), (x,))
+
+
     @allowVmapFallbackUsage
     def test_inplace_view(self, device):
         leaf = torch.randn(4, 5, requires_grad=True)
diff --git a/test/type_hint_tests/opt_size.py b/test/type_hint_tests/opt_size.py
new file mode 100644
index 000000000000..f24e57e6e56f
--- /dev/null
+++ b/test/type_hint_tests/opt_size.py
@@ -0,0 +1,6 @@
+import torch.nn as nn
+
+avg_pool1 = nn.AdaptiveAvgPool2d((1, None))
+avg_pool2 = nn.AdaptiveAvgPool2d((None, 1))
+max_pool1 = nn.AdaptiveMaxPool2d((1, None))
+max_pool2 = nn.AdaptiveMaxPool2d((None, 1))
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 026293a9281a..9d4fa54c93b3 100755
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -131,6 +131,20 @@ def is_hip_clang():
                 sources.write(line)
         print("%s updated" % gloo_cmake_file)
 
+gloo_cmake_file = "third_party/gloo/cmake/Modules/Findrccl.cmake"
+if os.path.exists(gloo_cmake_file):
+    do_write = False
+    with open(gloo_cmake_file, "r") as sources:
+        lines = sources.readlines()
+    newlines = [line.replace('RCCL_LIBRARY', 'RCCL_LIBRARY_PATH') for line in lines]
+    if lines == newlines:
+        print("%s skipped" % gloo_cmake_file)
+    else:
+        with open(gloo_cmake_file, "w") as sources:
+            for line in newlines:
+                sources.write(line)
+        print("%s updated" % gloo_cmake_file)
+
 hipify_python.hipify(
     project_directory=proj_dir,
     output_directory=out_dir,
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 9f68622e7691..9bf266da394d 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -896,8 +896,8 @@
   index: non_differentiable
   source: grad.take(index)
 
-- name: qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)
-  self: qr_backward(grads, self, some, Q, R)
+- name: linalg_qr(Tensor self, str mode='reduced') -> (Tensor Q, Tensor R)
+  self: linalg_qr_backward(grads, self, mode, Q, R)
 
 - name: rad2deg(Tensor self) -> Tensor
   self: rad2deg_backward(grad)
diff --git a/tools/autograd/gen_autograd.py b/tools/autograd/gen_autograd.py
index 88c00e0ba71a..b930aca504df 100644
--- a/tools/autograd/gen_autograd.py
+++ b/tools/autograd/gen_autograd.py
@@ -23,9 +23,6 @@
 
 import argparse
 import os
-import yaml
-import re
-from .utils import YamlLoader, op_name_with_overload
 from tools.codegen.selective_build.selector import SelectiveBuilder
 
 # See NOTE [ Autograd View Variables ] in variable.h for details.
@@ -89,84 +86,14 @@
     'tensor_split', 'swapdims', 'swapaxes'
 })
 
-def format_return_type(returns):
-    if len(returns) == 0:
-        return 'void'
-    elif len(returns) == 1:
-        return returns[0]['type']
-    else:
-        return_types = [r['type'] for r in returns]
-        return 'std::tuple<{}>'.format(','.join(return_types))
-
-
-def get_simple_type(arg):
-    simple_type = arg['type']
-    simple_type = simple_type.replace(' &', '').replace('const ', '')
-    simple_type = simple_type.replace('Generator *', 'Generator')
-
-    opt_match = re.match(r'c10::optional<(.+)>', simple_type)
-    if opt_match:
-        simple_type = '{}?'.format(opt_match.group(1))
-    return simple_type
-
-def has_tensoroptions_argument(declaration):
-    for argument in declaration['arguments']:
-        if 'TensorOptions' == argument['dynamic_type']:
-            return True
-    return False
-
-
-def load_aten_declarations(path):
-    with open(path, 'r') as f:
-        declarations = yaml.load(f, Loader=YamlLoader)
-
-    # enrich declarations with additional information
-    selected_declarations = []
-    for declaration in declarations:
-        if declaration.get('deprecated'):
-            continue
-
-        for arg in declaration['arguments']:
-            arg['simple_type'] = get_simple_type(arg)
-        for arg in declaration['schema_order_arguments']:
-            arg['simple_type'] = get_simple_type(arg)
-        for ret in declaration['returns']:
-            ret['simple_type'] = get_simple_type(ret)
-
-        declaration['formals'] = [arg['type'] + ' ' + arg['name']
-                                  for arg in declaration['arguments']]
-        declaration['schema_order_formals'] = [arg['type'] + ' ' + arg['name']
-                                               for arg in declaration['schema_order_arguments']]
-        declaration['args'] = [arg['name'] for arg in declaration['arguments']]
-        declaration['schema_order_args'] = [arg['name'] for arg in declaration['schema_order_arguments']]
-        declaration['api_name'] = declaration['name']
-        if declaration.get('overload_name'):
-            declaration['type_wrapper_name'] = "{}_{}".format(
-                declaration['name'], declaration['overload_name'])
-        else:
-            declaration['type_wrapper_name'] = declaration['name']
-        declaration['operator_name_with_overload'] = declaration['schema_string'].split('(')[0]
-        declaration['unqual_operator_name_with_overload'] = declaration['operator_name_with_overload'].split('::')[1]
-        declaration['return_type'] = format_return_type(declaration['returns'])
-
-        declaration['base_name'] = declaration['name']
-        selected_declarations.append(declaration)
-
-    return selected_declarations
-
-
-def gen_autograd(aten_path, native_functions_path, out, autograd_dir, operator_selector: SelectiveBuilder, disable_autograd=False):
-    full_aten_decls = load_aten_declarations(aten_path)
-
-    def filter_decls(aten_decls, operator_selector):
-        def is_operator_selected_for_training(decl):
-            op_name = op_name_with_overload(decl)
-            return operator_selector.is_operator_selected_for_training(op_name)
-
-        return [decl for decl in aten_decls if is_operator_selected_for_training(decl)]
-
-    aten_decls = filter_decls(full_aten_decls, operator_selector)
-
+def gen_autograd(
+    aten_path: str,
+    native_functions_path: str,
+    out: str,
+    autograd_dir: str,
+    operator_selector: SelectiveBuilder,
+    disable_autograd: bool = False,
+) -> None:
     # Parse and load derivatives.yaml
     from .load_derivatives import load_derivatives
     differentiability_infos = load_derivatives(
@@ -175,13 +102,13 @@ def is_operator_selected_for_training(decl):
     template_path = os.path.join(autograd_dir, 'templates')
 
     # Generate VariableType.h/cpp
+    from .gen_trace_type import gen_trace_type
+    from .gen_variable_type import gen_variable_type
     if not disable_autograd:
-        from .gen_variable_type import gen_variable_type
-        gen_variable_type(out, aten_decls, differentiability_infos, template_path)
+        gen_variable_type(out, native_functions_path, differentiability_infos, template_path, operator_selector)
 
-        from . import gen_trace_type
         # operator filter not applied as tracing sources are excluded in selective build
-        gen_trace_type.gen_trace_type(out, native_functions_path, template_path)
+        gen_trace_type(out, native_functions_path, template_path)
 
     # Generate Functions.h/cpp
     from .gen_autograd_functions import gen_autograd_functions_lib
@@ -193,7 +120,12 @@ def is_operator_selected_for_training(decl):
     gen_variable_factories(out, native_functions_path, template_path)
 
 
-def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir):
+def gen_autograd_python(
+    aten_path: str,
+    native_functions_path: str,
+    out: str,
+    autograd_dir: str,
+) -> None:
     from .load_derivatives import load_derivatives
     differentiability_infos = load_derivatives(
         os.path.join(autograd_dir, 'derivatives.yaml'), native_functions_path)
@@ -212,7 +144,7 @@ def gen_autograd_python(aten_path, native_functions_path, out, autograd_dir):
         out, native_functions_path, deprecated_path, template_path)
 
 
-def main():
+def main() -> None:
     parser = argparse.ArgumentParser(
         description='Generate autograd C++ files script')
     parser.add_argument('declarations', metavar='DECL',
diff --git a/tools/autograd/gen_autograd_functions.py b/tools/autograd/gen_autograd_functions.py
index a22154b5c01d..4724b99a8742 100644
--- a/tools/autograd/gen_autograd_functions.py
+++ b/tools/autograd/gen_autograd_functions.py
@@ -141,7 +141,7 @@ def process_function(info: DifferentiabilityInfo, template: CodeTemplate) -> str
     compute_index_ranges: List[str] = []
 
     for arg in info.args_with_derivatives:
-        if arg.type == 'TensorList':
+        if arg.type == 'TensorList' or arg.type == 'const c10::List<c10::optional<Tensor>> &':
             size = f'{arg.name}_size_'
             saved_list_sizes.append(f'size_t {arg.name}_size_;')
         else:
@@ -166,6 +166,15 @@ def save_var(var: SavedAttribute, is_output: bool) -> None:
             release_variables.append(f'{name}_released_ = true;')
             unpack.append(f'auto {name} = unpack_list({name}_);')
             asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);')
+        elif var.type == 'c10::List<c10::optional<Tensor>>':
+            saved_variables.append(f'std::vector<SavedVariable> {name}_;')
+            saved_variables.append(f'bool {name}_released_ = false;')
+            # Just clear() is sufficient, we don't need to loop and clear each variable.
+            # Because the SavedVariable owns a tensor and a grad_fn, removing the SavedVariable makes them go away as well.
+            release_variables.append(f'{name}_.clear();')
+            release_variables.append(f'{name}_released_ = true;')
+            unpack.append(f'auto {name} = unpack_opt_list({name}_);')
+            asserts.append(f'TORCH_CHECK(!{name}_released_, ERR_BACKWARD_TWICE);')
         elif var.type == 'IntArrayRef':
             saved_variables.append(f'std::vector<int64_t> {name};')
         elif var.type == 'c10::optional<IntArrayRef>':
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 1f61ce3dfa20..0450983a8e41 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -230,7 +230,7 @@ def signature_original(f: NativeFunction) -> str:
             opname += '_out'
         if f.func.name.name.inplace and pyi:
             opname += '_'
-        args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
+        args = CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
         # Simply ignore TensorOptionsArguments as it does not exist in deprecated.yaml.
         types = ', '.join(argument_type_str(a.argument.type)
                           for a in args if isinstance(a.argument, Argument))
diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
index b2dfe2667128..d8e68606e6ba 100644
--- a/tools/autograd/gen_trace_type.py
+++ b/tools/autograd/gen_trace_type.py
@@ -112,19 +112,12 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
             ]
         else:
             name = arg.name
-            # XXX: For arg that have type of Tensor?[], tracer will pass allow_undefined to addInputs
             if str(arg.type) == 'Tensor?[]':
-                return [f'jit::tracer::addInputs(node, "{name}", {name}, true);']
+                return [f'jit::tracer::addInputs(node, "{name}", {name});']
             else:
                 return [ADD_TRACE_INPUT.substitute(name=name, input=name)]
 
-    args: List[Union[Argument, TensorOptionsArguments]] = []
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        args = list(f.func.schema_order_arguments())
-    else:
-        sig_group = CppSignatureGroup.from_schema(f.func, method=False)
-        args = [cpp_args.argument for cpp_args in sig_group.signature.arguments()
-                if not isinstance(cpp_args.argument, SelfArgument)]
+    args: List[Union[Argument, TensorOptionsArguments]] = list(f.func.schema_order_arguments())
 
     if f.func.is_out_fn():
         # *_out functions take the result as a separate argument, but we don't want to
@@ -132,12 +125,7 @@ def dispatch_trace_input(arg: Union[Argument, TensorOptionsArguments]) -> Sequen
         # So first, we need to remove the out argument from the list of arguments to trace.
         # TODO: byte-for-byte compatible with old codegen behavior - it's incorrect to assume
         # there is only one output argument.
-        if f.use_c10_dispatcher.dispatcher_uses_new_style():
-            # for c10-full ops, the out argument is in the end
-            args = args[:-1]
-        else:
-            # for legacy ops, the out argument is in the beginning.
-            args = args[1:]
+        args = args[:-1]
 
     trace_inputs = itertools.chain.from_iterable(dispatch_trace_input(arg) for arg in args)
 
@@ -375,14 +363,10 @@ def method_definition(f: NativeFunction) -> Optional[str]:
     if cpp.name(f.func) in MANUAL_TRACER:
         return None
 
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        formals = ', '.join(
-            f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
-            for a in f.func.schema_order_arguments()
-        )
-    else:
-        sig_group = CppSignatureGroup.from_schema(f.func, method=False)
-        formals = ', '.join(f'{a.type} {a.name}' for a in sig_group.signature.arguments())
+    formals = ', '.join(
+        f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+        for a in f.func.schema_order_arguments()
+    )
 
     return METHOD_DEFINITION.substitute(
         return_type=cpp.returns_type(f.func.returns),
@@ -397,33 +381,22 @@ def method_definition(f: NativeFunction) -> Optional[str]:
 );
 """)
 
-UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\
-m.impl_UNBOXED("${name}", &${class_type}::${type_wrapper_name});
-""")
-
 @with_native_function
 def method_registration(f: NativeFunction) -> Optional[str]:
     if cpp.name(f.func) in MANUAL_TRACER:
         return None
 
-    if f.use_c10_dispatcher.dispatcher_uses_new_style():
-        return WRAPPER_REGISTRATION.substitute(
-            name=f.func.name,
-            type_wrapper_name=type_wrapper_name(f),
-            class_type='TraceType',
-        )
-    else:
-        return UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
-            name=f.func.name,
-            type_wrapper_name=type_wrapper_name(f),
-            class_type='TraceType',
-        )
+    return WRAPPER_REGISTRATION.substitute(
+        name=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type='TraceType',
+    )
 
 def gen_trace_type_shard(
     fm: FileManager, native_functions: Sequence[NativeFunction], suffix: str
 ) -> None:
     fm.write_with_template('TraceType%s.cpp' % suffix, 'TraceType.cpp', lambda: {
-        'generated_comment': f'@generated from {fm.template_dir}/TraceType.cpp',
+        'generated_comment': '@' + f'generated from {fm.template_dir}/TraceType.cpp',
         'trace_method_definitions': list(mapMaybe(method_definition, native_functions)),
         'trace_wrapper_registrations': list(mapMaybe(method_registration, native_functions)),
     })
diff --git a/tools/autograd/gen_variable_factories.py b/tools/autograd/gen_variable_factories.py
index a8c07aef4181..f8ab30dc4580 100644
--- a/tools/autograd/gen_variable_factories.py
+++ b/tools/autograd/gen_variable_factories.py
@@ -48,7 +48,7 @@ def process_function(f: NativeFunction) -> Optional[str]:
     if Variant.function not in f.variants or not is_factory:
         return None
 
-    sig = CppSignatureGroup.from_schema(f.func, method=False).signature
+    sig = CppSignatureGroup.from_native_function(f, method=False).signature
     formals: List[str] = []
     exprs: List[str] = []
     requires_grad = 'false'
diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
index 03fbf34034ea..c78e1e5f66cc 100644
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@@ -22,20 +22,24 @@
 #     which will in turn dispatch back to VariableType for its
 #     differentiable subcomponents.
 #
+from dataclasses import dataclass
 
-from .utils import CodeTemplate, nested_dict, write, make_out_api_name_faithful
 from .gen_autograd import VIEW_FUNCTIONS, VIEW_FUNCTIONS_WITH_METADATA_CHANGE, \
     MULTI_OUTPUT_SAFE_FUNCTIONS, RETURNS_VIEWS_OF_INPUT
 from .gen_autograd_functions import uses_single_grad
-from .gen_trace_type import MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD
+from .gen_trace_type import (
+    MANUAL_BACKEND, MANUAL_AUTOGRAD_AND_TRACER, MANUAL_AUTOGRAD,
+    declare_returned_variables, tie_return_values, get_return_value, type_wrapper_name,
+)
 
 from tools.codegen.api.types import *
 from tools.codegen.api.autograd import *
 import tools.codegen.api.cpp as cpp
-import tools.codegen.api.python as python
-from tools.codegen.gen import with_native_function
+from tools.codegen.code_template import CodeTemplate
+from tools.codegen.gen import with_native_function, parse_native_yaml, FileManager, mapMaybe
 from tools.codegen.model import *
-from typing import Dict, Optional, List, Sequence, Any, Callable
+from tools.codegen.selective_build.selector import SelectiveBuilder
+from typing import Callable, List, Optional, Sequence, Tuple, Union
 
 # We don't set or modify grad_fn on these methods. Generally, they return
 # tensors that have requires_grad=False. In-place functions listed here will
@@ -78,8 +82,8 @@
     'bmm', 'diagonal', 'alias', 'atan', 'log', 'log10', 'log1p', 'log2', 'reciprocal',
     'tan', 'pow', 'rsqrt', 'tanh', 'tanh_backward', 'asinh', 'acosh', 'take', 'fill_',
     'exp', 'nonzero', 'mean', 'inverse', 'solve', 'linalg_cholesky', 'addcmul', 'addcdiv',
-    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'qr', 'svd',
-    '_fft_c2c', '_fft_r2c', 'linalg_solve', 'sqrt'
+    'matrix_exp', 'linalg_eigh', 'cholesky_solve', 'linalg_qr', 'svd', '_fft_c2c', '_fft_r2c',
+    'linalg_solve', 'sqrt', 'stack', 'gather', 'index_select', 'index_add_'
 }
 
 # Some operators invalidate the grad_accumulator. Let's reset it.
@@ -118,6 +122,21 @@
 }
 """)
 
+SAVE_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\
+std::vector<c10::optional<Storage>> ${tensorlist_name}_storage_saved(${tensorlist_name}.size());
+for (const c10::optional<Tensor>& tensor : ${tensorlist_name})
+  ${tensorlist_name}_storage_saved.push_back(
+    tensor.has_value() && tensor->has_storage() ? c10::optional<Storage>(tensor->storage()) : c10::nullopt);
+""")
+
+ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE = CodeTemplate("""\
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  if (${tensorlist_name}_storage_saved[i].has_value())
+    AT_ASSERT(${tensorlist_name}_storage_saved[i].value().is_alias_of(
+        static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->storage()));
+}
+""")
+
 SAVE_TENSOR_IMPL = CodeTemplate("""\
 c10::intrusive_ptr<TensorImpl> ${tensor_name}_impl_saved;
 if (${tensor_name}.defined()) ${tensor_name}_impl_saved = ${tensor_name}.getIntrusivePtr();
@@ -140,6 +159,21 @@
 }
 """)
 
+SAVE_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\
+std::vector<c10::intrusive_ptr<TensorImpl>> ${tensorlist_name}_impl_saved(${tensorlist_name}.size());
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  c10::optional<Tensor> t = ${tensorlist_name}[i];
+  if (t.has_value() && t->defined()) ${tensorlist_name}_impl_saved[i] = t->getIntrusivePtr();
+}
+""")
+
+ENFORCE_SAME_OPTIONALTENSORLIST_IMPL = CodeTemplate("""\
+for (size_t i=0; i<${tensorlist_name}.size(); i++) {
+  if (${tensorlist_name}_impl_saved[i])
+    AT_ASSERT(${tensorlist_name}_impl_saved[i] == static_cast<c10::optional<Tensor>>(${tensorlist_name}[i])->getIntrusivePtr());
+}
+""")
+
 # The following list contains functions that we don't enforce the invariant on.
 DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE = {
     # These functions are expected to change impl or storage of input tensors
@@ -157,19 +191,6 @@
 }
 """)
 
-# NOTE[UnboxedOnly] Many of our codegen templates currently exist twice, once
-# in an _UNBOXEDONLY_ variant and once without _UNBOXEDONLY_. This is because
-# ops that are `use_c10_dispatcher: full` need different c++ code than ops
-# that aren't `use_c10_dispatcher: full` yet. The _UNBOXEDONLY_ variants
-# are for ops that aren't `use_c10_dispatcher: full` yet and those code templates
-# can be deleted once all ops are `use_c10_dispatcher: full`.
-# If you update one of the templates, you likely also have to update the other.
-
-# See NOTE[UnboxedOnly]
-UNBOXEDONLY_WRAPPER_REGISTRATION = CodeTemplate("""\
-m.impl_UNBOXED("${unqual_operator_name_with_overload}", &${class_type}::${type_wrapper_name});
-""")
-
 WRAPPER_REGISTRATION = CodeTemplate("""\
 m.impl("${unqual_operator_name_with_overload}",
        TORCH_FN(${class_type}::${type_wrapper_name})
@@ -179,9 +200,6 @@
 UNPACK_TENSOR = CodeTemplate("""\
 auto${ref} ${arg_name}_ = unpack${suffix}(${arg_name}, "${arg_name}", ${arg_pos});""")
 
-LEGACY_WRAP_OPTIONS = CodeTemplate("""\
-auto ${arg_name}_ = TensorOptions(${arg_name});""")
-
 DECLARE_GRAD_FN = CodeTemplate("""\
 std::shared_ptr<${op}> grad_fn;
 """)
@@ -274,49 +292,18 @@
 #endif
 """)
 
-# Methods shared by TraceType and VariableType to handle return variable declaration, tie and tuple.
-def format_return_variables(declaration):
-    name = declaration['name']
-    arguments = declaration['arguments']
-    inplace = declaration['inplace']
-    is_out_fn = name.endswith('_out')
-    modifies_arguments = inplace or is_out_fn
-
-    def declare_returned_variables():
-        if modifies_arguments:
-            return ''
-        if len(declaration['returns']) == 1:
-            return ''
-        # TODO: this will be ugly
-        names = [ret['type'] + ' ' + ret['name'] + ';' for ret in declaration['returns']]
-        return '\n'.join(names)
-
-    def tie_return_values():
-        if len(declaration['returns']) == 1:
-            return 'auto {}'.format(declaration['returns'][0]['name'])
-        names = [ret['name'] for ret in declaration['returns']]
-        return 'std::tie({})'.format(', '.join(names))
-
-    def get_return_value():
-        if inplace:
-            return 'self'
-        if is_out_fn:
-            return_names = [arg['name'] for arg in arguments
-                            if arg.get('output', False)]
-            if len(return_names) == 1:
-                return return_names[0]
-            return 'std::forward_as_tuple({})'.format(', '.join(return_names))
-
-        returns = declaration['returns']
-        if len(returns) == 1:
-            return returns[0]['name']
-        moved = ['std::move({})'.format(r['name']) for r in returns]
-        return 'std::make_tuple({})'.format(', '.join(moved))
-
-    return (declare_returned_variables(), tie_return_values(), get_return_value())
+@dataclass(frozen=True)
+class NativeFunctionWithDifferentiabilityInfo:
+    func: NativeFunction
+    info: Optional[DifferentiabilityInfo]
 
-
-def gen_variable_type(out, aten_declarations, differentiability_infos, template_path):
+def gen_variable_type(
+    out: str,
+    native_yaml_path: str,
+    differentiability_infos: Sequence[DifferentiabilityInfo],
+    template_path: str,
+    operator_selector: SelectiveBuilder,
+) -> None:
 
     """VariableType.h and VariableType.cpp body
 
@@ -324,153 +311,190 @@ def gen_variable_type(out, aten_declarations, differentiability_infos, template_
     implementation of each function dispatches to the base tensor type to
     compute the output. The grad_fn is attached to differentiable functions.
     """
+    fns = list(sorted(filter(
+        operator_selector.is_native_function_selected_for_training,
+        parse_native_yaml(native_yaml_path)), key=lambda f: cpp.name(f.func)))
+    fns_with_infos = match_differentiability_info(fns, differentiability_infos)
 
-    aten_declarations = list(sorted(aten_declarations, key=lambda decl: decl['name']))
-    match_declarations_with_differentiability_info(aten_declarations, differentiability_infos)
-
-    gen_variable_type_shard(out, aten_declarations, template_path, None, True)
+    fm = FileManager(install_dir=out, template_dir=template_path, dry_run=False)
+    gen_variable_type_shard(fm, fns_with_infos, 'VariableType.h', 'VariableType.h')
 
     # NOTE: see Note [Sharded File] at the top of the VariableType.cpp
     # template regarding sharding of the generated files.
     num_shards = 5
-    shards = [[] for _ in range(num_shards)]
+    shards: List[List[NativeFunctionWithDifferentiabilityInfo]] = [[] for _ in range(num_shards)]
 
     # functions are assigned arbitrarily but stably to a file based on hash
-    for decl in aten_declarations:
-        x = sum(ord(c) for c in decl['name']) % num_shards
-        shards[x].append(decl)
+    for fn in fns_with_infos:
+        x = sum(ord(c) for c in cpp.name(fn.func.func)) % num_shards
+        shards[x].append(fn)
 
     for i, shard in enumerate(shards):
-        gen_variable_type_shard(out, shard, template_path, '_%d' % i, False)
-    gen_variable_type_shard(out, aten_declarations, template_path, 'Everything', False)
-
+        gen_variable_type_shard(fm, shard, 'VariableType.cpp', f'VariableType_{i}.cpp')
 
-def gen_variable_type_shard(out, aten_declarations, template_path, suffix, header):
-    VARIABLE_TYPE_H = CodeTemplate.from_file(template_path + '/VariableType.h')
-    VARIABLE_TYPE_CPP = CodeTemplate.from_file(template_path + '/VariableType.cpp')
+    gen_variable_type_shard(fm, fns_with_infos, 'VariableType.cpp', 'VariableTypeEverything.cpp')
 
-    type_declarations = []
-    type_definitions = []
-    wrapper_registrations = []
-
-    for declaration in aten_declarations:
-        if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']:
-            formals = declaration['schema_order_formals']
-        else:
-            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-            formals = declaration['formals']
-        type_declarations.append(METHOD_DECLARATION.substitute(declaration, formals=formals))
-        strategy = dispatch_strategy(declaration)
-        if declaration['name'] not in MANUAL_AUTOGRAD and strategy == 'use_derived':
-            body = emit_body(declaration)
+@with_native_function
+def gen_formals(f: NativeFunction) -> str:
+    return ', '.join(
+        f'{cpp.argument_type(a, binds="__placeholder__").cpp_type()} {a.name}'
+        for a in f.func.schema_order_arguments()
+    )
 
+@with_native_function
+def gen_wrapper_registration(f: NativeFunction) -> str:
+    return WRAPPER_REGISTRATION.substitute(
+        unqual_operator_name_with_overload=f.func.name,
+        type_wrapper_name=type_wrapper_name(f),
+        class_type='VariableType',
+    )
+
+def gen_variable_type_shard(
+    fm: FileManager,
+    fns_with_infos: List[NativeFunctionWithDifferentiabilityInfo],
+    template_name: str,
+    output_name: str,
+) -> None:
+    type_declarations: List[str] = []
+    type_definitions: List[str] = []
+    wrapper_registrations: List[str] = []
+
+    for fn in fns_with_infos:
+        f = fn.func
+        name = cpp.name(f.func)
+        formals = gen_formals(f)
+
+        type_declarations.append(METHOD_DECLARATION.substitute(
+            return_type=cpp.returns_type(f.func.returns),
+            type_wrapper_name=type_wrapper_name(f),
+            formals=formals,
+        ))
+
+        if name not in MANUAL_AUTOGRAD and dispatch_strategy(fn) == 'use_derived':
             type_definitions.append(METHOD_DEFINITION.substitute(
-                declaration, type_definition_body=body, formals=formals))
-            if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']:
-                wrapper_registrations.append(WRAPPER_REGISTRATION.substitute(
-                    declaration, class_type='VariableType'))
-            else:
-                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                wrapper_registrations.append(UNBOXEDONLY_WRAPPER_REGISTRATION.substitute(
-                    declaration, class_type='VariableType'))
+                return_type=cpp.returns_type(f.func.returns),
+                type_wrapper_name=type_wrapper_name(f),
+                type_definition_body=emit_body(fn),
+                formals=formals,
+            ))
+            wrapper_registrations.append(gen_wrapper_registration(f))
 
         # See Note [Manual Backend kernels]
-        assert (declaration['name'] in MANUAL_BACKEND) == declaration['manual_kernel_registration']
+        assert (name in MANUAL_BACKEND) == f.manual_kernel_registration
         # If you want to register a kernel to Autograd, you must make the op abstract.
         # In other words, this op must have dispatch section in native_functions.yaml.
-        if declaration['name'] in MANUAL_AUTOGRAD_AND_TRACER or declaration['derivative']:
-            msg = (f'There\'s a formula for {declaration["name"]}(or its functional variant) in derivatives.yaml. '
+        if name in MANUAL_AUTOGRAD_AND_TRACER or (fn.info and fn.info.has_derivatives):
+            msg = (f'There\'s a formula for {name}(or its functional variant) in derivatives.yaml. '
                    f'It\'s required to add a dispatch section for it with explicit supported backends e.g CPU/CUDA '
                    f'or DefaultBackend in native_functions.yaml. Please see '
                    f'https://github.com/pytorch/pytorch/tree/master/aten/src/ATen/native#choosing-the-right-dispatch-keyword '
                    f'for instructions to choose the right dispatch keyword.')
-            assert declaration['abstract'], msg
+            assert f.is_abstract, msg
 
-    env = {
+    fm.write_with_template(output_name, template_name, lambda: {
+        'generated_comment': '@' + f'generated from {fm.template_dir}/{template_name}',
         'type_derived_method_declarations': type_declarations,
         'type_derived_method_definitions': type_definitions,
         'wrapper_registrations': wrapper_registrations,
-    }
-    if header:
-        write(out, 'VariableType.h', VARIABLE_TYPE_H, env)
-    else:
-        write(out, 'VariableType%s.cpp' % suffix, VARIABLE_TYPE_CPP, env)
-
-
-def emit_body(declaration):
-    assert dispatch_strategy(declaration) == 'use_derived'
-
-    arguments = declaration['arguments']
-    returns = declaration['returns']
-    func = declaration['derivative']
-    name = declaration['name']
-    inplace = declaration['inplace']
-    is_out_fn = name.endswith('_out')
-    modifies_arguments = inplace or is_out_fn
-    returns_void = len(returns) == 0
-
-    base_name = name[:-1] if inplace else name[:-4] if is_out_fn else name
+    })
+
+def emit_body(fn: NativeFunctionWithDifferentiabilityInfo) -> List[str]:
+    assert dispatch_strategy(fn) == 'use_derived'
+    f = fn.func
+    info = fn.info
+
+    name = cpp.name(f.func)
+    inplace = f.func.kind() == SchemaKind.inplace
+    is_out_fn = f.func.kind() == SchemaKind.out
+    returns_void = len(f.func.returns) == 0
+    base_name = f.func.name.name.base  # TODO: should be str(f.func.name.name)?
     view_info = VIEW_FUNCTIONS.get(base_name, None)
     if view_info is None and base_name in RETURNS_VIEWS_OF_INPUT:
         view_info = "self"
 
-    def is_differentiable(arg):
-        if 'TensorOptions' in arg['type']:
-            return False
-        if 'Tensor' not in arg['type']:
-            return False
-        if arg['name'] in declaration.get('non_differentiable_arg_names', []):
-            return False
-        return True
-
-    def find_args_with_derivatives(differentiable_inputs):
+    def is_differentiable(name: str, type: Type) -> bool:
+        return type.is_tensor_like() and (info is None or name not in info.non_differentiable_arg_names)
+
+    def gen_differentiable_input(
+        arg: Union[Argument, SelfArgument, TensorOptionsArguments]
+    ) -> Optional[DifferentiableInput]:
+        if isinstance(arg, TensorOptionsArguments):
+            return None
+        a: Argument = arg.argument if isinstance(arg, SelfArgument) else arg
+
+        # TODO: `cpp_type` is only to keep it byte-for-byte compatible with the old codegen, should remove.
+        # NB: This is not a clone of cpp.argument() - TensorOptionsArguments / faithful / binds are
+        # not handled properly as they are irrelevant for this codegen.
+        cpp_type = cpp.argument_type(a, binds=a.name).cpp_type()
+
+        if not is_differentiable(a.name, a.type):
+            return None
+        return DifferentiableInput(
+            name=a.name,
+            type=a.type,
+            cpp_type=cpp_type,
+        )
+
+    @with_native_function
+    def gen_differentiable_inputs(f: NativeFunction) -> List[DifferentiableInput]:
+        return list(mapMaybe(gen_differentiable_input, f.func.arguments.non_out))
+
+    def find_args_with_derivatives(differentiable_inputs: List[DifferentiableInput]) -> List[DifferentiableInput]:
         """Find arguments that have derivative definitions"""
-        if func is None:
+        if info is None or not info.has_derivatives:
             return differentiable_inputs
-        names = set(name for d in func.derivatives for name in d.var_names)
-        differentiable = [arg for arg in differentiable_inputs if arg['name'] in names]
+        names = set(name for d in info.derivatives for name in d.var_names)
+        differentiable = [arg for arg in differentiable_inputs if arg.name in names]
         if len(differentiable) != len(names):
-            missing = names - set(arg['name'] for arg in differentiable)
-            raise RuntimeError(f'Missing arguments for derivatives: {missing} in {func.name}')
+            missing = names - set(arg.name for arg in differentiable)
+            raise RuntimeError(f'Missing arguments for derivatives: {missing} in {info.name}')
         return differentiable
 
-    inputs = [arg for arg in arguments if not arg.get('output', False)]
-    differentiable_inputs = list(filter(is_differentiable, inputs))
+    def gen_differentiable_outputs(f: NativeFunction) -> List[DifferentiableOutput]:
+        outputs: List[DifferentiableOutput] = [
+            DifferentiableOutput(name=name, type=ret.type, cpp_type=cpp.return_type(ret))
+            for name, ret in zip(cpp.return_names(f), f.func.returns)]
+
+        output_differentiability = info.output_differentiability if info else None
+        if output_differentiability is not None:
+            differentiable_outputs: List[DifferentiableOutput] = []
+            if False in output_differentiability and f.func.kind() == SchemaKind.inplace:
+                raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)")
+            for differentiable, output in zip(output_differentiability, outputs):
+                if differentiable:
+                    differentiable_outputs.append(output)
+            return differentiable_outputs
+
+        candidate_differentiable_outputs = list(filter(lambda r: is_differentiable(r.name, r.type), outputs))
+
+        if uses_single_grad(info):
+            return candidate_differentiable_outputs[:1]
+        else:
+            return candidate_differentiable_outputs
+
+    differentiable_inputs = gen_differentiable_inputs(f)
     args_with_derivatives = find_args_with_derivatives(differentiable_inputs)
-    non_differentiable_arg_names = declaration.get('non_differentiable_arg_names', [])
-    candidate_differentiable_outputs = list(filter(is_differentiable, returns))
-
-    if declaration['output_differentiability'] is not None:
-        differentiable_outputs = []
-        output_differentiability = declaration['output_differentiability']
-        if False in output_differentiability and inplace:
-            raise RuntimeError("output_differentiability=False for inplace operation (version_counter won't get updated)")
-        for differentiable, output in zip(output_differentiability, returns):
-            if differentiable:
-                differentiable_outputs.append(output)
-    elif uses_single_grad(func):
-        differentiable_outputs = candidate_differentiable_outputs[:1]
-    else:
-        differentiable_outputs = candidate_differentiable_outputs
+    differentiable_outputs = gen_differentiable_outputs(f)
 
     requires_derivative = (
         base_name not in DONT_REQUIRE_DERIVATIVE and name not in DONT_REQUIRE_DERIVATIVE and
         len(differentiable_inputs) > 0 and len(differentiable_outputs) > 0)
 
-    if func is not None and not requires_derivative:
-        raise RuntimeError('ERROR: derivative ignored for {} -- specified an autograd function without derivative'
-                           .format(name))
+    if info is not None and info.has_derivatives and not requires_derivative:
+        raise RuntimeError(f'ERROR: derivative ignored for {name} -- specified an autograd function without derivative')
 
-    def emit_save_inputs():
-        setup = []
-        if func is None:
+    def emit_save_inputs() -> List[str]:
+        setup: List[str] = []
+        if info is None or not info.has_derivatives:
             return setup
 
-        has_tensorlist_arg = any(arg.type == 'TensorList' for arg in func.args_with_derivatives)
+        has_tensorlist_arg = any(is_tensor_list_type(arg.type) for arg in args_with_derivatives)
 
         # We don't want to save tensors if we know that they will never be used
         # when computing the derivative, so we add guards to those statements
         def guard_for(arg: SavedAttribute) -> Optional[str]:
+            assert info is not None
+
             # It's hard to determine the edge offset if we have TensorLists
             if has_tensorlist_arg:
                 return None
@@ -481,12 +505,12 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
             # require_grad if the backward function even gets executed. I don't
             # have any good ideas for detecting those cases, so I simply disabled the
             # checks.
-            if 'backward' in func.name:
+            if 'backward' in info.name:
                 return None
 
             # If there's a single derivative we could compute, we already have
             # a requires_grad check that is sufficient
-            if len(func.args_with_derivatives) <= 1:
+            if len(args_with_derivatives) <= 1:
                 return None
 
             # We really only care about trimming down the amount of tensors we save
@@ -495,7 +519,7 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
 
             # We want to emit simple guards, so we only allow that if checking one
             # input is enough to determine whether we need that value
-            used_in = [d for d in func.derivatives if arg in d.saved_inputs]
+            used_in = [d for d in info.derivatives if arg in d.saved_inputs]
             assert len(used_in) > 0
             if len(used_in) != 1:
                 return None
@@ -505,75 +529,76 @@ def guard_for(arg: SavedAttribute) -> Optional[str]:
             derivative_var_name = derivative.var_names[0]
 
             # Figure out the offset of the edge that uses this variable
-            for edge_off, arg in enumerate(func.args_with_derivatives):
-                if arg.name == derivative_var_name:
+            for edge_off, a in enumerate(args_with_derivatives):
+                if a.name == derivative_var_name:
                     break
             else:
                 raise AssertionError()
 
             return f'grad_fn->should_compute_output({edge_off})'
 
-        setup.extend(save_variables(func.all_saved_inputs, False, guard_for))
-        for arg in func.args_with_derivatives:
-            if arg.type == 'TensorList':
+        setup.extend(save_variables(info.all_saved_inputs, False, guard_for))
+        for arg in args_with_derivatives:
+            if is_tensor_list_type(arg.type):
                 setup.append(f'grad_fn->{arg.name}_size_ = {arg.name}.size();')
 
         return setup
 
-    def setup_derivative(differentiable_inputs):
-        env = {}
-        env['args_with_derivatives'] = [arg['name'] for arg in args_with_derivatives]
-        env['op'] = func.op if func is not None else 'NotImplemented'
-        env['op_ctor'] = '' if func is not None else '"{}"'.format(declaration['api_name'])
-
+    def setup_derivative(differentiable_inputs: List[DifferentiableInput]) -> List[str]:
+        body: List[str] = []
         if is_out_fn:
             # For out functions, ensure that no input or output requires grad
-            body = []
             body.append(DECLARE_GRAD_FN.substitute(op='Node'))
             body.append(SETUP_NONE_REQUIRES_GRAD.substitute(
                 base_name=base_name,
-                args_to_check=[arg['name'] for arg in differentiable_inputs]))
+                args_to_check=[arg.name for arg in differentiable_inputs]))
             body.append(SETUP_NONE_REQUIRES_GRAD.substitute(
                 base_name=base_name,
-                args_to_check=[arg['name'] for arg in differentiable_outputs]))
+                args_to_check=[arg.name for arg in differentiable_outputs]))
             return body
 
+        op = info.op if info is not None and info.has_derivatives else 'NotImplemented'
         setup = []
-        setup.extend(ASSIGN_GRAD_FN.substitute(env).split('\n'))
+        setup.extend(ASSIGN_GRAD_FN.substitute(
+            op=op,
+            op_ctor='' if info is not None and info.has_derivatives else f'"{cpp.name(f.func)}"',
+            args_with_derivatives=[arg.name for arg in args_with_derivatives],
+        ).split('\n'))
         setup.extend(emit_save_inputs())
 
-        body = []
         body.extend(emit_check_no_requires_grad(differentiable_inputs, args_with_derivatives))
-        body.append(DECLARE_GRAD_FN.substitute(env))
+        body.append(DECLARE_GRAD_FN.substitute(op=op))
         body.append(SETUP_DERIVATIVE.substitute(setup=setup))
         return body
 
-    def emit_check_if_in_complex_autograd_allowlist():
-        body = []
+    def emit_check_if_in_complex_autograd_allowlist() -> List[str]:
+        body: List[str] = []
         if base_name in GRADIENT_IMPLEMENTED_FOR_COMPLEX:
             return body
         for arg in differentiable_outputs:
-            name = arg['name']
-            if arg['type'] == 'Tensor' or arg['type'] == 'TensorList':
-                body.append('throw_error_for_complex_autograd({}, "{}");'.format(name, base_name))
+            name = arg.name
+            # TODO: should be `arg.type.is_tensor_like()`?
+            if arg.cpp_type in ['Tensor', 'TensorList', 'const c10::List<c10::optional<Tensor>> &']:
+                body.append(f'throw_error_for_complex_autograd({name}, "{base_name}");')
         return body
 
-    def emit_check_no_requires_grad(tensor_args, args_with_derivatives):
+    def emit_check_no_requires_grad(
+        tensor_args: List[DifferentiableInput],
+        args_with_derivatives: List[DifferentiableInput],
+    ) -> List[str]:
         """Checks that arguments without derivatives don't require grad"""
-        body = []
+        body: List[str] = []
         for arg in tensor_args:
             if arg in args_with_derivatives:
                 continue
-            name = arg['name']
-            if name in non_differentiable_arg_names:
+            name = arg.name
+            if info and name in info.non_differentiable_arg_names:
                 continue
             if name == 'output':
                 # Double-backwards definitions sometimes take in 'input' and
                 # 'output', but only define the derivative for input.
                 continue
-            if arg['dynamic_type'] in {'IndexTensor', 'ByteTensor', 'BoolTensor'}:
-                continue
-            body.append('check_no_requires_grad({}, "{}");'.format(name, name))
+            body.append(f'check_no_requires_grad({name}, "{name}");')
         return body
 
     def save_variables(
@@ -599,7 +624,7 @@ def save_variables(
                     expr = f'SavedVariable({var}, {str(is_output).lower()}, {is_inplace_view})'
                 else:
                     expr = f'SavedVariable({var}, {str(is_output).lower()})'
-            elif arg.type == 'TensorList':
+            elif arg.type in ['TensorList', 'c10::List<c10::optional<Tensor>>']:
                 name += '_'
                 expr = f'make_saved_variable_list({arg.name})'
             elif arg.type == 'IntArrayRef':
@@ -613,42 +638,40 @@ def save_variables(
                 stmts.append('}')
         return stmts
 
-    def emit_dispatch_call(api_name, input_base, unpacked_args):
+    def emit_dispatch_call(f: NativeFunction, input_base: str, unpacked_args: Sequence[str]) -> str:
         """ Dispatch call via function in a namespace or method on Tensor."""
-        if 'namespace' in declaration['method_of']:
-            if declaration['use_c10_dispatcher'] in ['hacky_wrapper_for_legacy_signatures', 'full']:
-                dispatcher_api_name = make_out_api_name_faithful(api_name)
-            else:
-                assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-                dispatcher_api_name = api_name
+        if Variant.function in f.variants:
             call = CALL_DISPATCH_VIA_NAMESPACE.substitute(
-                api_name=dispatcher_api_name,
+                api_name=cpp.name(
+                    f.func,
+                    faithful_name_for_out_overloads=True,
+                ),
                 unpacked_args=unpacked_args)
         else:
             call = CALL_DISPATCH_VIA_METHOD.substitute(
-                api_name=api_name,
+                api_name=cpp.name(f.func),
                 var=input_base,
                 unpacked_method_args=unpacked_args[1:])
         return call
 
-    def emit_view_lambda():
+    def emit_view_lambda(unpacked_bindings: List[Binding]) -> str:
         """ Generate an additional lambda function to recover views in backward when as_strided is not supported.
         See Note [View + Inplace update for base tensor] and [View + Inplace update for view tensor] for more details."""
         input_base = 'input_base'
         replay_view_func = ''
-        updated_unpacked_args = []
-        combined = nested_dict(env, declaration)
-        known_view_arg_simple_types = ['int64_t', 'int64_t?', 'bool', 'IntArrayRef']
-        for arg in combined['unpacked_args']:
+        updated_unpacked_args: List[str] = []
+        known_view_arg_simple_types: List[str] = ['int64_t', 'c10::optional<int64_t>', 'bool', 'IntArrayRef']
+        for unpacked_binding in unpacked_bindings:
+            arg, arg_type = unpacked_binding.name, unpacked_binding.type
             if arg == 'self_':
                 updated_unpacked_args.append(input_base)
                 continue
-            arg_type = combined['unpacked_args_simple_type'][arg]
             if arg_type not in known_view_arg_simple_types:
-                raise TypeError('You are adding an {} {} argument to op {} in addition to known types: {}. '
-                                'Please update the list or materialize it so that it can be closed over by value, '
-                                'also add a test in pytorch/xla/test/test_operations.py where this code is exercised.'
-                                .format(arg_type, arg, declaration['name'], ', '.join(known_view_arg_simple_types)))
+                known_types_str = ', '.join(known_view_arg_simple_types)
+                raise TypeError(f'You are adding an {arg_type} {arg} argument to op {cpp.name(f.func)} in addition to known types: '
+                                f'{known_types_str}. Please update the list or materialize it so that it can be closed '
+                                'over by value, also add a test in pytorch/xla/test/test_operations.py where this code '
+                                'is exercised.')
 
             if arg_type == 'IntArrayRef':
                 # It's not safe to close over IntArrayRef by value, since this is a
@@ -656,7 +679,7 @@ def emit_view_lambda():
                 arg_vec = arg + '_vec'
                 replay_view_func += ARRAYREF_TO_VEC.substitute(arg=arg, vec=arg_vec)
                 updated_unpacked_args.append(arg_vec)
-            elif arg_type == 'int64_t?':
+            elif arg_type == 'c10::optional<int64_t>':
                 # Materialize int64_t? to int64_t
                 arg_value = arg + '_val'
                 replay_view_func += OPTIONAL_TO_VAL.substitute(arg=arg, val=arg_value, default='0')
@@ -664,7 +687,7 @@ def emit_view_lambda():
             else:
                 updated_unpacked_args.append(arg)
 
-        replay_view_call = emit_dispatch_call(combined['api_name'], input_base, updated_unpacked_args)
+        replay_view_call = emit_dispatch_call(f, input_base, updated_unpacked_args)
         replay_view_func += REPLAY_VIEW_LAMBDA_FUNC.substitute(
             input_base=input_base,
             replay_view_call=replay_view_call)
@@ -675,17 +698,17 @@ def emit_view_lambda():
             is_view_with_metadata_change=is_view_with_metadata_change,
             replay_view_func=replay_view_func)
 
-    def wrap_output(return_values, var):
+    def wrap_output(f: NativeFunction, unpacked_bindings: List[Binding], var: str) -> str:
         call = ''
-        rhs_value = None
-        if 'Tensor' not in declaration['return_type']:
+        rhs_value: Optional[str] = None
+        if not any(r.type.is_tensor_like() for r in f.func.returns):
             rhs_value = var
         elif view_info is not None:
             # See NOTE [ Autograd View Variables ] in variable.h for details.
-            differentiable_output_vars = {r['name'] for r in differentiable_outputs}
+            differentiable_output_vars = {r.name for r in differentiable_outputs}
 
             if not isinstance(view_info, str):
-                raise TypeError("The view info should be a string for {}, but it is: {}".format(base_name, view_info))
+                raise TypeError(f'The view info should be a string for {base_name}, but it is: {view_info}')
 
             if len(differentiable_output_vars) == 0:
                 # no output is differentiable (.indices() for SparseTensors for example)
@@ -694,49 +717,55 @@ def wrap_output(return_values, var):
                 # Single differentiable output (Tensor or Tensor[])
                 return_info = differentiable_outputs[0]
                 # We only support simple Tensor or a TensorList for functions that return views
-                if not return_info['dynamic_type'] in ['Tensor', 'TensorList']:
-                    raise RuntimeError("{} that return differentiable views can only return Tensor or Tensor[]".format(base_name))
+                if not is_tensor_type(return_info.type) and not is_tensor_list_type(return_info.type):
+                    raise RuntimeError(f'{base_name} that return differentiable views can only return Tensor or Tensor[]')
                 # Only allow rebasing of the history if we return a single Tensor
                 # If we are in a no grad block, raise a warning
                 # See NOTE [ View + Inplace detection ] for more details about this logic
-                if return_info['dynamic_type'] == 'TensorList':
+                if is_tensor_list_type(return_info.type):
                     if base_name in MULTI_OUTPUT_SAFE_FUNCTIONS:
-                        creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE"
+                        creation_meta = 'CreationMeta::MULTI_OUTPUT_SAFE'
                     else:
-                        creation_meta = "CreationMeta::MULTI_OUTPUT_NODE"
-                    call += ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
-                             "/* is_fw_differentiable */ true, "
-                             "/* creation_meta */ {});").format(view_info, var, creation_meta)
-                    rhs_value = 'std::move({})'.format(var)
+                        creation_meta = 'CreationMeta::MULTI_OUTPUT_NODE'
+                    call += (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, '
+                             '/* is_fw_differentiable */ true, '
+                             f'/* creation_meta */ {creation_meta});')
+                    rhs_value = f'std::move({var})'
                 else:
-                    call += emit_view_lambda()
-                    creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE"
-                    rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
-                                 "/* is_fw_differentiable */ true, "
-                                 "/* view_func */ func, /* creation_meta */ {})").format(view_info, var, creation_meta)
+                    call += emit_view_lambda(unpacked_bindings)
+                    creation_meta = 'GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE'
+                    rhs_value = (f'as_view(/* base */ {view_info}, /* output */ {var}, /* is_bw_differentiable */ true, '
+                                 '/* is_fw_differentiable */ true, '
+                                 f'/* view_func */ func, /* creation_meta */ {creation_meta})')
             else:
                 # This could be supported but we don't need it at the moment, so keeping things simple.
-                raise RuntimeError("Function that return multiple differentiable output "
-                                   "when at least one of them is view is not supported.")
+                raise RuntimeError('Function that return multiple differentiable output '
+                                   'when at least one of them is view is not supported.')
         else:
-            rhs_value = 'std::move({})'.format(var)
+            rhs_value = f'std::move({var})'
         assert rhs_value is not None
-        call += ASSIGN_RETURN_VALUE.substitute(return_values=return_values,
+        call += ASSIGN_RETURN_VALUE.substitute(return_values=tie_return_values(f),
                                                rhs_value=rhs_value)
         return call
 
-    def enforce_same_tensorimpl_and_storage(env, call):
-        save_ptrs_stmts = []
-        enforce_same_ptrs_stmts = []
-        if declaration['name'] not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE:
-            for arg in env.get('unpacked_args', []):
-                simple_type = env['unpacked_args_simple_type'][arg]
-                if simple_type == 'TensorList':
+    def enforce_same_tensorimpl_and_storage(call: str, unpacked_bindings: List[Binding]) -> str:
+        save_ptrs_stmts: List[str] = []
+        enforce_same_ptrs_stmts: List[str] = []
+        if cpp.name(f.func) not in DONT_ENFORCE_SAME_TENSOR_IMPL_OR_STORAGE:
+            for unpacked_binding in unpacked_bindings:
+                arg = unpacked_binding.name
+                noref_cpp_type = unpacked_binding.ctype.cpp_type(strip_ref=True)
+                if noref_cpp_type == 'TensorList':
                     save_ptrs_stmts += [SAVE_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                         SAVE_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
                     enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSORLIST_STORAGE.substitute(tensorlist_name=arg),
                                                 ENFORCE_SAME_TENSORLIST_IMPL.substitute(tensorlist_name=arg)]
-                elif simple_type == 'Tensor':
+                elif noref_cpp_type == 'c10::List<c10::optional<Tensor>>':
+                    save_ptrs_stmts += [SAVE_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                                        SAVE_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
+                    enforce_same_ptrs_stmts += [ENFORCE_SAME_OPTIONALTENSORLIST_STORAGE.substitute(tensorlist_name=arg),
+                                                ENFORCE_SAME_OPTIONALTENSORLIST_IMPL.substitute(tensorlist_name=arg)]
+                elif noref_cpp_type == 'Tensor':
                     save_ptrs_stmts += [SAVE_TENSOR_STORAGE.substitute(tensor_name=arg),
                                         SAVE_TENSOR_IMPL.substitute(tensor_name=arg)]
                     enforce_same_ptrs_stmts += [ENFORCE_SAME_TENSOR_STORAGE.substitute(tensor_name=arg),
@@ -748,74 +777,69 @@ def enforce_same_tensorimpl_and_storage(env, call):
                 RUN_ONLY_IN_DEBUG_MODE.substitute(statements=enforce_same_ptrs_stmts)
         return call
 
-    def emit_call(env, tie_return_values):
-        combined = nested_dict(env, declaration)
+    def emit_call(f: NativeFunction, unpacked_bindings: List[Binding]) -> str:
         # We only care about adding `at::AutoNonVariableTypeMode` guard for non-variable dispatch
         # (which corresponds to 'use_derived' strategy). The purpose of this guard is to make sure
         # the baseType operations still dispatch to non-Variable type, even if the arguments passed
         # in are now Variables.
         # See NOTE [ Treating Variables as non-Variables in type dispatch ] for details.
-        base_type_call = emit_dispatch_call(combined['api_name'], 'self_', combined['unpacked_args'])
-        if not modifies_arguments and not returns_void:
+        unpacked_args = [b.name for b in unpacked_bindings]
+        base_type_call = emit_dispatch_call(f, 'self_', unpacked_args)
+        if not modifies_arguments(f) and not returns_void:
             call = DISPATCH_TO_NON_VAR_TYPE_WITH_TMP_RETURN_VALUES.substitute(
                 base_type_call=base_type_call)
 
-            call += wrap_output(tie_return_values, 'tmp')
+            call += wrap_output(f, unpacked_bindings, 'tmp')
         else:
             call = DISPATCH_TO_NON_VAR_TYPE_WITHOUT_RETURN_VALUES.substitute(
                 base_type_call=base_type_call)
-        call = enforce_same_tensorimpl_and_storage(env, call)
+        call = enforce_same_tensorimpl_and_storage(call, unpacked_bindings)
         return call
 
-    def emit_history():
-        fn = 'rebase' if modifies_arguments and view_info is None else 'set'
-        output_names = [r['name'] for r in differentiable_outputs]
+    def emit_history() -> str:
+        fn = 'rebase' if modifies_arguments(f) and view_info is None else 'set'
+        output_names = [r.name for r in differentiable_outputs]
         # TODO: flatten allocates a std::vector, which could be expensive
         outs = CodeTemplate("flatten_tensor_args( ${outs} )").substitute(outs=output_names)
         return SET_HISTORY.substitute(fn=fn, differentiable_outputs=outs)
 
-    def emit_save_outputs():
+    def emit_save_outputs() -> str:
         if is_out_fn:
             # out functions don't currently support differentiation
             return ''
-        func = declaration['derivative']
-        if func is not None:
-            stmts = save_variables(func.all_saved_outputs, True)
+        if info is not None and info.has_derivatives:
+            stmts = save_variables(info.all_saved_outputs, True)
             if len(stmts) == 0:
                 return ''
             return CONDITIONAL.substitute(cond='grad_fn', statements=stmts)
         return ''
 
-    def emit_any_requires_grad():
+    def emit_any_requires_grad() -> List[str]:
         return [SETUP_ANY_REQUIRES_GRAD.substitute(
-            args_with_derivatives=[arg['name'] for arg in args_with_derivatives]), ]
+            args_with_derivatives=[arg.name for arg in args_with_derivatives]), ]
 
-    def emit_check_inplace():
+    def emit_check_inplace() -> List[str]:
         if not inplace:
             return []
-        return ['check_inplace({}, _any_requires_grad);'.format(arg['name']) for arg in differentiable_outputs]
+        return [f'check_inplace({arg.name}, _any_requires_grad);' for arg in differentiable_outputs]
 
-    def emit_increment_version():
-        if not modifies_arguments:
+    def emit_increment_version(f: NativeFunction) -> List[str]:
+        if not modifies_arguments(f):
             return []
-        return ['increment_version({});'.format(arg['name']) for arg in returns]
-
-    env = {}
-    combined = nested_dict(env, declaration)
-
-    body = []
+        return [f'increment_version({r});' for r in cpp.return_names(f)]
 
-    declare_returned_variables, tie_return_values, get_return_value = format_return_variables(declaration)
+    body: List[str] = []
+    unpack_args_stats, unpacked_bindings = unpack_args(f)
 
-    body.extend(unpack_args(env, declaration))
+    body.extend(unpack_args_stats)
     if requires_derivative:
         body.extend(emit_any_requires_grad())
         body.extend(emit_check_inplace())
         body.extend(setup_derivative(differentiable_inputs))
-    body.append(declare_returned_variables)
+    body.append(declare_returned_variables(f))
 
-    body.append(emit_call(env, tie_return_values))
-    body.extend(emit_increment_version())
+    body.append(emit_call(f, unpacked_bindings))
+    body.extend(emit_increment_version(f))
     if requires_derivative:
         # set_flags has to appear after version_counter, because rebase_history
         # requires that the counter is incremented before it is called
@@ -830,57 +854,50 @@ def emit_increment_version():
         assert inplace
         body.append('reset_grad_accumulator(self);')
     if not returns_void:
-        body.append('return {};'.format(get_return_value))
+        body.append(f'return {get_return_value(f)};')
     return body
 
-
-def unpack_args(env, declaration):
-    def requires_unpack(arg):
-        return 'Tensor' in arg['dynamic_type']
-
-    body = []
-    unpacked_args = []
-    unpacked_args_simple_type = {}
-    if declaration['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']:
-        arguments = declaration['schema_order_arguments']
-    else:
-        assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper'
-        arguments = declaration['arguments']
-    for i, arg in enumerate(arguments):
-        if not requires_unpack(arg):
-            unpacked_args.append(arg['name'])
-            unpacked_args_simple_type[arg['name']] = arg['simple_type']
+@with_native_function
+def unpack_args(f: NativeFunction) -> Tuple[List[str], List[Binding]]:
+    body: List[str] = []
+    unpacked_bindings: List[Binding] = []
+
+    bindings = [r for a in f.func.schema_order_arguments()
+                for r in cpp.argument(a,
+                                      method=False,
+                                      cpp_no_default_args=set(),
+                                      faithful=False,
+                                      has_tensor_options=False)]
+
+    for i, binding in enumerate(bindings):
+        assert not isinstance(binding.argument, SelfArgument)
+        if isinstance(binding.argument, TensorOptionsArguments):
+            raise RuntimeError("VariableKernel shouldn't take TensorOptions")
+
+        is_nullable = binding.argument.type.is_nullable()
+        if not binding.argument.type.is_tensor_like() or is_nullable:
+            unpacked_bindings.append(binding)
             continue
 
-        dynamic_type = arg['dynamic_type']
-        if 'TensorOptions' not in dynamic_type:
-            is_nullable = arg.get('is_nullable', False)
-            ref = (not is_nullable) and dynamic_type not in ['TensorList']
-            suffix = '_opt' if is_nullable and dynamic_type != 'TensorList' else ''
-
-            body.append(UNPACK_TENSOR.substitute(
-                arg_name=arg['name'],
-                arg_pos=i,
-                suffix=suffix,
-                ref='&' if ref else '',
-            ))
-        else:
-            # Okay, we are abusing the definition of 'unpack' here a bit,
-            # although it's still getting the non-variable from the variable
-            # (in this case via TensorOptions rather than Variable/Tensor).
-            assert declaration['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper', \
-                "VariableKernel shouldn't take TensorOptions if the op is c10-full"
-            body.append(LEGACY_WRAP_OPTIONS.substitute(arg_name=arg['name']))
-
-        unpacked_args.append(arg['name'] + '_')
-        unpacked_args_simple_type[arg['name'] + '_'] = arg['simple_type']
-
-    env['unpacked_args'] = unpacked_args
-    env['unpacked_args_simple_type'] = unpacked_args_simple_type
-    return body
-
-
-def dispatch_strategy(declaration):
+        is_tensor_list = is_tensor_list_type(binding.argument.type)
+        ref = (not is_nullable) and not is_tensor_list
+        suffix = '_opt' if is_nullable and not is_tensor_list else ''
+        body.append(UNPACK_TENSOR.substitute(
+            arg_name=binding.name,
+            arg_pos=i,
+            suffix=suffix,
+            ref='&' if ref else '',
+        ))
+        unpacked_bindings.append(Binding(
+            name=binding.name + '_',
+            ctype=binding.ctype,
+            argument=binding.argument,
+            default=binding.default,
+        ))
+
+    return body, unpacked_bindings
+
+def dispatch_strategy(fn: NativeFunctionWithDifferentiabilityInfo) -> str:
     """How are we going to call the underlying implementation of a
     declaration?  There are two strategies:
 
@@ -900,7 +917,7 @@ def dispatch_strategy(declaration):
           get dispatched back to VariableType (which will ensure that they
           are differentiable.)
     """
-    if declaration['abstract'] or declaration['derivative'] is not None:
+    if fn.func.is_abstract or (fn.info is not None and fn.info.has_derivatives):
         # If the function is abstract (not implemented on at::Type), we must
         # call the implementation on the derived type with unpacked tensors.
 
@@ -924,62 +941,47 @@ def dispatch_strategy(declaration):
         # assumption might not hold, but then you'll see gradcheck fail.)
         return 'use_type'
 
-def get_decl_signature(declaration: Dict[Any, Any], use_base_variant: bool = False) -> str:
-    name = declaration['name']
-    arguments = declaration['arguments']
-    if use_base_variant:
-        if declaration['inplace']:
-            assert name.endswith('_')
-            name = name[:-1]
-        elif name.endswith('_out'):
-            name = name[:-4]
-            arguments = [arg for arg in arguments if not arg.get('output', False)]
-    simple_types = ', '.join(arg['simple_type'] for arg in arguments)
-    return f'{name}({simple_types})'
+def is_tensor_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is None
 
-@with_native_function
-def get_func_signature(f: NativeFunction) -> str:
-    args = CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
-    types = ', '.join(python.argument_type_str(a.argument.type, simple_type=True)
-                      if isinstance(a.argument, Argument) else 'TensorOptions'
-                      for a in args)
-    return f'{cpp.name(f.func)}({types})'
-
-def match_declarations_with_differentiability_info(
-    declarations: Dict[Any, Any],
+def is_tensor_list_type(t: Type) -> bool:
+    # TODO: Should handle optional here?
+    return t.is_tensor_like() and t.is_list_like() is not None
+
+def modifies_arguments(f: NativeFunction) -> bool:
+    return f.func.kind() in [SchemaKind.inplace, SchemaKind.out]
+
+def match_differentiability_info(
+    native_functions: List[NativeFunction],
     differentiability_infos: Sequence[DifferentiabilityInfo],
-) -> None:
+) -> List[NativeFunctionWithDifferentiabilityInfo]:
     """Sets the "derivative" key on declarations to matching autograd function
 
     In-place functions will use the out-of-place derivative definition if there
     is no in-place specific derivative.
     """
 
-    info_by_signature = {get_func_signature(info.func): info for info in differentiability_infos}
+    info_by_schema = {info.func.func: info for info in differentiability_infos}
+    functional_info_by_signature = {
+        info.func.func.signature(strip_default=True): info
+        for info in differentiability_infos
+        if info.func.func.kind() == SchemaKind.functional}
 
-    def find_info(declaration: Dict[Any, Any]) -> Optional[DifferentiabilityInfo]:
-        signature = get_decl_signature(declaration)
-        if signature in info_by_signature:
-            return info_by_signature[signature]
+    def find_info(f: NativeFunction) -> Tuple[Optional[DifferentiabilityInfo], bool]:
+        if f.func in info_by_schema:
+            return info_by_schema[f.func], True
 
         # if there is no exact match look for the out-of-place signature.
         # i.e mul() for mul_() or mul_out()
-        signature = get_decl_signature(declaration, use_base_variant=True)
-        return info_by_signature.get(signature)
-
-    for declaration in declarations:
-        info = find_info(declaration)
-        declaration['derivative'] = info if info and info.args_with_derivatives else None
-
-        # Currently, the '.strides()' to 'strides_or_error' replacement does not support
-        # 'self' derivatives of an inplace function, so we must check for this case.
-        if declaration['inplace'] and (info is not None):
-            for derivative in info.derivatives:
-                if 'self' in derivative.var_names:
-                    for saved_input in derivative.saved_inputs:
-                        assert 'strides_or_error' not in saved_input.expr, (
-                            "Calling '.strides()' in the 'self' derivative formula of an "
-                            f"in-place function is not supported: {declaration['name']}")
-
-        declaration['non_differentiable_arg_names'] = info.non_differentiable_arg_names if info else []
-        declaration['output_differentiability'] = info.output_differentiability if info else None
+        return functional_info_by_signature.get(f.func.signature(strip_default=True)), False
+
+    result: List[NativeFunctionWithDifferentiabilityInfo] = []
+    for f in native_functions:
+        info, is_exact_match = find_info(f)
+        result.append(NativeFunctionWithDifferentiabilityInfo(
+            func=f,
+            info=info,
+        ))
+
+    return result
diff --git a/tools/autograd/load_derivatives.py b/tools/autograd/load_derivatives.py
index bc2de6bb14d7..d5c742bb6fa5 100644
--- a/tools/autograd/load_derivatives.py
+++ b/tools/autograd/load_derivatives.py
@@ -62,7 +62,7 @@ def load_derivatives(derivatives_yaml_path: str, native_yaml_path: str) -> Seque
 
 @with_native_function
 def cpp_arguments(f: NativeFunction) -> Sequence[Binding]:
-    return CppSignatureGroup.from_schema(f.func, method=False).signature.arguments()
+    return CppSignatureGroup.from_native_function(f, method=False).signature.arguments()
 
 def create_derivative(f: NativeFunction, formula: str, var_names: Tuple[str, ...]) -> Derivative:
     arguments = cpp_arguments(f)
diff --git a/tools/autograd/templates/Functions.h b/tools/autograd/templates/Functions.h
index 03240e2a5a2b..0540bb65b33b 100644
--- a/tools/autograd/templates/Functions.h
+++ b/tools/autograd/templates/Functions.h
@@ -32,6 +32,15 @@ inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs) {
   });
 }
 
+inline c10::List<c10::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs) {
+  torch::List<c10::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    result.push_back(v.unpack());
+  }
+  return result;
+}
+
 struct TypeAndSize {
   TypeAndSize() : options(at::TensorOptions()) {}
   /* implicit */
diff --git a/tools/autograd/templates/VariableType.h b/tools/autograd/templates/VariableType.h
index 9062a4d08e34..fc8ffa5799c1 100644
--- a/tools/autograd/templates/VariableType.h
+++ b/tools/autograd/templates/VariableType.h
@@ -49,7 +49,6 @@ namespace VariableType {
   at::Tensor & unpack(Tensor & t, const char * name, int pos);
   const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
   at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
-  c10::optional<at::Tensor> unpack_opt(const c10::optional<Tensor> & t, const char * name, int pos);
   std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos);
 };
 
diff --git a/tools/autograd/templates/python_fft_functions.cpp b/tools/autograd/templates/python_fft_functions.cpp
index 49be92d30d35..a77547a6cc07 100644
--- a/tools/autograd/templates/python_fft_functions.cpp
+++ b/tools/autograd/templates/python_fft_functions.cpp
@@ -8,6 +8,7 @@
 #include "torch/csrc/autograd/utils/wrap_outputs.h"
 #include "torch/csrc/autograd/utils/python_arg_parsing.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
+#include "torch/csrc/utils/out_types.h"
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
 #include "torch/csrc/utils/structseq.h"
@@ -30,6 +31,7 @@ using at::TensorList;
 using at::Dimname;
 using at::DimnameList;
 
+using torch::utils::check_out_type_matches;
 using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
diff --git a/tools/autograd/templates/python_torch_functions.cpp b/tools/autograd/templates/python_torch_functions.cpp
index e05e6fbe1975..c42a869b3a98 100644
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@@ -19,6 +19,7 @@
 #include "torch/csrc/Dtype.h"
 #include "torch/csrc/DynamicTypes.h"
 #include "torch/csrc/Exceptions.h"
+#include "torch/csrc/utils/out_types.h"
 #include "torch/csrc/utils/pybind.h"
 #include "torch/csrc/utils/pycfunction_helpers.h"
 #include "torch/csrc/utils/python_arg_parser.h"
@@ -53,43 +54,13 @@ using at::Dimname;
 using at::DimnameList;
 using at::ArrayRef;
 
+using torch::utils::check_out_type_matches;
 using namespace torch::autograd::utils;
 
 namespace torch { namespace autograd {
 
 static PyObject* THPVariableFunctionsModule = NULL;
 
-static void check_out_type_matches(Tensor result,
-                                   ScalarType scalarType, bool scalarType_is_none,
-                                   c10::optional<at::Layout> layout,
-                                   const Device& device, bool device_is_none) {
-  if (scalarType_is_none && !layout && device_is_none) {  // common case
-    return;
-  }
-  if (!scalarType_is_none && result.scalar_type() != scalarType) {
-    AT_ERROR(
-        "dtype ", scalarType,
-        " does not match dtype of out parameter (", result.scalar_type(), ")");
-  }
-  auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType;
-  auto device_type_arg = device_is_none ? result.device().type() : device.type();
-  if (result.scalar_type() != scalarType_arg) {
-    AT_ERROR(
-        "scalar type ", scalarType_arg,
-        " does not match scalar type of out parameter (", result.scalar_type(), ")");
-  }
-  if (layout && result.layout() != *layout) {
-    AT_ERROR(
-        "layout ", *layout,
-        " does not match layout of out parameter (", result.layout(), ")");
-  }
-  if (result.device().type() != device_type_arg) {
-    AT_ERROR(
-        "device type ", device_type_arg,
-        " does not match device type of out parameter (", result.device().type(), ")");
-  }
-}
-
 inline Tensor dispatch_arange(Scalar end, Tensor result) {
   pybind11::gil_scoped_release no_gil;
   return at::arange_out(result, end);
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
index a214684ab29c..5ed0b1340811 100644
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@@ -7,9 +7,6 @@ GENERATED_CPP = [
     "autograd/generated/VariableType_2.cpp",
     "autograd/generated/VariableType_3.cpp",
     "autograd/generated/VariableType_4.cpp",
-    "jit/generated/generated_unboxing_wrappers_0.cpp",
-    "jit/generated/generated_unboxing_wrappers_1.cpp",
-    "jit/generated/generated_unboxing_wrappers_2.cpp",
     "autograd/generated/TraceType_0.cpp",
     "autograd/generated/TraceType_1.cpp",
     "autograd/generated/TraceType_2.cpp",
@@ -39,9 +36,6 @@ libtorch_nvfuser_generated_headers = ["{}.h".format(name[36:-3]) for name in lib
 def libtorch_generated_sources(gencode_pattern):
     return [gencode_pattern.format(name) for name in [
         "autograd/generated/Functions.cpp",
-        "jit/generated/generated_unboxing_wrappers_0.cpp",
-        "jit/generated/generated_unboxing_wrappers_1.cpp",
-        "jit/generated/generated_unboxing_wrappers_2.cpp",
         "autograd/generated/VariableType_0.cpp",
         "autograd/generated/VariableType_1.cpp",
         "autograd/generated/VariableType_2.cpp",
@@ -351,6 +345,7 @@ libtorch_extra_sources = libtorch_core_jit_sources + [
     "torch/csrc/jit/serialization/export_module.cpp",
     "torch/csrc/jit/serialization/import_legacy.cpp",
     "torch/csrc/utils/byte_order.cpp",
+    "torch/csrc/utils/out_types.cpp",
 ]
 
 def libtorch_sources(gencode_pattern = ":generate-code[{}]"):
@@ -408,6 +403,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/transform_rfactor.cpp",
     "torch/csrc/jit/codegen/cuda/type.cpp",
     "torch/csrc/jit/tensorexpr/cuda_codegen.cpp",
+    "torch/csrc/jit/runtime/register_cuda_ops.cpp",
 ]
 
 libtorch_cuda_sources = libtorch_cuda_core_sources + [
@@ -503,7 +499,6 @@ libtorch_python_core_sources = [
     "torch/csrc/MemoryFormat.cpp",
     "torch/csrc/QScheme.cpp",
     "torch/csrc/Module.cpp",
-    "torch/csrc/PtrWrapper.cpp",
     "torch/csrc/python_dimname.cpp",
     "torch/csrc/Size.cpp",
     "torch/csrc/Storage.cpp",
diff --git a/tools/code_analyzer/run_analyzer.sh b/tools/code_analyzer/run_analyzer.sh
index 79b366fb1a0d..dc8705cc39f7 100755
--- a/tools/code_analyzer/run_analyzer.sh
+++ b/tools/code_analyzer/run_analyzer.sh
@@ -15,7 +15,7 @@ echo "Analyze: ${INPUT}"
 # to operate, so for safety we match a more expansive set.
 "${ANALYZER_BIN}" \
   -op_schema_pattern="^(_aten|_prim|aten|quantized|_quantized|prepacked|profiler|_test)::[a-zA-Z0-9_.]+(\(.*)?$" \
-  -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl|impl_UNBOXED)|torch::Library::(_?def|_?impl|_?impl_UNBOXED)" \
+  -op_register_pattern="c10::RegisterOperators::(op|checkSchemaAndRegisterOp_)|c10::Module::(_?def|_?impl)|torch::Library::(_?def|_?impl)" \
   -op_invoke_pattern="c10::Dispatcher::findSchema" \
   -root_symbol_pattern="torch::jit::[^(]" \
   -torch_library_init_pattern="^.*TORCH_LIBRARY_init_([^(]+)(\(.*)?$" \
diff --git a/tools/codegen/api/autograd.py b/tools/codegen/api/autograd.py
index 58fb75bb7c07..6f58eea6d1ea 100644
--- a/tools/codegen/api/autograd.py
+++ b/tools/codegen/api/autograd.py
@@ -87,3 +87,36 @@ class DifferentiabilityInfo:
 
     # Raw data read from derivatives.yaml.
     output_differentiability: Optional[List[bool]]
+
+    @property
+    def has_derivatives(self) -> bool:
+        return len(self.args_with_derivatives) > 0
+
+# Represents a differentiable `Argument`.
+# How is it different from the `Argument` type?
+# - It's processed Arguments which are differentiable and only used in the
+#   context of the autograd codegen;
+# - It can represent SelfArgument or regular Argument but not TensorOptionsArgument;
+@dataclass(frozen=True)
+class DifferentiableInput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
+
+# Represents a differentiable `Return`.
+# How it it different from the `Return` type?
+# - The name in `Return` is optional. Here it is always populated using the same
+#   `cpp.return_names()` method.
+#   TODO: some cpp naming logic (e.g. resolving name conflict) might be irrelevant?
+# - It's processed Returns which are differentiable, in compliance with the
+#   `output_differentiability` field defined in derivatives.yaml (if specified),
+#   and are only used in the context of the autograd codegen;
+@dataclass(frozen=True)
+class DifferentiableOutput:
+    name: str
+    type: Type
+
+    # TODO: only to keep it byte-for-byte compatible with the old codegen, should remove.
+    cpp_type: str
diff --git a/tools/codegen/api/cpp.py b/tools/codegen/api/cpp.py
index ffd9626601a0..0debd52ca896 100644
--- a/tools/codegen/api/cpp.py
+++ b/tools/codegen/api/cpp.py
@@ -1,7 +1,6 @@
 from tools.codegen.model import *
 from tools.codegen.api.types import *
-import tools.codegen.local as local
-from typing import Optional, Sequence, Union, List
+from typing import Optional, Sequence, Union, List, Set
 
 # This file describes the translation of JIT schema to the public C++
 # API, which is what people use when they call functions like at::add.
@@ -88,10 +87,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
             if mutable:
                 return MutRefCType(BaseCType('Tensor', binds))  # TODO: fix this discrepancy
             else:
-                if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                    return ConstRefCType(OptionalCType(BaseCType('Tensor', binds)))
-                else:
-                    return ConstRefCType(BaseCType('Tensor', binds))
+                return ConstRefCType(OptionalCType(BaseCType('Tensor', binds)))
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         return OptionalCType(elem)
     elif isinstance(t, ListType):
@@ -104,9 +100,8 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
             return BaseCType("TensorList", binds)
         elif str(t.elem) == 'Dimname':
             return BaseCType("DimnameList", binds)
-        # TODO: do something reasonable about lists of optional tensors
-        elif (not local.use_c10_dispatcher().dispatcher_uses_new_style()) and str(t.elem) == 'Tensor?':
-            return BaseCType("TensorList", binds)
+        elif str(t.elem) == 'Tensor?':
+            return ConstRefCType(BaseCType("c10::List<c10::optional<Tensor>>", binds))
         elem = argumenttype_type(t.elem, mutable=mutable, binds=binds)
         # TODO: explicitly qualify namespace here
         return BaseCType(f"ArrayRef<{elem.cpp_type()}>", binds)
@@ -237,26 +232,37 @@ def default_expr(d: str, t: Type) -> str:
 
 def argument(
     a: Union[Argument, TensorOptionsArguments, SelfArgument],
-    *, method: bool = False, faithful: bool = False,
-    has_tensor_options: bool = False
+    *, cpp_no_default_args: Set[str], method: bool, faithful: bool,
+    has_tensor_options: bool
 ) -> List[Binding]:
+    def sub_argument(a: Union[Argument, TensorOptionsArguments, SelfArgument]) -> List[Binding]:
+        return argument(
+            a, cpp_no_default_args=cpp_no_default_args, method=method, faithful=faithful,
+            has_tensor_options=has_tensor_options)
+
     if isinstance(a, Argument):
         binds: ArgName
         if a.name == "memory_format" and has_tensor_options:
             binds = SpecialArgName.possibly_redundant_memory_format
         else:
             binds = a.name
+        default: Optional[str] = None
+        if a.name not in cpp_no_default_args and a.default is not None:
+            default = default_expr(a.default, a.type)
         return [Binding(
             ctype=argument_type(a, binds=binds),
             name=a.name,
-            default=default_expr(a.default, a.type) if a.default is not None else None,
+            default=default,
             argument=a,
         )]
     elif isinstance(a, TensorOptionsArguments):
         if faithful:
-            return argument(a.dtype) + argument(a.layout) + argument(a.device) + argument(a.pin_memory)
+            return sub_argument(a.dtype) + sub_argument(a.layout) + \
+                sub_argument(a.device) + sub_argument(a.pin_memory)
         else:
             default = None
+            # Enforced by NativeFunction.__post_init__
+            assert 'options' not in cpp_no_default_args
             if all(x.default == "None" for x in a.all()):
                 default = '{}'
             elif a.dtype.default == "long":
@@ -272,13 +278,13 @@ def argument(
             # Caller is responsible for installing implicit this in context!
             return []
         else:
-            return argument(a.argument)
+            return sub_argument(a.argument)
     else:
         assert_never(a)
 
 def arguments(
     arguments: Arguments,
-    *, faithful: bool, method: bool
+    *, faithful: bool, method: bool, cpp_no_default_args: Set[str]
 ) -> List[Binding]:
     args: List[Union[Argument, TensorOptionsArguments, SelfArgument]] = []
     if faithful:
@@ -289,5 +295,8 @@ def arguments(
         args.extend(arguments.non_out)
     return [
         r.no_default() if faithful else r for a in args
-        for r in argument(a, faithful=faithful, method=method, has_tensor_options=arguments.tensor_options is not None)
+        for r in argument(
+            a, faithful=faithful, method=method,
+            has_tensor_options=arguments.tensor_options is not None,
+            cpp_no_default_args=cpp_no_default_args)
     ]
diff --git a/tools/codegen/api/dispatcher.py b/tools/codegen/api/dispatcher.py
index 3adc2465b607..bb65bc386e64 100644
--- a/tools/codegen/api/dispatcher.py
+++ b/tools/codegen/api/dispatcher.py
@@ -2,8 +2,6 @@
 
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
-import tools.codegen.api.native as native
-import tools.codegen.local as local
 
 import itertools
 from typing import Sequence, List, Union
@@ -31,17 +29,11 @@ def name(func: FunctionSchema) -> str:
     return cpp.name(func)
 
 def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
-    if local.use_c10_dispatcher().dispatcher_uses_new_style():
-        # This is a faux amis.  If it makes sense in the future to add
-        # more special cases here, or invert things so cpp.argument_type
-        # calls this, or just completely inline the function, please do
-        # it.
-        return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
-    else:
-        # This is real sharing.  If you're modifying this path, ask
-        # yourself why you are changing the native functions protocol
-        # here and not in native.
-        return native.argumenttype_type(t, mutable=mutable, binds=binds)
+    # This is a faux amis.  If it makes sense in the future to add
+    # more special cases here, or invert things so cpp.argument_type
+    # calls this, or just completely inline the function, please do
+    # it.
+    return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
 
 def argument_type(a: Argument, *, binds: ArgName) -> CType:
     return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
@@ -53,10 +45,6 @@ def returns_type(rs: Sequence[Return]) -> str:
 def argument(
     a: Union[Argument, TensorOptionsArguments, SelfArgument]
 ) -> List[Binding]:
-    # We could forward to native.argument but it is a bit suspect because
-    # the grouping may not be set correctly
-    assert local.use_c10_dispatcher().dispatcher_uses_new_style()
-
     if isinstance(a, Argument):
         return [Binding(
             ctype=argument_type(a, binds=a.name),
@@ -71,13 +59,10 @@ def argument(
         assert_never(a)
 
 def arguments(func: FunctionSchema) -> List[Binding]:
-    if local.use_c10_dispatcher().dispatcher_uses_new_style():
-        return [
-            r for a in itertools.chain(
-                func.arguments.positional,
-                func.arguments.kwarg_only,
-                func.arguments.out
-            ) for r in argument(a)
-        ]
-    else:
-        return native.arguments(func)
+    return [
+        r for a in itertools.chain(
+            func.arguments.positional,
+            func.arguments.kwarg_only,
+            func.arguments.out
+        ) for r in argument(a)
+    ]
diff --git a/tools/codegen/api/native.py b/tools/codegen/api/native.py
index 3b793527edd9..af82210b20f4 100644
--- a/tools/codegen/api/native.py
+++ b/tools/codegen/api/native.py
@@ -4,7 +4,7 @@
 import tools.codegen.api.cpp as cpp
 from tools.codegen import local
 
-from typing import Union, Sequence, List
+from typing import Union, Sequence, List, Optional
 
 # This file describes the translation of JIT schema to the native functions API.
 # This looks a lot like the C++ API (which makes historical sense, because the
@@ -34,7 +34,7 @@ def argumenttype_type(t: Type, *, mutable: bool, binds: ArgName) -> CType:
         else:
             return ConstRefCType(BaseCType('Tensor', binds))
     elif str(t) == 'Tensor?[]':
-        return BaseCType('TensorList', binds)
+        return BaseCType('const c10::List<c10::optional<Tensor>> &', binds)
     return cpp.argumenttype_type(t, mutable=mutable, binds=binds)
 
 def returns_type(rs: Sequence[Return]) -> str:
@@ -43,26 +43,35 @@ def returns_type(rs: Sequence[Return]) -> str:
 def argument_type(a: Argument, *, binds: ArgName) -> CType:
     return argumenttype_type(a.type, mutable=a.is_write, binds=binds)
 
-def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[Binding]:
+def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments], *, is_out: bool) -> List[Binding]:
+    # Ideally, we NEVER default native functions.  However, there are a number
+    # of functions that call native:: directly and rely on the defaulting
+    # existing.  So for BC, we generate defaults for non-out variants (but not
+    # for out variants, where it is impossible to generate an appropriate
+    # default)
+    should_default = not is_out or local.use_c10_dispatcher() is not UseC10Dispatcher.full
     if isinstance(a, Argument):
+        default: Optional[str] = None
+        if should_default and a.default is not None:
+            default = cpp.default_expr(a.default, a.type)
         return [Binding(
             ctype=argument_type(a, binds=a.name),
             name=a.name,
-            default=cpp.default_expr(a.default, a.type) if a.default is not None else None,
+            default=default,
             argument=a,
         )]
     elif isinstance(a, SelfArgument):
         # Erase SelfArgument from the distinction
-        return argument(a.argument)
+        return argument(a.argument, is_out=is_out)
     elif isinstance(a, TensorOptionsArguments):
-        if local.use_c10_dispatcher() in [UseC10Dispatcher.hacky_wrapper_for_legacy_signatures,
-                                          UseC10Dispatcher.with_codegenerated_unboxing_wrapper]:
+        if local.use_c10_dispatcher() == UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
             # TODO: expunge this logic entirely
             default = None
-            if all(x.default == "None" for x in a.all()):
-                default = '{}'
-            elif a.dtype.default == "long":
-                default = 'at::kLong'  # TODO: this is wrong
+            if should_default:
+                if all(x.default == "None" for x in a.all()):
+                    default = '{}'
+                elif a.dtype.default == "long":
+                    default = 'at::kLong'  # TODO: this is wrong
             return [Binding(
                 ctype=ConstRefCType(BaseCType('TensorOptions', 'options')),
                 name='options',
@@ -71,29 +80,35 @@ def argument(a: Union[Argument, SelfArgument, TensorOptionsArguments]) -> List[B
             )]
         else:
             assert local.use_c10_dispatcher() == UseC10Dispatcher.full
+            default = None
+            if should_default:
+                default = '{}'
+            # TODO: Not sure why the arguments assigned here are for
+            # TensorOptionsArguments and not the constituent pieces.  It seems
+            # to matter
             return [
                 Binding(
                     ctype=OptionalCType(BaseCType('ScalarType', 'dtype')),
                     name='dtype',
-                    default='{}',
+                    default=default,
                     argument=a,
                 ),
                 Binding(
                     ctype=OptionalCType(BaseCType('Layout', 'layout')),
                     name='layout',
-                    default='{}',
+                    default=default,
                     argument=a,
                 ),
                 Binding(
                     ctype=OptionalCType(BaseCType('Device', 'device')),
                     name='device',
-                    default='{}',
+                    default=default,
                     argument=a,
                 ),
                 Binding(
                     ctype=OptionalCType(BaseCType('bool', 'pin_memory')),
                     name='pin_memory',
-                    default='{}',
+                    default=default,
                     argument=a,
                 )]
     else:
@@ -107,4 +122,4 @@ def arguments(func: FunctionSchema) -> List[Binding]:
     else:
         args.extend(func.arguments.out)
         args.extend(func.arguments.non_out)
-    return [r for arg in args for r in argument(arg)]
+    return [r for arg in args for r in argument(arg, is_out=func.is_out_fn())]
diff --git a/tools/codegen/api/python.py b/tools/codegen/api/python.py
index 059032869675..749513cb5c0d 100644
--- a/tools/codegen/api/python.py
+++ b/tools/codegen/api/python.py
@@ -3,7 +3,6 @@
 
 from tools.codegen.api.types import *
 import tools.codegen.api.cpp as cpp
-import tools.codegen.local as local
 from tools.codegen.gen import pythonify_default
 from tools.codegen.model import *
 
@@ -228,7 +227,7 @@ class PythonArgument:
     # Compute argument formal for python argument parsing.
     # Needs to be consistent with torch/csrc/utils/python_arg_parser.h.
     def argument_str(self, *, method: bool = False) -> str:
-        type_str = argument_type_str(self.type)
+        type_str = argument_type_str(self.type).replace('const ', '').replace(' &', '')
 
         name = self.name
         # s/self/input/ outside method bindings
@@ -566,7 +565,7 @@ class DispatchLambdaArgumentExprs:
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 
 def _cpp_signature(f: NativeFunction, *, method: bool = False) -> CppSignature:
-    return CppSignatureGroup.from_schema(f.func, method=method).signature
+    return CppSignatureGroup.from_native_function(f, method=method).signature
 
 def has_tensor_options(f: NativeFunction) -> bool:
     return f.func.arguments.tensor_options is not None
@@ -599,11 +598,8 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str:
 
     elif isinstance(t, OptionalType):
         if str(t.elem) == 'Tensor':
-            if not simple_type or local.use_c10_dispatcher().dispatcher_uses_new_style():
-                # Is it desired to keep '?' for simple_type with new style dispatcher?
-                return 'Tensor?'
-            else:
-                return 'Tensor'
+            # Is it desired to keep '?' for simple_type with new style dispatcher?
+            return 'Tensor?'
         elem = argument_type_str(t.elem, simple_type=simple_type)
         if elem == 'Layout':
             # TODO: fix this special case in PythonArgParser?
@@ -624,10 +620,9 @@ def argument_type_str(t: Type, *, simple_type: bool = False) -> str:
             return f'ScalarList[{size}]' if size is not None else 'ScalarList'
         elif str(t.elem) == 'Tensor?':
             if simple_type:
-                return 'TensorList'
+                return 'c10::List<c10::optional<Tensor>>'
             else:
-                # TODO: clone the old codegen behavior but does it make sense?
-                return 'TensorList?'
+                return 'const c10::List<c10::optional<Tensor>> &'
         elif str(t.elem) == 'Dimname':
             return f'DimnameList[{size}]' if size is not None else 'DimnameList'
         elem = argument_type_str(t.elem, simple_type=simple_type)
@@ -1023,10 +1018,7 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
 
     elif isinstance(t, OptionalType):
         if str(t.elem) == 'Tensor':
-            if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                return 'optionalTensor'
-            else:
-                return 'tensor'
+            return 'optionalTensor'
 
         elif isinstance(t.elem, BaseType):
             if t.elem.name in [BaseTy.ScalarType, BaseTy.Scalar,
@@ -1051,12 +1043,14 @@ def arg_parser_unpack_method(t: Type, has_default: bool) -> str:
                 return 'toDimnameListOptional'
 
     elif isinstance(t, ListType):
-        if str(t.elem) == 'Tensor' or str(t.elem) == 'Tensor?':
+        if str(t.elem) == 'Tensor':
             # accept and use definite size
             if t.size is not None:
                 return f'tensorlist_n<{t.size}>'
             else:
                 return 'tensorlist'
+        elif str(t.elem) == 'Tensor?':
+            return 'list_of_optional_tensors'
         elif str(t.elem) == 'Dimname':
             # accept definite size
             return 'dimnamelist'
diff --git a/tools/codegen/api/types.py b/tools/codegen/api/types.py
index 5532c35b4ed2..39fb8bef3846 100644
--- a/tools/codegen/api/types.py
+++ b/tools/codegen/api/types.py
@@ -1,6 +1,6 @@
 from tools.codegen.model import *
 from dataclasses import dataclass
-from typing import Optional, Union, Sequence, TypeVar, List
+from typing import Optional, Union, Sequence, TypeVar, List, Set
 from enum import Enum
 
 _T = TypeVar('_T')
@@ -31,14 +31,16 @@ class BaseCType:
     type: str
     name: ArgName
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
         return self.type
 
 @dataclass(frozen=True)
 class ConstRefCType:
     elem: 'CType'
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
         return f'const {self.elem.cpp_type()} &'
 
     @property
@@ -49,7 +51,9 @@ def name(self) -> ArgName:
 class MutRefCType:
     elem: 'CType'
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        if strip_ref:
+            return self.elem.cpp_type(strip_ref=strip_ref)
         return f'{self.elem.cpp_type()} &'
 
     @property
@@ -60,7 +64,8 @@ def name(self) -> ArgName:
 class OptionalCType:
     elem: 'CType'
 
-    def cpp_type(self) -> str:
+    def cpp_type(self, *, strip_ref: bool = False) -> str:
+        # Do not pass `strip_ref` recursively.
         return f'c10::optional<{self.elem.cpp_type()}>'
 
     @property
@@ -128,13 +133,22 @@ class CppSignature:
     # (i.e. with a potential TensorOptions argument and out arguments in the front)
     faithful: bool
 
+    # The set of C++ arguments which should not have defaults applied to them
+    cpp_no_default_args: Set[str]
+
+    # Is this a fallback C++ binding?  Fallback bindings are enabled by
+    # manual_cpp_binding: True and are alternate, non-public API that
+    # lets manual C++ binding implementors access the binding that would
+    # have been automatically generated
     fallback_binding: bool = False
 
     # Return the unpacked argument structure of this signature,
     # discarding information about which arguments are semantically
     # related to each other.
     def arguments(self) -> Sequence[Binding]:
-        return cpp.arguments(self.func.arguments, faithful=self.faithful, method=self.method)
+        return cpp.arguments(
+            self.func.arguments, faithful=self.faithful,
+            method=self.method, cpp_no_default_args=self.cpp_no_default_args)
 
     def name(self) -> str:
         n = cpp.name(self.func, faithful_name_for_out_overloads=self.faithful)
@@ -168,13 +182,26 @@ class CppSignatureGroup:
     faithful_signature: Optional[CppSignature]
 
     @staticmethod
-    def from_schema(func: FunctionSchema, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup':
+    def from_native_function(f: NativeFunction, *, method: bool, fallback_binding: bool = False) -> 'CppSignatureGroup':
+        func = f.func
         faithful_signature: Optional[CppSignature]
         if func.arguments.tensor_options is not None or len(func.arguments.out) > 0:
-            faithful_signature = CppSignature(func=func, faithful=True, method=method, fallback_binding=fallback_binding)
+            faithful_signature = CppSignature(
+                func=func,
+                faithful=True,
+                method=method,
+                fallback_binding=fallback_binding,
+                cpp_no_default_args=f.cpp_no_default_args
+            )
         else:
             faithful_signature = None
-        signature = CppSignature(func=func, faithful=False, method=method, fallback_binding=fallback_binding)
+        signature = CppSignature(
+            func=func,
+            faithful=False,
+            method=method,
+            fallback_binding=fallback_binding,
+            cpp_no_default_args=f.cpp_no_default_args
+        )
         return CppSignatureGroup(
             func=func,
             signature=signature,
diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
index 37f4ea7cc174..08e9572131e3 100644
--- a/tools/codegen/gen.py
+++ b/tools/codegen/gen.py
@@ -203,8 +203,7 @@ class RegisterSchema:
 
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
-        op_name = f"aten::{f.func.name}"
-        if not self.selector.is_operator_selected(op_name):
+        if not self.selector.is_native_function_selected(f):
             return None
         return f'm.def({cpp_string(str(f.func))});\n'
 
@@ -388,6 +387,7 @@ def gen_structured(self, g: StructuredNativeFunctions) -> List[str]:
         @with_native_function
         def gen_one(f: NativeFunction) -> Optional[str]:
             assert self.target is not Target.DECLARATION
+            assert not f.manual_kernel_registration
 
             # TODO: put this into StructuredNativeFunctions itself
             functional_func = g.out.func.signature()
@@ -398,8 +398,7 @@ def gen_one(f: NativeFunction) -> Optional[str]:
                 e.expr for e in translate(functional_sig.arguments(), dispatcher.arguments(functional_func), method=False)
             )
 
-            op_name = f"aten::{f.func.name}"
-            if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name):
+            if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f):
                 return None
 
             k = f.func.kind()
@@ -431,11 +430,13 @@ def gen_one(f: NativeFunction) -> Optional[str]:
                 if self.dispatch_key == 'Meta':
                     impl_call = ""
                 else:
-                    impl_call = f"op.impl({out_expr}, {functional_exprs});"
+                    impl_call = f"op.impl({functional_exprs}, {out_expr});"
 
                 # For an overview of what this template code looks like, see
                 # https://github.com/pytorch/rfcs/pull/9
                 return f"""\
+namespace {{
+
 {self.gen_structured_class(
     f, k,
     class_name=class_name,
@@ -449,24 +450,15 @@ def gen_one(f: NativeFunction) -> Optional[str]:
     {impl_call}
     return {ret_expr};
 }}
+
+}} // anonymous namespace
 """
 
             elif self.target is Target.REGISTRATION:
                 dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
-                if local.use_c10_dispatcher() is UseC10Dispatcher.full:
-                    payload = f"TORCH_FN({sig.name()})"
-                elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
-                    payload = f"""
-c10::impl::hacky_wrapper_for_legacy_signatures<
-    {dispatcher_sig.type()},
-    {len(f.func.arguments.out)}
->(TORCH_FN({sig.name()}))
-"""
-                else:
-                    assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-                    payload = f"torch::CppFunction::makeUnboxedOnly(&{sig.name()})"
-                return f'm.impl("{f.func.name}", {payload});'
+                assert local.use_c10_dispatcher() is UseC10Dispatcher.full
+                return f'm.impl("{f.func.name}", TORCH_FN({sig.name()}));'
             else:
                 assert_never(self.target)
                 # Silence mypy's "Missing return statement" error
@@ -479,17 +471,12 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
         # for mypy type refinement; would be fixed by TODO on target
         assert self.target is not Target.DECLARATION
 
-        if f.func.is_out_fn():
-            assert local.use_c10_dispatcher().dispatcher_uses_new_style(), \
-                ("{} takes out arguments and has to be written in the new style. " +
-                 "Please add `use_c10_dispatcher: full` to your operator in native_functions.yaml " +
-                 "and write the C++ implementation to take out arguments in the end.").format(f.func.name)
-
         if self.dispatch_key not in f.dispatch:
             return None
+        if f.manual_kernel_registration:
+            return None
 
-        op_name = f"aten::{f.func.name}"
-        if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name):
+        if self.target is Target.REGISTRATION and not self.selector.is_native_function_selected(f):
             return None
 
         name = native.name(f.func)
@@ -526,8 +513,7 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
     const DeviceGuard device_guard(device_or_default(device));
 """
                 else:
-                    assert local.use_c10_dispatcher() in [UseC10Dispatcher.with_codegenerated_unboxing_wrapper,
-                                                          UseC10Dispatcher.hacky_wrapper_for_legacy_signatures]
+                    assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures
                     cuda_guard_from_tensor_options = """\
     const DeviceGuard device_guard(options.device());
 """
@@ -551,9 +537,13 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
 """
 
             return f"""\
+namespace {{
+
 {returns_type} {name}({args_str}) {{
 {cuda_guard}{return_kw}{impl_name}({args_exprs_str});
 }}
+
+}} // anonymous namespace
 """
 
         elif self.target is Target.REGISTRATION:
@@ -565,16 +555,14 @@ def gen_unstructured(self, f: NativeFunction) -> Optional[str]:
                 # Figure out which signature the function is
                 if local.use_c10_dispatcher() is UseC10Dispatcher.full:
                     payload = f"TORCH_FN({name})"
-                elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures:
+                else:
+                    assert local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures
                     payload = f"""
 c10::impl::hacky_wrapper_for_legacy_signatures<
     {dispatcher_sig.type()},
     {len(f.func.arguments.out)}
 >(TORCH_FN({name}))
 """
-                else:
-                    assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-                    payload = f"torch::CppFunction::makeUnboxedOnly(&{name})"
 
                 return f'm.impl("{f.func.name}",\n{payload});\n'
         else:
@@ -589,14 +577,12 @@ class ComputeFunction:
 
     @method_with_native_function
     def __call__(self, f: NativeFunction) -> Optional[str]:
-        if f.manual_kernel_registration:
-            return None
         if Variant.function not in f.variants:
             return None
 
         name = cpp.name(f.func)
 
-        sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=f.manual_cpp_binding)
+        sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=f.manual_cpp_binding)
 
         if self.target is Target.DECLARATION:
             result = f"TORCH_API {sig_group.signature.decl()};\n"
@@ -650,7 +636,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 
         name = cpp.name(f.func)
 
-        sig_group = CppSignatureGroup.from_schema(f.func, method=True, fallback_binding=f.manual_cpp_binding)
+        sig_group = CppSignatureGroup.from_native_function(f, method=True, fallback_binding=f.manual_cpp_binding)
 
         if self.target is Target.DECLARATION:
             result = f"{sig_group.signature.decl()} const;\n"
@@ -729,17 +715,7 @@ def compute_native_function_declaration(g: Union[StructuredNativeFunctions, Nati
                 if is_structured_dispatch_key(k):
                     continue
                 seen.add(n)
-                if f.func.is_out_fn() and local.use_c10_dispatcher() is UseC10Dispatcher.full:
-                    # out overloads don't get default arguments because
-                    # defaulted arguments would be before the out argument
-                    # in the argument list and that doesn't work.
-                    # TODO We should consider if we just want to remove
-                    # default arguments from all at::native functions
-                    # but that would be a larger change because we need
-                    # to change a lot of call sites
-                    args_str = ', '.join(a.defn() for a in args)
-                else:
-                    args_str = ', '.join(a.decl() for a in args)
+                args_str = ', '.join(a.decl() for a in args)
                 rs.append(f"TORCH_API {returns_type} {n}({args_str});")
 
         return rs
@@ -769,7 +745,7 @@ def compute_meta_function_declaration(g: StructuredNativeFunctions) -> str:
         sig = g.signature()
         name = meta.name(g)
         args = native.arguments(sig)
-        args_str = ', '.join(a.defn() for a in args)
+        args_str = ', '.join(a.decl() for a in args)
         parent_class = g.out.structured_inherits
         if parent_class is None:
             parent_class = "at::impl::MetaBase"
@@ -805,14 +781,9 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
         dispatcher_sig = DispatcherSignature.from_schema(f.func)
 
         sig: Union[NativeSignature, DispatcherSignature]
-        if local.use_c10_dispatcher().dispatcher_uses_new_style():
-            sig = dispatcher_sig
-            dispatcher_exprs = dispatcher_sig.exprs()
-            dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
-        else:
-            sig = native_sig
-            dispatcher_exprs = native_sig.dispatcher_exprs()
-            dispatch_key = "options.computeDispatchKey()"
+        sig = dispatcher_sig
+        dispatcher_exprs = dispatcher_sig.exprs()
+        dispatch_key = "c10::computeDispatchKey(dtype, layout, device)"
 
         if self.target is Target.DEFINITION:
             # I don't think there's actually a good reason to generate
@@ -838,11 +809,7 @@ def __call__(self, f: NativeFunction) -> Optional[str]:
 }}
 """
         elif self.target is Target.REGISTRATION:
-            if local.use_c10_dispatcher().dispatcher_uses_new_style():
-                return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
-            else:
-                assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-                return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});"""
+            return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));"""
         elif self.target is Target.DECLARATION:
             raise AssertionError()
         else:
@@ -1032,7 +999,7 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
     kwarg_only_set = set(a.name for a in f.func.arguments.flat_kwarg_only)
     out_arg_set = set(a.name for a in f.func.arguments.out)
 
-    sig_group = CppSignatureGroup.from_schema(f.func, method=False, fallback_binding=False)
+    sig_group = CppSignatureGroup.from_native_function(f, method=False, fallback_binding=False)
     cpp_args = sig_group.signature.arguments()
     arguments = [
         compute_cpp_argument_yaml(
@@ -1052,7 +1019,9 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
 
     cpp_schema_order_types = [
         # NB: method here doesn't matter
-        r.type for a in schema_order_jit_arguments for r in cpp.argument(a, method=False)
+        r.type for a in schema_order_jit_arguments
+        for r in cpp.argument(
+            a, method=False, cpp_no_default_args=set(), faithful=False, has_tensor_options=False)
     ]
 
     cpp_returns = cpp.returns_type(f.func.returns)
@@ -1065,7 +1034,6 @@ def compute_declaration_yaml(f: NativeFunction) -> object:
         ('name', cpp.name(f.func)),
         ('operator_name', str(f.func.name.name)),
         ('overload_name', str(f.func.name.overload_name)),
-        ('use_c10_dispatcher', f.use_c10_dispatcher.name),
         ('manual_kernel_registration', f.manual_kernel_registration),
         ('category_override', f.category_override if f.category_override is not None else ''),
         ('matches_jit_signature', True),
@@ -1091,7 +1059,7 @@ def compute_registration_declarations(f: NativeFunction) -> str:
     name = dispatcher.name(f.func)
     returns_type = dispatcher.returns_type(f.func.returns)
     args = dispatcher.arguments(f.func)
-    args_str = ', '.join(a.defn() for a in args)
+    args_str = ', '.join(a.no_default().decl() for a in args)
     comment_data : Dict[str, str] = {
         'schema': f'aten::{f.func}',
         # TODO: What exactly is the semantics of the 'dispatch' field?
diff --git a/tools/codegen/model.py b/tools/codegen/model.py
index a007e1a76f7c..1128878fe45c 100644
--- a/tools/codegen/model.py
+++ b/tools/codegen/model.py
@@ -49,12 +49,8 @@ def __str__(self) -> str:
 
 class UseC10Dispatcher(Enum):
     full = 0
-    with_codegenerated_unboxing_wrapper = 1
     hacky_wrapper_for_legacy_signatures = 2
 
-    def dispatcher_uses_new_style(self) -> bool:
-        return self in [UseC10Dispatcher.full, UseC10Dispatcher.hacky_wrapper_for_legacy_signatures]
-
 # The basic input to the code generation is native_functions.yaml.
 # The name "native", BTW, comes from the distinction between native
 # functions and legacy TH functions.  The legacy TH functions are gone,
@@ -77,7 +73,7 @@ class NativeFunction:
     func: 'FunctionSchema'
 
     # Corresponds to the 'use_c10_dispatcher' field.  The default
-    # is 'with_codegenerated_unboxing_wrapper'
+    # is 'full'
     use_c10_dispatcher: UseC10Dispatcher
 
     # Whether or not to omit automatic generation of a DeviceGuard
@@ -137,6 +133,10 @@ class NativeFunction:
     # changes the semantics of set_output to call the parent class.
     structured_inherits: Optional[str]
 
+    # Argument names whose default  should be excluded from the C++ interface.
+    # Intended for resolving overload ambiguities between signatures.
+    cpp_no_default_args: Set[str]
+
     # Note [Abstract ATen methods]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # An abstract ATen method is one whose dispatch differs between
@@ -169,16 +169,18 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
         assert isinstance(funcs, str), f'not a str: {funcs}'
         func = FunctionSchema.parse(funcs)
 
-        use_c10_dispatcher_s = e.pop('use_c10_dispatcher', None)
-        if use_c10_dispatcher_s is None:
-            use_c10_dispatcher = UseC10Dispatcher.with_codegenerated_unboxing_wrapper
-        elif use_c10_dispatcher_s == 'full':
+        cpp_no_default_args_list = e.pop('cpp_no_default_args', [])
+        assert isinstance(cpp_no_default_args_list, list)
+        cpp_no_default_args = set(cpp_no_default_args_list)
+
+        use_c10_dispatcher_s = e.pop('use_c10_dispatcher', 'full')
+        if use_c10_dispatcher_s == 'full':
             use_c10_dispatcher = UseC10Dispatcher.full
         elif use_c10_dispatcher_s == 'hacky_wrapper_for_legacy_signatures':
             use_c10_dispatcher = UseC10Dispatcher.hacky_wrapper_for_legacy_signatures
         else:
             raise AssertionError(
-                f'use_c10_dispatcher must be unset or set to full, got {use_c10_dispatcher}')
+                f'use_c10_dispatcher must be full or hacky_wrapper_for_legacy_signatures, got {use_c10_dispatcher}')
 
         variants_s = e.pop('variants', 'function')
         assert isinstance(variants_s, str)
@@ -222,6 +224,9 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
         assert raw_dispatch is None or isinstance(raw_dispatch, dict), e
         dispatch: Dict[str, str] = {}
         if raw_dispatch is not None:
+            assert not manual_kernel_registration, \
+                "cannot specify both manual_kernel_registration and dispatch; with " \
+                "manual registration, dispatch has no effect!"
             for ks, v in raw_dispatch.items():
                 if ks == '__line__':
                     continue  # not worth tracking line numbers for dispatch entries
@@ -255,6 +260,7 @@ def from_yaml(ei: Dict[str, object], loc: 'Location') -> 'NativeFunction':
             dispatch=dispatch,
             device_guard=device_guard,
             loc=loc,
+            cpp_no_default_args=cpp_no_default_args,
         )
 
     def validate_unstructured(self) -> None:
@@ -290,6 +296,13 @@ def __post_init__(self) -> None:
         # happen
         assert not (self.structured and self.structured_delegate), \
             "Cannot have both structured and structured_delegate on function"
+        defaulted_arguments = {a.name for a in self.func.schema_order_arguments()
+                               if a.default is not None}
+        invalid_args = set.difference(self.cpp_no_default_args, defaulted_arguments)
+        assert len(invalid_args) == 0, f'Invalid cpp_no_default_args: {invalid_args}'
+        if self.structured or self.structured_delegate:
+            assert self.use_c10_dispatcher is UseC10Dispatcher.full, \
+                "Structured kernels MUST be use_c10_dispatcher: full; port your argument order"
 
 SchemaKind = Enum('SchemaKind', ('functional', 'inplace', 'out'))
 
@@ -548,7 +561,7 @@ def kind(self) -> SchemaKind:
         else:
             return SchemaKind.functional
 
-    def signature(self) -> 'FunctionSchema':
+    def signature(self, *, strip_default: bool = False) -> 'FunctionSchema':
         """
         Certain schemas are 'related', in that they are simply
         inplace/out/functional versions of the same function.  This method
@@ -563,11 +576,13 @@ def signature(self) -> 'FunctionSchema':
         - Out arguments are stripped
         - Mutability annotations are stripped  (this is sound
           because you cannot overload on mutability annotation)
+        - Return names are stripped since they are not overloadable and
+          some variants have return names but some not
         """
 
         def strip_ret_annotation(r: Return) -> Return:
             return Return(
-                name=r.name,
+                name=None,
                 type=r.type,
                 annotation=None,
             )
@@ -581,7 +596,7 @@ def strip_ret_annotation(r: Return) -> Return:
                 ),
                 overload_name="",  # stripped
             ),
-            arguments=self.arguments.signature(),
+            arguments=self.arguments.signature(strip_default=strip_default),
             returns=tuple(map(strip_ret_annotation, self.returns)),
         )
 
@@ -964,14 +979,14 @@ def kwarg_only(self) -> Sequence[Union[Argument, TensorOptionsArguments]]:
         ret.extend(self.post_tensor_options_kwarg_only)
         return ret
 
-    def signature(self) -> 'Arguments':
+    def signature(self, *, strip_default: bool = False) -> 'Arguments':
         # dataclasses.replace could be used here, but it is less
         # type safe so for now I've opted to type everything out
         def strip_arg_annotation(a: Argument) -> Argument:
             return Argument(
                 name=a.name,
                 type=a.type,
-                default=a.default,  # hmmm
+                default=a.default if not strip_default else None,
                 annotation=None,
             )
 
diff --git a/tools/codegen/selective_build/selector.py b/tools/codegen/selective_build/selector.py
index 24e387128b6c..eeb15049075e 100644
--- a/tools/codegen/selective_build/selector.py
+++ b/tools/codegen/selective_build/selector.py
@@ -1,8 +1,9 @@
-from typing import Dict, Set, Optional, Tuple
+from typing import Dict, Set, Optional, Tuple, List
 import yaml
 
 from dataclasses import dataclass
 
+from tools.codegen.model import NativeFunction
 from tools.codegen.selective_build.operator import *
 
 # A SelectiveBuilder holds information extracted from the selective build
@@ -25,6 +26,20 @@ class SelectiveBuilder:
     # A dictionary of operator -> operator metadata.
     operators: Dict[str, SelectiveBuildOperator]
 
+    # A dictionary of selected kernel tags and dtypes. Typically a
+    # PyTorch Operator Kernel (function) may have many code paths
+    # that are specialized for many many Tensor dtypes, so it's not
+    # one per kernel function, but there could be many per kernel
+    # function. The tag isn't a kernel function name, but some fragment
+    # of the kernel function implementation itself.
+    kernel_metadata: Dict[str, List[str]]
+
+    # If true, then fragments for all dtypes for all kernel functions
+    # are included. This is typically set when any one of the
+    # operator lists is generated from a mechanism other than
+    # tracing based selective build.
+    include_all_kernel_dtypes: bool
+
     @staticmethod
     def get_nop_selector() -> 'SelectiveBuilder':
         return SelectiveBuilder.from_yaml_dict({'include_all_operators': True})
@@ -32,9 +47,11 @@ def get_nop_selector() -> 'SelectiveBuilder':
     @staticmethod
     def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder':
         valid_top_level_keys = {
+            'include_all_kernel_dtypes',
             'include_all_operators',
             'debug_info',
             'operators',
+            'kernel_metadata',
         }
         top_level_keys = set(data.keys())
         if len(top_level_keys - valid_top_level_keys) > 0:
@@ -57,7 +74,24 @@ def from_yaml_dict(data: Dict[str, object]) -> 'SelectiveBuilder':
 
         for (k, v) in operators_dict.items():
             operators[k] = SelectiveBuildOperator.from_yaml_dict(k, v)
-        return SelectiveBuilder(include_all_operators, debug_info, operators)
+
+        kernel_metadata = {}
+        kernel_metadata_dict = data.get('kernel_metadata', {})
+        assert isinstance(kernel_metadata_dict, dict)
+
+        for (k, v) in kernel_metadata_dict.items():
+            kernel_metadata[str(k)] = list(map(lambda dtype: str(dtype), v))
+
+        include_all_kernel_dtypes = data.get('include_all_kernel_dtypes', False)
+        assert isinstance(include_all_kernel_dtypes, bool)
+
+        return SelectiveBuilder(
+            include_all_operators,
+            debug_info,
+            operators,
+            kernel_metadata,
+            include_all_kernel_dtypes,
+        )
 
     @staticmethod
     def from_yaml_str(config_contents: str) -> 'SelectiveBuilder':
@@ -85,6 +119,7 @@ def from_legacy_op_registration_allow_list(
             }
         return SelectiveBuilder.from_yaml_dict({
             'operators': operators,
+            'include_all_kernel_dtypes': True,
         })
 
     def is_operator_selected(self, name: str) -> bool:
@@ -96,6 +131,10 @@ def is_operator_selected(self, name: str) -> bool:
         name = strip_operator_overload_name(name)
         return name in self.operators and self.operators[name].include_all_overloads
 
+    def is_native_function_selected(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected(op_name)
+
     def is_operator_selected_for_training(self, name: str) -> bool:
         if not self.is_operator_selected(name):
             return False
@@ -123,6 +162,10 @@ def is_operator_selected_for_training(self, name: str) -> bool:
             (base_op.include_all_overloads and base_op.is_used_for_training)
         )
 
+    def is_native_function_selected_for_training(self, func: NativeFunction) -> bool:
+        op_name = op_name_from_native_function(func)
+        return self.is_operator_selected_for_training(op_name)
+
     def is_root_operator(self, name: str) -> bool:
         if not self.is_operator_selected(name):
             return False
@@ -138,8 +181,15 @@ def is_root_operator(self, name: str) -> bool:
         base_op: SelectiveBuildOperator = self.operators[name]
         return base_op.include_all_overloads and base_op.is_root_operator
 
+    def is_kernel_dtype_selected(self, kernel_tag: str, dtype: str) -> bool:
+        if self.include_all_operators or self.include_all_kernel_dtypes:
+            return True
+
+        return kernel_tag in self.kernel_metadata and dtype in self.kernel_metadata[kernel_tag]
+
     def to_dict(self) -> Dict[str, object]:
         ret: Dict[str, object] = {
+            'include_all_kernel_dtypes': self.include_all_kernel_dtypes,
             'include_all_operators': self.include_all_operators,
         }
         operators = {}
@@ -150,11 +200,41 @@ def to_dict(self) -> Dict[str, object]:
         if self._debug_info is not None:
             ret['debug_info'] = self._debug_info
 
+        ret['kernel_metadata'] = {k: list(v) for (k, v) in self.kernel_metadata.items()}
+
         return ret
 
 
+def merge_kernel_metadata(
+        lhs: Dict[str, List[str]],
+        rhs: Dict[str, List[str]],
+) -> Dict[str, List[str]]:
+    kernel_metadata: Dict[str, List[str]] = {}
+    for (tag_name, dtypes) in list(lhs.items()) + list(rhs.items()):
+        dtypes_copy = set(dtypes)
+        if tag_name in kernel_metadata:
+            dtypes_copy |= set(kernel_metadata[tag_name])
+
+        kernel_metadata[tag_name] = list(dtypes_copy)
+
+    return kernel_metadata
+
 def combine_selective_builders(lhs: SelectiveBuilder, rhs: SelectiveBuilder) -> SelectiveBuilder:
     include_all_operators = lhs.include_all_operators or rhs.include_all_operators
     debug_info = merge_debug_info(lhs._debug_info, rhs._debug_info)
     operators = merge_operator_dicts(lhs.operators, rhs.operators)
-    return SelectiveBuilder(include_all_operators, debug_info, operators)
+    kernel_metadata = merge_kernel_metadata(lhs.kernel_metadata, rhs.kernel_metadata)
+    include_all_kernel_dtypes = lhs.include_all_kernel_dtypes or rhs.include_all_kernel_dtypes
+    return SelectiveBuilder(
+        include_all_operators,
+        debug_info,
+        operators,
+        kernel_metadata,
+        include_all_kernel_dtypes,
+    )
+
+
+def op_name_from_native_function(f: NativeFunction) -> str:
+    # This was originally read from the 'operator_name_with_overload' field in the
+    # declaration dict, which was the part before the first '(' in 'schema_string'.
+    return f'aten::{f.func.name}'
diff --git a/tools/jit/gen_unboxing_wrappers.py b/tools/jit/gen_unboxing_wrappers.py
deleted file mode 100644
index 267b5a3b221a..000000000000
--- a/tools/jit/gen_unboxing_wrappers.py
+++ /dev/null
@@ -1,545 +0,0 @@
-"""
-To run this file by hand from the root of the PyTorch
-repository, run:
-
-python -m tools.jit.gen_unboxing_wrappers \
-       build/aten/src/ATen/Declarations.yaml \
-       $OUTPUT_DIR \
-       tools/jit/templates
-
-Where $OUTPUT_DIR is where you would like the files to be
-generated.  In the full build system, OUTPUT_DIR is
-torch/csrc/jit/generated/
-"""
-
-# This file generates generated_unboxing_wrappers, which contains
-# manual unboxing wrappers for ops that aren't use_c10_dispatcher: full
-# because the templated unboxing logic in c10 doesn't support them yet.
-# The ultimate goal is to make all ops use the templated unboxing and
-# delete this codegen file.
-
-import argparse
-import re
-from itertools import groupby
-from functools import reduce
-from ..autograd.gen_autograd import load_aten_declarations
-from ..autograd.gen_autograd import RETURNS_VIEWS_OF_INPUT
-from ..autograd.utils import CodeTemplate, write, is_out_variant, op_name_with_overload
-from tools.codegen.selective_build.selector import SelectiveBuilder
-
-# JIT has a type system of
-# Scalar = int | float | bool # int is the largest int (int64_t),
-# float is the largest float (double) we don't have the others because they are never held in tensors
-# Type = Scalar # primitive numbers
-#      | Tensor # any tensor, as defined by at::Tensor
-#      | Type[] # a dynamically sized list[ of a type
-#      | Scalar[N] # a homogenous fixed size scalar list, single scalars can expand to this list
-#      | (Type1, Type2, ...) # a heterogeneous tuple
-#      | Layout | ScalarType | Device | Generator # special singleton types for built-in concepts in tensor lib
-
-# clean up the variety of C++ types in the ATen declarations
-# to be in the restricted set of types that the IR represents
-# note: no default values for this map, to make it clear what types
-# can be passedthrough
-
-TYPE_MAP = {
-    'std::array<bool,2>': 'bool[2]',
-    'std::array<bool,3>': 'bool[3]',
-    'std::array<bool,4>': 'bool[4]',
-    'std::string': 'str',
-    'std::string?': 'str?',
-    'Scalar': 'Scalar',
-    'ScalarList': 'Scalar[]',
-    'MemoryFormat': 'MemoryFormat',
-    'MemoryFormat?': 'MemoryFormat?',
-    'QScheme': 'QScheme',
-    'Scalar?': 'Scalar?',
-    'Tensor': 'Tensor',
-    'Tensor?': 'Tensor?',
-    'TensorList': 'Tensor[]',
-    # this appears in return values instead of TensorList
-    # since TensorList is a ArrayRef in arguments but a vector
-    # in returns
-    'std::vector<Tensor>': 'Tensor[]',
-    'IntArrayRef': 'int[]',
-    'IntArrayRef?': 'int[]?',
-    'ArrayRef<double>?': 'float[]?',
-    'Layout': 'Layout',
-    'Layout?': 'Layout?',
-    'Device': 'Device',
-    'Device?': 'Device?',
-    'ScalarType': 'ScalarType',
-    'ScalarType?': 'ScalarType?',
-    'int64_t': 'int',
-    'int64_t?': 'int?',
-    'double': 'float',
-    'double?': 'float?',
-    'bool': 'bool',
-    'bool?': 'bool?',
-    'Generator': 'Generator?',
-    'Generator?': 'Generator?',
-}
-
-
-def optional_type_of(arg, typ):
-    # optional type special handling for Tensor?[] and Tensor
-    # types that is missing a optional annotation
-    if arg.get('is_nullable') and '?' not in typ:
-        if typ == 'TensorList' or typ == 'Tensor[]':
-            typ = 'Tensor?[]'
-        else:
-            typ = '{}?'.format(typ)
-    return typ
-
-
-def annotated_type_of(arg, typ):
-    anno = arg.get('annotation')
-    if anno:
-        typ = '{}({})'.format(typ, anno)
-    return typ
-
-
-def jit_type_of(arg):
-    jit_type = arg.get('jit_type')
-    if not jit_type:
-        jit_type = TYPE_MAP[arg['simple_type']]
-        if is_sized_intlist_arg(arg):
-            jit_type = 'int[{}]'.format(arg['size'])
-        jit_type = optional_type_of(arg, jit_type)
-        jit_type = annotated_type_of(arg, jit_type)
-        arg['jit_type'] = jit_type
-    return jit_type
-
-
-# map from aten 'simple_type' to the function that will turn a tensor into
-# that type
-FROM_IVALUE = {
-    'Device': '{}.toDevice()',
-    'Device?': '{}.toOptional<c10::Device>()',
-    'IntArrayRef': '{}.toIntVector()',
-    'IntArrayRef?': '{}.toOptionalIntArray()',
-    'ArrayRef<double>?': '{}.toOptionalDoubleArray()',
-    'Layout': '{}.toLayout()',
-    'Layout?': '{}.toOptional<c10::Layout>()',
-    'MemoryFormat': '{}.toMemoryFormat()',
-    'MemoryFormat?': '{}.toOptional<c10::MemoryFormat>()',
-    'QScheme': '{}.toQScheme()',
-    'Scalar': '{}.toScalar()',
-    'Scalar?': '{}.toOptional<Scalar>()',
-    'ScalarType': '{}.toScalarType()',
-    'ScalarType?': '{}.toOptional<ScalarType>()',
-    'Tensor': '{}.toTensor()',
-    'Tensor?': 'toOptionalTensor({})',
-    'Tensor?[]': 'toListOfOptionalTensor({})',
-    'TensorList': '{}.toTensorVector()',
-    'ScalarList': '{}.toScalarVector()',
-    'bool': '{}.toBool()',
-    'bool?': '{}.toOptional<bool>()',
-    'double': '{}.toDouble()',
-    'double?': '{}.toOptional<double>()',
-    'int64_t': '{}.toInt()',
-    'int64_t?': '{}.toOptional<int64_t>()',
-    'std::string': '{}.toStringRef()',
-    'std::string?': '{}.toOptional<std::string>()',
-    'Generator?': '{}.toOptional<at::Generator>()',
-    'std::array<bool,2>': 'as_bool_array<2>({}.toBoolList())',
-    'std::array<bool,3>': 'as_bool_array<3>({}.toBoolList())',
-    'std::array<bool,4>': 'as_bool_array<4>({}.toBoolList())',
-}
-
-
-def from_ivalue(arg, value):
-    typ = optional_type_of(arg, arg['simple_type'])
-    return FROM_IVALUE[typ].format(value)
-
-
-CALL_UNBOXED_KERNEL = CodeTemplate("""\
-auto result_ = callUnboxedKernel<${return_type}${formals_types_with_leading_comma}>(unboxedKernel${args_with_leading_comma});
-""")
-CALL_NAMESPACE = CodeTemplate("""\
-auto result_ = at::${name}(
-    ${args}
-);
-""")
-CALL_METHOD = CodeTemplate("""\
-auto result_ = (${first}).${name}(
-    ${args}
-);
-""")
-CALL_NAMESPACE_WITH_TENSOR_OPTIONS = CodeTemplate("""\
-const auto options = TensorOptions()
-        .dtype(${dtype})
-        .layout(${layout})
-        .device(${device})
-        .pinned_memory(${pin_memory});
-    auto result_ = torch::${name}(${args_with_tensor_options});
-""")
-CALL_METHOD_WITH_TENSOR_OPTIONS = CodeTemplate("""\
-const auto options = TensorOptions()
-        .dtype(${dtype})
-        .layout(${layout})
-        .device(${device})
-        .pinned_memory(${pin_memory});
-auto result_ = (${first}).${name}(${args_with_tensor_options});
-""")
-
-CONSTRUCTOR = CodeTemplate("""\
-[](OperatorKernel* unboxedKernel, const OperatorHandle&, Stack* stack) {
-    using namespace at;
-    ${lvalues}
-    ${call}
-    drop(*stack, ${num_inputs});
-    pack(*stack, std::move(result_));
-}
-""")
-
-OPERATOR = CodeTemplate("""\
-  .op("${signature}",
-    ${op})
-""")
-
-
-disallowed_types = {
-    'Storage',
-    'DimnameList?',
-    'ConstQuantizerPtr',
-    'Dimname',
-    'DimnameList',
-}
-
-default_only_types = {'Generator'}
-
-
-def is_jit_arg(i, arg):
-    simple_type = arg['simple_type']
-    if simple_type in disallowed_types:
-        return False
-    if simple_type in default_only_types and 'default' not in arg:
-        return False
-    if simple_type == 'Type':
-        return False
-    return True
-
-
-def is_jit_op(decl):
-    # We currently don't support functions that return nothing
-    assert all(r['type'] != 'void' for r in decl['returns'])
-    if len(decl['returns']) == 0:
-        return False
-
-    arguments = decl['arguments']
-
-    # there must be a single out variant
-    if is_out_variant(decl) and sum([not not arg.get('output') for arg in arguments]) > 1:
-        return False
-
-    return (('namespace' in decl['method_of'] or 'Tensor' in decl['method_of']) and
-            all(is_jit_arg(i, arg) for i, arg in enumerate(decl['arguments'])) and
-            all(is_jit_arg(i, arg) for i, arg in enumerate(decl['returns'])))
-
-
-def is_tensor_arg(arg):
-    return arg['simple_type'] in {'Tensor', 'TensorList'}
-
-
-def is_sized_intlist_arg(arg):
-    """Returns True for arguments declared as IntArrayRef[k], but False for IntArrayRef."""
-    return (arg['simple_type'] == 'IntArrayRef') and ('size' in arg)
-
-
-def base_name(decl):
-    name = decl['name']
-    return name[:-1] if decl.get('inplace', False) else name[:-4] if name.endswith('_out') else name
-
-
-def is_view(decl):
-    return base_name(decl) in RETURNS_VIEWS_OF_INPUT
-
-
-# Copied from ..autograd.gen_python_functions.SKIP_PYTHON_BINDINGS
-BACKWARD_OP_PATTERNS = [
-    '.*_backward',
-    '.*_backward_(out|input|weight|bias)',
-]
-
-def is_backward_op(decl):
-    for pattern in BACKWARD_OP_PATTERNS:
-        if re.match('^' + pattern + '$', decl['name']):
-            return True
-    return False
-
-
-# for each argument in decl, the location it should appear in the
-# jit schema declaration. e.g.
-# arguments = [x, y, z] # the order in aten
-# jit_argument_order = [2, 0, 1]
-# aten::my_arg(Tensor y, Tensor z, Tensor x) # the order in schema
-# used to move 'out' arguments to the end of the list
-def argument_order(decl):
-    return decl.get('jit_argument_order') or list(range(len(decl['arguments'])))
-
-
-def gen_unboxing_wrappers(
-    declarations,
-    out,
-    template_path,
-    operator_selector: SelectiveBuilder,
-    disable_autograd=False,
-    force_schema_registration=False,
-):
-    GENERATED_UNBOXING_WRAPPERS_CPP = CodeTemplate.from_file(template_path + '/generated_unboxing_wrappers.cpp')
-
-    ops = []
-
-    def get_invocation(decl, args, num_inputs):
-
-        # because the arg list can get lengthy we put them on a separate line
-        def pack_arguments(args):
-            return ',\n'.join(args)
-        is_namespace_function = 'namespace' in decl['method_of']
-        tensor_options_arg_index = decl.get('tensor_options_arg_index', None)
-        if tensor_options_arg_index is not None:
-            dtype = args[tensor_options_arg_index]
-            layout = args[tensor_options_arg_index + 1]
-            device = args[tensor_options_arg_index + 2]
-            pin_memory = args[tensor_options_arg_index + 3]
-            args_with_tensor_options = args[:tensor_options_arg_index] + \
-                ['options'] + args[(tensor_options_arg_index + 4):]
-            if is_namespace_function:
-                return CALL_NAMESPACE_WITH_TENSOR_OPTIONS.substitute(
-                    name=decl['name'], dtype=dtype, layout=layout,
-                    device=device, pin_memory=pin_memory,
-                    args_with_tensor_options=pack_arguments(args_with_tensor_options))
-            else:
-                return CALL_METHOD_WITH_TENSOR_OPTIONS.substitute(
-                    name=decl['name'], dtype=dtype, layout=layout,
-                    device=device, pin_memory=pin_memory,
-                    args_with_tensor_options=pack_arguments(args_with_tensor_options[1:]),
-                    first=args_with_tensor_options[0], num_inputs=num_inputs)
-        elif decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper':
-            if len(decl['returns']) == 0:
-                return_type = "void"
-            elif len(decl['returns']) == 1:
-                return_type = decl['returns'][0]['type']
-            else:
-                return_type = "std::tuple<{}>".format(", ".join([r['type'] for r in decl['returns']]))
-            for a in decl['arguments']:
-                if 'type' not in a:
-                    raise Exception(decl)
-            argument_types_with_leading_comma = ", ".join([a['type'] for a in decl['arguments']])
-            if argument_types_with_leading_comma != "":
-                argument_types_with_leading_comma = ", " + argument_types_with_leading_comma
-            args_with_leading_comma = pack_arguments(args)
-            if args_with_leading_comma != "":
-                args_with_leading_comma = ", " + args_with_leading_comma
-            return CALL_UNBOXED_KERNEL.substitute(name=decl['name'],
-                                                  args_with_leading_comma=args_with_leading_comma,
-                                                  num_inputs=num_inputs,
-                                                  return_type=return_type,
-                                                  formals_types_with_leading_comma=argument_types_with_leading_comma)
-        else:
-            assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']
-            if is_namespace_function:
-                return CALL_NAMESPACE.substitute(name=decl['name'],
-                                                 args=pack_arguments(args),
-                                                 num_inputs=num_inputs)
-            else:
-                return CALL_METHOD.substitute(
-                    name=decl['name'], first=args[0],
-                    args=pack_arguments(args[1:]), num_inputs=num_inputs)
-
-    def requires_lvalue(arg):
-        jit_type = jit_type_of(arg)
-        return jit_type.startswith('Tensor') and '!' in jit_type
-
-    def emit_decl_variant(decl):
-        if ('emit_dummy_placeholder' in decl):
-            return "DUMMY_OPERATION"
-        kw_assignments = []
-
-        # mutable arguments in aten are passed as non const references
-        # these must be lvalues, so we have to put them in variables
-        # before calling the function
-        lvalues = []
-
-        arguments = []
-        num_inputs = len(decl['arguments'])
-        op_capture = ''
-        order = argument_order(decl)
-        for i, arg in enumerate(decl['arguments']):
-            value = from_ivalue(arg, '(std::move(peek(*stack, {}, {})))'.format(order[i], num_inputs))
-            if requires_lvalue(arg):
-                lvalues.append('auto {} = {};\n'.format(arg['name'], value))
-                value = arg['name']
-            arguments.append(value)
-
-        call = get_invocation(decl, arguments, num_inputs)
-
-        returns = decl['returns']
-
-        if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper':
-            constructor = CONSTRUCTOR.substitute(name=decl['name'],
-                                                 call=call,
-                                                 kw_assignments=kw_assignments,
-                                                 num_inputs=num_inputs,
-                                                 op_capture=op_capture,
-                                                 lvalues=lvalues)
-        else:
-            assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']
-
-        return constructor
-
-    def filter_decls(jit_decls, disable_autograd, operator_selector: SelectiveBuilder, force_schema_registration):
-        result = []
-        for decl in jit_decls:
-            if disable_autograd and is_backward_op(decl):
-                continue
-            op_name = op_name_with_overload(decl)
-            if operator_selector.is_root_operator(op_name):
-                result.append(decl)
-            else:
-                if force_schema_registration:
-                    decl['emit_dummy_placeholder'] = True
-                    result.append(decl)
-
-        return result
-
-    # This function declares an order on declarations. This is necessary because
-    # there is some ambiguity in the choice of overload: if an argument is overloaded
-    # to accept both Scalar and Tensor, the schema with the Tensor should come first
-    # TODO: this can (probably) be removed when we remove the implicit conversion
-    # from Tensor -> Number.
-    def sort_decls(jit_decls):
-        def declkey(decl):
-            # key = sum_{i < len(args)} {1 if arg is tensor else 2} * (3 ** i)
-            # This is a ternary encoding where
-            # 0: No argument at this position
-            # 1: Tensor argument at this position
-            # 2: Some other argument at this position.
-            args = decl['arguments']
-            result = 0
-            for i in range(len(args)):
-                result += (3 ** i) * (1 if args[i]['simple_type'] == 'Tensor' else 2)
-            return result
-
-        # NB: itertools.groupby requires the list be sorted.
-        sorted_decls = sorted(jit_decls, key=lambda decl: decl['name'])
-        grouped_decls = [list(g) for _, g in
-                         groupby(sorted_decls, key=lambda decl: decl['name'])]
-        return [sorted(g, key=declkey) for g in grouped_decls]
-
-    aten_decls = load_aten_declarations(declarations)
-    jit_decls = [d for d in aten_decls if is_jit_op(d)]
-
-    # add arguments dtype and device for functions like zeros
-    def expand_options(decl, i, arg):
-        if arg['simple_type'] != 'TensorOptions':
-            return [arg]
-        assert decl.get('tensor_options_arg_index') != i
-        decl['tensor_options_arg_index'] = i
-        tensor_options_expansion = [
-            # XXX - until we actually have first-class interpreter types for these
-            # concepts, the default values to be encoded in Tensors
-            # If you change this, you also need to update [TensorOptions in script]
-            # in the tracer code.
-            # dtype is specified as an int64_t of at::ScalarType
-            {'name': 'dtype', 'simple_type': 'ScalarType'},
-            # layout is specified as an int64_t of at::Layout
-            {'name': 'layout', 'simple_type': 'Layout'},
-            # device is specified as an IntArrayRef of { at::Device::Type, device_id }
-            {'name': 'device', 'simple_type': 'Device'},
-            # pin_memory is specified as a boolean
-            {'name': 'pin_memory', 'simple_type': 'bool', 'default': False},
-        ]
-        # TODO: Don't repack this into TensorOptions. Needs various changes in downstream code.
-        if 'default' in arg:
-            for el in tensor_options_expansion:
-                el['simple_type'] += '?'
-                el['default'] = 'None'
-        if 'default' in arg and arg['default'] == 'at::kLong':
-            tensor_options_expansion[0]['default'] = 'long'
-        if 'kwarg_only' in arg and arg['kwarg_only']:
-            for el in tensor_options_expansion:
-                el['kwarg_only'] = True
-        return tensor_options_expansion
-
-    additional_jit_decls = []
-
-    for decl in jit_decls:
-        decl['arguments'] = [a for i, arg in enumerate(decl['arguments']) for a in expand_options(decl, i, arg)]
-        if is_out_variant(decl):
-            reorder_out_args(decl)
-
-    jit_decls.extend(additional_jit_decls)
-    jit_decls = filter_decls(jit_decls, disable_autograd, operator_selector, force_schema_registration)
-
-    # generation is deterministic
-    jit_decl_groups = sort_decls(jit_decls)
-
-    # NOTE: see Note [Sharded File] at the top of the generated_unboxing_wrappers.cpp
-    # template regarding sharding of the generated files.
-    #
-    # If you edit the number of shards here, you will also have to
-    # modify generate_code.py, torch/CMakeLists.txt, and the TARGETS
-    # files.
-    num_shards = 3
-    shards = [[] for _ in range(num_shards)]
-
-    # ops are assigned arbitrarily but stably to a file based on hash
-    for group in jit_decl_groups:
-        x = sum(ord(c) for c in group[0]['name']) % num_shards
-        for decl in group:
-            if decl['use_c10_dispatcher'] == 'with_codegenerated_unboxing_wrapper':
-                shards[x].append(OPERATOR.substitute(signature=decl['schema_string'],
-                                                     op=emit_decl_variant(decl)))
-            else:
-                assert decl['use_c10_dispatcher'] in ['full', 'hacky_wrapper_for_legacy_signatures']
-
-    for i, shard in enumerate(shards):
-        env = {
-            'constructors': shard,
-        }
-        write(out, 'generated_unboxing_wrappers_%d.cpp' % i, GENERATED_UNBOXING_WRAPPERS_CPP, env)
-
-    all_shards = reduce(
-        lambda lhs, rhs: lhs + rhs,
-        shards,
-    )
-    env = {
-        'constructors': all_shards,
-    }
-    write(out, 'generated_unboxing_wrappers_everything.cpp', GENERATED_UNBOXING_WRAPPERS_CPP, env)
-
-
-default_map = {'{}': 'None', 'nullptr': 'None', 'c10::nullopt': 'None'}
-
-
-def reorder_out_args(decl):
-    first_arg = decl['arguments'][0]
-    assert(first_arg['output'])
-    # the output variant must go at the end
-    # note: this is an annoying side effect of using a single '*'
-    # to denote kwarg_only
-    nargs = len(decl['arguments'])
-    decl['jit_argument_order'] = [nargs - 1] + list(range(nargs - 1))
-
-
-def is_kwarg_only(a):
-    return a.get('kwarg_only') or a.get('output')
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Generate JIT op dispatch')
-    parser.add_argument('declarations', metavar='DECL',
-                        help='path to Declarations.yaml')
-    parser.add_argument('out', metavar='OUT',
-                        help='path to output directory')
-    parser.add_argument('template_path', metavar='TEMPLATE_PATH',
-                        help='path to templates directory')
-    args = parser.parse_args()
-    gen_unboxing_wrappers(args.declarations, args.out, args.template_path,
-                          SelectiveBuilder.get_nop_selector())
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/jit/templates/generated_unboxing_wrappers.cpp b/tools/jit/templates/generated_unboxing_wrappers.cpp
deleted file mode 100644
index cd8d12f6b15e..000000000000
--- a/tools/jit/templates/generated_unboxing_wrappers.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-#include "torch/csrc/jit/runtime/operator.h"
-#include "torch/csrc/jit/runtime/custom_operator.h"
-#include "torch/csrc/jit/frontend/function_schema_parser.h"
-
-#include "torch/csrc/autograd/profiler.h"
-#include "torch/csrc/autograd/generated/variable_factories.h"
-
-#include <ATen/ATen.h>
-#include <ATen/core/functional.h>
-#include <ATen/core/interned_strings.h>
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
-#include <cstring>
-#include <sstream>
-#include <stdexcept>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-// ${generated_comment}
-
-// This file contains manual unboxing wrappers for ops that aren't
-// use_c10_dispatcher: full because the templated unboxing logic in c10 doesn't
-// support them yet. The ultimate goal is to make all ops use the templated
-// unboxing and delete this codegen file.
-
-// NOTE [Sharded File]: This file is generated in a sharded fashion to speed up
-// incremental rebuilds. See the comment at the top of
-// templates/VariableType.cpp for an analogous, in-depth discussion.
-
-namespace torch { namespace jit {
-
-using autograd::Variable;
-using autograd::variable_list;
-using at::Scalar;
-using at::ScalarType;
-using at::Tensor;
-using at::TensorOptions;
-using at::DeviceGuard;
-using at::MemoryFormat;
-
-using ::c10::fmap;
-using ::c10::filter;
-using c10::OperatorKernel;
-using c10::OperatorHandle;
-using c10::KernelFunction;
-using c10::RegistrationHandleRAII;
-using c10::Stack;
-
-namespace {
-
-template<class Return, class... Args>
-Return callUnboxedKernel(OperatorKernel* unboxedKernel, Args... args) {
-  using FuncType = Return (Args...);
-  auto* typedUnboxedKernel = static_cast<c10::impl::WrapFunctionIntoRuntimeFunctor<FuncType*>*>(unboxedKernel);
-  return (*typedUnboxedKernel)(std::forward<Args>(args)...);
-}
-
-// TODO: remove the toOptionalTensor and toListOfOptionalTensor
-// when we remove the undefined tensor semantic from TH
-
-// XXX: This function is to specialize IValue for tensor type in
-// interpreter, it should only be used in this file
-at::Tensor toOptionalTensor(const IValue& v) {
-  if (v.isNone()) {
-    return at::Tensor();
-  }
-  return v.toTensor();
-}
-
-// XXX: This function is to specialize IValue for list of optional
-// tensor type in interpreter, it should only be used in this file
-std::vector<Tensor> toListOfOptionalTensor(const IValue& v) {
-  // v is a list of optional tensor, loop over as generic list
-  auto vlist = v.toListRef();
-  std::vector<Tensor> res;
-
-  for (const IValue &v: vlist) {
-    res.emplace_back(toOptionalTensor(v));
-  }
-  return res;
-}
-
-template<size_t N>
-std::array<bool, N> as_bool_array(const c10::List<bool>& list) {
-  std::array<bool, N> res;
-  AT_ASSERT(list.size() == N);
-  std::copy(list.begin(), list.end(), res.begin());
-  return res;
-}
-
-KernelFunction::InternalBoxedKernelFunction *DUMMY_OPERATION =
-  [](c10::OperatorKernel *, const c10::OperatorHandle &, std::vector<c10::IValue> *) -> void {
-    TORCH_CHECK(false, "Operator has been stripped in the custom build.")
-  };
-
-class Registerer final {
-public:
-  Registerer&& op(const std::string& schemaStr, KernelFunction::InternalBoxedKernelFunction* boxed_kernel_wrapper) && {
-    static auto& dispatcher = c10::Dispatcher::singleton();
-    auto schema = parseSchema(schemaStr);
-    schema.setAliasAnalysis(AliasAnalysisKind::FROM_SCHEMA);
-    c10::OperatorName name = schema.operator_name();
-    RegistrationHandleRAII registration = dispatcher.registerName(name);
-    auto op = dispatcher.findOp(name).value();
-    registrationHandles_.push_back(std::move(registration));
-    dispatcher.setManuallyBoxedKernelFor_(op, boxed_kernel_wrapper);
-    return std::move(*this);
-  }
-
-  Registerer() = default;
-  Registerer(const Registerer&) = delete;
-  Registerer& operator=(const Registerer&) = delete;
-  Registerer(Registerer&&) noexcept = default;
-  Registerer& operator=(Registerer&&) noexcept = default;
-private:
-  std::vector<RegistrationHandleRAII> registrationHandles_;
-};
-
-static auto registry = Registerer()
-  // Generated operators
-  ${constructors}
-  ;
-
-} // anon namespace
-
-
-}} // namespace torch::jit
diff --git a/tools/nightly.py b/tools/nightly.py
index 1fecc67e72f3..55a90e3fd9fb 100755
--- a/tools/nightly.py
+++ b/tools/nightly.py
@@ -322,10 +322,10 @@ def pytorch_install(url):
 
 def _site_packages(dirname, platform):
     if platform.startswith("win"):
-        os.path.join(pytdir.name, "Lib", "site-packages")
+        template = os.path.join(dirname, "Lib", "site-packages")
     else:
         template = os.path.join(dirname, "lib", "python*.*", "site-packages")
-        spdir = glob.glob(template)[0]
+    spdir = glob.glob(template)[0]
     return spdir
 
 
diff --git a/tools/setup_helpers/cmake.py b/tools/setup_helpers/cmake.py
index 62ab12a5854f..f1809552cd40 100644
--- a/tools/setup_helpers/cmake.py
+++ b/tools/setup_helpers/cmake.py
@@ -7,7 +7,6 @@
 import re
 from subprocess import check_call, check_output
 import sys
-import distutils
 import distutils.sysconfig
 from distutils.version import LooseVersion
 
@@ -32,7 +31,7 @@ def _mkdir_p(d):
 def convert_cmake_value_to_python_value(cmake_value, cmake_type):
     r"""Convert a CMake value in a string form to a Python value.
 
-    Arguments:
+    Args:
       cmake_value (string): The CMake value in a string form (e.g., "ON", "OFF", "1").
       cmake_type (string): The CMake type of :attr:`cmake_value`.
 
@@ -56,7 +55,7 @@ def convert_cmake_value_to_python_value(cmake_value, cmake_type):
 def get_cmake_cache_variables_from_file(cmake_cache_file):
     r"""Gets values in CMakeCache.txt into a dictionary.
 
-    Arguments:
+    Args:
       cmake_cache_file: A CMakeCache.txt file object.
     Returns:
       dict: A ``dict`` containing the value of cached CMake variables.
diff --git a/tools/setup_helpers/env.py b/tools/setup_helpers/env.py
index df6bea52b427..f04f10cc287c 100644
--- a/tools/setup_helpers/env.py
+++ b/tools/setup_helpers/env.py
@@ -43,7 +43,7 @@ class BuildType(object):
     is ``None``, then the build type will be inferred from ``CMakeCache.txt``. If ``CMakeCache.txt`` does not exist,
     os.environ['CMAKE_BUILD_TYPE'] will be used.
 
-    Arguments:
+    Args:
       cmake_build_type_env (str): The value of os.environ['CMAKE_BUILD_TYPE']. If None, the actual build type will be
         inferred.
 
diff --git a/tools/setup_helpers/generate_code.py b/tools/setup_helpers/generate_code.py
index 9ca843abc69f..10bbc33c352f 100644
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@@ -30,7 +30,6 @@ def generate_code(ninja_global=None,
                   operator_selector=None):
     from tools.autograd.gen_autograd import gen_autograd, gen_autograd_python
     from tools.autograd.gen_annotated_fn_args import gen_annotated
-    from tools.jit.gen_unboxing_wrappers import gen_unboxing_wrappers
     from tools.codegen.selective_build.selector import SelectiveBuilder
 
 
@@ -70,13 +69,6 @@ def generate_code(ninja_global=None,
             disable_autograd=disable_autograd,
             operator_selector=operator_selector,
         )
-        gen_unboxing_wrappers(
-            declarations_path or DECLARATIONS_PATH,
-            jit_gen_dir,
-            tools_jit_templates,
-            disable_autograd=disable_autograd,
-            operator_selector=operator_selector,
-            force_schema_registration=force_schema_registration)
 
     if subset == "python" or not subset:
         gen_annotated(
diff --git a/tools/shared/module_loader.py b/tools/shared/module_loader.py
index c24a19678c39..51c57aa161c9 100644
--- a/tools/shared/module_loader.py
+++ b/tools/shared/module_loader.py
@@ -1,5 +1,3 @@
-
-
 def import_module(name, path):
     import importlib.util
     spec = importlib.util.spec_from_file_location(name, path)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 79c93cb191f1..6427a4a4ed80 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -344,6 +344,10 @@ def _propagate_and_assign_input_shapes(
     propagate: _bool
 ) -> Graph: ...
 
+# Defined in torch/csrc/jit/runtime/graph_executor.h
+class GraphExecutorState:
+    ...
+
 # Defined in torch/torch/csrc/jit/ir/ir.h
 class Graph:
     def eraseInput(self, i: _int) -> None: ...
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
index cfcb66896ad7..15a286f2370c 100644
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@@ -25,7 +25,8 @@ class ProfilerConfig:
         state: ProfilerState,
         report_input_shapes: bool,
         profile_memory: bool,
-        with_stack: bool
+        with_stack: bool,
+        with_flops: bool
     ) -> None: ...
     ...
 
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index cd9a0f7d46a9..5ac2c0a8315d 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -77,6 +77,7 @@ class ReduceScatterOptions:
     timeout: timedelta
 
 class BarrierOptions:
+    device_ids: List[int]
     timeout: timedelta
 
 class AllToAllOptions:
diff --git a/torch/__init__.py b/torch/__init__.py
index 04955623ab2a..9ae1010a3ba8 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -574,6 +574,7 @@ def _assert(condition, message):
 import torch.futures
 import torch.nn
 import torch.nn.intrinsic
+import torch.nn.quantizable
 import torch.nn.quantized
 import torch.optim
 import torch.optim._multi_tensor
diff --git a/torch/_classes.py b/torch/_classes.py
index ecf9987c69b9..90cc6c7672f6 100644
--- a/torch/_classes.py
+++ b/torch/_classes.py
@@ -40,7 +40,7 @@ def load_library(self, path):
         ``torch.classes.loaded_libraries`` attribute, a set that may be inspected
         for the paths of all libraries loaded using this function.
 
-        Arguments:
+        Args:
             path (str): A path to a shared library to load.
         """
         torch.ops.load_library(path)
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 906439320a2c..fca1c45377e7 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -254,7 +254,7 @@ def get_type_hint_captures(fn):
     for the literal annotations on 'fn'. These are not considered to be closed-over by fn
     and must be obtained separately (e.g. using this function).
 
-    Arguments:
+    Args:
         fn: A callable.
     Returns:
         A Dict[str, Any] containing a mapping from the literal annotations used on
diff --git a/torch/_lobpcg.py b/torch/_lobpcg.py
index 02b666493f9a..dfe52774f8ca 100644
--- a/torch/_lobpcg.py
+++ b/torch/_lobpcg.py
@@ -76,7 +76,7 @@ def _polynomial_value(poly, x, zero_power, transition):
     """
     A generic method for computing poly(x) using the Horner's rule.
 
-    Arguments:
+    Args:
       poly (Tensor): the (possibly batched) 1D Tensor representing
                      polynomial coefficients such that
                      poly[..., i] = (a_{i_0}, ..., a{i_n} (==1)), and
@@ -384,7 +384,7 @@ def lobpcg(A: Tensor,
       we do the following symmetrization map: `A -> (A + A.t()) / 2`.
       The map is performed only when the `A` requires gradients.
 
-    Arguments:
+    Args:
 
       A (Tensor): the input tensor of size :math:`(*, m, m)`
 
@@ -925,7 +925,7 @@ def _get_rayleigh_ritz_transform(self, S):
           matrix product `D M` with element-wise product `M *
           d`. Also, creating the diagonal matrix `D` is avoided.
 
-        Arguments:
+        Args:
         S (Tensor): the matrix basis for the search subspace, size is
                     :math:`(m, n)`.
 
@@ -957,7 +957,7 @@ def _get_svqb(self,
                   modification of the corresponding algorithm
                   introduced in [StathopolousWu2002].
 
-        Arguments:
+        Args:
 
           U (Tensor) : initial approximation, size is (m, n)
           drop (bool) : when True, drop columns that
@@ -1023,7 +1023,7 @@ def _get_ortho(self, U, V):
         .. note:: If all U columns are B-collinear to V then the
                   returned tensor U will be empty.
 
-        Arguments:
+        Args:
 
           U (Tensor) : initial approximation, size is (m, n)
           V (Tensor) : B-orthogonal external basis, size is (m, k)
diff --git a/torch/_lowrank.py b/torch/_lowrank.py
index ade7c59286d8..a6e1dd10e570 100644
--- a/torch/_lowrank.py
+++ b/torch/_lowrank.py
@@ -37,7 +37,7 @@ def get_approximate_basis(A,        # type: Tensor
     .. note:: To obtain repeatable results, reset the seed for the
               pseudorandom number generator
 
-    Arguments::
+    Args::
         A (Tensor): the input tensor of size :math:`(*, m, n)`
 
         q (int): the dimension of subspace spanned by :math:`Q`
@@ -103,7 +103,7 @@ def svd_lowrank(A, q=6, niter=2, M=None):
               will be useful for huge sparse matrices that
               ``torch.svd`` cannot handle.
 
-    Arguments::
+    Args::
         A (Tensor): the input tensor of size :math:`(*, m, n)`
 
         q (int, optional): a slightly overestimated rank of A.
@@ -211,7 +211,7 @@ def pca_lowrank(A, q=None, center=True, niter=2):
     .. note:: To obtain repeatable results, reset the seed for the
               pseudorandom number generator
 
-    Arguments:
+    Args:
 
         A (Tensor): the input tensor of size :math:`(*, m, n)`
 
diff --git a/torch/_ops.py b/torch/_ops.py
index 70edc2822025..dd0c8cd19fde 100644
--- a/torch/_ops.py
+++ b/torch/_ops.py
@@ -94,7 +94,7 @@ def load_library(self, path):
         ``torch.ops.loaded_libraries`` attribute, a set that may be inspected
         for the paths of all libraries loaded using this function.
 
-        Arguments:
+        Args:
             path (str): A path to a shared library to load.
         """
         path = torch._utils_internal.resolve_library_path(path)
diff --git a/torch/_six.py b/torch/_six.py
index c53feed94cce..00f9fa6b7f95 100644
--- a/torch/_six.py
+++ b/torch/_six.py
@@ -33,7 +33,6 @@
 FileNotFoundError = builtins.FileNotFoundError
 StringIO = io.StringIO
 container_abcs = collections.abc
-PY3 = sys.version_info[0] == 3
 PY37 = sys.version_info[0] == 3 and sys.version_info[1] >= 7
 
 def with_metaclass(meta: type, *bases) -> type:
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index e05784cbcc22..4a1c36df7497 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1026,7 +1026,6 @@ def merge_dicts(*dicts):
     tensor([ 0,  1, -4], dtype=torch.int8)
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.bmm,
            r"""
 bmm(input, mat2, *, deterministic=False, out=None) -> Tensor
@@ -2548,7 +2547,7 @@ def merge_dicts(*dicts):
 
 .. note::  This function is similar to SciPy's `scipy.special.digamma`.
 
-.. note::  From PyTorch 1.8 onwards, the digamma function returns `-Inf` for `0`. 
+.. note::  From PyTorch 1.8 onwards, the digamma function returns `-Inf` for `0`.
            Previously it returned `NaN` for `0`.
 
 Example::
@@ -2934,7 +2933,6 @@ def merge_dicts(*dicts):
     tensor([ 0.,  1.])
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.eye,
            r"""
 eye(n, m=None, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
@@ -2944,6 +2942,8 @@ def merge_dicts(*dicts):
 Args:
     n (int): the number of rows
     m (int, optional): the number of columns with default being :attr:`n`
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -3095,7 +3095,17 @@ def merge_dicts(*dicts):
            r"""
 flatten(input, start_dim=0, end_dim=-1) -> Tensor
 
-Flattens a contiguous range of dims in a tensor.
+Flattens :attr:`input` by reshaping it into a one-dimensional tensor. If :attr:`start_dim` or :attr:`end_dim`
+are passed, only dimensions starting with :attr:`start_dim` and ending with :attr:`end_dim` are flattened.
+The order of elements in :attr:`input` is unchanged.
+
+Unlike NumPy's flatten, which always copies input's data, this function may return the original object, a view,
+or copy. If no dimensions are flattened, then the original object :attr:`input` is returned. Otherwise, if input can
+be viewed as the flattened shape, then that view is returned. Finally, only if the input cannot be viewed as the
+flattened shape is input's data copied. See :meth:`torch.Tensor.view` for details on when a view will be returned.
+
+.. note::
+    Flattening a zero-dimensional tensor will return a one-dimensional view.
 
 Args:
     {input}
@@ -3127,12 +3137,10 @@ def merge_dicts(*dicts):
     out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
     out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
 
-If :attr:`input` is an n-dimensional tensor with size
-:math:`(x_0, x_1..., x_{i-1}, x_i, x_{i+1}, ..., x_{n-1})`
-and ``dim = i``, then :attr:`index` must be an :math:`n`-dimensional tensor with
-size :math:`(x_0, x_1, ..., x_{i-1}, y, x_{i+1}, ..., x_{n-1})` where :math:`y \geq 1`
-and :attr:`out` will have the same size as :attr:`index`.  Note that ``input``
-and ``index`` do not broadcast against each other.
+:attr:`input` and :attr:`index` must have the same number of dimensions.
+It is also required that ``index.size(d) <= input.size(d)`` for all
+dimensions ``d != dim``.  :attr:`out` will have the same shape as :attr:`index`.
+Note that ``input`` and ``index`` do not broadcast against each other.
 
 Args:
     input (Tensor): the source tensor
@@ -3738,7 +3746,7 @@ def merge_dicts(*dicts):
     Complex values are infinite when their real or imaginary part is
     infinite.
 
-    Arguments:
+    Args:
         {input}
 
     Returns:
@@ -3823,7 +3831,7 @@ def merge_dicts(*dicts):
 Real values are finite when they are not NaN, negative infinity, or infinity.
 Complex values are finite when both their real and imaginary parts are finite.
 
-    Arguments:
+    Args:
         {input}
 
     Returns:
@@ -4176,7 +4184,6 @@ def merge_dicts(*dicts):
     tensor([ 0.5724,  0.0000, -0.1208])
 """.format(**common_args))
 
-# TODO: update kwargs formatting (see https://github.com/pytorch/pytorch/issues/43667)
 add_docstr(torch.linspace, r"""
 linspace(start, end, steps, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
 
@@ -4203,6 +4210,8 @@ def merge_dicts(*dicts):
     start (float): the starting value for the set of points
     end (float): the ending value for the set of points
     steps (int): size of the constructed tensor
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -4539,7 +4548,6 @@ def merge_dicts(*dicts):
     tensor([ True,  True, False, False])
 """.format(**common_args))
 
-# TODO: update kwargs formatting (see https://github.com/pytorch/pytorch/issues/43667)
 add_docstr(torch.logspace, """
 logspace(start, end, steps, base=10.0, *, \
          out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
@@ -4570,7 +4578,9 @@ def merge_dicts(*dicts):
     start (float): the starting value for the set of points
     end (float): the ending value for the set of points
     steps (int): size of the constructed tensor
-    base (float): base of the logarithm function. Default: ``10.0``.
+    base (float, optional): base of the logarithm function. Default: ``10.0``.
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -5471,36 +5481,15 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.argmin,
            r"""
-argmin(input) -> LongTensor
+argmin(input, dim=None, keepdim=False) -> LongTensor
 
-Returns the indices of the minimum value of all elements in the :attr:`input` tensor.
+Returns the indices of the minimum value(s) of the flattened tensor or along a dimension
 
 This is the second value returned by :meth:`torch.min`. See its
 documentation for the exact semantics of this method.
 
 .. note:: If there are multiple minimal values then the indices of the first minimal value are returned.
 
-Args:
-    {input}
-
-Example::
-
-    >>> a = torch.randn(4, 4)
-    >>> a
-    tensor([[ 0.1139,  0.2254, -0.1381,  0.3687],
-            [ 1.0100, -1.1975, -0.0102, -0.4732],
-            [-0.9240,  0.1207, -0.7506, -1.0213],
-            [ 1.7809, -1.2960,  0.9384,  0.1438]])
-    >>> torch.argmin(a)
-    tensor(13)
-
-.. function:: argmin(input, dim, keepdim=False) -> LongTensor
-
-Returns the indices of the minimum values of a tensor across a dimension.
-
-This is the second value returned by :meth:`torch.min`. See its
-documentation for the exact semantics of this method.
-
 Args:
     {input}
     {dim} If ``None``, the argmin of the flattened input is returned.
@@ -5514,8 +5503,15 @@ def merge_dicts(*dicts):
             [ 1.0100, -1.1975, -0.0102, -0.4732],
             [-0.9240,  0.1207, -0.7506, -1.0213],
             [ 1.7809, -1.2960,  0.9384,  0.1438]])
+    >>> torch.argmin(a)
+    tensor(13)
     >>> torch.argmin(a, dim=1)
     tensor([ 2,  1,  3,  1])
+    >>> torch.argmin(a, dim=1, keepdim=True)
+    tensor([[2],
+            [1],
+            [3],
+            [1]])
 """.format(**single_dim_common))
 
 add_docstr(torch.mm,
@@ -6330,7 +6326,6 @@ def merge_dicts(*dicts):
 
 """.format(**common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.ones,
            r"""
 ones(*size, *, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor
@@ -6341,6 +6336,8 @@ def merge_dicts(*dicts):
 Args:
     size (int...): a sequence of integers defining the shape of the output tensor.
         Can be a variable number of arguments or a collection like a list or tuple.
+
+Keyword arguments:
     {out}
     {dtype}
     {layout}
@@ -6358,7 +6355,6 @@ def merge_dicts(*dicts):
 
 """.format(**factory_common_args))
 
-# TODO: see https://github.com/pytorch/pytorch/issues/43667
 add_docstr(torch.ones_like,
            r"""
 ones_like(input, *, dtype=None, layout=None, device=None, requires_grad=False, memory_format=torch.preserve_format) -> Tensor
@@ -6374,6 +6370,8 @@ def merge_dicts(*dicts):
 
 Args:
     {input}
+
+Keyword arguments:
     {dtype}
     {layout}
     {device}
@@ -6678,18 +6676,30 @@ def merge_dicts(*dicts):
 If :attr:`some` is ``True``, then this function returns the thin (reduced) QR factorization.
 Otherwise, if :attr:`some` is ``False``, this function returns the complete QR factorization.
 
+.. warning:: ``torch.qr`` is deprecated. Please use ``torch.linalg.`` :meth:`~torch.linalg.qr`
+             instead, which provides a better compatibility with
+             ``numpy.linalg.qr``.
+
+             **Differences with** ``torch.linalg.`` :meth:`~torch.linalg.qr`:
+
+             * ``torch.linalg.qr`` takes a string parameter ``mode`` instead of ``some``:
+
+               - ``some=True`` is equivalent of ``mode='reduced'``: both are the
+                 default
+
+               - ``some=False`` is equivalent of ``mode='complete'``.
+
+
 .. warning::
           If you plan to backpropagate through QR, note that the current backward implementation
           is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
           columns of :attr:`input` are linearly independent.
           This behavior will propably change once QR supports pivoting.
 
-.. note:: precision may be lost if the magnitudes of the elements of :attr:`input`
-          are large
-
-.. note:: While it should always give you a valid decomposition, it may not
-          give you the same one across platforms - it will depend on your
-          LAPACK implementation.
+.. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
+          and may produce different (valid) decompositions on different device types
+          and different platforms, depending on the precise version of the
+          underlying library.
 
 Args:
     input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
@@ -8250,7 +8260,7 @@ def merge_dicts(*dicts):
 Args:
     input (Tensor): the input tensor of size :math:`(*, n, n)` where `*` is zero or more
                     batch dimensions consisting of symmetric matrices.
-    eigenvectors(boolean, optional): controls whether eigenvectors have to be computed
+    eigenvectors(bool, optional): controls whether eigenvectors have to be computed
     upper(boolean, optional): controls whether to consider upper-triangular or lower-triangular region
 
 Keyword args:
@@ -9260,7 +9270,7 @@ def merge_dicts(*dicts):
 
 add_docstr(torch.full_like,
            """
-full_like(input, fill_value, \\*, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
+full_like(input, fill_value, \\*, dtype=None, layout=torch.strided, device=None, requires_grad=False, \
 memory_format=torch.preserve_format) -> Tensor
 
 Returns a tensor with the same size as :attr:`input` filled with :attr:`fill_value`.
@@ -9479,9 +9489,10 @@ def merge_dicts(*dicts):
     Batched version for complex inputs is only supported on the CPU.
 
 Arguments:
-    input (Tensor): The input tensor of size :math:`(*, m, n)` where :math:`*` is zero or more batch dimensions
-    rcond (float): A floating point value to determine the cutoff for small singular values.
-                   Default: 1e-15
+    input (Tensor): The input tensor of size :math:`(*, m, n)` where :math:`*` is
+        zero or more batch dimensions.
+    rcond (float, optional): A floating point value to determine the cutoff for
+        small singular values. Default: ``1e-15``.
 
 Returns:
     The pseudo-inverse of :attr:`input` of dimensions :math:`(*, n, m)`
@@ -9877,6 +9888,8 @@ def merge_dicts(*dicts):
 
 Arguments:
     y (Tensor): The values of the function to integrate
+
+Keyword args:
     dx (float): The distance between points at which `y` is sampled.
     dim (int): The dimension along which to integrate.
         By default, use the last dimension.
diff --git a/torch/_utils.py b/torch/_utils.py
index fbee17167b56..796e88a3cc2d 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -248,7 +248,7 @@ def _flatten_dense_tensors(tensors):
     buffer. Element-wise operation on this buffer will be equivalent to
     operating individually.
 
-    Arguments:
+    Args:
         tensors (Iterable[Tensor]): dense tensors to flatten.
 
     Returns:
@@ -264,7 +264,7 @@ def _flatten_sparse_tensors(tensors):
     """Flatten sparse tensors into two contiguous 1D buffers, one of indices and
     one of values. Assume tensors are of same sparse type.
 
-    Arguments:
+    Args:
         tensors (Iterable[Tensor]): sparse tensors to flatten.
 
     Returns:
@@ -280,7 +280,7 @@ def _unflatten_dense_tensors(flat, tensors):
     """View a flat buffer using the sizes of tensors. Assume that tensors are of
     same dense type, and that flat is given by _flatten_dense_tensors.
 
-    Arguments:
+    Args:
         flat (Tensor): flattened dense tensors to unflatten.
         tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
           unflatten flat.
@@ -303,7 +303,7 @@ def _unflatten_sparse_tensors(flat, tensors):
     tensors. Assume that tensors are of same sparse type, and that flat is given
     by _flatten_sparse_tensors.
 
-    Arguments:
+    Args:
         flat (tuple(Tensor, Tensor)): flattened indices and values of sparse
           tensors to unflatten.
         tensors (Iterable[Tensor]): sparse tensors whose sizes will be used to
@@ -327,7 +327,7 @@ def _reorder_tensors_as(tensors, ordered_tensors):
     types, e.g., from _take_tensors. Reorder them to be of same order as
     ordered_tensors.
 
-    Arguments:
+    Args:
         tensors (Iterable[Tensor]): tensors to be reordered. They should be of
           the same order as ordered_tensors within their own types.
         ordered_tensors (Iterable[Tensor]): tensors whose order will be the
diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py
index 67e2ec1a2cd9..26f32cfd9ffd 100644
--- a/torch/_vmap_internals.py
+++ b/torch/_vmap_internals.py
@@ -137,7 +137,7 @@ def _get_name(func: Callable):
     # Not all callables have __name__, in fact, only static functions/methods do.
     # A callable created via functools.partial or an nn.Module, to name some
     # examples, don't have a __name__.
-    fn_name = repr(func)
+    return repr(func)
 
 # vmap(func)(inputs) wraps all Tensor inputs to be batched in BatchedTensors,
 # sends those into func, and then unwraps the output BatchedTensors. Operations
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index 0bcf70e10e61..a013c9eb7326 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -102,7 +102,7 @@ def backward(
         in a user-specified CUDA stream context, see
         :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
 
-    Arguments:
+    Args:
         tensors (sequence of Tensor): Tensors of which the derivative will be
             computed.
         grad_tensors (sequence of (Tensor or None)): The "vector" in the Jacobian-vector
@@ -174,7 +174,7 @@ def grad(
         in a user-specified CUDA stream context, see
         :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
 
-    Arguments:
+    Args:
         outputs (sequence of Tensor): outputs of the differentiated function.
         inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
             returned (and not accumulated into ``.grad``).
diff --git a/torch/autograd/anomaly_mode.py b/torch/autograd/anomaly_mode.py
index 6a33f1780ef1..97def2dea3d0 100644
--- a/torch/autograd/anomaly_mode.py
+++ b/torch/autograd/anomaly_mode.py
@@ -89,7 +89,7 @@ class set_detect_anomaly(object):
 
     See ``detect_anomaly`` above for details of the anomaly detection behaviour.
 
-    Arguments:
+    Args:
         mode (bool): Flag whether to enable anomaly detection (``True``),
                      or disable (``False``).
 
diff --git a/torch/autograd/grad_mode.py b/torch/autograd/grad_mode.py
index 6e0f6f1469c5..ebbb7dea41e9 100644
--- a/torch/autograd/grad_mode.py
+++ b/torch/autograd/grad_mode.py
@@ -172,7 +172,7 @@ class set_grad_enabled(object):
     This context manager is thread local; it will not affect computation
     in other threads.
 
-    Arguments:
+    Args:
         mode (bool): Flag whether to enable grad (``True``), or disable
                      (``False``). This can be used to conditionally enable
                      gradients.
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 208281d99d2b..a3d0da1aef9d 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -172,7 +172,7 @@ def self_cpu_time_total(self):
     def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False):
         """Prints an EventList as a nicely formatted table.
 
-        Arguments:
+        Args:
             sort_by (str, optional): Attribute used to sort entries. By default
                 they are printed in the same order as they were registered.
                 Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``,
@@ -203,7 +203,7 @@ def export_chrome_trace(self, path):
 
         The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL.
 
-        Arguments:
+        Args:
             path (str): Path where the trace will be written.
         """
         import os
@@ -288,7 +288,7 @@ def export_stacks(self, path: str, metric: str):
     def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0):
         """Averages all function events over their keys.
 
-        Arguments:
+        Args:
             group_by_input_shapes: group entries by
             (event name, input shapes) rather than just event name.
             This is useful to see which input shapes contribute to the runtime
@@ -345,7 +345,7 @@ class profile(object):
     only report runtime of PyTorch functions.
     Note: profiler is thread local and is automatically propagated into the async tasks
 
-    Arguments:
+    Args:
         enabled (bool, optional): Setting this to False makes this context manager a no-op.
             Default: ``True``.
 
@@ -468,7 +468,8 @@ def config(self):
             self.profiler_kind,
             self.record_shapes,
             self.profile_memory,
-            self.with_stack)
+            self.with_stack,
+            self.with_flops)
 
     def __enter__(self):
         if not self.enabled:
@@ -574,7 +575,7 @@ class record_function(ContextDecorator):
     Python code (or function) when running autograd profiler. It is
     useful when tracing the code profile.
 
-    Arguments:
+    Args:
         name (str): Label assigned to the block of code.
         node_id (int): ID of node, for distributed profiling. Unset in
         non-distributed cases.
@@ -628,7 +629,7 @@ def _call_end_callbacks_on_future(self, fut: Future[Any]) -> Future[Any]:
         once to attach the callback onto the future, and will throw if called multiple
         times.
 
-        Arguments:
+        Args:
             fut: (torch._C.Future): future for which to schedule
             callback for.
 
@@ -666,7 +667,7 @@ class emit_nvtx(object):
         This context manager should not be called recursively, i.e. at most one
         instance should be enabled at any given time.
 
-    Arguments:
+    Args:
         enabled (bool, optional, default=True): Setting ``enabled=False`` makes this context manager a no-op.
             Default: ``True``.
         record_shapes (bool, optional, default=False): If ``record_shapes=True``, the nvtx range wrapping
@@ -746,6 +747,7 @@ def __enter__(self):
                 torch.autograd.ProfilerState.NVTX,
                 self.record_shapes,
                 False,
+                False,
                 False)
         )
         return self
@@ -761,7 +763,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 def load_nvprof(path):
     """Opens an nvprof trace file and parses autograd annotations.
 
-    Arguments:
+    Args:
         path (str): path to nvprof trace
     """
     return EventList(parse_nvprof_trace(path))
diff --git a/torch/contrib/_tensorboard_vis.py b/torch/contrib/_tensorboard_vis.py
index 8f4ca71ff202..b1b8d35a511d 100644
--- a/torch/contrib/_tensorboard_vis.py
+++ b/torch/contrib/_tensorboard_vis.py
@@ -1,6 +1,7 @@
 import time
 from collections import defaultdict
 from functools import partial
+from typing import DefaultDict
 
 import torch
 
@@ -52,7 +53,7 @@ def visualize(graph, name_prefix='', pb_graph=None, executors_it=None):
 def visualize_graph_executor(state, name_prefix, pb_graph, inline_graph):
     """Appends the state of a given GraphExecutor to the graph protobuf.
 
-    Arguments:
+    Args:
         state (GraphExecutor or GraphExecutorState): GraphExecutor to display.
         name_prefix (str): Name prefix of the containing subgraph.
         pb_graph (GraphDef): graph to append to.
@@ -104,7 +105,7 @@ def inline_graph(subgraph, name, node):
         for out, val in zip(subgraph.outputs(), node.outputs()):
             value_map[val.unique()] = rec_value_map[out.unique()]
 
-    op_id_counter = defaultdict(int)
+    op_id_counter: DefaultDict[str, int] = defaultdict(int)
 
     def name_for(node):
         kind = node.kind()[node.kind().index('::') + 2:]
diff --git a/torch/csrc/Generator.cpp b/torch/csrc/Generator.cpp
index 55e5abc29ef9..2bc478f36007 100644
--- a/torch/csrc/Generator.cpp
+++ b/torch/csrc/Generator.cpp
@@ -15,7 +15,6 @@
 #include <torch/csrc/autograd/generated/variable_factories.h>
 
 #ifdef USE_CUDA
-#include <THC/THCTensorRandom.h>
 #include <ATen/CUDAGeneratorImpl.h>
 #endif
 
@@ -78,45 +77,32 @@ static PyObject * THPGenerator_getState(PyObject *_self, PyObject *noargs)
 {
   using namespace torch::autograd;
   HANDLE_TH_ERRORS
-  auto self = (THPGenerator*)_self;
-  Variable var = torch::empty({0}, at::device(at::kCPU).dtype(at::kByte));
-  if (self->cdata.device().type() == at::kCPU) {
-    THByteTensor_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl()));
-  } else {
-#ifdef USE_CUDA
-    TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA);
-    THCRandom_getRNGState(self->cdata, (THByteTensor*)(var.unsafeGetTensorImpl()));
-#else
-    TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA");
-#endif
-  }
-  return THPVariable_Wrap(std::move(var));
+  auto& gen = ((THPGenerator*)_self)->cdata;
+
+  // See Note [Acquire lock when using random generators]
+  std::lock_guard<std::mutex> lock(gen.mutex());
+  auto state_tensor = gen.get_state();
+
+  return THPVariable_Wrap(std::move(state_tensor));
   END_HANDLE_TH_ERRORS
 }
 
 static PyObject * THPGenerator_setState(PyObject *_self, PyObject *_new_state)
 {
   using namespace torch::autograd;
-  auto self = (THPGenerator*)_self;
+  
   HANDLE_TH_ERRORS
   if (!THPVariable_Check(_new_state)) {
     throw torch::TypeError("expected a torch.ByteTensor, but got %s", Py_TYPE(_new_state)->tp_name);
   }
-  auto& tensor = ((THPVariable*)_new_state)->cdata;
-  if (tensor.layout() != kStrided || tensor.device().type() != kCPU || tensor.scalar_type() != kByte) {
-    auto type_name = torch::utils::options_to_string(tensor.options());
-    throw torch::TypeError("expected a torch.ByteTensor, but got %s", type_name.c_str());
-  }
-  if (self->cdata.device().type() == at::kCPU) {
-    THByteTensor_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl());
-  } else {
-#ifdef USE_CUDA
-    TORCH_INTERNAL_ASSERT(self->cdata.device().type() == at::kCUDA);
-    THCRandom_setRNGState(self->cdata, (THByteTensor*)tensor.unsafeGetTensorImpl());
-#else
-    TORCH_INTERNAL_ASSERT(false, "PyTorch not compiled with CUDA");
-#endif
-  }
+  auto self = (THPGenerator*)_self;
+  auto& gen = self->cdata;
+  auto& new_state_tensor = ((THPVariable*)_new_state)->cdata;
+  
+  // See Note [Acquire lock when using random generators]
+  std::lock_guard<std::mutex> lock(gen.mutex());
+  gen.set_state(new_state_tensor);
+  
   Py_INCREF(self);
   return (PyObject*)self;
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index aeac5bafd56f..f70bd1a0ad95 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -722,7 +722,6 @@ PyObject* initModule() {
      methods.data()
   };
   ASSERT_TRUE(module = PyModule_Create(&torchmodule));
-  ASSERT_TRUE(THPWrapper_init(module));
   ASSERT_TRUE(THPGenerator_init(module));
   ASSERT_TRUE(THPException_init(module));
   THPSize_init(module);
diff --git a/torch/csrc/PtrWrapper.cpp b/torch/csrc/PtrWrapper.cpp
deleted file mode 100644
index aa48c49949b9..000000000000
--- a/torch/csrc/PtrWrapper.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <torch/csrc/python_headers.h>
-#include <ATen/Utils.h>
-#include <functional>
-
-static PyObject* THPWrapperClass = nullptr;
-
-struct THPWrapper {
-  PyObject_HEAD
-  void *data;
-  void (*destructor)(void*);
-};
-
-PyObject * THPWrapper_New(void *data, void (*destructor)(void*))
-{
-  PyObject *args = PyTuple_New(0);
-  if (!args) {
-    return nullptr;
-  }
-  PyObject *result = PyObject_Call(THPWrapperClass, args, nullptr);
-  if (result) {
-    THPWrapper* wrapper = (THPWrapper*) result;
-    wrapper->data = data;
-    wrapper->destructor = destructor;
-  }
-  Py_DECREF(args);
-  return result;
-}
-
-bool THPWrapper_check(PyObject * obj)
-{
-  return (PyObject*)Py_TYPE(obj) == THPWrapperClass;
-}
-
-void * THPWrapper_get(PyObject * obj)
-{
-  return ((THPWrapper*)obj)->data;
-}
-
-static PyObject * THPWrapper_pynew(PyTypeObject *type, PyObject *args, PyObject *kwargs)
-{
-  PyObject* self = type->tp_alloc(type, 0);
-  THPWrapper* wrapper = (THPWrapper*) self;
-  wrapper->data = nullptr;
-  wrapper->destructor = nullptr;
-  return self;
-}
-
-static void THPWrapper_dealloc(THPWrapper* self)
-{
-  self->destructor(self->data);
-  Py_TYPE(self)->tp_free((PyObject*)self);
-}
-
-PyTypeObject THPWrapperType = {
-  PyVarObject_HEAD_INIT(nullptr, 0)
-  "torch._C._PtrWrapper",                      /* tp_name */
-  sizeof(THPWrapper),                          /* tp_basicsize */
-  0,                                           /* tp_itemsize */
-  (destructor)THPWrapper_dealloc,              /* tp_dealloc */
-  0,                                           /* tp_vectorcall_offset */
-  nullptr,                                     /* tp_getattr */
-  nullptr,                                     /* tp_setattr */
-  nullptr,                                     /* tp_reserved */
-  nullptr,                                     /* tp_repr */
-  nullptr,                                     /* tp_as_number */
-  nullptr,                                     /* tp_as_sequence */
-  nullptr,                                     /* tp_as_mapping */
-  nullptr,                                     /* tp_hash  */
-  nullptr,                                     /* tp_call */
-  nullptr,                                     /* tp_str */
-  nullptr,                                     /* tp_getattro */
-  nullptr,                                     /* tp_setattro */
-  nullptr,                                     /* tp_as_buffer */
-  Py_TPFLAGS_DEFAULT,                          /* tp_flags */
-  nullptr,                                     /* tp_doc */
-  nullptr,                                     /* tp_traverse */
-  nullptr,                                     /* tp_clear */
-  nullptr,                                     /* tp_richcompare */
-  0,                                           /* tp_weaklistoffset */
-  nullptr,                                     /* tp_iter */
-  nullptr,                                     /* tp_iternext */
-  nullptr,                                     /* tp_methods */
-  nullptr,                                     /* tp_members */
-  nullptr,                                     /* tp_getset */
-  nullptr,                                     /* tp_base */
-  nullptr,                                     /* tp_dict */
-  nullptr,                                     /* tp_descr_get */
-  nullptr,                                     /* tp_descr_set */
-  0,                                           /* tp_dictoffset */
-  nullptr,                                     /* tp_init */
-  nullptr,                                     /* tp_alloc */
-  THPWrapper_pynew,                            /* tp_new */
-};
-
-bool THPWrapper_init(PyObject *module)
-{
-  THPWrapperClass = (PyObject*)&THPWrapperType;
-  if (PyType_Ready(&THPWrapperType) < 0)
-    return false;
-  Py_INCREF(&THPWrapperType);
-  return true;
-}
diff --git a/torch/csrc/PtrWrapper.h b/torch/csrc/PtrWrapper.h
deleted file mode 100644
index 985193c74c9b..000000000000
--- a/torch/csrc/PtrWrapper.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef THP_PTR_WRAPPER_H
-#define THP_PTR_WRAPPER_H
-
-#include <torch/csrc/python_headers.h>
-
-/**
- * Python wrapper around arbitrary opaque C++ class
- */
-
-bool THPWrapper_init(PyObject *module);
-
-PyObject * THPWrapper_New(void *data, void (*destructor)(void*));
-void * THPWrapper_get(PyObject * obj);
-bool THPWrapper_check(PyObject * obj);
-
-#endif
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
index edf4621765f8..26f6c06b3d20 100644
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@@ -31,7 +31,6 @@
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/Module.h>
-#include <torch/csrc/PtrWrapper.h>
 #include <torch/csrc/Size.h>
 #include <torch/csrc/Storage.h>
 #include <torch/csrc/Types.h>
diff --git a/torch/csrc/api/include/torch/cuda.h b/torch/csrc/api/include/torch/cuda.h
index 5f6f2a9eb8a9..a7e063b90af9 100644
--- a/torch/csrc/api/include/torch/cuda.h
+++ b/torch/csrc/api/include/torch/cuda.h
@@ -23,5 +23,8 @@ void TORCH_API manual_seed(uint64_t seed);
 /// Sets the seed for all available GPUs.
 void TORCH_API manual_seed_all(uint64_t seed);
 
+/// Waits for all kernels in all streams on a CUDA device to complete.
+void TORCH_API synchronize(int64_t device_index = -1);
+
 } // namespace cuda
 } // namespace torch
diff --git a/torch/csrc/api/src/cuda.cpp b/torch/csrc/api/src/cuda.cpp
index d40cd8611c42..b8f3ffa0ee0a 100644
--- a/torch/csrc/api/src/cuda.cpp
+++ b/torch/csrc/api/src/cuda.cpp
@@ -1,6 +1,7 @@
 #include <torch/cuda.h>
 
 #include <ATen/Context.h>
+#include <c10/core/DeviceGuard.h>
 
 #include <cstddef>
 
@@ -49,5 +50,13 @@ void manual_seed_all(uint64_t seed) {
   }
 }
 
+void synchronize(int64_t device_index) {
+  TORCH_CHECK(is_available(), "No CUDA GPUs are available");
+  int64_t num_gpus = cuda::device_count();
+  TORCH_CHECK(device_index == -1 || device_index < num_gpus,
+    "Device index out of range: ", device_index);
+  at::detail::getCUDAHooks().deviceSynchronize(device_index);
+}
+
 } // namespace cuda
 } // namespace torch
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 891d66e8f282..6558295d58cb 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -12,7 +12,9 @@
 #include <ATen/BatchedTensorImpl.h>
 #include <ATen/Dispatch.h>
 #include <ATen/ScalarOps.h>
+#include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/native/IndexingUtils.h>
 
 #include <ciso646>
 #include <algorithm>
@@ -159,10 +161,21 @@ std::tuple<Tensor, Tensor> _euclidean_dist_backward(const Tensor & grad, const T
             x2 * ratio.sum(-2, false).unsqueeze(-1) - ratio.transpose(-2, -1).matmul(x1)};
 }
 
-Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional<Scalar> & p_, const Tensor & norm) {
+Tensor norm_backward(const Tensor& grad, const Tensor& self, const optional<Scalar> & p_, const Tensor& norm) {
+  return norm_backward(grad, self, p_, norm, {}, true);
+}
+
+Tensor norm_backward(Tensor grad, const Tensor& self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
+  size_t ndim = self.sizes().size();
   double p = p_.value_or(2.0).toDouble();
   Tensor self_scaled;
   Tensor scale_v;
+
+  if (!keepdim && self.dim() != 0) {
+    grad = unsqueeze_multiple(grad, dim, ndim);
+    norm = unsqueeze_multiple(norm, dim, ndim);
+  }
+
   if (p == 0.0) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else if (p == 1.0) {
@@ -171,8 +184,13 @@ Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional<Sc
     self_scaled = self;
     scale_v = grad / norm;
   } else if (std::isinf(p)) {
-    self_scaled = self.sgn() * (self.abs() == norm).type_as(self);
-    scale_v = grad.clone(at::MemoryFormat::Preserve);
+    Tensor is_eq_max = (self.abs() == norm).logical_or_(self.isnan().logical_and_(norm.isnan())).type_as(self);
+    self_scaled = self.sign() * is_eq_max;
+    Tensor nb_max = is_eq_max.count_nonzero(dim);
+    if (self.dim() != 0) {
+      nb_max = unsqueeze_multiple(nb_max, dim, ndim);
+    }
+    scale_v = grad / nb_max;
   } else if (p < 2.0) {
     self_scaled = self.sgn() * self.abs().pow(p - 1);
     scale_v = grad / norm.pow(p - 1);
@@ -185,31 +203,12 @@ Tensor norm_backward(const Tensor & grad, const Tensor & self, const optional<Sc
   return self_scaled * scale_v;
 }
 
-Tensor norm_backward(Tensor grad, const Tensor & self, const optional<Scalar> & p_, Tensor norm, IntArrayRef dim, bool keepdim) {
-  IntArrayRef sizes = self.sizes();
-  if (!keepdim && self.dim() != 0) {
-    if (dim.size()==1) {
-      grad = grad.unsqueeze(dim[0]);
-      norm = norm.unsqueeze(dim[0]);
-    } else {
-      auto dims_to_unsqueeze = at::dim_list_to_bitset(dim, sizes.size());
-      for (size_t i = 0; i < sizes.size(); i++){
-        if (dims_to_unsqueeze[i]) {
-          grad = grad.unsqueeze(i);
-          norm = norm.unsqueeze(i);
-        }
-      }
-    }
-  }
-  return norm_backward(grad, self, p_, norm);
-}
-
-Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent_) {
-  auto exponent = (exponent_.isComplex()) ? exponent_.toComplexDouble() : exponent_.toDouble();
-  if (exponent == 0.0) {
+Tensor pow_backward(Tensor grad, const Tensor & self, const Scalar & exponent) {
+  if (exponent.equal(0.0)) {
     return at::zeros_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
   } else {
-    auto out = grad * (exponent * self.pow(exponent - 1)).conj();
+    auto grad_lambda = [&](auto exp) { return grad * (exp * self.pow(exp - 1)).conj(); };
+    Tensor out = (exponent.isComplex()) ? grad_lambda(exponent.toComplexDouble()) : grad_lambda(exponent.toDouble());
     return handle_r_to_c(self, out);
   }
 }
@@ -242,9 +241,8 @@ Tensor pow_backward_exponent(Tensor grad, const Tensor& self, const Tensor& expo
 }
 
 Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exponent, Tensor result) {
-  auto base_ = base.isComplex() ? base.toComplexDouble() : base.toDouble();
-  auto grad_lambda = [](auto a, auto b) { return (a * std::log(b)).conj(); };
-  if (base_ == 0.0) {
+  auto grad_lambda = [](Tensor a, Scalar b) { return (a * b.log()).conj(); };
+  if (base.equal(0.0)) {
     auto cond = [](auto exp) {
       if (exp.is_complex()) {
         return at::logical_and(at::imag(exp) == 0, at::real(exp) >= 0);
@@ -254,10 +252,10 @@ Tensor pow_backward_exponent(Tensor grad, const Scalar & base, const Tensor& exp
     };
     auto out = grad * at::where(cond(exponent),
                             at::zeros({}, grad.options()),
-                            grad_lambda(result, base_));
+                            grad_lambda(result, base));
     return handle_r_to_c(exponent, out);
   } else {
-    auto out = grad * grad_lambda(result, base_);
+    auto out = grad * grad_lambda(result, base);
     return handle_r_to_c(exponent, out);
   }
 }
@@ -2076,8 +2074,13 @@ Tensor symeig_backward(const std::vector<torch::autograd::Variable> &grads, cons
   return result.add(result.conj().transpose(-2, -1)).mul_(0.5);
 }
 
-Tensor qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
-                   bool some, const Tensor& q, const Tensor& r){
+Tensor linalg_qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
+                          std::string mode, const Tensor& q, const Tensor& r){
+  bool compute_q, reduced;
+  std::tie(compute_q, reduced) = at::native::_parse_qr_mode(mode);
+  TORCH_CHECK(compute_q, "linalg_qr_backward: cannot compute backward if mode='r'. "
+                         "Please use torch.linalg.qr(..., mode='reduced')");
+
   auto square_deep_case_backward = [](const Tensor& grad_Q,
                                       const Tensor& grad_R,
                                       const Tensor& A,
@@ -2141,7 +2144,7 @@ Tensor qr_backward(const std::vector<torch::autograd::Variable> &grads, const Te
   auto n = self.size(-1);
 
   TORCH_CHECK(
-      ((m <= n && (!some)) || some),
+      ((m <= n && (!reduced)) || reduced),
       "The derivative is not implemented when nrows > ncols and complete QR. ");
 
   auto grad_Q = grads[0];
@@ -2209,15 +2212,17 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det)
       return nonsingular_case_backward(grad, self, det);
     }
   } else {
-    auto nonzero_det_indices = at::where(det);
+    auto nonzero_det_indices = at::native::toListOfOptionalTensors(at::where(det));
+    c10::optional<Tensor> first_nonzero_det_index = nonzero_det_indices[0];
 
-    if (nonzero_det_indices[0].size(0) == det.numel()) {  // all determinants are nonzero (non-singular)
+    if (first_nonzero_det_index->size(0) == det.numel()) {  // all determinants are nonzero (non-singular)
       return nonsingular_case_backward(grad, self, det);
     }
 
-    auto zero_det_indices = at::where(det == 0);
+    auto zero_det_indices = at::native::toListOfOptionalTensors(at::where(det == 0));
+    c10::optional<Tensor> first_zero_det_index = zero_det_indices[0];
 
-    if (zero_det_indices[0].size(0) == det.numel()) {  // all determinants are zero (singular)
+    if (first_zero_det_index->size(0) == det.numel()) {  // all determinants are zero (singular)
       return singular_case_backward(grad, self, det);
     }
 
@@ -2259,15 +2264,17 @@ Tensor logdet_backward(const Tensor & grad, const Tensor& self, const Tensor& lo
       return singular_case_backward(grad, self);
     }
   } else {
-    auto finite_logdet_indices = at::where(logdet != -INFINITY);
+    auto finite_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet != -INFINITY));
+    c10::optional<Tensor> first_finite_logdet_index = finite_logdet_indices[0];
 
-    if (finite_logdet_indices[0].size(0) == logdet.numel()) {  // all log determinants are finite (non-singular)
+    if (first_finite_logdet_index->size(0) == logdet.numel()) {  // all log determinants are finite (non-singular)
       return nonsingular_case_backward(grad, self);
     }
 
-    auto neginf_logdet_indices = at::where(logdet == -INFINITY);
+    auto neginf_logdet_indices = at::native::toListOfOptionalTensors(at::where(logdet == -INFINITY));
+    c10::optional<Tensor> first_neginf_logdet_index = neginf_logdet_indices[0];
 
-    if (neginf_logdet_indices[0].size(0) == logdet.numel()) {  // all log determinants are -inf (singular)
+    if (first_neginf_logdet_index->size(0) == logdet.numel()) {  // all log determinants are -inf (singular)
       return singular_case_backward(grad, self);
     }
 
@@ -2311,15 +2318,17 @@ Tensor slogdet_backward(const Tensor& grad_logabsdet,
       return nonsingular_case_backward(grad_logabsdet, self);
     }
   } else {
-    auto nonzero_signdet_indices = at::where(signdet);
+    auto nonzero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet));
+    c10::optional<Tensor> first_nonzero_signdet_index = nonzero_signdet_indices[0];
 
-    if (nonzero_signdet_indices[0].size(0) == logabsdet.numel()) {  // all log determinants are finite (non-singular)
+    if (first_nonzero_signdet_index->size(0) == logabsdet.numel()) {  // all log determinants are finite (non-singular)
       return nonsingular_case_backward(grad_logabsdet, self);
     }
 
-    auto zero_signdet_indices = at::where(signdet == 0);
+    auto zero_signdet_indices = at::native::toListOfOptionalTensors(at::where(signdet == 0));
+    c10::optional<Tensor> first_zero_signdet_index = zero_signdet_indices[0];
 
-    if (zero_signdet_indices[0].size(0) == logabsdet.numel()) {  // all log determinants are -inf (singular)
+    if (first_zero_signdet_index->size(0) == logabsdet.numel()) {  // all log determinants are -inf (singular)
       return singular_case_backward(grad_logabsdet, self);
     }
 
@@ -2871,8 +2880,8 @@ Tensor embedding_dense_double_backward(const Tensor & grad, const Tensor & indic
   return gg_weight.view(size);
 }
 
-Tensor index_backward(Tensor zeros_like_self, TensorList indices, const Tensor& grad) {
-   return at::_index_put_impl_(zeros_like_self, indices, grad, true, true);
+Tensor index_backward(Tensor zeros_like_self, const torch::List<c10::optional<Tensor>>& indices, const Tensor& grad) {
+  return at::_index_put_impl_(zeros_like_self, indices, grad, true, true);
 }
 
 Tensor _cudnn_ctc_loss_backward(const Tensor& grad_out, const Tensor& loss, const Tensor& raw_grad, bool zero_infinity) {
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 73bd02960d7f..30736e13f58a 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -124,7 +124,7 @@ at::Tensor slogdet_backward(const at::Tensor& grad_logabsdet, const at::Tensor&
 at::Tensor log1p_backward(const at::Tensor& grad, const at::Tensor& self);
 at::Tensor sparse_constructor_values_backward(const at::Tensor& sparse_grad_out, const at::Tensor& indices, at::IntArrayRef values_shape);
 at::Tensor embedding_dense_double_backward(const at::Tensor & grad, const at::Tensor & indices, int64_t padding_idx);
-at::Tensor index_backward(at::Tensor zeros_like_self, at::TensorList indices, const at::Tensor& grad);
+at::Tensor index_backward(at::Tensor zeros_like_self, const torch::List<c10::optional<Tensor>>& indices, const at::Tensor& grad);
 at::Tensor _cudnn_ctc_loss_backward(const at::Tensor& grad_out, const at::Tensor& loss, const at::Tensor& raw_grad, bool zero_infinity);
 
 Tensor svd_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
@@ -139,8 +139,8 @@ std::tuple<Tensor, Tensor> triangular_solve_backward(
 std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(const Tensor& grad_out, const Tensor& i1, const Tensor& i2, const Tensor& i3,
                                                        IntArrayRef expand1, IntArrayRef expand2, IntArrayRef expand3,
                                                        IntArrayRef sumdim, int64_t unroll_dim, std::array<bool, 3> grad_mask);
-Tensor qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
-                   bool some, const Tensor& Q, const Tensor& R);
+Tensor linalg_qr_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
+                          std::string mode, const Tensor& Q, const Tensor& R);
 Tensor eig_backward(const std::vector<torch::autograd::Variable> &grads, const Tensor& self,
                     bool eigenvectors, const Tensor& lambda, const Tensor& v);
 Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det);
diff --git a/torch/csrc/autograd/VariableTypeManual.cpp b/torch/csrc/autograd/VariableTypeManual.cpp
index 0663d7f46fa8..f6c3f23cd0f7 100644
--- a/torch/csrc/autograd/VariableTypeManual.cpp
+++ b/torch/csrc/autograd/VariableTypeManual.cpp
@@ -66,10 +66,6 @@ Tensor unpack_opt(const Tensor & t, const char * name, int pos) {
   return unpack(t, name, pos);
 }
 
-c10::optional<Tensor> unpack_opt(const c10::optional<Tensor> & t, const char * name, int pos) {
-  return t;
-}
-
 std::vector<at::Tensor> unpack(at::TensorList tl, const char *name, int pos) {
   std::vector<at::Tensor> ret(tl.size());
   for (size_t i = 0; i < tl.size(); ++i) {
@@ -94,7 +90,7 @@ void _backward(
   // instead of us having to unwrap it to Tensor _gradient here.
   Tensor _gradient = gradient.has_value() ? *gradient : Tensor();
   std::vector<torch::autograd::Variable> input_vars(inputs.begin(), inputs.end());
-  torch::autograd::backward({self}, {_gradient}, std::move(keep_graph), create_graph, input_vars);
+  torch::autograd::backward({self}, {_gradient}, keep_graph, create_graph, input_vars);
 }
 
 void set_data(Tensor & self, const Tensor & new_data) {
@@ -230,7 +226,6 @@ Tensor _fw_primal(const Tensor & self, int64_t level) {
 
 // We don't have an outplace copy, so this can't be generated automatically
 Tensor & copy_(Tensor & self, const Tensor & src, bool non_blocking) {
-  jit::Value* output = nullptr;
   // TODO: once copy is exposed in Declarations.yaml we may be able to bind
   // it automatically
   auto& self_ = unpack(self, "self", 0);
@@ -282,7 +277,7 @@ Tensor& resize_(
   }
   {
     at::AutoNonVariableTypeMode non_var_type_mode(true);
-    self_.resize_(size, std::move(optional_memory_format));
+    self_.resize_(size, optional_memory_format);
   }
 
   if (self.fw_grad(/* level */ 0).defined()) {
@@ -303,7 +298,7 @@ Tensor& resize_as_(
   }
   {
     at::AutoNonVariableTypeMode non_var_type_mode(true);
-    at::resize_as_(self_, the_template_, std::move(optional_memory_format));
+    at::resize_as_(self_, the_template_, optional_memory_format);
   }
 
   // Handle fw grad
@@ -392,14 +387,6 @@ TORCH_LIBRARY_IMPL(aten, Autograd, m) {
   m.impl("detach", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach)));
   m.impl("detach_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::detach_)));
   m.impl("copy_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::copy_)));
-  // For backward() and requires_grad_(), we need the DefaultBackend kernel, but we also need the Autograd backend
-  // kernel, because when called with a VariableTensorId tensor, it goes through the variable fallback kernel,
-  // which calls callBoxed(), which doesn't support optional tensor arguments yet and backward() has an optional
-  // tensor argument.
-  // TODO Once callBoxed() supports optional tensor arguments, we can enable `use_c10_dispatcher: full` for backward()
-  //      and requires_grad_(), then remove the backend Autograd kernel here, only leaving the Math kernel.
-  m.impl("_backward", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_backward)));
-  m.impl("requires_grad_", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::requires_grad_)));
   m.impl("_fw_primal", torch::dispatch(DispatchKey::Autograd, TORCH_FN(VariableType::_fw_primal)));
 }
 
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index af02de68fc27..509a12e01140 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -266,12 +266,31 @@ inline void check_no_requires_grad(TensorList tensors, const char* name) {
   }
 }
 
+inline void check_no_requires_grad(const c10::List<c10::optional<Tensor>>& tensors, const char* name) {
+  for (c10::optional<Tensor> tensor : tensors) {
+    if (tensor.has_value()) {
+      check_no_requires_grad(*tensor, name);
+    }
+  }
+}
+
 // Assumed that saved tensor lists are never inplace outputs
 inline std::vector<SavedVariable> make_saved_variable_list(TensorList tensors) {
   return fmap(tensors, [](const Tensor& tensor) -> SavedVariable {
       return SavedVariable{tensor, false /* is output */}; });
 }
 
+// Assumed that saved tensor lists are never inplace outputs
+inline std::vector<SavedVariable> make_saved_variable_list(const c10::List<c10::optional<at::Tensor>>& tensors) {
+  return fmap(tensors, [](const c10::optional<Tensor>& tensor) -> SavedVariable {
+    if (tensor.has_value()) {
+      return SavedVariable{*tensor, false /* is output */};
+    } else {
+      return SavedVariable{Tensor(), false /* is output */};
+    }
+  });
+}
+
 inline std::vector<std::vector<int64_t>> to_args_sizes(TensorList tensors) {
   std::vector<std::vector<int64_t>> args_sizes(tensors.size());
   for (size_t i = 0; i < tensors.size(); ++i) {
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index dbfd129c87bb..f906efb187ef 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -853,7 +853,7 @@ auto Engine::execute(const edge_list& roots,
   // A frech first time Engine::execute call should start on the CPU device, initialize
   // a new thread local ready queue on CPU or reuse the existing one (if there is one
   // allocated already, i.e. consecutive backward calls, re-entrant backward calls),
-  // then memorize the local_ready_queue in GraphTask
+  // then memoize the local_ready_queue in GraphTask
   init_local_ready_queue();
   bool not_reentrant_backward_call = worker_device == NO_DEVICE;
 
diff --git a/torch/csrc/autograd/engine.h b/torch/csrc/autograd/engine.h
index b3c28b233014..7892f47521c5 100644
--- a/torch/csrc/autograd/engine.h
+++ b/torch/csrc/autograd/engine.h
@@ -144,7 +144,7 @@ struct GraphTask: std::enable_shared_from_this<GraphTask> {
 
   // CPU threads are dedicated to processing CPU work for the backward they invoked.
   // So any given graph task maintains its own cpu_ready_queue_ where you should send
-  // work for it to be done. We memorize the cpu_ready_queue_ per GraphTask so that
+  // work for it to be done. We memoize the cpu_ready_queue_ per GraphTask so that
   // we know which ready queue we should push to if we are on device thread (i.e. GPU)
   // and but next NodeTask should be run on CPU.
   std::shared_ptr<ReadyQueue> cpu_ready_queue_;
diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
index ca419522dff8..d86073a7af79 100644
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@@ -52,7 +52,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) {
       .value("CUDA", ActivityType::CUDA);
 
   py::class_<ProfilerConfig>(m, "ProfilerConfig")
-      .def(py::init<ProfilerState, bool, bool, bool>());
+      .def(py::init<ProfilerState, bool, bool, bool, bool>());
 
   py::class_<LegacyEvent>(m, "ProfilerEvent")
       .def("kind", &LegacyEvent::kindStr)
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index 3b1d254e985b..85272677a06b 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -226,8 +226,10 @@ void ProfilerThreadLocalState::pushRange(
     evt.setSequenceNr(fn.seqNr());
     evt.setFwdThreadId(fn.forwardThreadId());
     evt.setScope((uint8_t)fn.scope());
-    evt.setExtraArgs(saveExtraArgs(fn));
-    evt.setFlops(computeFlops(std::string(fn.name().str()), evt.extraArgs()));
+    if (config_.with_flops) {
+      evt.setExtraArgs(saveExtraArgs(fn));
+      evt.setFlops(computeFlops(std::string(fn.name().str()), evt.extraArgs()));
+    }
 #ifndef C10_MOBILE
     // backward nodes source range corresponds to the forward node
     // TODO: consider using C++ stack trace
diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h
index 3e07c8cb541b..23169cd33450 100644
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@@ -387,16 +387,19 @@ struct TORCH_API ProfilerConfig {
       ProfilerState state,
       bool report_input_shapes = false,
       bool profile_memory = false,
-      bool with_stack = false)
+      bool with_stack = false,
+      bool with_flops = false)
       : state(state),
         report_input_shapes(report_input_shapes),
         profile_memory(profile_memory),
-        with_stack(with_stack) {}
+        with_stack(with_stack),
+        with_flops(with_flops) {}
   ~ProfilerConfig() = default;
   ProfilerState state;
   bool report_input_shapes;
   bool profile_memory;
   bool with_stack;
+  bool with_flops;
 
   // Returns IValues corresponding to ProfilerConfig struct, to be used for
   // serialization.
diff --git a/torch/csrc/autograd/python_engine.cpp b/torch/csrc/autograd/python_engine.cpp
index eee29481bea5..a9c7d709466e 100644
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@@ -1,7 +1,6 @@
 #include <torch/csrc/autograd/python_engine.h>
 
 #include <torch/csrc/DynamicTypes.h>
-#include <torch/csrc/PtrWrapper.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/engine.h>
diff --git a/torch/csrc/autograd/python_variable_indexing.cpp b/torch/csrc/autograd/python_variable_indexing.cpp
index 4b38d924c91b..285161a49ef2 100644
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@@ -351,6 +351,10 @@ int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* py_value) {
   }
 
   auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
+  if (self_.is_sparse())
+  {
+    throw TypeError("Cannot assign to a sparse tensor");
+  }
   OptionalDeviceGuard device_guard(device_of(self_));
   at::Device self_device = self_.device();
   Variable value;
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index b31d44a1d295..0d4250eddd13 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -345,6 +345,7 @@ They are used in specifying strategies for reduction collectives, e.g.,
 
   py::class_<::c10d::BarrierOptions>(module, "BarrierOptions")
       .def(py::init<>())
+      .def_readwrite("device_ids", &::c10d::BarrierOptions::device_ids)
       .def_readwrite("timeout", &::c10d::BarrierOptions::timeout);
 
   py::class_<::c10d::AllToAllOptions>(module, "AllToAllOptions")
@@ -1259,11 +1260,25 @@ static const auto TCPStoreTorchBind =
         .def(torch::init([](const std::string& host_name,
                             int64_t port,
                             int64_t world_size,
-                            bool is_master) {
+                            bool is_master,
+                            int64_t timeout) {
+          auto timeout_miliseconds = std::chrono::milliseconds(timeout);
           return c10::make_intrusive<::c10d::TCPStore>(
-              host_name, port, world_size, is_master);
+              host_name, port, world_size, is_master, timeout_miliseconds);
         }));
 
+// TODO: This should really take Store as constructor argument instead of
+// TCPStore, but the fact that TorchScript does not support polymorphism
+// forced us to cast in C++ instead of automatic casting
+static const auto PrefixStoreTorchBind =
+    torch::class_<::c10d::PrefixStore>("dist_c10d", "PrefixStore")
+        .def(torch::init([](const std::string& prefix,
+                            const c10::intrusive_ptr<::c10d::TCPStore>& store) {
+            return c10::make_intrusive<::c10d::PrefixStore>(
+                prefix, store);
+        }));
+
+
 // Torchbind the ProcessGroup to make it available in TorchScript
 static const auto ProcessGroupWorkTorchBind =
     torch::class_<::c10d::ProcessGroup::Work>("dist_c10d", "Work")
@@ -1623,7 +1638,14 @@ static const auto ProcessGroupNCCLTorchBind =
                   outputSplitSizes,
                   inputSplitSizes,
                   ::c10d::AllToAllOptions());
-            });
+
+            })
+        .def("size", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+            return (int64_t) self->getSize();
+        })
+        .def("rank", [](const c10::intrusive_ptr<::c10d::ProcessGroupNCCL>& self) {
+            return (int64_t) self->getRank();
+        });
 #endif
 
 static const auto DistributedC10dFrontendTorchBind =
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 9b28ecbdd4bb..6204f6e343ed 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -245,7 +245,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
                   to the local node and returns it. If the current node is the
                   owner, returns a reference to the local value.
 
-                  Arguments:
+                  Args:
                       timeout (float, optional): Timeout for ``to_here``. If
                           the call does not complete within this timeframe, an
                           exception indicating so will be raised. If this
@@ -437,7 +437,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
                   the node calling this API has to be the owner of the RRef.
                   The value of the RRef is expected to be a scalar Tensor.
 
-                Arguments:
+                Args:
                     dist_autograd_ctx_id (int, optional): The distributed
                         autograd context id for which we should retrieve the
                         gradients (default: -1).
@@ -464,7 +464,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           The backend options class for ``ProcessGroupAgent``, which is derived
           from ``RpcBackendOptions``.
 
-          Arguments:
+          Args:
               num_send_recv_threads (int, optional): The number of threads in
                   the thread-pool used by ``ProcessGroupAgent`` (default: 4).
               rpc_timeout (float, optional): The default timeout, in seconds,
@@ -782,7 +782,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
     Set whether GIL wait times should be enabled or not. This incurs a slight
     overhead cost. Default is disabled for performance reasons.
 
-    Arguments:
+    Args:
         flag (bool): True to set GIL profiling, False to disable.
       )");
 
@@ -801,7 +801,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) {
           :meth:`~torch.distributed.rpc.rpc_sync` and
           :meth:`~torch.distributed.rpc.rpc_async`.
 
-          Arguments:
+          Args:
             rpcTimeoutSeconds (float): Timeout value in seconds.
       )");
 
diff --git a/torch/csrc/jit/OVERVIEW.md b/torch/csrc/jit/OVERVIEW.md
index ee267db74403..262878f6b6ef 100644
--- a/torch/csrc/jit/OVERVIEW.md
+++ b/torch/csrc/jit/OVERVIEW.md
@@ -180,7 +180,7 @@ For Nodes representing built-in Operators, the method `Node::schema` can also lo
 
 All of the strings correspond to different `FunctionSchema` objects. A `Node` can be queried for its schema using the `schema()` method (it will check the argument types, and will try to match one of the options for its `kind()`).
 
-Note that the chosen overload is not shown in any way in the textual output. If you're unsure which function does a node resolve to, you might need to check the type annotations of its input values.
+Note that the chosen overload is not shown in any way in the textual output. If you're unsure which function a node resolves to, you might need to check the type annotations of its input values.
 
 
 Each node also has a set of attributes which are named integers, strings, floats, Tensors, and subgraphs, or lists of these types. These are used by special primitive operators to encode additional data in the Node. For instance `prim::Constant` defines a compile-time constant value. For Tensor constants, it will have a single Tensor attribute with the name `attr::value` which contains the value of the constant.
@@ -204,7 +204,7 @@ Iterators for the `nodes()` list are invalided when the current Node they point
 
 Block also contain a list of input and output values. The meaning of these values depends on where the block is used. For the Graph's top-level block, these are inputs and outputs to the Graph, and line up with the FunctionSchema associated with a Method.
 
-**Control-flow** is represented with using sub-blocks rather than a control-flow graph representation. A `prim::If` has one block for the true branch and one block for the else.A `prim:Loop` has a block for the loop body (there is no condition block, instead the end of the loop body computes whether to re-enter the loop body). This representation ensures we have structured control-flow. This limitation makes a lot of optimizations easier and is true for the vast majority of networks. A Node can lookup what Block it is in, and a Block and can look up its parent (either the Node that has it as a subblock, or `nullptr` for the main Block).
+**Control-flow** is represented with using sub-blocks rather than a control-flow graph representation. A `prim::If` has one block for the true branch and one block for the else.A `prim:Loop` has a block for the loop body (there is no condition block, instead the end of the loop body computes whether to re-enter the loop body). This representation ensures we have structured control-flow. This limitation makes a lot of optimizations easier and is true for the vast majority of networks. A Node can look up what Block it is in, and a Block and can look up its parent (either the Node that has it as a subblock, or `nullptr` for the main Block).
 
 ### If ###
 For if-statements (`prim::If`) the Blocks have no inputs, and the outputs are the new values of variables in the outer block whose values were altered in an if-statement.
@@ -1209,7 +1209,7 @@ a = torch.rand(2, 3)
 b = a
 # At this point, `a` and `b` share their storage.
 c = b[0]
-# `c` is shares storage with `a` and `b`, but only sees a slice of the allocated memory.
+# `c` shares storage with `a` and `b`, but only sees a slice of the allocated memory.
 ```
 
 Some operators will *mutate* one or more of their operands in-place. These are typically denoted with a trailing underscore, or by taking an `out` argument as input:
@@ -1293,7 +1293,7 @@ So to determine whether  `a` and `b` may alias, we traverse the `AliasTracker` D
 ### Writing optimization passes with `AliasDb`
 `AliasDb` provides a high-level interface to help people write mutability-safe optimization passes.
 
-In particular, `moveAfterTopologicallyValid()` (and it's `moveBefore` variant) will reorder nodes in a way that preserves data dependencies and avoids any data hazards.  The rules for this are that all mutable *writes* to a given memory location must occur in the same order (avoid WAW hazards), and that no reads can be reordered before or after any write (WAR, RAW hazards).
+In particular, `moveAfterTopologicallyValid()` (and its `moveBefore` variant) will reorder nodes in a way that preserves data dependencies and avoids any data hazards.  The rules for this are that all mutable *writes* to a given memory location must occur in the same order (avoid WAW hazards), and that no reads can be reordered before or after any write (WAR, RAW hazards).
 
 However, reordering of reads across writes *is allowed* if we can prove that the read cannot alias the thing being written. This happens whenever we have tensors that come from functions that produce fresh results (common) inside of the function. It also happens whenever the creation of the mutable tensor is seen in the function (so it gets assigned a fresh variable), and all of its writes occur in that function.
 
diff --git a/torch/csrc/jit/api/compilation_unit.h b/torch/csrc/jit/api/compilation_unit.h
index aa8b32ce58d3..96a7f0a463bc 100644
--- a/torch/csrc/jit/api/compilation_unit.h
+++ b/torch/csrc/jit/api/compilation_unit.h
@@ -40,9 +40,9 @@ struct Self {
 };
 
 // A CompilationUnit is a list of named Functions
-// with helper methods to iterate the list, or invoke the function.
-// Classes have a CompilationUnit holding the class methods
-// and Modules also have a CompilationUnit holding the Functions that
+// with helper methods to iterate the list or invoke the function.
+// Classes have a CompilationUnit holding the class methods,
+// and Modules have a CompilationUnit holding the Functions that
 // are used to implement their Methods
 
 struct TORCH_API CompilationUnit {
@@ -85,7 +85,7 @@ struct TORCH_API CompilationUnit {
   }
 
   // for historic reasons, these are defined in ir_emitter.cpp
-  // Returns the list of Function's just defined.
+  // Returns the list of Functions just defined.
   std::vector<Function*> define(
       const c10::optional<c10::QualifiedName>& prefix,
       const std::vector<Property>& properties,
@@ -100,7 +100,7 @@ struct TORCH_API CompilationUnit {
       bool shouldMangle = false);
 
   // same as above but parse the definitions from source
-  // Returns the list of Function's just defined.
+  // Returns the list of Functions just defined.
   std::vector<Function*> define(
       // prefix namespace to put all the defined functions into
       const c10::optional<c10::QualifiedName>& prefix,
diff --git a/torch/csrc/jit/backends/backend_detail.h b/torch/csrc/jit/backends/backend_detail.h
index 2d19f2ed8950..00f0f2f9eb44 100644
--- a/torch/csrc/jit/backends/backend_detail.h
+++ b/torch/csrc/jit/backends/backend_detail.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 
 namespace torch {
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
index f1a0a634727a..5bddc510fe56 100644
--- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
+++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -209,7 +209,7 @@ InputsIdLookup::IdLookupReturn InputsIdLookup::lookupId(
   std::stringstream encoded_inputs;
   for (const auto& input : inputs) {
     if (input.isTensor()) {
-      auto input_tensor = input.toTensor();
+      auto& input_tensor = input.toTensor();
 
       encoded_inputs << ";";
       auto sep = "";
diff --git a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
index 4e76dc23e55d..4f4aa0d1536b 100644
--- a/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
+++ b/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.cpp
@@ -45,11 +45,17 @@ constexpr int so_suffix_len = 3;
 constexpr int cpp_suffix_len = 4;
 #endif
 
+intptr_t run(const std::string& cmd);
+
 static bool programExists(const std::string& program) {
   TemplateEnv env;
   env.s("program", program);
   std::string cmd = format(check_exists_string, env);
+#ifdef _MSC_VER
+  return (run(cmd.c_str()) == 0);
+#else
   return (system(cmd.c_str()) == 0);
+#endif
 }
 
 #ifdef _MSC_VER
diff --git a/torch/csrc/jit/cuda/cuda.h b/torch/csrc/jit/cuda/cuda.h
new file mode 100644
index 000000000000..fa92ce22d6e4
--- /dev/null
+++ b/torch/csrc/jit/cuda/cuda.h
@@ -0,0 +1,179 @@
+#include <aten/src/ATen/cuda/CUDAEvent.h>
+#include <c10/core/Device.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/custom_class.h>
+
+namespace torch {
+namespace jit {
+
+class CUDAEvent;
+// This class is a wrapper around c10::cuda::CUDAStream.
+// It is needed because TorchBind does not support all of the argument types
+// for c10::cuda::CUDAStream. For more details, please refer to
+// c10/cuda/CUDAStream.h.
+class CUDAStream final : public CustomClassHolder {
+ public:
+  CUDAStream(int64_t device = -1, int64_t priority = 0) {
+    constexpr int64_t PRIORITY_INDEX = 0;
+    stream_ = std::make_unique<c10::cuda::CUDAStream>(
+        c10::cuda::getStreamFromPool(priority < PRIORITY_INDEX, device));
+  }
+
+  CUDAStream(c10::cuda::CUDAStream s) {
+    stream_ = std::make_unique<c10::cuda::CUDAStream>(s);
+  }
+
+  bool query() {
+    return stream_->query();
+  }
+
+  c10::intrusive_ptr<CUDAEvent> recordEvent(
+      c10::intrusive_ptr<CUDAEvent> event);
+
+  void synchronize() {
+    stream_->synchronize();
+  }
+
+  void waitEvent(c10::intrusive_ptr<CUDAEvent> event);
+
+  void waitStream(c10::intrusive_ptr<CUDAStream> stream);
+
+  /// Get the CUDA device index that this stream is associated with.
+  int64_t device_index() const {
+    return stream_->device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  c10::Device device() const {
+    return stream_->device();
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  int64_t id() const {
+    return stream_->id();
+  }
+
+  /// Pack a CUDAStream to uint64_t representation.
+  /// The CUDAStream can be unpacked using unpack().  The format of
+  /// the uint64_t is unspecified and may be changed.
+  int64_t pack() const {
+    return stream_->pack();
+  }
+
+ private:
+  std::unique_ptr<c10::cuda::CUDAStream> stream_;
+  friend class CUDAEvent;
+};
+
+// This class is a wrapper around at::cuda::CUDAStream.
+// It is needed because TorchBind does not support all of the argument types
+// for at::cuda::CUDAEvent. For more details, please refer to
+// aten/src/ATen/cuda/CUDAEvent.h.
+class CUDAEvent final : public CustomClassHolder {
+ public:
+  CUDAEvent(
+      bool enable_timing = false,
+      bool blocking = false,
+      bool interprocess = false) {
+    int flags = cudaEventDisableTiming;
+    if (enable_timing) {
+      flags = cudaEventDefault;
+    }
+    if (blocking) {
+      flags |= cudaEventBlockingSync;
+    }
+    if (interprocess) {
+      TORCH_CHECK(!enable_timing);
+      flags |= cudaEventInterprocess;
+    }
+
+    event_ = std::make_unique<at::cuda::CUDAEvent>(flags);
+  }
+
+  double elapsedTime(c10::intrusive_ptr<CUDAEvent> end) {
+    return event_->elapsed_time(*end->event_);
+  }
+
+  std::string ipcHandle() {
+    cudaIpcEventHandle_t handle;
+    event_->ipc_handle(&handle);
+    std::string str_handle((const char*)&handle, sizeof(handle));
+    return str_handle;
+  }
+
+  bool query() {
+    return event_->query();
+  }
+
+  void record(c10::intrusive_ptr<CUDAStream> stream);
+
+  void synchronize() {
+    event_->synchronize();
+  }
+  void wait(c10::intrusive_ptr<CUDAStream> stream);
+
+ private:
+  void recordInternal(CUDAStream* stream);
+  std::unique_ptr<at::cuda::CUDAEvent> event_;
+
+  friend class CUDAStream;
+};
+
+c10::intrusive_ptr<CUDAEvent> CUDAStream::recordEvent(
+    c10::intrusive_ptr<CUDAEvent> event) {
+  if (!event) {
+    event = c10::make_intrusive<CUDAEvent>();
+  }
+
+  event->recordInternal(this);
+  return event;
+}
+
+void CUDAStream::waitEvent(c10::intrusive_ptr<CUDAEvent> event) {
+  event->event_->block(*stream_);
+}
+
+void CUDAStream::waitStream(c10::intrusive_ptr<CUDAStream> stream) {
+  auto ev = c10::make_intrusive<CUDAEvent>();
+  stream->recordEvent(ev);
+  waitEvent(ev);
+}
+
+void CUDAEvent::record(c10::intrusive_ptr<CUDAStream> stream) {
+  event_->record(*stream->stream_);
+}
+
+void CUDAEvent::recordInternal(CUDAStream* stream) {
+  event_->record(*stream->stream_);
+}
+
+void CUDAEvent::wait(c10::intrusive_ptr<CUDAStream> stream) {
+  event_->block(*stream->stream_);
+}
+
+TORCH_LIBRARY(cuda, m) {
+  auto stream_class = m.class_<torch::jit::CUDAStream>("Stream").def(
+      torch::init<int64_t, int64_t>());
+  auto event_class = m.class_<torch::jit::CUDAEvent>("Event").def(
+      torch::init<bool, bool, bool>());
+
+  stream_class.def("query", &CUDAStream::query)
+      .def("record_event", &CUDAStream::recordEvent)
+      .def("synchronize", &CUDAStream::synchronize)
+      .def("wait_event", &CUDAStream::waitEvent)
+      .def("wait_stream", &CUDAStream::waitStream)
+      .def("device_index", &CUDAStream::device_index)
+      .def("device", &CUDAStream::device)
+      .def("pack", &CUDAStream::pack)
+      .def("id", &CUDAStream::id);
+
+  event_class.def("elapsed_time", &CUDAEvent::elapsedTime)
+      .def("query", &CUDAEvent::query)
+      .def("record", &CUDAEvent::record)
+      .def("synchronize", &CUDAEvent::synchronize)
+      .def("wait", &CUDAEvent::wait);
+};
+
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/docs/serialization.md b/torch/csrc/jit/docs/serialization.md
index 9f40618e0e4d..2fc3c9abb429 100644
--- a/torch/csrc/jit/docs/serialization.md
+++ b/torch/csrc/jit/docs/serialization.md
@@ -112,7 +112,7 @@ serialized as well.
 `PythonPrint` works by walking a `Graph` (the IR representation of either a
 `ClassType`'s method or raw `Function`) and emitting Python code that
 corresponds to it. The rules for emitting Python code are mostly
-straightforward uninteresting. There are some extra pieces of information
+straightforward and uninteresting. There are some extra pieces of information
 that `PythonPrint` tracks, however:
 
 **Class dependencies**. While walking the graph, `PythonPrint` keeps track of
diff --git a/torch/csrc/jit/frontend/builtin_functions.cpp b/torch/csrc/jit/frontend/builtin_functions.cpp
index b011f3505496..22bd5b695388 100644
--- a/torch/csrc/jit/frontend/builtin_functions.cpp
+++ b/torch/csrc/jit/frontend/builtin_functions.cpp
@@ -72,6 +72,8 @@ const auto aten_ops_additional =
     R"SCRIPT(
 def _assert(condition : Tensor, message : str):
   assert bool(condition), message
+def __contains__(self: str, key: str):
+    return self.find(key, 0, len(self)) != -1
 )SCRIPT";
 
 // Implementations of historic symbol behaviors are defined here
diff --git a/torch/csrc/jit/frontend/convert_to_ssa.cpp b/torch/csrc/jit/frontend/convert_to_ssa.cpp
index df1a922248b8..9d97e1067e74 100644
--- a/torch/csrc/jit/frontend/convert_to_ssa.cpp
+++ b/torch/csrc/jit/frontend/convert_to_ssa.cpp
@@ -183,8 +183,8 @@ struct ControlFlowLoadStores {
   std::shared_ptr<TypeEnvironment> environment_stack = nullptr;
 };
 
-// Given a graph where outputs have been added to control flow nodes, and
-// loads and stores are represented in the graph, erases the Loads & Stores.
+// Given a graph where 1) outputs have been added to control flow nodes and
+// 2) loads and stores are represented in the graph, erase the Loads & Stores.
 struct EraseLoadStores {
   void eraseBlockLoadStores(Block* block) {
     pushFrame(block);
diff --git a/torch/csrc/jit/frontend/exit_transforms.cpp b/torch/csrc/jit/frontend/exit_transforms.cpp
index 28fc95ac0d73..60e3bb4bbfbb 100644
--- a/torch/csrc/jit/frontend/exit_transforms.cpp
+++ b/torch/csrc/jit/frontend/exit_transforms.cpp
@@ -273,8 +273,8 @@ struct ExitTransformer {
       return constructWontExitPair();
     }
 
-    // for the block that is not exitting, its' exit values will not get
-    // used so we create uninitialized values of the same type as the other
+    // The exit values of the block that is not exiting will not get
+    // used, so we create uninitialized values of the same type as the other
     // block.
     if (then_status == ExitStatus::WONT || then_status == ExitStatus::THROWS) {
       std::vector<Value*> exit_vals =
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index 6db46e0dcd65..f8b6202f4b54 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -1128,8 +1128,7 @@ struct to_ir {
     // propagate further in all loaded models. The handling of
     // unwrap_optional will fail in these cases since export did
     // not expect that the input would be none and an unannotated None.
-    // cannot be passed to unwrapoptional To enable this,
-    // we need to (1) implement a real casting operator
+    // To enable this, we need to (1) implement a real casting operator
     // annotated(T, X) that stays in the graph and does the cast
     // and (2) only enable this OPTIONAL_NONE when loading newer
     // graphs because it is incompatible with older graphs.
@@ -1282,7 +1281,7 @@ struct to_ir {
       type_set = true;
     }
 
-    // comprehension introduces it's own scope. no variable assigned
+    // comprehension introduces its own scope. no variable assigned
     // leaks into the rest of the graph
     Node* n =
         graph->insertNode(create(prim::ComprehensionScope, lc.range(), 0));
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 8b1aa58b5aff..f4c1fa2c920d 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -211,6 +211,13 @@ TypePtr ScriptTypeParser::parseTypeFromExprImpl(const Expr& expr) const {
       }
     }
 
+    // Check if the type is a custom class. This is done by checking
+    // if type_name starts with "torch.classes."
+    if (type_name.find("torch.classes.") == 0) {
+      auto custom_class_type = getCustomClass("__torch__." + type_name);
+      return custom_class_type;
+    }
+
     throw ErrorReport(expr) << "Unknown type name '" << type_name << "'";
   } else if (auto name = parseBaseTypeName(expr)) {
     auto itr = string_to_type_lut().find(*name);
diff --git a/torch/csrc/jit/frontend/tracer.cpp b/torch/csrc/jit/frontend/tracer.cpp
index 72ccd77f2220..0c88371399de 100644
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@@ -103,6 +103,9 @@ void TracingState::delValue(const IValue& var) {
 Value* getValueTrace(const IValue& var) {
   return getTracingState()->getValue(var);
 }
+Value* getOptTensorValueTrace(const c10::optional<at::Tensor>& var) {
+  return getValueTrace(IValue(var));
+}
 Value* TracingState::getValue(const IValue& var) {
   // allow tracing of tuples passed to List[Tensor] or Tuple[Tensor...]
   // arguments
@@ -134,7 +137,7 @@ Value* TracingState::getValue(const IValue& var) {
     return graph->insertNode(dict_node)->output();
   }
   if (var.isTensor()) {
-    auto ten = var.toTensor();
+    auto& ten = var.toTensor();
     if (!ten.defined()) {
       Node* n = graph->createNone();
       return graph->insertNode(n)->output();
@@ -234,7 +237,7 @@ bool TracingState::hasValue(const IValue& var) const {
 Value* TracingState::getOutput(const IValue& iv, size_t i) {
   bool tracing_mode_strict = getTracingState()->strict;
   if (iv.isTensor()) {
-    at::Tensor var = iv.toTensor();
+    const at::Tensor& var = iv.toTensor();
     if (!var.defined()) {
       Node* n = graph->createNone();
       return graph->insertNode(n)->output();
@@ -503,7 +506,7 @@ void setValueTrace(const IValue& v, Value* value) {
 }
 void TracingState::setValue(const IValue& v, Value* value) {
   if (v.isTensor()) {
-    auto var = v.toTensor();
+    auto& var = v.toTensor();
     AT_ASSERT(var.defined());
     env_stack.back()[v] = value;
   } else if (v.isTensorList()) {
@@ -686,6 +689,16 @@ void addInputs(
   }
   n->addInput(list_node->output());
 }
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const List<c10::optional<at::Tensor>>& value) {
+  Graph* g = n->owningGraph();
+  Node* list_node = nullptr;
+  list_node = g->insertNode(g->createList(
+      OptionalType::ofTensor(), fmap(value, getOptTensorValueTrace)));
+  n->addInput(list_node->output());
+}
 
 void addInputs(
     Node* n,
diff --git a/torch/csrc/jit/frontend/tracer.h b/torch/csrc/jit/frontend/tracer.h
index 61d79cb3efd2..f5cbd821bda4 100644
--- a/torch/csrc/jit/frontend/tracer.h
+++ b/torch/csrc/jit/frontend/tracer.h
@@ -255,6 +255,10 @@ TORCH_API void addInputs(
     const char* name,
     ArrayRef<at::Tensor> value,
     bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const List<c10::optional<at::Tensor>>& value);
 TORCH_API void addInputs(
     Node* n,
     const char* name,
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 000bea53e0fc..1ca0f48f9e17 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -572,7 +572,8 @@ void AliasDb::analyzeImpl(Node* node) {
           !aliasAnalysisHasSpecialCaseFor(node->kind()),
       "Special cases should be handled already if we're here.");
 
-  if (node->kind().is_aten() || node->kind().is_prim()) {
+  if (node->kind().is_aten() || node->kind().is_prim() ||
+      node->kind().is_cuda()) {
     // TODO There is nothing in the system that relies on aten:: and prim::
     // ops using AliasAnalysisKind::FROM_SCHEMA or
     // AliasAnalysisKind::INTERNAL_SPECIAL_CASE, but this is the intended
@@ -994,7 +995,7 @@ void AliasDb::makePointerTo(const Value* from, const Value* to) {
   // the contained types of immutable type containers (optional, tuple, future)
   // are unified, so these types can be mutable or immutable
   // and point to a type which is mutable or immutable.
-  // Any is mutable but can point to a immutable type through refinement
+  // Any is mutable but can point to an immutable type through refinement
   if (isMutableTypeInternal(from) != isMutableTypeInternal(to)) {
     bool expected_kind = false;
     for (auto kind : {from->type()->kind(), to->type()->kind()}) {
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 65b410d82069..eb75928e5952 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -1079,6 +1079,11 @@ bool Node::hasSideEffects() const {
     case prim::rpc_sync: // It represents RPC message sent.
     case prim::rpc_remote: // It represents RPC message sent.
     case aten::wait: // It can represent RPC message received.
+#ifndef __HIP_PLATFORM_HCC__
+    case cuda::set_stream:
+    case cuda::_set_device:
+    case cuda::_current_device:
+#endif
     case prim::Enter:
     case prim::Exit:
       return true;
@@ -1094,7 +1099,7 @@ bool Node::hasSideEffects() const {
     return false;
   }
 
-  if (kind_.is_prim() || kind_.is_aten()) {
+  if (kind_.is_prim() || kind_.is_aten() || kind_.is_cuda()) {
     // TODO There is nothing in the system that relies on aten:: and prim::
     // ops using AliasAnalysisKind::FROM_SCHEMA,
     // AliasAnalysisKind::INTERNAL_SPECIAL_CASE, or
diff --git a/torch/csrc/jit/ir/ir.h b/torch/csrc/jit/ir/ir.h
index 7587451d9fd4..02867b8639cd 100644
--- a/torch/csrc/jit/ir/ir.h
+++ b/torch/csrc/jit/ir/ir.h
@@ -72,6 +72,11 @@ using namespace ::c10::attr;
 namespace aten {
 using namespace ::c10::aten;
 }
+namespace cuda {
+#ifndef __HIP_PLATFORM_HCC__
+using namespace ::c10::cuda;
+#endif
+} // namespace cuda
 
 struct Function;
 struct MatchedSchema;
@@ -1171,7 +1176,7 @@ struct Graph {
       const std::string& cconv,
       pyobj_list&& scalar_args);
   // clone n, making a new node in _this_ graph.
-  // use node_map to translate inputs of n to inputs of the cloned node
+  // use value_map to translate inputs of n to inputs of the cloned node
   // if copy_blocks is false, it will not recursively clone the nested blocks
   // this node contains.
   TORCH_API Node* createClone(
@@ -1267,7 +1272,7 @@ struct Graph {
 /** \brief An utility class for setting temporary insertion points.
  *
  * When an object of this class is created, it stores the current insertion
- * point, sets the new one, and restores the original insertion point  when the
+ * point, sets the new one, and restores the original insertion point when the
  * object is destroyed.
  */
 struct WithInsertPoint {
diff --git a/torch/csrc/jit/ir/subgraph_matcher.h b/torch/csrc/jit/ir/subgraph_matcher.h
index a6aa46b1bdcc..cd8d733da755 100644
--- a/torch/csrc/jit/ir/subgraph_matcher.h
+++ b/torch/csrc/jit/ir/subgraph_matcher.h
@@ -12,9 +12,9 @@ namespace jit {
  * \brief A structure describing a match of a pattern in a graph.
  *
  * The structure contains an anchor node, from which the match was found, and
- * match-maps for nodes and values. A match-map specifies correspondance between
- * nodes in the pattern graph (match-map keys) with nodes in the actual graph
- * (match-map values). We keep such maps for both nodes and values.
+ * match-maps for nodes and values. A match-map specifies the correspondance
+ * between nodes in the pattern graph (match-map keys) with nodes in the actual
+ * graph (match-map values). We keep such maps for both nodes and values.
  */
 struct Match {
   Node* anchor;
@@ -33,34 +33,34 @@ struct Match {
  *  - Matched subgraphs do not span across different blocks.
  *  - No uses outside the match are allowed, except for Param and Return nodes.
  *  Basically, we're matching hammocks, not arbitrary subgraphs.
- *  - Pattern graph must return only one value (i.e. it must have a single
+ *  - The pattern graph must return only one value (i.e. it must have a single
  *  node leading to return).
  *  - Nodes that are not used in computation of the return value in the pattern
  * graph are ignored during matching (IOW, we're essentially performing DCE on
  * the pattern).
  *  - Pattern graph nodes cannot alias. TODO: the check not implemented yet.
- *  - Aliasing nodes in the graph can not consitute a match (i.e. in all found
- * matches no nodes in the subgraph alias with each other). TODO: the check not
- * implemented yet.
- *  - The matcher will not mutate either the pattern graph or the matched graph,
- * but the latter is taken as non-const so that Match may contain non-const
+ *  - Aliasing nodes in the graph cannot consitute a match (i.e. through all
+ * found matches, no nodes in the subgraph alias with each other). TODO: check
+ * not implemented yet.
+ *  - The matcher will not mutate either the pattern graph or the matched graph.
+ * The matched graph is taken as non-const so that Match may contain non-const
  * pointers.  This enables clients of this API to use Match to drive mutations.
  *
  * Note [Multi-output Patterns]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- * Subgraph matcher provides a limited support for multi-output patterns. With a
- * single output pattern it a single scan through the graph is sufficient to
- * find all the matcher: given a starting, an anchor, node we deterministically
- * could check whether a pattern matches a subgraph corresponding to this anchor
- * node. For a general case of multi-output patterns we would have N anchors,
- * which would result in M^N comparisons we need to perform (M is the size of
- * the graph), which is computationally prohibitive.
+ * Subgraph matcher provides limited support for multi-output patterns. With a
+ * single output pattern, a single scan through the graph is sufficient to
+ * find all the matches: given a starting node (an "anchor"), we can
+ * deterministically check whether a pattern matches a subgraph corresponding to
+ * this anchor node. For a general case of multi-output patterns, we would have
+ * N anchors, which would result in M^N comparisons (M is the size of the
+ * graph). Clearly this is computationally prohibitive.
  *
  * To overcome this, we impose some constraints on the multi-output patterns
  * that we accept. We require that checking whether the pattern matches a
  * subgraph would still be fully determined by a single node in the graph. To
- * achieve this we designate the first output in the pattern as the 'main'
- * output and assume that it is sufficient to traverse up from it to match the
+ * achieve this, we designate the first output in the pattern as the "main"
+ * output and assume that we can traverse up from this node to match the
  * entire pattern.
  *
  * Corrolary 1: the order of outputs in the pattern matters!
diff --git a/torch/csrc/jit/mobile/export_data.cpp b/torch/csrc/jit/mobile/export_data.cpp
index 3bd28fbbac5c..a6eff3192bfd 100644
--- a/torch/csrc/jit/mobile/export_data.cpp
+++ b/torch/csrc/jit/mobile/export_data.cpp
@@ -36,7 +36,7 @@ class ScriptModuleSerializer {
   void writeArchive(const std::string& archive_name, const IValue& value) {
     std::vector<char> data;
     // Vector to capture the run-time class types during pickling the IValues
-    std::vector<c10::ClassTypePtr> memorizedClassTypes;
+    std::vector<c10::ClassTypePtr> memoizedClassTypes;
     Pickler data_pickle(
         [&](const char* buf, size_t size) {
           data.insert(data.end(), buf, buf + size);
@@ -45,7 +45,7 @@ class ScriptModuleSerializer {
         [&](const c10::ClassTypePtr& t) {
           return type_name_uniquer_.getUniqueName(t);
         },
-        &memorizedClassTypes);
+        &memoizedClassTypes);
     data_pickle.protocol();
     data_pickle.pushIValue(value);
     data_pickle.stop();
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 031c21474618..681eddfaa832 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -148,7 +148,7 @@ bool InterpreterState::run(Stack& stack) {
       case RET:
         return false;
       case LIST_CONSTRUCT: {
-        auto type = code_->types_[inst.X]->expect<at::ListType>();
+        const auto& type = code_->types_[inst.X]->expectRef<at::ListType>();
         listConstruct(stack, type, inst.N);
         ++pc;
       } break;
diff --git a/torch/csrc/jit/mobile/module.h b/torch/csrc/jit/mobile/module.h
index 8b7da739df9a..2be75c61b6b5 100644
--- a/torch/csrc/jit/mobile/module.h
+++ b/torch/csrc/jit/mobile/module.h
@@ -1,5 +1,6 @@
 #pragma once
 //#include <ATen/core/function_schema.h>
+#include <ATen/core/jit_type.h>
 #include <torch/csrc/jit/mobile/function.h>
 #include <torch/csrc/jit/mobile/method.h>
 
diff --git a/torch/csrc/jit/passes/constant_propagation.cpp b/torch/csrc/jit/passes/constant_propagation.cpp
index b2f314c23770..75be7e86acab 100644
--- a/torch/csrc/jit/passes/constant_propagation.cpp
+++ b/torch/csrc/jit/passes/constant_propagation.cpp
@@ -45,7 +45,9 @@ c10::optional<std::vector<IValue>> runNodeIfInputsAreConstant(
     } break;
     case prim::ListConstruct: {
       listConstruct(
-          stack, n->output()->type()->expect<ListType>(), n->inputs().size());
+          stack,
+          n->output()->type()->expectRef<ListType>(),
+          n->inputs().size());
     } break;
     case prim::DictConstruct: {
       dictConstruct(
@@ -54,6 +56,10 @@ c10::optional<std::vector<IValue>> runNodeIfInputsAreConstant(
     case prim::CreateObject: {
       createObject(stack, n->output()->type()->expect<ClassType>());
     } break;
+    case prim::GetAttr: {
+      auto attr = pop(stack).toObject()->getAttr(n->s(attr::name));
+      push(stack, attr);
+    } break;
     case prim::isinstance: {
       isinstance(stack, n->tys(attr::types));
     } break;
diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp
index 2778c7712f23..f66f54eeb567 100644
--- a/torch/csrc/jit/passes/freeze_module.cpp
+++ b/torch/csrc/jit/passes/freeze_module.cpp
@@ -289,11 +289,11 @@ class AttributePropagator {
 
   IValue overrideGradient(IValue attr) {
     if (attr.isTensor()) {
-      auto t = attr.toTensor();
+      auto& t = attr.toTensor();
       if (t.requires_grad()) {
-        t = t.detach();
-        t.set_requires_grad(false);
-        attr = IValue(t);
+        auto detached = t.detach();
+        detached.set_requires_grad(false);
+        attr = IValue(std::move(detached));
       }
     } else if (attr.isTuple()) {
       auto tuple = std::move(attr).toTuple();
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index 162e596eb6a7..bc26183a25bb 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -271,6 +271,93 @@ std::vector<Value*> ReshapeToAdvancedIndexingFormat(
   return indices;
 }
 
+// Register index_put inputs/outputs through the blocks.
+// Eg. The IR before updating:
+//   = prim::Loop(%10, %27)
+//    block0(%stream_idx.1 : int):
+//       = prim::Loop(%9, %27)
+//        block0(%i.1 : int):
+//          %36 : Tensor = aten::select(%bias.1, %26, %stream_idx.1)
+//          %41 : Tensor = aten::copy_(%37, %40, %25)
+//          -> (%27)
+//      -> (%27)
+//  After updating:
+// %62 : Tensor = prim::Loop(%10, %27, %bias.2)
+//    block0(%stream_idx.1 : int, %bias.3 : Tensor):
+//      %61 : Tensor = prim::Loop(%9, %27, %bias.3)
+//        block0(%i.1 : int, %bias.1 : Tensor):
+//          %36 : Tensor = aten::select(%bias.1, %26, %stream_idx.1)
+//          %59 : Tensor?[] = prim::ListConstruct(%55, %58)
+//          %60 : Tensor = aten::index_put(%bias.1, %59, %45, %25)
+//          -> (%27, %60)
+//      -> (%27, %61)
+void RegisterIndexPutInBlocks(
+    Value* orig_data,
+    Value* new_index_put,
+    Node* block_node,
+    Block* outer_block,
+    Node* next_node) {
+  auto cur_node = next_node;
+  while (nullptr != cur_node) {
+    if (cur_node->kind() != prim::Loop)
+      return;
+    cur_node = cur_node->owningBlock()->owningNode();
+  }
+
+  for (auto block_input : outer_block->inputs()) {
+    if (block_input->debugName() == orig_data->debugName()) {
+      AT_ERROR(
+          "More than one aten::index_put in a subblock are not supported.");
+    }
+  }
+
+  // Register index_put outputs through the blocks.
+  for (auto block_output : outer_block->outputs()) {
+    if (block_output->debugName() == new_index_put->debugName())
+      return;
+  }
+  outer_block->registerOutput(new_index_put);
+  std::vector<std::pair<Block*, Node*>> node_list = {
+      std::make_pair(outer_block, next_node)};
+  next_node->addOutput()->copyMetadata(new_index_put);
+  auto next_block = next_node->owningBlock();
+  while (nullptr != next_block->owningNode()) {
+    outer_block = next_block;
+    outer_block->registerOutput(next_node->output(0));
+    next_node = outer_block->owningNode();
+    next_node->addOutput()->copyMetadata(new_index_put);
+    next_block = next_node->owningBlock();
+    node_list.emplace_back(std::make_pair(outer_block, next_node));
+  }
+
+  // Register index_put inputs through the blocks.
+  auto next_data = orig_data;
+  while (!node_list.empty()) {
+    auto cur_pair = node_list.back();
+    // Add input to current node.
+    cur_pair.second->addInput(next_data);
+    // Add input to current block.
+    auto cur_input = cur_pair.first->addInput();
+    cur_input->copyMetadata(next_data);
+    next_data = cur_input;
+    node_list.pop_back();
+  }
+  // Update index_put inputs inside the inner most block.
+  auto prev_data = block_node->input(0);
+  for (auto node : block_node->owningBlock()->nodes()) {
+    size_t idx = 0;
+    for (auto inputs_ : node->inputs()) {
+      if (inputs_ == prev_data) {
+        node->replaceInput(idx, next_data);
+        idx++;
+        break;
+      }
+    }
+  }
+  orig_data->replaceAllUsesAfterNodeWith(
+      next_node->output(0)->node(), next_node->output(0));
+}
+
 // Trace back all the slice & select nodes associated with the index_put node,
 // and convert them to associated indices.
 // E.g. The IR for x[1:3, 0] = update
@@ -336,7 +423,16 @@ void SquashSliceAndSelect(Node* index_put_node) {
   new_index_put->copyMetadata(index_put_node->output());
   index_put_node->output()->replaceAllUsesWith(new_index_put);
 
-  orig_data->replaceAllUsesAfterNodeWith(new_index_put->node(), new_index_put);
+  auto block_node = new_index_put->node();
+  auto outer_block = block_node->owningBlock();
+  auto next_node = outer_block->owningNode();
+  if (nullptr == next_node) {
+    orig_data->replaceAllUsesAfterNodeWith(
+        new_index_put->node(), new_index_put);
+    return;
+  }
+  RegisterIndexPutInBlocks(
+      orig_data, new_index_put, block_node, outer_block, next_node);
 }
 
 void PrepareCopyForONNX(Block* block) {
diff --git a/torch/csrc/jit/passes/subgraph_rewrite.h b/torch/csrc/jit/passes/subgraph_rewrite.h
index 8b90840e2227..ee84f58ad087 100644
--- a/torch/csrc/jit/passes/subgraph_rewrite.h
+++ b/torch/csrc/jit/passes/subgraph_rewrite.h
@@ -4,8 +4,8 @@
  * the corresponding subgraphs with another subgraph. A special case of such
  * rewrites is fusion, where the new subgraph consists of just a single node.
  *
- * There is a default set of most-common patterns that everyone could use, or
- * alternatively an arbitrary pattern can be registered.
+ * There is a default set of the most common patterns that everyone could use.
+ * Alternatively, an arbitrary pattern can be registered.
  */
 #pragma once
 
@@ -37,13 +37,13 @@ TORCH_API Module PatternBasedRewrite(const Module& module);
 /** A class implementing API for pattern-based subgraph rewrites.
  *
  * To perform pattern-based subgraph rewrites on a module using this API, one
- * needs to crete an object of such class, register rewrite patterns and run the
- * transformation pass (`runOnModule`).
+ * needs to create an object of such class, register rewrite patterns and run
+ * the transformation pass (`runOnModule`).
  *
  * To use standard patterns, one could use `RegisterDefaultPatterns`.
  *
- * To enable rewrites of custom patterns, they must be registered with
- * `RegisterRewritePattern`.
+ * To enable rewrites of custom patterns, the custom patterns must be registered
+ * with `RegisterRewritePattern`.
  */
 class TORCH_API SubgraphRewriter {
  public:
@@ -51,12 +51,12 @@ class TORCH_API SubgraphRewriter {
   Module runOnModule(const Module& module);
 
   // Run pattern-based subgraph rewrite pass on the graph (used in testing).
-  // filter is a function that does extra filtering on the match, if it returns
-  // false for a given Match, we'll skip the match
-  // filter function takes a `Match` and a value map from parsing the pattern
-  // graph since we need to do extra filtering on the matched result but we need
-  // to refer to the values in the matched result through the values in pattern
-  // graph.
+  // `filter` is a function that does extra filtering on the match. If it
+  // returns false for a given Match, we'll skip the Match. The filter
+  // function's arguments consist of a Match and a value map from parsing the
+  // pattern graph. Both the Match and the value map are necessary because we
+  // need to 1) do extra filtering on the matched result as well as 2) refer to
+  // the values in the matched result through the values in the pattern graph.
   void runOnGraph(
       std::shared_ptr<Graph>& graph,
       const std::vector<MatchFilter>& filters);
@@ -99,8 +99,8 @@ class TORCH_API SubgraphRewriter {
 
 /** Rewrite pattern descriptor.
  *
- * This structure is used in implementation of `SubgraphRewriter` and not
- * supposed to be used externally.
+ * This structure is used in the implementation of `SubgraphRewriter` and
+ * is not supposed to be used externally.
  */
 struct RewritePatternDescr {
   std::string pattern;
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 09258e259e45..166238cebe17 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -15,6 +15,11 @@
 #include <torch/csrc/jit/tensorexpr/kernel.h>
 #include <torch/csrc/utils/memory.h>
 
+// NOLINTNEXTLINE
+C10_DEFINE_bool(
+    torch_jit_disable_cat,
+    false,
+    "disable aten::cat in TE fusion groups");
 namespace torch {
 namespace jit {
 
@@ -123,7 +128,7 @@ bool isSupported(Node* node) {
       "aten::trunc(Tensor self) -> Tensor",
       "aten::threshold(Tensor self, Scalar threshold, Scalar value) -> Tensor",
       // "aten::masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor",
-      // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor",
+      // "aten::masked_fill.Tensor(Tensor self, Tensor mask, Tensor value) -> Tensor", TODO: requires 0-dim Tensor
       "aten::remainder.Scalar(Tensor self, Scalar other) -> Tensor",
       "aten::remainder.Tensor(Tensor self, Tensor other) -> Tensor",
       "aten::cat(Tensor[] tensors, int dim=0) -> Tensor",
@@ -202,6 +207,10 @@ bool isSupported(Node* node) {
       }
     }
 
+    if (FLAGS_torch_jit_disable_cat && node->kind() == aten::cat) {
+      return false;
+    }
+
     return true;
   }
 
diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp
index 933d3bb1a867..056e23d06f02 100644
--- a/torch/csrc/jit/python/python_sugared_value.cpp
+++ b/torch/csrc/jit/python/python_sugared_value.cpp
@@ -217,6 +217,32 @@ std::shared_ptr<SugaredValue> PythonModuleValue::attr(
   return toSugaredValue(member, m, loc, /*is_constant=*/true);
 }
 
+#ifndef __HIP_PLATFORM_HCC__
+std::shared_ptr<SugaredValue> CUDAPythonModuleValue::attr(
+    const SourceRange& loc,
+    Function& m,
+    const std::string& field) {
+  // List of all the cuda operators which are supported in JIT
+  const std::unordered_set<std::string> cuda_ops = {"current_stream",
+                                                    "default_stream",
+                                                    "_current_device",
+                                                    "_set_device",
+                                                    "device_index",
+                                                    "device_count",
+                                                    "set_stream"};
+
+  if (cuda_ops.find(field) != cuda_ops.end()) {
+    return std::make_shared<BuiltinFunction>(Symbol::cuda(field), c10::nullopt);
+  }
+
+  py::object member = getattr(loc, field);
+  // note: is_constant = true because we consider that global properties
+  // on modules like math.pi or torch.float to be constants
+  // even though it is possible, though rare, for someone to mutate them
+  return toSugaredValue(member, m, loc, /*is_constant=*/true);
+}
+#endif
+
 Value* ModuleValue::asValue(const SourceRange& loc, Function& m) {
   return self_;
 }
@@ -938,6 +964,12 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   if (auto callee = as_function(obj)) {
     return std::make_shared<FunctionValue>(callee->function_);
   } else if (py::isinstance<py::module>(obj)) {
+#ifndef USE_ROCM
+    std::string obj_name = py::cast<py::str>(py::getattr(obj, "__name__"));
+    if (obj_name.compare("torch.cuda") == 0) {
+      return std::make_shared<CUDAPythonModuleValue>(obj);
+    }
+#endif
     return std::make_shared<PythonModuleValue>(obj);
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("_fork").ptr() ||
diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h
index b5d8f4490b3e..1edbc6c15cad 100644
--- a/torch/csrc/jit/python/python_sugared_value.h
+++ b/torch/csrc/jit/python/python_sugared_value.h
@@ -91,6 +91,20 @@ struct VISIBILITY_HIDDEN PythonModuleValue : public PythonValue {
       const std::string& field) override;
 };
 
+// Used for desugaring uses of the torch.cuda module. All the CUDA APIs with
+// torch.cuda.* are resolved using CUDAPythonModuleValue.
+#ifndef __HIP_PLATFORM_HCC__
+struct VISIBILITY_HIDDEN CUDAPythonModuleValue : public PythonValue {
+  explicit CUDAPythonModuleValue(py::object mod)
+      : PythonValue(std::move(mod)) {}
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      Function& m,
+      const std::string& field) override;
+};
+#endif
+
 // Represents all the parameters of a module as a List[Tensor]
 struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue {
   ConstantParameterList(Value* the_list) : the_list_(the_list) {}
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 401933c6d67e..a0e60e879146 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -237,7 +237,7 @@ struct CompleteArgumentSpec {
     for (int32_t i = 0; i < num_inputs; i++) {
       if (!inputs[i].isTensor())
         continue;
-      auto tensor = inputs[i].toTensor();
+      auto& tensor = inputs[i].toTensor();
       all_dims += tensor.defined() ? tensor.ndimension() : 0;
     }
     // allocate enough room for all TensorPODs and dimensions
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 24ca9dbf9793..7d588b6d96e7 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -1418,7 +1418,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             // Check every input's shape against profiled (expected) shape.
             for (i = 0; i < num_inputs; i++) {
               auto& input = peek(stack, i, num_inputs);
-              auto t = input.toTensor();
+              auto& t = input.toTensor();
               const TypePtr& expected = frame.function->type_table_[inst.X + i];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() && !expected_type->matchTensor(t)) {
@@ -1439,7 +1439,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
               // so it's safe to pass this guard check
               push(stack, true);
             } else {
-              auto t = stack.back().toTensor();
+              auto& t = stack.back().toTensor();
               const TypePtr& expected = frame.function->type_table_[inst.X];
               auto expected_type = expected->cast<TensorType>();
               if (t.defined() &&
@@ -1495,7 +1495,8 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             ++frame.pc;
           } break;
           case LIST_CONSTRUCT: {
-            auto type = frame.function->type_table_[inst.X]->expect<ListType>();
+            const auto& type =
+                frame.function->type_table_[inst.X]->expectRef<ListType>();
             listConstruct(stack, type, inst.N);
             ++frame.pc;
           } break;
diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h
index 120a3ffb7507..a4bb209cd17e 100644
--- a/torch/csrc/jit/runtime/interpreter.h
+++ b/torch/csrc/jit/runtime/interpreter.h
@@ -5,6 +5,7 @@
 
 #include <ATen/ThreadLocalState.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
 #include <torch/csrc/WindowsTorchApiMacro.h>
 #include <torch/csrc/jit/frontend/source_range.h>
 
diff --git a/torch/csrc/jit/runtime/profiling_record.cpp b/torch/csrc/jit/runtime/profiling_record.cpp
index 8d276dd58b50..d233f089f187 100644
--- a/torch/csrc/jit/runtime/profiling_record.cpp
+++ b/torch/csrc/jit/runtime/profiling_record.cpp
@@ -165,7 +165,7 @@ void ProfilingRecord::insertShapeProfile(Node* n, size_t offset) {
     if (v.isTensor()) {
       std::lock_guard<std::mutex> lock(this->mutex_);
       auto& profiled_types = profiled_types_per_frame_[frame_id];
-      auto t = v.toTensor();
+      auto& t = v.toTensor();
       if (t.defined()) {
         auto pttp = tensorTypeInCurrentExecutionContext(t);
         GRAPH_DEBUG(
diff --git a/torch/csrc/jit/runtime/register_cuda_ops.cpp b/torch/csrc/jit/runtime/register_cuda_ops.cpp
new file mode 100644
index 000000000000..5cf31d626dd0
--- /dev/null
+++ b/torch/csrc/jit/runtime/register_cuda_ops.cpp
@@ -0,0 +1,87 @@
+// This file registers special JIT operators used to implement the PyTorch CUDA
+// API in TorchScript.
+#ifndef __HIP_PLATFORM_HCC__
+#include <torch/csrc/api/include/torch/utils.h>
+#include <torch/csrc/jit/cuda/cuda.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace torch {
+namespace jit {
+
+namespace {
+
+c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+RegisterOperators const reg({
+    Operator(
+        "cuda::current_stream(int64_t val) -> __torch__.torch.classes.cuda.Stream",
+        [](Stack* stack) {
+          auto idx = uint16_t(pop(stack).toInt());
+          auto s = c10::cuda::getCurrentCUDAStream(idx);
+          auto st = make_custom_class<torch::jit::CUDAStream>(s);
+          push(stack, IValue(st));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::default_stream(int64_t val) -> __torch__.torch.classes.cuda.Stream",
+        [](Stack* stack) {
+          auto idx = uint16_t(pop(stack).toInt());
+          auto s = c10::cuda::getDefaultCUDAStream(idx);
+          auto st = make_custom_class<torch::jit::CUDAStream>(s);
+          push(stack, IValue(st));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::_current_device() -> int",
+        [](Stack* stack) {
+          auto v = c10::cuda::current_device();
+          push(stack, static_cast<int>(v));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::_set_device(int64_t val) -> ()",
+        [](Stack* stack) {
+          int64_t idx = -1;
+          pop(stack, idx);
+          c10::cuda::set_device(static_cast<c10::DeviceIndex>(idx));
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::device_index(Device device) -> int",
+        [](Stack* stack) {
+          auto device = pop(stack);
+          auto idx = device.toDevice().index();
+          push(stack, idx);
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::device_count() -> int",
+        [](Stack* stack) { push(stack, at::cuda::device_count()); },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "cuda::set_stream(__torch__.torch.classes.cuda.Stream stream) -> ()",
+        [](Stack* stack) {
+          auto v = pop(stack);
+          auto s = v.toCustomClass<torch::jit::CUDAStream>();
+          // To set the current CUDA stream using
+          // c10::cuda::setCurrentCUDAStream, the jit::CUDAStream object needs
+          // to be converted to c10::cuda::CUDAStream. Since the latter cannot
+          // be returned from a class registered via TorchBind, this can only be
+          // achieved by packing the c10::cuda::CUDAStream instance contained
+          // inside the jit::CUDAStream object to a uint64_t representation, and
+          // unpacking it inside this operator. The unpacked stream is then used
+          // to set the current CUDA stream.
+          auto packed = s->pack();
+          auto unpacked = c10::cuda::CUDAStream::unpack(packed);
+          c10::cuda::setCurrentCUDAStream(unpacked);
+        },
+        aliasAnalysisFromSchema()),
+});
+} // namespace
+} // namespace jit
+} // namespace torch
+#endif
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index f23b09dc0e74..fe75ec52046e 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -908,7 +908,7 @@ RegisterOperators reg(
          TORCH_SELECTIVE_SCHEMA(
              "aten::index.Tensor_hacked_twin(Tensor self, Tensor[] indices) -> Tensor"),
          [](Stack* stack) {
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result = at::index(self, indices);
            push(stack, std::move(result));
@@ -921,7 +921,7 @@ RegisterOperators reg(
            auto unsafe = pop(stack).toBool();
            auto accumulate = pop(stack).toBool();
            auto values = pop(stack).toTensor();
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result =
                at::_index_put_impl_(self, indices, values, accumulate, unsafe);
@@ -934,7 +934,7 @@ RegisterOperators reg(
          [](Stack* stack) {
            auto accumulate = pop(stack).toBool();
            auto values = pop(stack).toTensor();
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result = at::index_put_(self, indices, values, accumulate);
            push(stack, std::move(result));
@@ -946,7 +946,7 @@ RegisterOperators reg(
          [](Stack* stack) {
            auto accumulate = pop(stack).toBool();
            auto values = pop(stack).toTensor();
-           auto indices = pop(stack).toTensorVector();
+           auto indices = pop(stack).to<List<c10::optional<at::Tensor>>>();
            auto self = pop(stack).toTensor();
            auto result = at::index_put_(self, indices, values, accumulate);
            push(stack, std::move(result));
diff --git a/torch/csrc/jit/runtime/static/ops.cpp b/torch/csrc/jit/runtime/static/ops.cpp
index 11fb5dae2d6c..4d66c6382c2d 100644
--- a/torch/csrc/jit/runtime/static/ops.cpp
+++ b/torch/csrc/jit/runtime/static/ops.cpp
@@ -79,27 +79,27 @@ struct static_add final : public at::native::structured_add_out {
 
 REGISTER_OPERATOR_FUNCTOR(aten::add, aten_add, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
     auto in2_s = p_node->Input(2, reg).toScalar();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     static_add op{out_t};
     op.meta(in0_t, in1_t, in2_s);
-    op.impl(out_t, in0_t, in1_t, in2_s);
+    op.impl(in0_t, in1_t, in2_s, out_t);
   };
 });
 
 REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::mul_out(out_t, in0_t, in1_t);
   };
@@ -107,15 +107,15 @@ REGISTER_OPERATOR_FUNCTOR(aten::mul, aten_mul, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
-    auto in2_t = p_node->Input(2, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
+    auto& in2_t = p_node->Input(2, reg).toTensor();
     auto in3_s = p_node->Input(3, reg).toScalar();
     auto in4_s = p_node->Input(4, reg).toScalar();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::addmm_cpu_out(out_t, in0_t, in1_t, in2_t, in3_s, in4_s);
   };
@@ -123,13 +123,13 @@ REGISTER_OPERATOR_FUNCTOR(aten::addmm, aten_addmm, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     auto in1_s = p_node->Input(1, reg).toScalar();
     auto in2_s = p_node->Input(2, reg).toScalar();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::clamp_out(out_t, in0_t, in1_s, in2_s);
   };
@@ -137,12 +137,12 @@ REGISTER_OPERATOR_FUNCTOR(aten::clamp, aten_clamp, [](Node* n) -> SROperator {
 
 REGISTER_OPERATOR_FUNCTOR(aten::bmm, aten_bmm, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
-    auto in1_t = p_node->Input(1, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
+    auto& in1_t = p_node->Input(1, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::bmm_out_cpu(out_t, in0_t, in1_t);
   };
@@ -154,7 +154,7 @@ REGISTER_OPERATOR_FUNCTOR(
     [](Node* n) -> SROperator {
       return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
         auto input_size = p_node->input_regs().size();
-        auto in0_t = p_node->Input(0, reg).toTensor();
+        auto& in0_t = p_node->Input(0, reg).toTensor();
         double in1_d = input_size > 1 ? p_node->Input(1, reg).toDouble() : 0;
         double in2_d = input_size > 2 ? p_node->Input(2, reg).toDouble()
                                       : std::numeric_limits<double>::infinity();
@@ -164,7 +164,7 @@ REGISTER_OPERATOR_FUNCTOR(
         if (p_node->Output(0, reg).isNone()) {
           p_node->Output(0, reg) = create_empty_from(in0_t);
         }
-        auto out_t = p_node->Output(0, reg).toTensor();
+        auto& out_t = p_node->Output(0, reg).toTensor();
         out_t.resize_({0});
         at::native::nan_to_num_out(out_t, in0_t, in1_d, in2_d, in3_d);
       };
@@ -176,18 +176,18 @@ REGISTER_OPERATOR_FUNCTOR(aten::cat, aten_cat, [](Node* n) -> SROperator {
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_tl[0]);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::_cat_out_cpu(out_t, in0_tl, in1_i);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::tanh, aten_tanh, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::tanh_out(out_t, in0_t);
   };
@@ -217,7 +217,7 @@ SROperator aten_stack(Node* n) {
     for (auto i = 0; i < inputs.size(); i++) {
       inputs[i] = inputs[i].unsqueeze(dim);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::_cat_out_cpu(out_t, inputs, dim);
   };
@@ -230,11 +230,11 @@ REGISTER_OPERATOR_FUNCTOR(
     aten_sigmoid,
     [](Node* n) -> SROperator {
       return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-        auto in0_t = p_node->Input(0, reg).toTensor();
+        auto& in0_t = p_node->Input(0, reg).toTensor();
         if (p_node->Output(0, reg).isNone()) {
           p_node->Output(0, reg) = create_empty_from(in0_t);
         }
-        auto out_t = p_node->Output(0, reg).toTensor();
+        auto& out_t = p_node->Output(0, reg).toTensor();
         out_t.resize_({0});
         at::native::sigmoid_out(out_t, in0_t);
       };
@@ -247,57 +247,57 @@ REGISTER_OPERATOR_FUNCTOR(
       if (in1) {
         auto in1_s = in1->toScalar();
         return [=](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-          auto in0_t = p_node->Input(0, reg).toTensor();
+          auto& in0_t = p_node->Input(0, reg).toTensor();
           if (p_node->Output(0, reg).isNone()) {
             p_node->Output(0, reg) = create_empty_from(in0_t);
           }
-          auto out_t = p_node->Output(0, reg).toTensor();
+          auto& out_t = p_node->Output(0, reg).toTensor();
           at::native::leaky_relu_out(out_t, in0_t, in1_s);
         };
       } else {
         return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-          auto in0_t = p_node->Input(0, reg).toTensor();
+          auto& in0_t = p_node->Input(0, reg).toTensor();
           auto in1_s = p_node->Input(1, reg).toScalar();
           if (p_node->Output(0, reg).isNone()) {
             p_node->Output(0, reg) = create_empty_from(in0_t);
           }
-          auto out_t = p_node->Output(0, reg).toTensor();
+          auto& out_t = p_node->Output(0, reg).toTensor();
           at::native::leaky_relu_out(out_t, in0_t, in1_s);
         };
       }
     });
 REGISTER_OPERATOR_FUNCTOR(aten::relu, aten_relu, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::threshold_out(out_t, in0_t, 0, 0);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::logit, aten_logit, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     double in1_d = p_node->input_regs().size() > 1
         ? p_node->Input(1, reg).toDouble()
         : -1.0;
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     out_t.resize_({0});
     at::native::logit_out(out_t, in0_t, in1_d);
   };
 });
 REGISTER_OPERATOR_FUNCTOR(aten::clone, aten_clone, [](Node* n) -> SROperator {
   return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-    auto in0_t = p_node->Input(0, reg).toTensor();
+    auto& in0_t = p_node->Input(0, reg).toTensor();
     if (p_node->Output(0, reg).isNone()) {
       p_node->Output(0, reg) = create_empty_from(in0_t);
     }
-    auto out_t = p_node->Output(0, reg).toTensor();
+    auto& out_t = p_node->Output(0, reg).toTensor();
     at::native::resize_as_(out_t, in0_t, c10::nullopt);
     at::native::copy_(out_t, in0_t, false);
   };
@@ -317,14 +317,14 @@ std::function<void(const ProcessedNode*, std::vector<IValue>&)>
 getNativeOperation(Node* n) {
   if (n->kind() == c10::Symbol::fromQualString("aten::transpose")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toInt();
       auto in2_i = p_node->Input(2, reg).toInt();
       p_node->Output(0, reg) = at::native::transpose(in0_t, in1_i, in2_i);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::flatten")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toInt();
       auto in2_i = p_node->Input(2, reg).toInt();
       p_node->Output(0, reg) = at::native::flatten(in0_t, in1_i, in2_i);
@@ -361,7 +361,7 @@ getNativeOperation(Node* n) {
       // run op
       listConstruct(
           stack,
-          p_node->get_node()->output()->type()->expect<ListType>(),
+          p_node->get_node()->output()->type()->expectRef<ListType>(),
           p_node->input_regs().size());
       // put output back
       p_node->Output(0, reg) = std::move(stack[0]);
@@ -386,19 +386,19 @@ getNativeOperation(Node* n) {
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::permute")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_iv = p_node->Input(1, reg).toIntVector();
       p_node->Output(0, reg) = at::native::permute(in0_t, in1_iv);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::reshape")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_iv = p_node->Input(1, reg).toIntVector();
       p_node->Output(0, reg) = at::native::reshape(in0_t, in1_iv);
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::slice")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toInt();
       auto in2_i = p_node->Input(2, reg).toInt();
       auto in3_i = p_node->Input(3, reg).toInt();
@@ -408,13 +408,13 @@ getNativeOperation(Node* n) {
     };
   } else if (n->kind() == c10::Symbol::fromQualString("aten::narrow")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
-      auto self = p_node->Input(0, reg).toTensor(); // self
+      auto& self = p_node->Input(0, reg).toTensor(); // self
       auto dim = p_node->Input(1, reg).toInt(); // dim
       int64_t start = 0;
       if (p_node->Input(2, reg).isScalar()) {
         start = p_node->Input(2, reg).toInt();
       } else {
-        auto t = p_node->Input(2, reg).toTensor();
+        auto& t = p_node->Input(2, reg).toTensor();
         start = t.item<int64_t>();
       }
       auto length = p_node->Input(3, reg).toInt(); // length
@@ -440,7 +440,7 @@ getNativeOperation(Node* n) {
   } else if (n->kind() == c10::Symbol::fromQualString("aten::to")) {
     return [](const ProcessedNode* p_node, std::vector<IValue>& reg) {
       DCHECK(p_node->input_regs().size() == 5);
-      auto in0_t = p_node->Input(0, reg).toTensor();
+      auto& in0_t = p_node->Input(0, reg).toTensor();
       auto in1_i = p_node->Input(1, reg).toScalarType();
       auto in2_i = p_node->Input(2, reg).toBool();
       auto in3_i = p_node->Input(3, reg).toBool();
diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp
index 75688cfa8880..bbaaa6683bbd 100644
--- a/torch/csrc/jit/runtime/static/passes.cpp
+++ b/torch/csrc/jit/runtime/static/passes.cpp
@@ -105,6 +105,23 @@ void ClipRangesGatherSigridHash(std::shared_ptr<torch::jit::Graph>& graph) {
   fuse.runOnGraph(graph);
 }
 
+void ClipRangesGatherRangesSigridHash(
+    std::shared_ptr<torch::jit::Graph>& graph) {
+  std::string pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f):
+        %y0 : Tensor = fb::clip_ranges(%b, %c)
+        %y1 : Tensor, %y2 : Tensor = fb::gather_ranges(%a, %y0)
+        %y3 : Tensor = fb::sigrid_hash(%y1, %d, %e, %f)
+        return (%y3, %y2))IR";
+  std::string fused_pattern = R"IR(
+    graph(%a, %b, %c, %d, %e, %f):
+        %off : Tensor, %out : Tensor = fb::clip_ranges_gather_sigrid_hash_v3(%b, %a, %c, %d, %e, %f)
+        return (%out, %off))IR";
+  SubgraphRewriter fuse;
+  fuse.RegisterRewritePattern(pattern, fused_pattern);
+  fuse.runOnGraph(graph);
+}
+
 void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
 #ifdef FBCODE_CAFFE2
   ConcatAddMulReplaceNaNClip(graph);
@@ -112,6 +129,7 @@ void FuseInferenceOpsForSparseNN(std::shared_ptr<torch::jit::Graph>& graph) {
   ConcatBatchMatMulBatchGather(graph);
   ClipRangesGatherRangesLengthsToOffsets(graph);
   ClipRangesGatherSigridHash(graph);
+  ClipRangesGatherRangesSigridHash(graph);
 #endif
 }
 
diff --git a/torch/csrc/jit/runtime/vararg_functions.cpp b/torch/csrc/jit/runtime/vararg_functions.cpp
index 44bc56206eaf..220a5e67f723 100644
--- a/torch/csrc/jit/runtime/vararg_functions.cpp
+++ b/torch/csrc/jit/runtime/vararg_functions.cpp
@@ -204,16 +204,13 @@ void namedTupleConstruct(
       c10::ivalue::Tuple::createNamed(std::move(elems), std::move(type)));
 }
 
-void listConstruct(
-    Stack& stack,
-    const at::ListTypePtr& type,
-    size_t num_inputs) {
+void listConstruct(Stack& stack, const at::ListType& type, size_t num_inputs) {
   // Structuring the implementation this way allows NRVO to avoid
   // move-constructing vals on its way onto the stack. Moving a List
   // isn't free.
   auto makeList =
-      [](Stack& stack, const at::ListTypePtr& type, size_t num_inputs) {
-        c10::List<IValue> vals(type->getElementType());
+      [](Stack& stack, const at::ListType& type, size_t num_inputs) {
+        c10::List<IValue> vals(type.getElementType());
         vals.reserve(num_inputs);
         for (size_t i = stack.size() - num_inputs; i < stack.size(); ++i) {
           vals.emplace_back(std::move(stack[i]));
diff --git a/torch/csrc/jit/runtime/vararg_functions.h b/torch/csrc/jit/runtime/vararg_functions.h
index 36bef721d626..e9580411212a 100644
--- a/torch/csrc/jit/runtime/vararg_functions.h
+++ b/torch/csrc/jit/runtime/vararg_functions.h
@@ -2,6 +2,7 @@
 #include <ATen/core/List.h>
 #include <ATen/core/functional.h>
 #include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 
 namespace torch {
@@ -24,7 +25,7 @@ void namedTupleConstruct(
 
 void listConstruct(
     Stack& stack,
-    const at::ListTypePtr& list_type,
+    const at::ListType& list_type,
     size_t num_inputs);
 
 void dictConstruct(
diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp
index 4808ab65145b..01b2a1c2ac4a 100644
--- a/torch/csrc/jit/serialization/export_module.cpp
+++ b/torch/csrc/jit/serialization/export_module.cpp
@@ -354,7 +354,7 @@ class ScriptModuleSerializer {
   void writeArchive(const std::string& archive_name, const IValue& value) {
     std::vector<char> data;
     // Vector to capture the run-time class types during pickling the IValues
-    std::vector<c10::ClassTypePtr> memorizedClassTypes;
+    std::vector<c10::ClassTypePtr> memoizedClassTypes;
     Pickler data_pickle(
         [&](const char* buf, size_t size) {
           data.insert(data.end(), buf, buf + size);
@@ -363,7 +363,7 @@ class ScriptModuleSerializer {
         [&](const c10::ClassTypePtr& t) {
           return type_name_uniquer_.getUniqueName(t);
         },
-        &memorizedClassTypes);
+        &memoizedClassTypes);
     data_pickle.protocol();
     data_pickle.pushIValue(value);
     data_pickle.stop();
@@ -378,7 +378,7 @@ class ScriptModuleSerializer {
     writer_.writeRecord(fname, data.data(), data.size());
 
     // serialize all the captured run-time class types
-    for (const c10::ClassTypePtr& wroteType : memorizedClassTypes) {
+    for (const c10::ClassTypePtr& wroteType : memoizedClassTypes) {
       convertNamedType(wroteType);
     }
   }
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index a11ad9aeb3c5..811569485888 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -90,11 +90,11 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
   } else if (ivalue.isObject()) {
     auto obj = ivalue.toObject();
     auto type = obj->type();
-    if (memorized_class_types_ != nullptr) {
-      // Memorize every class type the Pickler encountered
+    if (memoized_class_types_ != nullptr) {
+      // memoize every class type the Pickler encountered
       // This is used to make sure we capture all the run-time types
       // and serialize them properly for class/interface polymorphism
-      memorized_class_types_->emplace_back(type);
+      memoized_class_types_->emplace_back(type);
     }
     auto type_name = type->name().value();
     if (type_renamer_) {
@@ -121,8 +121,8 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
   } else if (ivalue.isCapsule()) {
     std::stringstream err;
     err << "Cannot serialize custom bound C++ class";
-    if (memorized_class_types_ && memorized_class_types_->size()) {
-      if (auto qualname = memorized_class_types_->back()->name()) {
+    if (memoized_class_types_ && memoized_class_types_->size()) {
+      if (auto qualname = memoized_class_types_->back()->name()) {
         err << " " << qualname->qualifiedName();
       }
     }
@@ -354,7 +354,7 @@ void Pickler::pushLiteralTensor(const IValue& ivalue) {
   //
   // The format here is the same one used by `torch.save()`. The code for the
   // format can be found in `torch/serialization.py`.
-  auto tensor = ivalue.toTensor();
+  auto& tensor = ivalue.toTensor();
   bool quantized = tensor.is_quantized();
   // The arguments to this function are:
   //    storage, storage_offset, size, stride, requires_grad, backward_hooks
diff --git a/torch/csrc/jit/serialization/pickler.h b/torch/csrc/jit/serialization/pickler.h
index 6a557e6e53f3..21d0f61a18eb 100644
--- a/torch/csrc/jit/serialization/pickler.h
+++ b/torch/csrc/jit/serialization/pickler.h
@@ -127,11 +127,11 @@ class TORCH_API Pickler {
       std::function<void(const char*, size_t)> writer,
       std::vector<at::Tensor>* tensor_table,
       std::function<c10::QualifiedName(const c10::ClassTypePtr&)> type_renamer,
-      std::vector<c10::ClassTypePtr>* memorized_class_types)
+      std::vector<c10::ClassTypePtr>* memoized_class_types)
       : writer_(std::move(writer)),
         tensor_table_(tensor_table),
         type_renamer_(std::move(type_renamer)),
-        memorized_class_types_(memorized_class_types) {}
+        memoized_class_types_(memoized_class_types) {}
   ~Pickler();
 
   // Push protocol onto the stack
@@ -253,7 +253,7 @@ class TORCH_API Pickler {
   std::function<c10::QualifiedName(const c10::ClassTypePtr&)> type_renamer_;
 
   // List of all the types that it wrote, inspect from the IValues it wrote.
-  std::vector<c10::ClassTypePtr>* memorized_class_types_;
+  std::vector<c10::ClassTypePtr>* memoized_class_types_;
 
   // List of tensor storages to serialize in the same binary as the pickle data
   // similar to ivalues, they are memoized using BINPUT
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index e203a03a2e24..18d656c98f32 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -309,12 +309,12 @@ struct PythonPrintImpl {
     // because it doesn't hash any information about the tensors.
     // We will probably need to optimize this at some point using hashing.
     if (val.isTensor()) {
-      auto t = val.toTensor();
+      auto& t = val.toTensor();
       for (size_t i = 0; i < constant_table_.size(); ++i) {
         if (!constant_table_[i].isTensor()) {
           continue;
         }
-        auto t2 = constant_table_[i].toTensor();
+        auto& t2 = constant_table_[i].toTensor();
         if (t.options().type_equal(t2.options()) && t.equal(t2)) {
           return i;
         }
@@ -1339,15 +1339,13 @@ struct PythonPrintImpl {
           body_ << "\"" << param << "\", ";
         }
         body_ << "]\n";
-#ifndef FBCODE_CAFFE2
-        // Note: Forward compat gated. TODO: @voznesenskym to remove when ready.
+
         indent();
         body_ << "__buffers__ = [";
         for (const auto& buffer : buffers) {
           body_ << "\"" << buffer << "\", ";
         }
         body_ << "]\n";
-#endif
       }
 
       for (size_t i = 0; i < numAttrs; i++) {
diff --git a/torch/csrc/jit/serialization/unpickler.cpp b/torch/csrc/jit/serialization/unpickler.cpp
index 15c4a89c2f1e..841e87592be9 100644
--- a/torch/csrc/jit/serialization/unpickler.cpp
+++ b/torch/csrc/jit/serialization/unpickler.cpp
@@ -29,7 +29,7 @@ static void restoreAccurateTypeTagsIfPossible(const IValue& root) {
 // objects it contains as attributes.
 // `IfPossible` - we can only do this recovery when we have an object as
 // the top-level unpickled thing (which is guaranteed for Modules, but
-// not for torch.load/torch,save). Otherwise we do not know the types
+// not for torch.load/torch.save). Otherwise we do not know the types
 // of the contained objects and cannot restore the tags.
 void restoreAccurateTypeTags(const IValue& root, const TypePtr& type_tag) {
   struct Work {
@@ -632,7 +632,7 @@ void Unpickler::rebuildTensor(bool quantized) {
     auto tup = pop(stack_).toTuple();
     const auto& elements = tup->elements();
     size_t idx = 0;
-    auto storage_tensor = elements.at(idx++).toTensor();
+    auto& storage_tensor = elements.at(idx++).toTensor();
     int64_t storage_offset = elements.at(idx++).toInt();
     std::vector<int64_t> size = tupleToIntList(elements.at(idx++));
     std::vector<int64_t> stride = tupleToIntList(elements.at(idx++));
diff --git a/torch/csrc/jit/tensorexpr/eval.cpp b/torch/csrc/jit/tensorexpr/eval.cpp
index e60a0bd704bf..186af3ca822f 100644
--- a/torch/csrc/jit/tensorexpr/eval.cpp
+++ b/torch/csrc/jit/tensorexpr/eval.cpp
@@ -834,8 +834,12 @@ class SimpleIREvaluatorImpl : public IRVisitor {
         return std::erfc(v);
       case kSqrt:
         return std::sqrt(v);
-      case kRsqrt:
-        return 1.0f / std::sqrt(v);
+      case kRsqrt: {
+        auto rsqrt = [](TInput v) __ubsan_ignore_float_divide_by_zero__ {
+          return 1.0f / std::sqrt(v);
+        };
+        return rsqrt(v);
+      }
       case kCeil:
         return std::ceil(v);
       case kFloor:
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 86f76b58d747..e6e31ba4d96c 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -964,7 +964,7 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
 
     case aten::masked_fill: {
       return computeThreeOperand(
-          "aten::masked_fill",
+          "aten_masked_fill",
           v,
           [](const ExprHandle& input,
              const ExprHandle& mask,
@@ -1282,8 +1282,9 @@ Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
     } break;
 
     case aten::rsqrt: {
-      return computeOneOperand(
-          "aten_rsqrt", v, [](const ExprHandle& a) { return rsqrt(a); });
+      return computeOneOperand("aten_rsqrt", v, [](const ExprHandle& a) {
+        return rsqrt(promoteIntegerToDefaultType(a));
+      });
     } break;
 
     case aten::abs: {
@@ -1531,12 +1532,12 @@ Stmt* TensorExprKernel::generateStmt(BackendType backendType) {
     root_stmt->accept(block_analysis.get());
   }
 
-  // inlining output buffers duplicates computation. it slows down
-  // cpu code generation but is enabled on gpu because it avoids difficult
-  // synchronization logic across blocks.
-  bool inline_output_buffers =
+  // inlining output & intermediate buffers can duplicate computation.
+  // it slows down cpu code generation but is enabled on gpu because it avoids
+  // difficult synchronization logic across blocks.
+  bool allow_duplicated_work =
       (backendType == kCudaCodeGen || backendType == kBlockCodeGen);
-  l.inlineIntermediateBufs(inline_output_buffers);
+  l.inlineIntermediateBufs(allow_duplicated_work);
 
   if (backendType == kCudaCodeGen) {
     for (auto tensor : tensorOutputs_) {
diff --git a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
index c5f94f16783d..adc3be984216 100644
--- a/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/llvm_codegen.cpp
@@ -14,7 +14,13 @@
 #include <llvm/IR/Verifier.h>
 #include <llvm/Support/Host.h>
 #include <llvm/Support/TargetSelect.h>
+
+#if LLVM_VERSION_MAJOR >= 10
+#include <llvm/Support/CodeGen.h>
+#else
 #include <llvm/Target/TargetMachine.h>
+#endif
+
 #include <llvm/Transforms/IPO/AlwaysInliner.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Scalar.h>
@@ -533,7 +539,11 @@ void LLVMCodeGenImpl::emitKernel(
         PM,
         asmStream,
         nullptr,
+#if LLVM_VERSION_MAJOR >= 10
+        llvm::CodeGenFileType::CGFT_AssemblyFile);
+#else
         llvm::TargetMachine::CodeGenFileType::CGFT_AssemblyFile);
+#endif
     PM.run(*module_);
   }
   GRAPH_DEBUG(
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 2fed242cf4c6..c2b274a3c9bb 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -9,6 +9,7 @@
 #include <c10/util/Logging.h>
 #include <c10/util/string_utils.h>
 
+#include <ATen/core/functional.h>
 #include <torch/csrc/jit/tensorexpr/analysis.h>
 #include <torch/csrc/jit/tensorexpr/bounds_inference.h>
 #include <torch/csrc/jit/tensorexpr/eval.h>
@@ -23,6 +24,28 @@ namespace torch {
 namespace jit {
 namespace tensorexpr {
 
+class FunctionCallUseCount : public IRVisitor {
+ public:
+  std::unordered_map<const Buf*, size_t> findUses(Stmt* s) {
+    s->accept(this);
+    return uses_;
+  }
+
+ private:
+  void visit(const FunctionCall* v) override {
+    if (function_calls_[v->tensor()->buf()].insert(v).second) {
+      uses_[v->tensor()->buf()] = uses_[v->tensor()->buf()] + 1;
+    }
+    IRVisitor::visit(v);
+  }
+
+  std::unordered_map<const Buf*, size_t> uses_;
+
+  // Sets of FunctionCalls in order to keep the results unique
+  std::unordered_map<const Buf*, std::unordered_set<const FunctionCall*>>
+      function_calls_;
+};
+
 class IndexFlattener : public IRMutator {
  public:
   Stmt* flatten(Stmt* s) {
@@ -751,28 +774,67 @@ bool LoopNest::computeInline(const Buf* b) {
   return true;
 }
 
-void LoopNest::inlineIntermediateBufs(bool inline_output_buffers) {
+// inlining buffers with multiple uses can create duplicated work, which can
+// slow down cpu code generation but is enabled on gpu because it avoids
+// difficult synchronization logic across blocks. Inlining trivial reads does
+// not duplicate work
+void LoopNest::inlineIntermediateBufs(bool allow_duplicated_work) {
   // We need to collect all intermediate buffers as the buffers to be inlined
   // before calling 'computeInline' since the buffers that are inlined are
   // erased from the set 'intermediate_bufs_' in that function.
-  std::unordered_set<const Buf*> bufs_to_inline(
-      intermediate_bufs_.begin(), intermediate_bufs_.end());
+  std::unordered_set<const Buf*> bufs_to_inline;
+
+  if (allow_duplicated_work) {
+    bufs_to_inline.insert(intermediate_bufs_.begin(), intermediate_bufs_.end());
+  } else {
+    FunctionCallUseCount fcu;
+    auto function_call_uses = fcu.findUses(root_stmt_);
+    auto buf_load_store_uses = findLoadOrStoreUses(root_stmt_);
+    auto input_bufs = getInputBufs();
+
+    for (auto buf : intermediate_bufs_) {
+      TORCH_INTERNAL_ASSERT(buf_load_store_uses.count(buf));
+      std::vector<BufLoadOrStoreUse>& uses = buf_load_store_uses[buf];
+      auto stores = c10::filter(
+          uses, [](const BufLoadOrStoreUse& use) { return use.isStore; });
+
+      // if the intermediate is the buffer formed from reading in the input
+      // tensors, always inline, bc we are not duplicating any work
+      // and avoiding an intermediary buffer
+      if (stores.size() == 1) {
+        auto store = dynamic_cast<Store*>(stores[0].s);
+        auto input_as_load = dynamic_cast<const Load*>(store->value());
+        if (input_as_load && input_bufs.count(input_as_load->buf())) {
+          bufs_to_inline.insert(buf);
+          continue;
+        }
+      }
 
-  // inlining output buffers duplicates computation. it slows down
-  // cpu code generation but is enabled on gpu because it avoids difficult
-  // synchronization logic across blocks.
-  if (inline_output_buffers) {
+      // all bufs will have at least one store (if they have > 1 they cant be
+      // inlined anyway)
+      size_t reads = uses.size() - 1;
+      size_t function_call_reads = function_call_uses[buf];
+      // if only one read, we can inline it without duplicating work
+      if ((reads + function_call_reads) <= 1) {
+        bufs_to_inline.insert(buf);
+      }
+    }
+  }
+
+  if (allow_duplicated_work) {
     bufs_to_inline.insert(output_bufs_.begin(), output_bufs_.end());
   }
+
   for (auto b : bufs_to_inline) {
     computeInline(b);
   }
 }
 
 // TODO: Unify with DepTracker
-class UseFinder : public IRVisitor {
+class LoadOrStoreUseFinder : public IRVisitor {
  public:
-  std::unordered_map<const Buf*, std::vector<BufUse>> findUses(Stmt* s) {
+  std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>> findUses(
+      Stmt* s) {
     uses_.clear();
     s->accept(this);
     return uses_;
@@ -794,15 +856,16 @@ class UseFinder : public IRVisitor {
   }
 
   Stmt* last_stmt_ = nullptr;
-  std::unordered_map<const Buf*, std::vector<BufUse>> uses_;
+  std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>> uses_;
 
   // Sets of loads and stores in order to keep the results unique
   std::unordered_map<const Buf*, std::unordered_set<Stmt*>> loads_;
   std::unordered_map<const Buf*, std::unordered_set<Stmt*>> stores_;
 };
 
-std::unordered_map<const Buf*, std::vector<BufUse>> findUses(Stmt* s) {
-  UseFinder uf;
+std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>>
+findLoadOrStoreUses(Stmt* s) {
+  LoadOrStoreUseFinder uf;
   return uf.findUses(s);
 }
 
@@ -828,7 +891,7 @@ class ContainedStmtsFinder : public IRVisitor {
   std::unordered_set<Stmt*> contained_;
 };
 
-bool containsAll(const std::vector<BufUse>& uses, Block* b) {
+bool containsAll(const std::vector<BufLoadOrStoreUse>& uses, Block* b) {
   std::unordered_set<Stmt*> not_found;
   for (auto use : uses) {
     not_found.insert(use.s);
@@ -852,7 +915,7 @@ Block* findParentBlock(Stmt* s) {
   return nullptr;
 }
 
-Block* findLowestContainingBlock(const std::vector<BufUse>& uses) {
+Block* findLowestContainingBlock(const std::vector<BufLoadOrStoreUse>& uses) {
   // TODO: we're not using the most efficient algorithm here for simplicity.
   // Replace with something more performant in case it becomes a bottleneck.
   Block* b = findParentBlock(uses[0].s);
@@ -872,7 +935,8 @@ Stmt* LoopNest::insertAllocFree(Stmt* stmt) {
     b = new Block({stmt});
   }
 
-  std::unordered_map<const Buf*, std::vector<BufUse>> uses = findUses(stmt);
+  std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>> uses =
+      findLoadOrStoreUses(stmt);
   // Insert allocations and frees for temporary buffers in the innermost
   // possible scope.
   for (const Buf* buf : intermediate_bufs_) {
diff --git a/torch/csrc/jit/tensorexpr/loopnest.h b/torch/csrc/jit/tensorexpr/loopnest.h
index 0a588f6a95e4..962d69f0458d 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.h
+++ b/torch/csrc/jit/tensorexpr/loopnest.h
@@ -53,7 +53,7 @@ class TORCH_API LoopNest {
 
   bool computeInline(Stmt* s);
   bool computeInline(const Buf* b);
-  void inlineIntermediateBufs(bool inline_output_buffers);
+  void inlineIntermediateBufs(bool allow_duplicated_work);
 
   static void splitWithTail(For* f, int factor);
   static void splitWithTail(
@@ -141,7 +141,7 @@ TORCH_API Stmt* FlattenIndexes(Stmt* s);
 // TODO: Revisit this once we decide on how dependencies analysis should look
 // like. Maybe we would choose to use a different API and BufUse would be
 // removed, or if we decide to keep it we need to properly document its API.
-struct BufUse {
+struct BufLoadOrStoreUse {
   Stmt* s;
   bool isStore;
 };
@@ -152,7 +152,8 @@ struct BufUse {
  * in the vectors reflects the order in which the uses appear in the given
  * statement.
  */
-std::unordered_map<const Buf*, std::vector<BufUse>> findUses(Stmt* s);
+std::unordered_map<const Buf*, std::vector<BufLoadOrStoreUse>>
+findLoadOrStoreUses(Stmt* s);
 
 } // namespace tensorexpr
 } // namespace jit
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index 83b54397b01c..f5e96a501bfd 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -548,7 +548,11 @@ FileCheck* FileCheck::check_count(
     const std::string& str,
     size_t count,
     bool exactly) {
-  fcImpl->addCheck(CHECK_COUNT, str, count);
+  TORCH_INTERNAL_ASSERT(
+      count != 0 || exactly, "Count == 0 && !exactly doesn't do anything");
+  if (count) {
+    fcImpl->addCheck(CHECK_COUNT, str, count);
+  }
   if (exactly) {
     fcImpl->addCheck(CHECK_NOT, str);
   }
diff --git a/torch/csrc/utils/out_types.cpp b/torch/csrc/utils/out_types.cpp
new file mode 100644
index 000000000000..0ceeb43bd1f8
--- /dev/null
+++ b/torch/csrc/utils/out_types.cpp
@@ -0,0 +1,39 @@
+#include <torch/csrc/utils/out_types.h>
+
+namespace torch {
+namespace utils {
+
+// Used by python binding codegen to ensure any TensorOptions arguments are consistent
+// with the out tensor's options
+void check_out_type_matches(const at::Tensor& result,
+                            at::ScalarType scalarType, bool scalarType_is_none,
+                            c10::optional<at::Layout> layout,
+                            const at::Device& device, bool device_is_none) {
+  if (scalarType_is_none && !layout && device_is_none) {  // common case
+    return;
+  }
+  if (!scalarType_is_none && result.scalar_type() != scalarType) {
+    AT_ERROR(
+        "dtype ", scalarType,
+        " does not match dtype of out parameter (", result.scalar_type(), ")");
+  }
+  auto scalarType_arg = scalarType_is_none ? result.scalar_type() : scalarType;
+  auto device_type_arg = device_is_none ? result.device().type() : device.type();
+  if (result.scalar_type() != scalarType_arg) {
+    AT_ERROR(
+        "scalar type ", scalarType_arg,
+        " does not match scalar type of out parameter (", result.scalar_type(), ")");
+  }
+  if (layout && result.layout() != *layout) {
+    AT_ERROR(
+        "layout ", *layout,
+        " does not match layout of out parameter (", result.layout(), ")");
+  }
+  if (result.device().type() != device_type_arg) {
+    AT_ERROR(
+        "device type ", device_type_arg,
+        " does not match device type of out parameter (", result.device().type(), ")");
+  }
+}
+
+}}
diff --git a/torch/csrc/utils/out_types.h b/torch/csrc/utils/out_types.h
new file mode 100644
index 000000000000..adc3686a6b97
--- /dev/null
+++ b/torch/csrc/utils/out_types.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch {
+namespace utils {
+
+TORCH_API void check_out_type_matches(
+    const at::Tensor& result,
+    at::ScalarType scalarType, bool scalarType_is_none,
+    c10::optional<at::Layout> layout,
+    const at::Device& device, bool device_is_none);
+
+}}
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index c7fdf844945e..ee3a0bc71f2f 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -24,6 +24,7 @@ static std::unordered_map<std::string, ParameterType> type_map = {
   {"double", ParameterType::DOUBLE},
   {"complex", ParameterType::COMPLEX},
   {"TensorList", ParameterType::TENSOR_LIST},
+  {"c10::List<c10::optional<Tensor>>", ParameterType::TENSOR_LIST},
   {"IntArrayRef", ParameterType::INT_LIST},
   {"ArrayRef<double>", ParameterType::FLOAT_LIST},
   {"Generator", ParameterType::GENERATOR},
@@ -390,7 +391,7 @@ bool is_float_or_complex_list(PyObject* obj) {
   }
 
   auto size = tuple ? PyTuple_GET_SIZE(obj) : PyList_GET_SIZE(obj);
-  if (size > 0) { 
+  if (size > 0) {
     PyObject* iobj = tuple ? PyTuple_GET_ITEM(obj, 0) : PyList_GET_ITEM(obj, 0);
     if (!THPUtils_checkDouble(iobj) && !PyComplex_Check(iobj)) {
       return false;
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index ccf3ba6b42c4..9fa490139cbd 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -160,6 +160,7 @@ struct PythonArgs {
   inline at::Scalar scalarWithDefault(int i, at::Scalar default_scalar);
   inline std::vector<at::Scalar> scalarlist(int i);
   inline std::vector<at::Tensor> tensorlist(int i);
+  inline torch::List<c10::optional<at::Tensor>> list_of_optional_tensors(int i);
   template<int N>
   inline std::array<at::Tensor, N> tensorlist_n(int i);
   inline std::vector<int64_t> intlist(int i);
@@ -327,6 +328,22 @@ inline std::vector<at::Tensor> PythonArgs::tensorlist(int i) {
   return res;
 }
 
+inline torch::List<c10::optional<at::Tensor>> PythonArgs::list_of_optional_tensors(int i) {
+  if (!args[i]) return torch::List<c10::optional<at::Tensor>>();
+  auto tuple = six::isTuple(args[i]);
+  THPObjectPtr arg = six::maybeAsTuple(args[i]);
+  auto size = tuple ? PyTuple_GET_SIZE(arg.get()) : PyList_GET_SIZE(arg.get());
+  torch::List<c10::optional<at::Tensor>> res;
+  res.reserve(size);
+  for (int idx = 0; idx < size; idx++) {
+    PyObject* obj = tuple ? PyTuple_GET_ITEM(arg.get(), idx) : PyList_GET_ITEM(arg.get(), idx);
+    // This is checked by the argument parser so it's safe to cast without checking
+    // if this is a tensor first
+    res.push_back(reinterpret_cast<THPVariable*>(obj)->cdata);
+  }
+  return res;
+}
+
 template<int N>
 inline std::array<at::Tensor, N> PythonArgs::tensorlist_n(int i) {
   auto res = std::array<at::Tensor, N>();
diff --git a/torch/csrc/utils/python_compat.h b/torch/csrc/utils/python_compat.h
index 28d990c64c42..7e1cb0c4f92d 100644
--- a/torch/csrc/utils/python_compat.h
+++ b/torch/csrc/utils/python_compat.h
@@ -63,20 +63,5 @@ __PySlice_Unpack(PyObject *_r,
   (PySlice_Unpack(SLICE, START, STOP, STEP) == 0)
 #endif
 
-// https://bugsfiles.kde.org/attachment.cgi?id=61186
-#if PY_VERSION_HEX >= 0x03020000
 #define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \
   (PySlice_GetIndicesEx(SLICE, LEN, START, STOP, LENGTH, STEP) == 0)
-#else
-#define THPUtils_parseSlice(SLICE, LEN, START, STOP, LENGTH, STEP) \
-  (PySlice_GetIndicesEx((PySliceObject*)SLICE, LEN, START, STOP, LENGTH, STEP) == 0)
-#endif
-
-// This function was introduced in Python 3.4
-#if PY_VERSION_HEX < 0x03040000
-inline int
-PyGILState_Check() {
-  PyThreadState * tstate = _PyThreadState_Current;
-  return tstate && (tstate == PyGILState_GetThisThreadState());
-}
-#endif
diff --git a/torch/csrc/utils/six.h b/torch/csrc/utils/six.h
index 932f0bf61a29..b83e60c77cf3 100644
--- a/torch/csrc/utils/six.h
+++ b/torch/csrc/utils/six.h
@@ -23,11 +23,7 @@ inline bool isTuple(pybind11::handle input) {
   if (PyTuple_Check(input.ptr())) {
     return true;
   }
-#if PY_MAJOR_VERSION == 2
-  return isStructSeq(input);
-#else
   return false;
-#endif
 }
 
 inline bool isTuple(PyObject* obj) {
@@ -40,12 +36,8 @@ inline bool isTuple(PyObject* obj) {
 // But on Python 2, structseq is not a subtype of tuple, so we need to manually create a
 // new tuple object from structseq.
 inline THPObjectPtr maybeAsTuple(PyStructSequence *obj) {
-#if PY_MAJOR_VERSION == 2
-  return THPObjectPtr(torch::utils::structseq_slice(obj, 0, Py_SIZE(obj)));
-#else
   Py_INCREF(obj);
   return THPObjectPtr((PyObject *)obj);
-#endif
 }
 
 inline THPObjectPtr maybeAsTuple(PyObject *obj) {
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index a4068ac6d7f3..e59c798a59be 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -153,15 +153,9 @@ def _lazy_init():
         # immediately, while we are still guaranteed to have the GIL, because some
         # of the C calls we make below will release the GIL
         if _is_in_bad_fork():
-            from sys import version_info
-            if version_info < (3, 4):
-                msg = ("To use CUDA with multiprocessing, you must use Python "
-                       "3.4+ and the 'spawn' start method")
-            else:
-                msg = ("To use CUDA with multiprocessing, you must use the "
-                       "'spawn' start method")
             raise RuntimeError(
-                "Cannot re-initialize CUDA in forked subprocess. " + msg)
+                "Cannot re-initialize CUDA in forked subprocess. To use CUDA with "
+                "multiprocessing, you must use the 'spawn' start method")
         if not hasattr(torch._C, '_cuda_getDeviceCount'):
             raise AssertionError("Torch not compiled with CUDA enabled")
         if _cudart is None:
@@ -210,7 +204,7 @@ def check_error(res: int) -> None:
 class device(object):
     r"""Context-manager that changes the selected device.
 
-    Arguments:
+    Args:
         device (torch.device or int): device index to select. It's a no-op if
             this argument is a negative integer or ``None``.
     """
@@ -239,7 +233,7 @@ class device_of(device):
     You can use both tensors and storages as arguments. If a given object is
     not allocated on a GPU, this is a no-op.
 
-    Arguments:
+    Args:
         obj (Tensor or Storage): object allocated on the selected device.
     """
 
@@ -254,7 +248,7 @@ def set_device(device: _device_t) -> None:
     Usage of this function is discouraged in favor of :any:`device`. In most
     cases it's better to use ``CUDA_VISIBLE_DEVICES`` environmental variable.
 
-    Arguments:
+    Args:
         device (torch.device or int): selected device. This function is a no-op
             if this argument is negative.
     """
@@ -266,11 +260,14 @@ def set_device(device: _device_t) -> None:
 def get_device_name(device: Optional[_device_t] = None) -> str:
     r"""Gets the name of a device.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): device for which to return the
             name. This function is a no-op if this argument is a negative
             integer. It uses the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
+
+    Returns:
+        str: the name of the device
     """
     return get_device_properties(device).name
 
@@ -278,7 +275,7 @@ def get_device_name(device: Optional[_device_t] = None) -> str:
 def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]:
     r"""Gets the cuda capability of a device.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): device for which to return the
             device capability. This function is a no-op if this argument is
             a negative integer. It uses the current device, given by
@@ -293,6 +290,15 @@ def get_device_capability(device: Optional[_device_t] = None) -> Tuple[int, int]
 
 
 def get_device_properties(device: _device_t) -> _CudaDeviceProperties:
+    r"""Gets the properties of a device.
+
+    Args:
+        device (torch.device or int or str): device for which to return the
+            properties of the device.
+
+    Returns:
+        _CudaDeviceProperties: the properties of the device
+    """
     _lazy_init()  # will define _get_device_properties
     device = _get_device_index(device, optional=True)
     if device < 0 or device >= device_count():
@@ -307,7 +313,7 @@ def stream(stream):
     All CUDA kernels queued within its context will be enqueued on a selected
     stream.
 
-    Arguments:
+    Args:
         stream (Stream): selected stream. This manager is a no-op if it's
             ``None``.
 
@@ -370,7 +376,7 @@ def current_device() -> int:
 def synchronize(device: _device_t = None) -> None:
     r"""Waits for all kernels in all streams on a CUDA device to complete.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): device for which to synchronize.
             It uses the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -396,7 +402,7 @@ def ipc_collect():
 def current_stream(device: Optional[_device_t] = None) -> Stream:
     r"""Returns the currently selected :class:`Stream` for a given device.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             the currently selected :class:`Stream` for the current device, given
             by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
@@ -410,7 +416,7 @@ def current_stream(device: Optional[_device_t] = None) -> Stream:
 def default_stream(device: Optional[_device_t] = None) -> Stream:
     r"""Returns the default :class:`Stream` for a given device.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             the default :class:`Stream` for the current device, given by
             :func:`~torch.cuda.current_device`, if :attr:`device` is ``None``
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index ce2d38c9d4f0..99fdf6e03e83 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -109,7 +109,7 @@ def forward(self, input):
     :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process
     (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
 
-    Arguments:
+    Args:
         enabled(bool, optional, default=True):  Whether autocasting should be enabled in the region.
     """
     def __init__(self, enabled=True):
@@ -176,7 +176,7 @@ def custom_fwd(fwd=None, **kwargs):
     Helper decorator for ``forward`` methods of custom autograd functions (subclasses of
     :class:`torch.autograd.Function`).  See the :ref:`example page<amp-custom-examples>` for more detail.
 
-    Arguments:
+    Args:
         cast_inputs (:class:`torch.dtype` or None, optional, default=None):  If not ``None``,
             when ``forward`` runs in an autocast-enabled region, casts incoming
             floating-point CUDA Tensors to the target dtype (non-floating-point Tensors are not affected),
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index dc35d21daf00..522b44cbf246 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -94,7 +94,7 @@ class GradScaler(object):
     value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
     iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
 
-    Arguments:
+    Args:
         init_scale (float, optional, default=2.**16):  Initial scale factor.
         growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
             :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
@@ -150,7 +150,7 @@ def scale(self, outputs):
         Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
         unmodified.
 
-        Arguments:
+        Args:
             outputs (Tensor or iterable of Tensors):  Outputs to scale.
         """
         if not self._enabled:
@@ -245,7 +245,7 @@ def unscale_(self, optimizer):
             scaler.step(optimizer)
             scaler.update()
 
-        Arguments:
+        Args:
             optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
 
         .. note::
@@ -292,7 +292,7 @@ def step(self, optimizer, *args, **kwargs):
 
         Returns the return value of ``optimizer.step(*args, **kwargs)``.
 
-        Arguments:
+        Args:
             optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
             args:  Any arguments.
             kwargs:  Any keyword arguments.
@@ -346,7 +346,7 @@ def update(self, new_scale=None):
 
         Passing ``new_scale`` sets the scale directly.
 
-        Arguments:
+        Args:
             new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
 
         .. warning::
@@ -415,7 +415,7 @@ def get_growth_factor(self):
 
     def set_growth_factor(self, new_factor):
         r"""
-        Arguments:
+        Args:
             new_scale (float):  Value to use as the new scale growth factor.
         """
         self._growth_factor = new_factor
@@ -428,7 +428,7 @@ def get_backoff_factor(self):
 
     def set_backoff_factor(self, new_factor):
         r"""
-        Arguments:
+        Args:
             new_scale (float):  Value to use as the new scale backoff factor.
         """
         self._backoff_factor = new_factor
@@ -441,7 +441,7 @@ def get_growth_interval(self):
 
     def set_growth_interval(self, new_interval):
         r"""
-        Arguments:
+        Args:
             new_interval (int):  Value to use as the new growth interval.
         """
         self._growth_interval = new_interval
@@ -484,7 +484,7 @@ def load_state_dict(self, state_dict):
         r"""
         Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
 
-        Arguments:
+        Args:
            state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
         """
         if not self._enabled:
diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py
index c67742820076..c0bde95de741 100644
--- a/torch/cuda/memory.py
+++ b/torch/cuda/memory.py
@@ -29,7 +29,7 @@ def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None
     frameworks. Allocated memory is released through
     :func:`~torch.cuda.caching_allocator_delete`.
 
-    Arguments:
+    Args:
         size (int): number of bytes to be allocated.
         device (torch.device or int, optional): selected device. If it is
             ``None`` the default CUDA device is used.
@@ -62,7 +62,7 @@ def caching_allocator_delete(mem_ptr):
     is freed here. The associated device and stream are tracked inside
     the allocator.
 
-    Arguments:
+    Args:
         mem_ptr (int): memory address to be freed by the allocator.
 
     .. note::
@@ -79,7 +79,7 @@ def set_per_process_memory_fraction(fraction, device: Union[Device, int] = None)
     If trying to allocate more than the allowed value in a process, will raise an out of
     memory error in allocator.
 
-    Arguments:
+    Args:
         fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction.
         device (torch.device or int, optional): selected device. If it is
             ``None`` the default CUDA device is used.
@@ -164,7 +164,7 @@ def memory_stats(device: Union[Device, int] = None) -> Dict[str, Any]:
       result in a cache flush and retry.
     - ``"num_ooms"``: number of out-of-memory errors thrown.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistics for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -204,7 +204,7 @@ def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None:
     the `"allocated"` and `"freed"` keys in each individual stat dict, as well as
     `"num_alloc_retries"` and `"num_ooms"`.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -223,7 +223,7 @@ def reset_peak_memory_stats(device: Union[Device, int] = None) -> None:
     See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the
     `"peak"` key in each individual stat dict.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -242,7 +242,7 @@ def reset_max_memory_allocated(device: Union[Device, int] = None) -> None:
 
     See :func:`~torch.cuda.max_memory_allocated` for details.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -268,7 +268,7 @@ def reset_max_memory_cached(device: Union[Device, int] = None) -> None:
 
     See :func:`~torch.cuda.max_memory_cached` for details.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -292,7 +292,7 @@ def memory_allocated(device: Union[Device, int] = None) -> int:
     r"""Returns the current GPU memory occupied by tensors in bytes for a given
     device.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -316,7 +316,7 @@ def max_memory_allocated(device: Union[Device, int] = None) -> int:
     functions can measure the peak allocated memory usage of each iteration in a
     training loop.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -332,7 +332,7 @@ def memory_reserved(device: Union[Device, int] = None) -> int:
     r"""Returns the current GPU memory managed by the caching allocator in bytes
     for a given device.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -354,7 +354,7 @@ def max_memory_reserved(device: Union[Device, int] = None) -> int:
     can measure the peak cached memory amount of each iteration in a training
     loop.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             statistic for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -402,7 +402,7 @@ def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False)
     This can be useful to display periodically during training, or when
     handling out-of-memory exceptions.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             printout for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
@@ -503,7 +503,7 @@ def list_gpu_processes(device: Union[Device, int] = None) -> str:
     This can be useful to display periodically during training, or when
     handling out-of-memory exceptions.
 
-    Arguments:
+    Args:
         device (torch.device or int, optional): selected device. Returns
             printout for the current device, given by :func:`~torch.cuda.current_device`,
             if :attr:`device` is ``None`` (default).
diff --git a/torch/cuda/nvtx.py b/torch/cuda/nvtx.py
index f2cb1625e388..4265eac9707d 100644
--- a/torch/cuda/nvtx.py
+++ b/torch/cuda/nvtx.py
@@ -22,7 +22,7 @@ def range_push(msg):
     Pushes a range onto a stack of nested range span.  Returns zero-based
     depth of the range that is started.
 
-    Arguments:
+    Args:
         msg (string): ASCII message to associate with range
     """
     return _nvtx.rangePushA(msg)
@@ -40,7 +40,7 @@ def mark(msg):
     """
     Describe an instantaneous event that occurred at some point.
 
-    Arguments:
+    Args:
         msg (string): ASCII message to associate with the event.
     """
     return _nvtx.markA(msg)
@@ -53,7 +53,7 @@ def range(msg, *args, **kwargs):
     of its scope, and pops it at the end. If extra arguments are given,
     they are passed as arguments to msg.format().
 
-    Arguments:
+    Args:
         msg (string): message to associate with the range
     """
     range_push(msg.format(*args, **kwargs))
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 9c9c30a7ff29..5371f9ca3463 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -17,7 +17,7 @@ class Stream(torch._C._CudaStreamBase):
     device, independent from other streams.  See :ref:`cuda-semantics` for
     details.
 
-    Arguments:
+    Args:
         device(torch.device or int, optional): a device on which to allocate
             the stream. If :attr:`device` is ``None`` (default) or a negative
             integer, this will use the current device.
@@ -36,7 +36,7 @@ def __new__(cls, device=None, priority=0, **kwargs):
     def wait_event(self, event):
         r"""Makes all future work submitted to the stream wait for an event.
 
-        Arguments:
+        Args:
             event (Event): an event to wait for.
 
         .. note:: This is a wrapper around ``cudaStreamWaitEvent()``: see
@@ -56,7 +56,7 @@ def wait_stream(self, stream):
         All future work submitted to this stream will wait until all kernels
         submitted to a given stream at the time of call complete.
 
-        Arguments:
+        Args:
             stream (Stream): a stream to synchronize.
 
         .. note:: This function returns without waiting for currently enqueued
@@ -67,7 +67,7 @@ def wait_stream(self, stream):
     def record_event(self, event=None):
         r"""Records an event.
 
-        Arguments:
+        Args:
             event (Event, optional): event to record. If not given, a new one
                 will be allocated.
 
@@ -123,7 +123,7 @@ class Event(torch._C._CudaEventBase):
     same device may record the event. However, streams on any device can wait on
     the event.
 
-    Arguments:
+    Args:
         enable_timing (bool, optional): indicates if the event should measure time
             (default: ``False``)
         blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
diff --git a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
index 590ce2054c03..b1be0b52c9e6 100644
--- a/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
+++ b/torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py
@@ -111,7 +111,7 @@ def powerSGD_hook(
     one left multiplication and one right multiplication.
     For warm start, can take one such step at a time, and alternate between them.
 
-    Arguments:
+    Args:
         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
             Note that since DDP comm hook only supports single process single device mode at this time,
@@ -305,7 +305,7 @@ def decompress(fut):
             torch.cuda.synchronize(device)
 
         if state.use_error_feedback:
-            # Memorize the local errors.
+            # memoize the local errors.
             state.error_dict[bucket_index] = input_tensor_cp - input_tensor
         if not state.warm_start:
             state.p_memory_dict.clear()
@@ -346,7 +346,7 @@ def batched_powerSGD_hook(
     one left multiplication and one right multiplication.
     For warm start, can take one such step at a time, and alternate between them.
 
-    Arguments:
+    Args:
         state (PowerSGDState): State information to configure the compression rate and support error feedback, warm start, etc.
         bucket (dist._GradBucket): Bucket that stores a 1D flattened gradient tensor that batches multiple per-variable tensors.
             Note that since DDP comm hook only supports single process single device mode at this time,
@@ -487,7 +487,7 @@ def decompress(fut):
         )
 
         if state.use_error_feedback:
-            # Memorize the local errors.
+            # memoize the local errors.
             state.error_dict[bucket_index] = input_tensor_cp - input_tensor
         if torch.cuda.is_available():
             torch.cuda.synchronize(device)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index a11a12c6bf82..5b300452f6d3 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -1,8 +1,8 @@
+import contextlib
+import logging
 import pickle
 import torch
 import warnings
-import contextlib
-import sys
 import time
 from torch._six import string_classes
 from datetime import timedelta
@@ -17,8 +17,8 @@
     AllreduceOptions,
     AllreduceCoalescedOptions,
     AllToAllOptions,
+    BarrierOptions,
     BroadcastOptions,
-    FileStore,
     GatherOptions,
     PrefixStore,
     ProcessGroup,
@@ -27,15 +27,8 @@
     ReduceScatterOptions,
     ScatterOptions,
     Store,
-    TCPStore,
 )
 
-if sys.platform != 'win32':
-    from torch._C._distributed_c10d import (
-        HashStore,
-    )
-
-
 _MPI_AVAILABLE = True
 _NCCL_AVAILABLE = True
 _GLOO_AVAILABLE = True
@@ -114,7 +107,7 @@ def register_backend(cls, name, func):
 
         This class method is used by 3rd party cpp extension to register new backend.
 
-        Arguments:
+        Args:
             name (str): Backend name matching with the one in `init_process_group()`.
             func (function): Function handler that instantiates the backend.
                              The function should be implemented in the backend cpp extension
@@ -191,16 +184,35 @@ def _store_based_barrier(rank, store, timeout):
     """
     store_key = "{}:{}".format(STORE_BASED_BARRIER_PREFIX, _group_count)
     store.add(store_key, 1)
+    logging.info('Added key: {} to store for rank: {}'.format(store_key, rank))
 
     # Now wait for all workers to check in with the store.
     world_size = get_world_size()
-    worker_count = int(store.get(store_key))
+    # Use 'add' instead of 'get' since for some store implementations 'add'
+    # doesn't work well with 'get'. Ideally the store implementations should
+    # be fixed, but for backward compatiblity reasons it is risky to change
+    # the store implementations. Once, we completely migrate away from these
+    # legacy stores, we can use 'get' here instead.
+    worker_count = store.add(store_key, 0)
     start = time.time()
+    log_time = time.time()
     while worker_count != world_size:
         time.sleep(0.01)
-        worker_count = int(store.get(store_key))
+        worker_count = store.add(store_key, 0)
+
+        # Print status periodically to keep track.
+        if timedelta(seconds=(time.time() - log_time)) > timedelta(seconds=10):
+            logging.info(
+                "Waiting in store based barrier to initialize process group for "
+                "rank: {}, key: {} (world_size={}, worker_count={}, timeout={})".format(
+                    rank, store_key, world_size, worker_count, timeout))
+            log_time = time.time()
+
         if timedelta(seconds=(time.time() - start)) > timeout:
-            raise RuntimeError("Timed out initializing process group")
+            raise RuntimeError(
+                "Timed out initializing process group in store based barrier on "
+                "rank: {}, for key: {} (world_size={}, worker_count={}, timeout={})".format(
+                    rank, store_key, world_size, worker_count, timeout))
 
 def _rank_not_in_group(group: ProcessGroup):
     """
@@ -356,7 +368,7 @@ def get_backend(group=None):
     """
     Returns the backend of the given process group.
 
-    Arguments:
+    Args:
         group (ProcessGroup, optional): The process group to work on. The
             default is the general main process group. If another specific group
             is specified, the calling process must be part of :attr:`group`.
@@ -396,7 +408,7 @@ def init_process_group(backend,
     If neither is specified, ``init_method`` is assumed to be "env://".
 
 
-    Arguments:
+    Args:
         backend (str or Backend): The backend to use. Depending on
             build-time configurations, valid values include ``mpi``, ``gloo``,
             and ``nccl``. This field should be given as a lowercase string
@@ -504,12 +516,8 @@ def init_process_group(backend,
     # barrier at the end to ensure that once we return from this method, all
     # process groups including global variables are updated correctly on all
     # ranks.
-    if backend == Backend.MPI or not (
-        isinstance(store, TCPStore) or
-        isinstance(store, FileStore) or
-        (sys.platform != 'win32' and isinstance(store, HashStore))
-    ):
-        # MPI doesn't have store.
+    if backend == Backend.MPI:
+        # MPI backend doesn't use store.
         barrier()
     else:
         # Use store based barrier here since barrier() used a bunch of
@@ -611,7 +619,7 @@ def destroy_process_group(group=None):
     """
     Destroy a given process group, and deinitialize the distributed package
 
-    Arguments:
+    Args:
         group (ProcessGroup, optional): The process group to be destroyed, if
                                         group.WORLD is given, all process
                                         groups including the default one will
@@ -665,7 +673,7 @@ def get_rank(group=None):
     process group. They are always consecutive integers ranging from 0 to
     ``world_size``.
 
-    Arguments:
+    Args:
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
 
@@ -688,7 +696,7 @@ def get_world_size(group=None):
     """
     Returns the number of processes in the current process group
 
-    Arguments:
+    Args:
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
 
@@ -710,7 +718,7 @@ def isend(tensor,
     """
     Sends a tensor asynchronously.
 
-    Arguments:
+    Args:
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
         group (ProcessGroup, optional): The process group to work on. If None,
@@ -741,7 +749,7 @@ def irecv(tensor,
     """
     Receives a tensor asynchronously.
 
-    Arguments:
+    Args:
         tensor (Tensor): Tensor to fill with received data.
         src (int, optional): Source rank. Will receive from any
             process if unspecified.
@@ -780,7 +788,7 @@ def send(tensor,
     """
     Sends a tensor synchronously.
 
-    Arguments:
+    Args:
         tensor (Tensor): Tensor to send.
         dst (int): Destination rank.
         group (ProcessGroup, optional): The process group to work on. If None,
@@ -807,7 +815,7 @@ def recv(tensor,
     """
     Receives a tensor synchronously.
 
-    Arguments:
+    Args:
         tensor (Tensor): Tensor to fill with received data.
         src (int, optional): Source rank. Will receive from any
             process if unspecified.
@@ -854,7 +862,7 @@ class P2POp(object):
     Process Group group, and tag. Instances of this class will be passed to
     ``batch_isend_irecv`` for point-to-point communications.
 
-    Arguments:
+    Args:
         op (callable): A function to send data to or receive data from a peer process.
             The type of ``op`` is either ``torch.distributed.isend`` or
             ``torch.distributed.irecv``.
@@ -895,7 +903,7 @@ def batch_isend_irecv(p2p_op_list):
     Process each of the operations in p2p_op_list and return the corresponding
     requests. NCCL and Gloo backend are currently supported.
 
-    Arguments:
+    Args:
         p2p_op_list: A list of point-to-point operations(type of each operator is
             ``torch.distributed.P2POp``). The order of the isend/irecv in the list
             matters and it needs to match with corresponding isend/irecv on the
@@ -955,7 +963,7 @@ def broadcast_multigpu(tensor_list,
     Only nccl and gloo backend are currently supported
     tensors should only be GPU tensors
 
-    Arguments:
+    Args:
         tensor_list (List[Tensor]): Tensors that participate in the collective
             operation. If ``src`` is the rank, then the specified ``src_tensor``
             element of ``tensor_list`` (``tensor_list[src_tensor]``) will be
@@ -1005,7 +1013,7 @@ def broadcast(tensor,
     ``tensor`` must have the same number of elements in all processes
     participating in the collective.
 
-    Arguments:
+    Args:
         tensor (Tensor): Data to be sent if ``src`` is the rank of current
             process, and tensor to be used to save received data otherwise.
         src (int): Source rank.
@@ -1058,7 +1066,7 @@ def all_reduce_multigpu(tensor_list,
     Only nccl and gloo backend is currently supported
     tensors should only be GPU tensors
 
-    Arguments:
+    Args:
         tensor list (List[Tensor]): List of input and output tensors of
             the collective. The function operates in-place and requires that
             each tensor to be a GPU tensor on different GPUs.
@@ -1107,7 +1115,7 @@ def all_reduce(tensor,
 
     Complex tensors are supported.
 
-    Arguments:
+    Args:
         tensor (Tensor): Input and output of the collective. The function
             operates in-place.
         op (optional): One of the values from
@@ -1189,7 +1197,7 @@ def all_reduce_coalesced(tensors,
 
     Complex tensors are supported.
 
-    Arguments:
+    Args:
         tensors (List[Tensor]): Input and output of the collective. The function
             operates in-place.
         op (Optional[ReduceOp]): One of the values from
@@ -1243,7 +1251,7 @@ def reduce_multigpu(tensor_list,
     Only nccl backend is currently supported
     tensors should only be GPU tensors
 
-    Arguments:
+    Args:
         tensor_list (List[Tensor]): Input and output GPU tensors of the
             collective. The function operates in-place.
             You also need to make sure that ``len(tensor_list)`` is the same for
@@ -1295,7 +1303,7 @@ def reduce(tensor,
 
     Only the process with rank ``dst`` is going to receive the final result.
 
-    Arguments:
+    Args:
         tensor (Tensor): Input and output of the collective. The function
             operates in-place.
         dst (int): Destination rank
@@ -1346,7 +1354,7 @@ def all_gather_multigpu(output_tensor_lists,
 
     Complex tensors are supported.
 
-    Arguments:
+    Args:
         output_tensor_lists (List[List[Tensor]]): Output lists. It should
             contain correctly-sized tensors on each GPU to be used for output
             of the collective, e.g. ``output_tensor_lists[i]`` contains the
@@ -1417,7 +1425,7 @@ def all_gather_object(object_list, obj, group=None):
     :func:`all_gather`, but Python objects can be passed in. Note that the object
     must be picklable in order to be gathered.
 
-    Arguments:
+    Args:
         object_list (list[Any]): Output list. It should be correctly sized as the
             size of the group for this collective and will contain the output.
         object (Any): Pickable Python object to be broadcast from current process.
@@ -1505,7 +1513,7 @@ def gather_object(obj, object_gather_list=None, dst=0, group=None):
     Similar to :func:`gather`, but Python objects can be passed in. Note that the
     object must be picklable in order to be gathered.
 
-    Arguments:
+    Args:
         obj (Any): Input object. Must be picklable.
         object_gather_list (list[Any]): Output list. On the ``dst`` rank, it
             should be correctly sized as the size of the group for this
@@ -1606,7 +1614,7 @@ def broadcast_object_list(object_list, src, group=None):
     Note that all objects in ``object_list`` must be picklable in order to be
     broadcasted.
 
-    Arguments:
+    Args:
         object_list (List[Any]): List of input objects to broadcast.
             Each object must be picklable. Only objects on the ``src`` rank will
             be broadcast, but each rank must provide lists of equal sizes.
@@ -1701,7 +1709,7 @@ def scatter_object_list(
     ``scatter_object_output_list``. Note that all objects in
     ``scatter_object_input_list`` must be picklable in order to be scattered.
 
-    Arguments:
+    Args:
         scatter_object_output_list (List[Any]): Non-empty list whose first
             element will store the object scattered to this rank.
         scatter_object_input_list (List[Any]): List of input objects to scatter.
@@ -1799,7 +1807,7 @@ def all_gather(tensor_list,
 
     Complex tensors are supported.
 
-    Arguments:
+    Args:
         tensor_list (list[Tensor]): Output list. It should contain
             correctly-sized tensors to be used for output of the collective.
         tensor (Tensor): Tensor to be broadcast from current process.
@@ -1869,7 +1877,7 @@ def all_gather_coalesced(output_tensor_lists,
 
     Complex tensors are supported.
 
-    Arguments:
+    Args:
         output_tensor_lists (list[list[Tensor]]): Output list. It should contain
             correctly-sized tensors to be used for output of the collective.
         input_tensor_list (list[Tensor]): Tensors to be broadcast from
@@ -1954,7 +1962,7 @@ def gather(tensor,
     """
     Gathers a list of tensors in a single process.
 
-    Arguments:
+    Args:
         tensor (Tensor): Input tensor.
         gather_list (list[Tensor], optional): List of appropriately-sized
             tensors to use for gathered data (default is None, must be specified
@@ -2013,7 +2021,7 @@ def scatter(tensor,
     Each process will receive exactly one tensor and store its data in the
     ``tensor`` argument.
 
-    Arguments:
+    Args:
         tensor (Tensor): Output tensor.
         scatter_list (list[Tensor]): List of tensors to scatter (default is
             None, must be specified on the source rank)
@@ -2081,7 +2089,7 @@ def reduce_scatter_multigpu(output_tensor_list,
     Each tensor in ``output_tensor_list`` should reside on a separate GPU, as
     should each list of tensors in ``input_tensor_lists``.
 
-    Arguments:
+    Args:
         output_tensor_list (List[Tensor]): Output tensors (on different GPUs)
             to receive the result of the operation.
 
@@ -2149,7 +2157,7 @@ def reduce_scatter(output,
     """
     Reduces, then scatters a list of tensors to all processes in a group.
 
-    Arguments:
+    Args:
         output (Tensor): Output tensor.
         input_list (list[Tensor]): List of tensors to reduce and scatter.
         group (ProcessGroup, optional): The process group to work on. If None,
@@ -2192,7 +2200,7 @@ def all_to_all_single(output,
     to all processes in a group. Then concatenate the received tensors from all
     the processes in the group and return single output tensor.
 
-    Arguments:
+    Args:
         output (Tensor): Gathered cancatenated output tensor.
         input (Tensor): Input tensor to scatter.
         output_split_sizes: (list[Int], optional): Output split sizes for dim 0
@@ -2285,7 +2293,7 @@ def all_to_all(output_tensor_list,
     Each process scatters list of input tensors to all processes in a group and
     return gathered list of tensors in output list.
 
-    Arguments:
+    Args:
         output_tensor_list (list[Tensor]): List of tensors to be gathered one
             per rank.
         input_tensor_list (list[Tensor]): List of tensors to scatter one per rank.
@@ -2370,18 +2378,23 @@ def all_to_all(output_tensor_list,
         work.wait()
 
 
+
 def barrier(group=GroupMember.WORLD,
-            async_op=False):
+            async_op=False,
+            device_ids=None):
+
     """
     Synchronizes all processes.
 
     This collective blocks processes until the whole group enters this function,
     if async_op is False, or if async work handle is called on wait().
 
-    Arguments:
+    Args:
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
         async_op (bool, optional): Whether this op should be an async op
+        device_ids ([int], optional): List of device/GPU ids.
+                                      Valid only for NCCL backend.
 
     Returns:
         Async work handle, if async_op is set to True.
@@ -2390,11 +2403,22 @@ def barrier(group=GroupMember.WORLD,
     if _rank_not_in_group(group):
         return
 
+    opts = BarrierOptions()
+    if device_ids is not None:
+        if get_backend(group) != Backend.NCCL:
+            raise RuntimeError("Function argument device_ids not supported "
+                               "for the selected backend {}".format(get_backend(group)))
+        if isinstance(device_ids, list):
+            opts.device_ids = device_ids
+        else:
+            raise RuntimeError("Invalid function argument: "
+                               "device_ids type should be List[int]")
+
     if group is None:
         default_pg = _get_default_group()
-        work = default_pg.barrier()
+        work = default_pg.barrier(opts=opts)
     else:
-        work = group.barrier()
+        work = group.barrier(opts=opts)
 
     if async_op:
         return work
@@ -2422,7 +2446,7 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
         ia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#using
         -multiple-nccl-communicators-concurrently>`_ for more details.
 
-    Arguments:
+    Args:
         ranks (list[int]): List of ranks of group members. If ``None``, will be
             set to all ranks. Default is ``None``.
         timeout (timedelta, optional): Timeout for operations executed against
@@ -2491,16 +2515,12 @@ def new_group(ranks=None, timeout=default_pg_timeout, backend=None):
     # barrier at the end to ensure that once we return from this method, all
     # process groups including global variables are updated correctly on all
     # ranks.
-    if backend == Backend.MPI or not (
-        isinstance(default_store, TCPStore) or
-        isinstance(default_store, FileStore) or
-        (sys.platform != 'win32' and isinstance(default_store, HashStore))
-    ):
+    if backend == Backend.MPI:
         # MPI doesn't have store.
         barrier()
     else:
         # Use store based barrier here since barrier() used a bunch of
         # default devices and messes up NCCL internal state.
-        _store_based_barrier(group_rank, default_store, timeout)
+        _store_based_barrier(global_rank, default_store, timeout)
 
     return pg
diff --git a/torch/distributed/nn/api/remote_module.py b/torch/distributed/nn/api/remote_module.py
index 678bbf6a96de..e6faf6d7accb 100644
--- a/torch/distributed/nn/api/remote_module.py
+++ b/torch/distributed/nn/api/remote_module.py
@@ -100,7 +100,7 @@ def __init__(
         ``def forward(input: Tensor) -> Tensor:`` and
         ``def forward_async(input: Tensor) -> Future[Tensor]:``.
 
-        Arguments:
+        Args:
             remote_device (str): Device on the destination worker where we‘d like to place this module.
                 The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
                 E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
@@ -339,7 +339,7 @@ class RemoteModule(_RemoteModule):
         ``def forward(input: Tensor) -> Tensor:`` and
         ``def forward_async(input: Tensor) -> Future[Tensor]:``.
 
-    Arguments:
+    Args:
         remote_device (str): Device on the destination worker where we‘d like to place this module.
             The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
             E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
diff --git a/torch/distributed/pipeline/sync/pipe.py b/torch/distributed/pipeline/sync/pipe.py
index 6ee95f28df8b..d191d880d299 100644
--- a/torch/distributed/pipeline/sync/pipe.py
+++ b/torch/distributed/pipeline/sync/pipe.py
@@ -95,7 +95,7 @@ def _retrieve_device(module: nn.Module) -> torch.device:
     """Validates all parameters in the Module have the same device and returns
     the appropriate device.
 
-    Arguments:
+    Args:
         An ``nn.Module`` to process.
 
     Returns:
@@ -152,7 +152,7 @@ class Pipe(Module):
     into an :class:`nn.Sequential <torch.nn.Sequential>` module defining the
     desired order of execution.
 
-    Arguments:
+    Args:
         module (:class:`nn.Sequential <torch.nn.Sequential>`):
             sequential module to be parallelized using pipelining. Each module
             in the sequence has to have all of its parameters on a single
@@ -339,7 +339,7 @@ def forward(self, input) -> RRef:  # type: ignore
         size is less than ``chunks``, the number of micro-batches is equal to
         the batch size.
 
-        Arguments:
+        Args:
             input (torch.Tensor or sequence of :class:`~torch.Tensor`): input mini-batch
 
         Returns:
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index b3bedfbc76ca..3f04fa142a4b 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -31,7 +31,7 @@ def register_rendezvous_handler(scheme, handler):
     Pick a unique name and use the URL scheme to identify it when
     calling the `rendezvous()` function.
 
-    Arguments:
+    Args:
         scheme (str): URL scheme to identify your rendezvous handler.
         handler (function): Handler that is invoked when the
             `rendezvous()` function is called with a URL that uses
diff --git a/torch/distributed/rpc/__init__.py b/torch/distributed/rpc/__init__.py
index c405ac987f0a..e65c6efcd7bd 100644
--- a/torch/distributed/rpc/__init__.py
+++ b/torch/distributed/rpc/__init__.py
@@ -85,7 +85,7 @@ def init_rpc(
         and distributed autograd, which immediately makes the current
         process ready to send and receive RPCs.
 
-        Arguments:
+        Args:
             backend (BackendType, optional): The type of RPC backend
                 implementation. Supported values include
                 ``BackendType.TENSORPIPE`` (the default) and
diff --git a/torch/distributed/rpc/api.py b/torch/distributed/rpc/api.py
index e88ced794454..0a99df67269e 100644
--- a/torch/distributed/rpc/api.py
+++ b/torch/distributed/rpc/api.py
@@ -273,7 +273,7 @@ def shutdown(graceful=True):
         :meth:`~torch.distributed.rpc.rpc_async`, ``future.wait()`` should not
         be called after ``shutdown()``.
 
-    Arguments:
+    Args:
         graceful (bool): Whether to do a graceful shutdown or not. If True,
                          this will 1) wait until there is no pending system
                          messages for ``UserRRefs`` and delete them; 2) block
@@ -337,7 +337,7 @@ def get_worker_info(worker_name=None):
     Use this :class:`~torch.distributed.rpc.WorkerInfo` to avoid passing an
     expensive string on every invocation.
 
-    Arguments:
+    Args:
         worker_name (str): the string name of a worker. If ``None``, return the
                            the id of the current worker. (default ``None``)
 
@@ -446,7 +446,7 @@ def remote(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
     :class:`~torch.distributed.rpc.RRef` is only destructed when globally there
     are no living references to it.
 
-    Arguments:
+    Args:
         to (str or WorkerInfo or int): name/rank/``WorkerInfo`` of the destination worker.
         func (callable): a callable function, such as Python callables, builtin
                          operators (e.g. :meth:`~torch.add`) and annotated
@@ -703,7 +703,7 @@ def rpc_sync(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
     messages are sent and received in parallel to execution of Python code. This
     method is thread-safe.
 
-    Arguments:
+    Args:
         to (str or WorkerInfo or int): name/rank/``WorkerInfo`` of the destination worker.
         func (callable): a callable function, such as Python callables, builtin
                          operators (e.g. :meth:`~torch.add`) and annotated
@@ -782,7 +782,7 @@ def rpc_async(to, func, args=None, kwargs=None, timeout=UNSET_RPC_TIMEOUT):
     method is thread-safe. This method will immediately return a
     :class:`~torch.futures.Future` that can be awaited on.
 
-    Arguments:
+    Args:
         to (str or WorkerInfo or int): name/rank/``WorkerInfo`` of the destination worker.
         func (callable): a callable function, such as Python callables, builtin
                          operators (e.g. :meth:`~torch.add`) and annotated
diff --git a/torch/distributed/rpc/backend_registry.py b/torch/distributed/rpc/backend_registry.py
index fe2acbe807b1..d8bd3963033b 100644
--- a/torch/distributed/rpc/backend_registry.py
+++ b/torch/distributed/rpc/backend_registry.py
@@ -38,7 +38,7 @@ def backend_registered(backend_name):
     """
     Checks if backend_name is registered as an RPC backend.
 
-    Arguments:
+    Args:
         backend_name (str): string to identify the RPC backend.
     Returns:
         True if the backend has been registered with ``register_backend``, else
@@ -52,7 +52,7 @@ def register_backend(
 ):
     """Registers a new RPC backend.
 
-    Arguments:
+    Args:
         backend_name (str): backend string to identify the handler.
         construct_rpc_backend_options_handler (function):
             Handler that is invoked when
diff --git a/torch/distributed/rpc/internal.py b/torch/distributed/rpc/internal.py
index 42847896c136..3c1c913b38f8 100644
--- a/torch/distributed/rpc/internal.py
+++ b/torch/distributed/rpc/internal.py
@@ -211,7 +211,7 @@ def _build_rpc_profiling_key(
     Builds the key that RPC calls are profiled with using the autograd profiler.
     This will be the name of the corresponding Event recorded in the profiler.
 
-    Arguments:
+    Args:
         exec_type (RPCExecMode): Type of RPC/RRef call
         func_name (str): Name of function being profiled.
         current_worker_name (str): Name of current worker.
@@ -236,7 +236,7 @@ def _start_record_function(exec_type, func_name, current_worker_name, dest_worke
     callbacks that start the profiling, though the user is responsible for
     running the appropriate callbacks when the function to be profiled finishes.
 
-    Arguments:
+    Args:
         exec_type (RPCExecMode): Type of RPC/RRef call
         func_name (str): Name of function being profiled.
         current_worker_name (str): Name of current worker.
diff --git a/torch/distributed/rpc/options.py b/torch/distributed/rpc/options.py
index edec5778da6e..19092c283efb 100644
--- a/torch/distributed/rpc/options.py
+++ b/torch/distributed/rpc/options.py
@@ -12,7 +12,7 @@ class TensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
     :class:`~torch.distributed.rpc.TensorPipeAgent`, derived from
     :class:`~torch.distributed.rpc.RpcBackendOptions`.
 
-    Arguments:
+    Args:
         num_worker_threads (int, optional): The number of threads in the
             thread-pool used by
             :class:`~torch.distributed.rpc.TensorPipeAgent` to execute
@@ -58,7 +58,7 @@ def set_device_map(self, to: str, device_map: Dict):
         function can be called multiple times to incrementally add
         device placement configurations.
 
-        Arguments:
+        Args:
             worker_name (str): Callee name.
             device_map (Dict of int, str, or torch.device): Device placement
                 mappings from this worker to the callee. This map must be
diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py
index f113ef609e2b..d8de89bfc937 100644
--- a/torch/distributed/rpc/server_process_global_profiler.py
+++ b/torch/distributed/rpc/server_process_global_profiler.py
@@ -22,7 +22,7 @@ class _server_process_global_profile(profile):
     only report runtime of PyTorch functions.
     Note: profiler is thread local and is automatically propagated into the async tasks
 
-    Arguments:
+    Args:
         enabled (bool, optional): Setting this to False makes this context manager a no-op.
             Default: ``True``.
 
@@ -116,6 +116,7 @@ def __enter__(self):
             profiler_kind,
             self.record_shapes,
             self.profile_memory,
+            False,
             False)
         _enable_server_process_global_profiler(profiler_config)
         return self
diff --git a/torch/distributed/rpc/utils.py b/torch/distributed/rpc/utils.py
index 15924c4a72f0..40585a73521d 100644
--- a/torch/distributed/rpc/utils.py
+++ b/torch/distributed/rpc/utils.py
@@ -2,7 +2,7 @@ def _parse_remote_device(remote_device: str):
     r"""
     Parses the remote device.
 
-    Arguments:
+    Args:
         remote_device (str): Device on the destination worker where we‘d like to place this module.
             The format should be "<workername>/<device>", where the device field can be parsed as torch.device type.
             E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0".
diff --git a/torch/distributions/cauchy.py b/torch/distributions/cauchy.py
index 50be941e073a..63181a2a6733 100644
--- a/torch/distributions/cauchy.py
+++ b/torch/distributions/cauchy.py
@@ -69,8 +69,6 @@ def cdf(self, value):
         return torch.atan((value - self.loc) / self.scale) / math.pi + 0.5
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         return torch.tan(math.pi * (value - 0.5)) * self.scale + self.loc
 
     def entropy(self):
diff --git a/torch/distributions/constraints.py b/torch/distributions/constraints.py
index 630c192ffed0..87d72d52d26b 100644
--- a/torch/distributions/constraints.py
+++ b/torch/distributions/constraints.py
@@ -3,13 +3,17 @@
 
 - ``constraints.boolean``
 - ``constraints.cat``
+- ``constraints.corr_cholesky``
 - ``constraints.dependent``
 - ``constraints.greater_than(lower_bound)``
+- ``constraints.greater_than_eq(lower_bound)``
 - ``constraints.integer_interval(lower_bound, upper_bound)``
 - ``constraints.interval(lower_bound, upper_bound)``
+- ``constraints.less_than(upper_bound)``
 - ``constraints.lower_cholesky``
 - ``constraints.lower_triangular``
 - ``constraints.nonnegative_integer``
+- ``constraints.one_hot``
 - ``constraints.positive``
 - ``constraints.positive_definite``
 - ``constraints.positive_integer``
@@ -57,6 +61,8 @@ class Constraint(object):
     A constraint object represents a region over which a variable is valid,
     e.g. within which a variable can be optimized.
     """
+    is_discrete = False
+
     def check(self, value):
         """
         Returns a byte tensor of `sample_shape + batch_shape` indicating
@@ -103,14 +109,30 @@ class _Boolean(Constraint):
     """
     Constrain to the two values `{0, 1}`.
     """
+    is_discrete = True
+
     def check(self, value):
         return (value == 0) | (value == 1)
 
 
+class _OneHot(Constraint):
+    """
+    Constrain to one-hot vectors.
+    """
+    is_discrete = True
+
+    def check(self, value):
+        is_boolean = (value == 0) | (value == 1)
+        is_normalized = value.sum(-1).eq(1)
+        return is_boolean.all(-1) & is_normalized
+
+
 class _IntegerInterval(Constraint):
     """
     Constrain to an integer interval `[lower_bound, upper_bound]`.
     """
+    is_discrete = True
+
     def __init__(self, lower_bound, upper_bound):
         self.lower_bound = lower_bound
         self.upper_bound = upper_bound
@@ -128,6 +150,8 @@ class _IntegerLessThan(Constraint):
     """
     Constrain to an integer interval `(-inf, upper_bound]`.
     """
+    is_discrete = True
+
     def __init__(self, upper_bound):
         self.upper_bound = upper_bound
 
@@ -144,6 +168,8 @@ class _IntegerGreaterThan(Constraint):
     """
     Constrain to an integer interval `[lower_bound, inf)`.
     """
+    is_discrete = True
+
     def __init__(self, lower_bound):
         self.lower_bound = lower_bound
 
@@ -358,6 +384,7 @@ def check(self, value):
 dependent = _Dependent()
 dependent_property = _DependentProperty
 boolean = _Boolean()
+one_hot = _OneHot()
 nonnegative_integer = _IntegerGreaterThan(0)
 positive_integer = _IntegerGreaterThan(1)
 integer_interval = _IntegerInterval
diff --git a/torch/distributions/continuous_bernoulli.py b/torch/distributions/continuous_bernoulli.py
index 180fbd8187ee..5d3d48840203 100644
--- a/torch/distributions/continuous_bernoulli.py
+++ b/torch/distributions/continuous_bernoulli.py
@@ -168,8 +168,6 @@ def cdf(self, value):
             torch.where(torch.ge(value, 1.0), torch.ones_like(value), unbounded_cdfs))
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         cut_probs = self._cut_probs()
         return torch.where(
             self._outside_unstable_region(),
diff --git a/torch/distributions/distribution.py b/torch/distributions/distribution.py
index f16eb154e2dd..bc61e0b0584e 100644
--- a/torch/distributions/distribution.py
+++ b/torch/distributions/distribution.py
@@ -12,10 +12,21 @@ class Distribution(object):
 
     has_rsample = False
     has_enumerate_support = False
-    _validate_args = False
+    _validate_args = __debug__
 
     @staticmethod
     def set_default_validate_args(value):
+        """
+        Sets whether validation is enabled or disabled.
+
+        The default behavior mimics Python's ``assert`` statement: validation
+        is on by default, but is disabled if Python is run in optimized mode
+        (via ``python -O``). Validation may be expensive, so you may want to
+        disable it once a model is working.
+
+        Args:
+            value (bool): Whether to enable validation.
+        """
         if value not in [True, False]:
             raise ValueError
         Distribution._validate_args = value
diff --git a/torch/distributions/exponential.py b/torch/distributions/exponential.py
index 41d7cd9f9787..ac18980c778b 100644
--- a/torch/distributions/exponential.py
+++ b/torch/distributions/exponential.py
@@ -68,8 +68,6 @@ def cdf(self, value):
         return 1 - torch.exp(-self.rate * value)
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         return -torch.log(1 - value) / self.rate
 
     def entropy(self):
diff --git a/torch/distributions/laplace.py b/torch/distributions/laplace.py
index d7ec01c65b35..a505d60c8f38 100644
--- a/torch/distributions/laplace.py
+++ b/torch/distributions/laplace.py
@@ -75,8 +75,6 @@ def cdf(self, value):
         return 0.5 - 0.5 * (value - self.loc).sign() * torch.expm1(-(value - self.loc).abs() / self.scale)
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         term = value - 0.5
         return self.loc - self.scale * (term).sign() * torch.log1p(-2 * term.abs())
 
diff --git a/torch/distributions/negative_binomial.py b/torch/distributions/negative_binomial.py
index 051725db19ca..4a8babb34a7c 100644
--- a/torch/distributions/negative_binomial.py
+++ b/torch/distributions/negative_binomial.py
@@ -77,8 +77,10 @@ def param_shape(self):
 
     @lazy_property
     def _gamma(self):
+        # Note we avoid validating because self.total_count can be zero.
         return torch.distributions.Gamma(concentration=self.total_count,
-                                         rate=torch.exp(-self.logits))
+                                         rate=torch.exp(-self.logits),
+                                         validate_args=False)
 
     def sample(self, sample_shape=torch.Size()):
         with torch.no_grad():
diff --git a/torch/distributions/normal.py b/torch/distributions/normal.py
index 2468e2f225dc..1f14f0ae015f 100644
--- a/torch/distributions/normal.py
+++ b/torch/distributions/normal.py
@@ -82,8 +82,6 @@ def cdf(self, value):
         return 0.5 * (1 + torch.erf((value - self.loc) * self.scale.reciprocal() / math.sqrt(2)))
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         return self.loc + self.scale * torch.erfinv(2 * value - 1) * math.sqrt(2)
 
     def entropy(self):
diff --git a/torch/distributions/one_hot_categorical.py b/torch/distributions/one_hot_categorical.py
index c661a245f716..64f696802d76 100644
--- a/torch/distributions/one_hot_categorical.py
+++ b/torch/distributions/one_hot_categorical.py
@@ -29,7 +29,7 @@ class OneHotCategorical(Distribution):
     """
     arg_constraints = {'probs': constraints.simplex,
                        'logits': constraints.real}
-    support = constraints.simplex
+    support = constraints.one_hot
     has_enumerate_support = True
 
     def __init__(self, probs=None, logits=None, validate_args=None):
diff --git a/torch/distributions/uniform.py b/torch/distributions/uniform.py
index b212c52695c2..edaf5abf77a5 100644
--- a/torch/distributions/uniform.py
+++ b/torch/distributions/uniform.py
@@ -81,8 +81,6 @@ def cdf(self, value):
         return result.clamp(min=0, max=1)
 
     def icdf(self, value):
-        if self._validate_args:
-            self._validate_sample(value)
         result = value * (self.high - self.low) + self.low
         return result
 
diff --git a/torch/functional.py b/torch/functional.py
index 10fb6b1e41b7..1442ab54c1c5 100644
--- a/torch/functional.py
+++ b/torch/functional.py
@@ -118,7 +118,7 @@ def split(tensor, split_size_or_sections, dim=0):
     into ``len(split_size_or_sections)`` chunks with sizes in :attr:`dim` according
     to :attr:`split_size_or_sections`.
 
-    Arguments:
+    Args:
         tensor (Tensor): tensor to split.
         split_size_or_sections (int) or (list(int)): size of a single chunk or
             list of sizes for each chunk
@@ -189,7 +189,7 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
 
     Returns a tuple of tensors as ``(the pivots, the L tensor, the U tensor)``.
 
-    Arguments:
+    Args:
         LU_data (Tensor): the packed LU factorization data
         LU_pivots (Tensor): the packed LU factorization pivots
         unpack_data (bool): flag indicating if the data should be unpacked
@@ -540,7 +540,7 @@ def stft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
       This function changed signature at version 0.4.1. Calling with the
       previous signature may cause error or return incorrect result.
 
-    Arguments:
+    Args:
         input (Tensor): the input tensor
         n_fft (int): size of Fourier transform
         hop_length (int, optional): the distance between neighboring sliding window
@@ -617,7 +617,7 @@ def istft(input: Tensor, n_fft: int, hop_length: Optional[int] = None,
     [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
     IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
 
-    Arguments:
+    Args:
         input (Tensor): The input tensor. Expected to be output of :func:`~torch.stft`,
             can either be complex (``channel``, ``fft_size``, ``n_frame``), or real
             (``channel``, ``fft_size``, ``n_frame``, 2) where the ``channel``
@@ -685,7 +685,7 @@ def _unique_impl(input: Tensor, sorted: bool = True,
         Sorting could be slow, so if your input tensor is already sorted, it is recommended to use
         :func:`torch.unique_consecutive` which avoids the sorting.
 
-    Arguments:
+    Args:
         input (Tensor): the input tensor
         sorted (bool): Whether to sort the unique elements in ascending order
             before returning as output.
@@ -766,7 +766,7 @@ def _unique_consecutive_impl(input: Tensor, return_inverse: bool = False,
         only eliminates consecutive duplicate values. This semantics is similar to `std::unique`
         in C++.
 
-    Arguments:
+    Args:
         input (Tensor): the input tensor
         return_inverse (bool): Whether to also return the indices for where
             elements in the original input ended up in the returned unique list.
@@ -1021,7 +1021,7 @@ def cartesian_prod(*tensors):
     """Do cartesian product of the given sequence of tensors. The behavior is similar to
     python's `itertools.product`.
 
-    Arguments:
+    Args:
         *tensors: any number of 1 dimensional tensors.
 
     Returns:
@@ -1053,7 +1053,7 @@ def cartesian_prod(*tensors):
 def block_diag(*tensors):
     """Create a block diagonal matrix from provided tensors.
 
-    Arguments:
+    Args:
         *tensors: One or more tensors with 0, 1, or 2 dimensions.
 
     Returns:
@@ -1515,7 +1515,7 @@ def _lu_impl(A, pivot=True, get_infos=False, out=None):
         The LU factorization does have backward support,
         but only for square inputs of full rank.
 
-    Arguments:
+    Args:
         A (Tensor): the tensor to factor of size :math:`(*, m, n)`
         pivot (bool, optional): controls whether pivoting is done. Default: ``True``
         get_infos (bool, optional): if set to ``True``, returns an info IntTensor.
diff --git a/torch/futures/__init__.py b/torch/futures/__init__.py
index 515214dfa872..f1da04803b07 100644
--- a/torch/futures/__init__.py
+++ b/torch/futures/__init__.py
@@ -51,7 +51,7 @@ def then(self, callback):  # type: (Callable[[Future[T]], S]) -> Future[S]
         this ``Future``. The callback function can use the ``Future.wait()`` API
         to get the value.
 
-        Arguments:
+        Args:
             callback(``Callable``): a ``Callable`` that takes this ``Future`` as
                                     the only argument.
 
@@ -97,7 +97,7 @@ def _add_done_callback(self, callback):  # type: (Callable[[Future[T]], None]) -
         use the same callback registration API under the hood, and thus the order of
         their callbacks will be maintained even if their calls are interleaved.
 
-        Arguments:
+        Args:
             callback(``None``): a ``Callable`` that takes in no arguments
 
         Example::
@@ -121,7 +121,7 @@ def set_result(self, result: T) -> None:
         completed and trigger all attached callbacks. Note that a ``Future``
         cannot be marked completed twice.
 
-        Arguments:
+        Args:
             result (object): the result object of this ``Future``.
 
         Example::
@@ -152,7 +152,7 @@ def collect_all(futures: List[Future]) -> Future[List[Future]]:
     combined :class:`~torch.futures.Future` that is completed when all of the
     sub-futures are completed.
 
-    Arguments:
+    Args:
         futures (list): a list of :class:`~torch.futures.Future` objects.
 
     Returns:
@@ -185,7 +185,7 @@ def wait_all(futures: List[Future]) -> List:
     Waits for all provided futures to be complete, and returns
     the list of completed values.
 
-    Arguments:
+    Args:
         futures (list): a list of :class:`~torch.futures.Future` object.
 
     Returns:
diff --git a/torch/fx/experimental/merge_matmul.py b/torch/fx/experimental/merge_matmul.py
new file mode 100644
index 000000000000..b72bbe633dd9
--- /dev/null
+++ b/torch/fx/experimental/merge_matmul.py
@@ -0,0 +1,220 @@
+import torch
+
+from torch.fx.graph import Graph
+from torch.fx.graph_module import GraphModule
+from torch.fx.node import Node
+from torch.fx.symbolic_trace import symbolic_trace
+
+import itertools
+import operator
+
+from typing import Dict, List
+
+
+def get_first_dim(t: torch.Tensor) -> int:
+    """
+    A free function primarily for use in the merge_matmul graph transformation below
+    that returns the first dimension of a Tensor. This is necessary because torch.Tensor.shape
+    is an attribute (and cannot be the target of a call_function node) and also helps save
+    a getitem op in the graph.
+
+    Arguments:
+        t: The tensor to get the first dimension of.
+
+    Returns:
+        The first dimension of t.
+    """
+    return t.shape[0]
+
+
+def legalize_graph(gm: GraphModule):
+    """
+    Replace the graph of the given GraphModule with one that contains the same nodes as the
+    original, but in topologically sorted order.
+
+    This is used by the merge_matmul transformation below, which disturbs the topologically sorted
+    order of its input GraphModule, so that this order is restored before further transformation.
+
+    Arguments:
+        gm: The graph module to topologically sort. It is modified in-place.
+
+    """
+    # Build an adjacency list representation of node dependencies in the graph. This also
+    # serves as a list of nodes that still need to be inserted into the new, topologically
+    # sorted graph.
+    dependencies = {node: node.all_input_nodes.copy() for node in gm.graph.nodes}
+
+    # Construct a new graph that will contain all nodes in topologically sorted order.
+    new_graph = Graph()
+    value_remap: Dict[Node, Node] = {}
+
+    # Copy over all nodes with no dependencies.
+    for node, deps in dependencies.items():
+        if not deps:
+            value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
+
+    # Remove the copied over nodes from the adjacency list.
+    for copied_node in value_remap.keys():
+        del dependencies[copied_node]
+
+    # While there are still nodes to insert into the new graph:
+    while dependencies:
+        copied_this_round = []
+
+        # Copy over all nodes whose dependencies already exist in the new graph.
+        for node, deps in dependencies.items():
+            all_deps_copied = True
+            for dep in deps:
+                if dep not in value_remap:
+                    all_deps_copied = False
+
+            if all_deps_copied:
+                value_remap[node] = new_graph.node_copy(node, lambda n: value_remap[n])
+                copied_this_round.append(node)
+
+        # Delete all nodes copied over in this iteration from dependencies.
+        for copied_node in copied_this_round:
+            del dependencies[copied_node]
+
+    # Replace the old graph with the new, topologically sorted one.
+    gm.graph = new_graph
+
+
+def may_depend_on(a: Node, b: Node, search_depth: int = 6):
+    """
+    Determine if one node depends on another in a torch.fx.Graph.
+
+    Arguments:
+        a: The node that may have a dependency on b.
+        b: The node that a may have a dependency on.
+        search_depth: In the case of an indirect dependency, this function
+                        searches upto this many nodes away in search of a
+                        data dependency. If none is found, the function
+                        makes the conservative assumption that there is a
+                        dependency.
+
+    Returns:
+        True if a may depend on b, False if it definitely does not.
+    """
+    # Equivalence is defined as dependence.
+    if a == b:
+        return True
+
+    # If a has no inputs, it cannot depend on b.
+    if len(a.all_input_nodes) == 0:
+        return False
+
+    # If the search depth has been exhausted and no conclusion has been
+    # reached, assume that there is a data dependency.
+    if search_depth == 0:
+        return True
+
+    # Recursively check all inputs of a.
+    for inp in a.all_input_nodes:
+        if may_depend_on(inp, b, search_depth - 1):
+            return True
+
+    return False
+
+
+def are_nodes_independent(nodes: List[Node]):
+    """
+    Check if all of the given nodes are pairwise-data independent.
+
+    Arguments:
+        nodes: The nodes to check for data dependencies.
+
+    Returns:
+        True if any pair in nodes has a data dependency.
+    """
+    # For each pair in nodes:
+    for i, j in itertools.combinations(nodes, 2):
+        if may_depend_on(i, j) or may_depend_on(j, i):
+            return False
+
+    return True
+
+
+def merge_matmul(in_mod: torch.nn.Module):
+    """
+    A graph transformation that merges matrix multiplication operations that share the same right-hand
+    side operand into one large matrix multiplication.
+               ____      _________        _________
+      ----    |    |    |         |     M|  A * C  |
+    M| A  |  T| B  | * K|    C    | =    |---------|
+      ---- ,  |    |    |         |     T|  B * C  |
+       K       ----      ---------        ---------
+                K            R                R
+    """
+    gm = symbolic_trace(in_mod)
+
+    rhs_users: Dict[Node, List[Node]] = {}
+    lhs_users: Dict[Node, List[Node]] = {}
+
+    # Populate rhs_users and lhs_users - maps from LHS/RHS matrix multiply operands to
+    # the matmul of which they are the LHS/RHS.
+    for node in gm.graph.nodes:
+        if node.op != "call_function" or node.target is not torch.matmul:
+            continue
+
+        lhs, rhs = node.args
+
+        # TODO: Properly handle aliasing caused by get_attr. For now,
+        # use the attribute name as the operand if the node is a
+        # get_attr.
+        lhs = lhs.target if lhs.op == "get_attr" else lhs
+        rhs = rhs.target if rhs.op == "get_attr" else rhs
+
+        lhs_users.setdefault(lhs, []).append(node)
+        rhs_users.setdefault(rhs, []).append(node)
+
+    for rhs, mms in rhs_users.items():
+        # There must be at least matmuls for a merge to make sense.
+        if len(mms) < 2:
+            continue
+
+        # All matmuls must not depend on each other directly or indirectly
+        # in order for the merge to be possible.
+        if not are_nodes_independent(mms):
+            continue
+
+        lhs_vals = [mm.args[0] for mm in mms]
+
+        # Merge the matmul.
+        # Collect a list of LHS operands and the single RHS operand.
+        lhs = [gm.graph.get_attr(l) if isinstance(l, str) else l for l in lhs_vals]
+        rhs = gm.graph.get_attr(rhs) if isinstance(rhs, str) else rhs
+
+        # Concatenate all the LHS operands.
+        merge_mm_cat = gm.graph.call_function(torch.cat, (lhs,), {})
+
+        # Multiply the concatenated LHS operands with the one RHS. This will produce
+        # the same results as all the individual matmuls involving rhs in the original graph,
+        # but they will all be concatenated together.
+        merge_mm = gm.graph.call_function(torch.matmul, (merge_mm_cat, rhs,), {})
+
+        # Split the result of the merged matmul using the shapes of the LHS operands
+        # to ascertain how large each chunk should be.
+        merge_mm_sizes = [
+            gm.graph.call_function(get_first_dim, (l,), {}) for l in lhs
+        ]
+        merge_mm_split = gm.graph.call_function(
+            torch.split, (merge_mm, merge_mm_sizes), {}
+        )
+        merge_mm_res = [
+            gm.graph.call_function(operator.getitem, (merge_mm_split, out), {})
+            for out in range(len(lhs))
+        ]
+
+        # Replace all uses of the original, unmerged matmuls with the equivalent split chunk from the merged matmul.
+        for old, new in zip(mms, merge_mm_res):
+            old.replace_all_uses_with(new)
+            gm.graph.erase_node(old)
+
+        # All of the new nodes created above were inserted at the end, so we need to sort
+        # the nodes topologically to make sure all definitions precede uses.
+        legalize_graph(gm)
+
+    gm.recompile()
+    gm.graph.lint(in_mod)
+    return gm
diff --git a/torch/fx/graph.py b/torch/fx/graph.py
index 8f07f42529aa..6e493676f8c2 100644
--- a/torch/fx/graph.py
+++ b/torch/fx/graph.py
@@ -577,7 +577,9 @@ def python_code(self, root_module: str) -> str:
         free_vars: List[str] = []
         modules_used : Set[str] = set()
         body: List[str] = []
-        maybe_return_annotation : str = ''
+
+        # Wrap string in list to pass by reference
+        maybe_return_annotation : List[str] = ['']
 
         def register_modules_used(qualified_name : str):
             if '.' in qualified_name:
@@ -675,7 +677,7 @@ def emit_node(node : Node):
                 return
             elif node.op == 'output':
                 if node.type is not None:
-                    maybe_return_annotation = f" -> {type_repr(node.type)}"
+                    maybe_return_annotation[0] = f" -> {type_repr(node.type)}"
                 body.append(f'return {repr(node.args[0])}')
                 return
             raise NotImplementedError(f'node: {node.op} {node.target}')
@@ -691,13 +693,18 @@ def emit_node(node : Node):
         import_strs = [f'import {name}' for name in sorted(modules_used)]
         import_block = '\n'.join(import_strs)
 
+        if len(body) == 0:
+            # If the Graph has no non-placeholder nodes, no lines for the body
+            # have been emitted. To continue to have valid Python code, emit a
+            # single pass statement
+            body.append('pass\n')
+
         code = ''.join(body)
-        code = '\n'.join('    ' + line for line in code.split('\n')) + '\n'
+        code = '\n'.join('    ' + line for line in code.split('\n'))
         fn_code = f"""\
 {import_block}
-def forward(self, {', '.join(free_vars)}){maybe_return_annotation}:
-{code}
-"""
+def forward(self, {', '.join(free_vars)}){maybe_return_annotation[0]}:
+{code}"""
 
         return fn_code
 
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 6f5faa2f3020..cfd327165899 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -44,6 +44,7 @@
 from torch.jit._serialization import save, load
 from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph
 
+from torch.jit.cuda import stream
 from torch.jit._freeze import freeze
 
 # For backwards compatibility
@@ -81,7 +82,7 @@ def script_if_tracing(fn):
     ``@torch.jit.script_if_tracing`` to substitute for
     ``torch.jit.script``.
 
-    Arguments:
+    Args:
         fn: A function to compile.
 
     Returns:
@@ -100,7 +101,7 @@ def isinstance(obj, target_type):
     ``Dict[str, List[torch.Tensor]]``, ``Optional[Tuple[int,str,int]]``. It can also 
     refine basic types such as bools and ints that are available in TorchScript.
 
-    Arguments:
+    Args:
         obj: object to refine the type of
         target_type: type to try to refine obj to 
     Returns:
diff --git a/torch/jit/_async.py b/torch/jit/_async.py
index 5e67167bd41a..ae9684a0e229 100644
--- a/torch/jit/_async.py
+++ b/torch/jit/_async.py
@@ -17,7 +17,7 @@
 
 
 def fork(func, *args, **kwargs):
-    """
+    r"""
     Creates an asynchronous task executing `func` and a reference to the value
     of the result of this execution. `fork` will return immediately,
     so the return value of `func` may not have been computed yet. To force completion
@@ -31,7 +31,7 @@ def fork(func, *args, **kwargs):
         `fork` tasks will execute non-deterministicly. We recommend only spawning
         parallel fork tasks for pure functions that do not modify their inputs,
         module attributes, or global state.
-    Arguments:
+    Args:
         func (callable or torch.nn.Module):  A Python function or `torch.nn.Module`
             that will be invoked. If executed in TorchScript, it will execute asynchronously,
             otherwise it will not. Traced invocations of fork will be captured in the IR.
@@ -42,7 +42,8 @@ def fork(func, *args, **kwargs):
 
     Example (fork a free function):
 
-    .. testcode::
+    .. code-block:: python
+
         import torch
         from torch import Tensor
         def foo(a : Tensor, b : int) -> Tensor:
@@ -60,16 +61,17 @@ def bar(a):
 
     Example (fork a module method):
 
-    .. testcode::
+    .. code-block:: python
+
         import torch
         from torch import Tensor
-        class SubMod(torch.nn.Module):
+        class AddMod(torch.nn.Module):
             def forward(self, a: Tensor, b : int):
                 return a + b
         class Mod(torch.nn.Module):
             def __init__(self):
                 super(self).__init__()
-                self.mod = SubMod()
+                self.mod = AddMod()
             def forward(self, input):
                 fut = torch.jit.fork(self.mod, a, b=2)
                 return torch.jit.wait(fut)
@@ -81,10 +83,10 @@ def forward(self, input):
 
 
 def wait(future):
-    """
+    r"""
     Forces completion of a `torch.jit.Future[T]` asynchronous task, returning the
     result of the task. See :func:`~fork` for docs and examples.
-    Arguments:
+    Args:
         func (torch.jit.Future[T]): an asynchronous task reference, created through `torch.jit.fork`
     Returns:
         `T`: the return value of the the completed task
diff --git a/torch/jit/_freeze.py b/torch/jit/_freeze.py
index 5c217ea17c1f..5db57f0b4dfe 100644
--- a/torch/jit/_freeze.py
+++ b/torch/jit/_freeze.py
@@ -20,7 +20,7 @@ def freeze(mod, preserved_attrs: Optional[List[str]] = None):
 
     Freezing currently only accepts ScriptModules that are in eval mode.
 
-    Arguments:
+    Args:
         mod (:class:`ScriptModule`): a module to be frozen
 
         preserved_attrs (Optional[List[str]]): a list of attributes to preserve in addition to the forward method.
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index aa319778140b..2d8d0c31cbbf 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -328,7 +328,7 @@ def get_module_concrete_type(nn_module, share_types=True):
     type is fetched from concrete_type_store. If it is False, a new concrete type
     is created without first searching concrete_type_store.
 
-    Arguments:
+    Args:
         nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
         share_types = Whether to share underlying JIT types between modules (if possible).
 
@@ -356,7 +356,7 @@ def create_script_module(nn_module, stubs_fn, share_types=True):
     """
     Creates a new ScriptModule from an nn.Module
 
-    Arguments:
+    Args:
         nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
         stubs_fn:  Lambda that takes an nn.Module and generates a list of ScriptMethodStubs to compile.
         share_types:  Whether to share underlying JIT types between modules (if possible).
@@ -373,7 +373,7 @@ def create_script_module_impl(nn_module, concrete_type, stubs_fn):
     """
     Convert an nn.Module to a RecursiveScriptModule.
 
-    Arguments:
+    Args:
         nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
         concrete_type:  The fully initialized ConcreteType of the module.
         stubs_fn:  Lambda that takes an nn.Module and generates a list of ScriptMethodStubs to compile.
@@ -645,7 +645,7 @@ def interface_script(mod_interface, nn_module):
     Makes a ScriptModule from an nn.Module, using the interface methods rule for
     determining which methods to compile.
 
-    Arguments:
+    Args:
         mod_interface: the interface type that the module have
         nn_module:  The original Python nn.Module that we are creating a ScriptModule for.
     """
diff --git a/torch/jit/_script.py b/torch/jit/_script.py
index d4f6f96c3da2..8bc8c6117c1b 100644
--- a/torch/jit/_script.py
+++ b/torch/jit/_script.py
@@ -379,7 +379,7 @@ def _construct(cpp_module, init_fn):
             object is properly finalized (and in the future we may take
             control of how the RecursiveScriptModule instance is created).
 
-            Arguments:
+            Args:
                 cpp_module:  The C++ Module that will hold the actual state of
                              this RecursiveScriptModule instance.
                 init_fn:  Lambda that initializes the RecursiveScriptModule passed to it.
@@ -409,7 +409,7 @@ def _reconstruct(self, cpp_module):
             """
             Re-construct an instance of RecursiveScriptModule using an instance of a C++ module.
 
-            Arguments:
+            Args:
                 cpp_module: The C++ module that this RecursiveScriptModule will be rebuilt around.
             """
             self.__init__(cpp_module)  # type: ignore
@@ -493,7 +493,7 @@ def _save_for_lite_interpreter(self, *args, **kwargs):
             Add (or update) the bytecode session to the script model. The updated model is used
             in lite interpreter for mobile applications.
 
-            Arguments:
+            Args:
                 f: a string containing a file name.
                 _extra_files: Map from filename to contents which will be stored as part of 'f'.
 
@@ -741,6 +741,43 @@ class RecursiveScriptModule(ScriptModule):  # type: ignore
         def __init__(self, arg=None):
             super().__init__()
 
+def call_prepare_scriptable_func_impl(obj, memo):
+    if not isinstance(obj, torch.nn.Module):
+        return obj
+
+    obj_id = id(obj)
+
+    # If obj_id is in memo, obj has already been prepared or is being
+    # prepared in another call up the stack.
+    if obj_id in memo:
+        return memo[id(obj)]
+
+    obj = obj.__prepare_scriptable__() if hasattr(obj, '__prepare_scriptable__') else obj  # type: ignore
+    # Record obj in memo to avoid infinite recursion in the case of cycles in the module
+    # hierarchy when recursing below.
+    memo[obj_id] = obj
+
+    new_obj_dict = {}
+
+    for name in obj.__dict__:
+        sub_module = obj.__dict__.get(name)
+        if name == '_modules':
+            for k, v in sub_module.items():
+                sub_module[k] = call_prepare_scriptable_func_impl(v, memo)
+            new_obj_dict[name] = sub_module
+        elif isinstance(sub_module, torch.nn.Module) and not isinstance(sub_module, ScriptModule):
+            new_obj_dict[name] = call_prepare_scriptable_func_impl(sub_module, memo)
+        else:
+            new_obj_dict[name] = sub_module
+
+    for k, v in new_obj_dict.items():
+        obj.__dict__[name] = v
+
+    return obj
+
+def call_prepare_scriptable_func(obj):
+    memo: Dict[int, torch.nn.Module] = {}
+    return call_prepare_scriptable_func_impl(obj, memo)
 
 def script(obj, optimize=None, _frames_up=0, _rcb=None):
     r"""
@@ -754,7 +791,7 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None):
     ``torch.jit.script`` can be used as a function for modules and functions, and as a decorator
     ``@torch.jit.script`` for :ref:`torchscript-classes` and functions.
 
-    Arguments:
+    Args:
         obj (callable, class, or ``nn.Module``):  The ``nn.Module``, function, or class type to
                                                   compile.
 
@@ -894,6 +931,7 @@ def forward(self, input):
         return obj
 
     if isinstance(obj, torch.nn.Module):
+        obj = call_prepare_scriptable_func(obj)
         return torch.jit._recursive.create_script_module(
             obj, torch.jit._recursive.infer_methods_to_compile
         )
diff --git a/torch/jit/_serialization.py b/torch/jit/_serialization.py
index fd93cc13aeb6..7a551bb7da76 100644
--- a/torch/jit/_serialization.py
+++ b/torch/jit/_serialization.py
@@ -33,7 +33,7 @@ def save(m, f, _extra_files=None):
         during loading.  This is different from :func:`torch.load`'s semantics
         and may change in the future.
 
-    Arguments:
+    Args:
         m: A :class:`ScriptModule` to save.
         f: A file-like object (has to implement write and flush) or a string
            containing a file name.
@@ -94,7 +94,7 @@ def load(f, map_location=None, _extra_files=None):
     because the run time system doesn't have certain devices), an exception is
     raised.
 
-    Arguments:
+    Args:
         f: a file-like object (has to implement read, readline, tell, and seek),
             or a string containing a file name
         map_location (string or torch.device): A simplified version of
diff --git a/torch/jit/_trace.py b/torch/jit/_trace.py
index c424d23e050a..17be5c1ffd40 100644
--- a/torch/jit/_trace.py
+++ b/torch/jit/_trace.py
@@ -194,7 +194,7 @@ def verify(model, args, loss_fn=torch.sum, devices=None):
     parameters), so don't expect the model to come out exactly the same as what
     you passed in.
 
-    Arguments:
+    Args:
         model (compiled torch.nn.Module or function): the module/function to be
             verified.  The module/function definition MUST have been decorated with
             `@torch.jit.compile`.
@@ -626,7 +626,7 @@ def trace(
         invocations of the model. The tracer will try to emit warnings when
         doing something that may cause an incorrect trace to be produced.
 
-    Arguments:
+    Args:
         func (callable or torch.nn.Module):  A Python function or `torch.nn.Module`
             that will be run with `example_inputs`. `func` arguments and return
             values  must be tensors or (possibly nested) tuples that contain
@@ -830,7 +830,7 @@ def trace_module(
 
     See :func:`torch.jit.trace <torch.jit.trace>` for more information on tracing.
 
-    Arguments:
+    Args:
         mod (torch.nn.Module):  A ``torch.nn.Module`` containing methods whose names are
                                 specified in ``inputs``. The given methods will be compiled
                                 as a part of a single `ScriptModule`.
@@ -1077,7 +1077,7 @@ def _reconstruct(self, cpp_module):
         """
         Re-construct an instance of TopLevelTracedModule using an instance of a C++ module.
 
-        Arguments:
+        Args:
             cpp_module: The C++ module that this TopLevelTracedModule will be rebuilt around.
         """
         self.__dict__["_actual_script_module"]._reconstruct(cpp_module)
@@ -1117,7 +1117,7 @@ def _get_trace_graph(f, args=(), kwargs=None, strict=True, _force_outplace=False
     Tracing is guaranteed not to change the semantics of the function/module
     that is traced.
 
-    Arguments:
+    Args:
         f (torch.nn.Module or function): the function or module
             to be traced.
         args (tuple or Tensor): the positional arguments to pass to the
diff --git a/torch/jit/cuda.py b/torch/jit/cuda.py
new file mode 100644
index 000000000000..16805301600b
--- /dev/null
+++ b/torch/jit/cuda.py
@@ -0,0 +1,182 @@
+# mypy: ignore-errors
+
+r"""
+This package adds support for JIT compilation for CUDA Streams and events,
+This is similar to API's available in the eager mode
+:ref:`cuda-semantics` has more details about working with CUDA.
+"""
+
+import torch
+from typing import Optional, Any
+from torch import device as _device
+
+def get_current_device_index() -> int:
+    r"""Checks if there are CUDA devices available and
+    returns the device index of the current default CUDA device.
+    Returns -1 in case there are no CUDA devices available.
+
+    Arguments: ``None``
+    """
+    if torch.cuda.device_count() > 0:
+        return torch.cuda._current_device()
+    return -1
+
+def get_device_index(device: Optional[_device] = None, optional: bool = False, allow_cpu: bool = False) -> int:
+    r"""Gets the device index from :attr:`device`, which can be a torch.device
+    object, a Python integer, or ``None``.
+
+    If :attr:`device` is a torch.device object, returns the device index if it
+    is a CUDA device. Note that for a CUDA device without a specified index,
+    , this will return the current default CUDA device if :attr:`optional` is ``True``.
+    If :attr:`allow_cpu` is ``True``,CPU devices will be accepted and ``-1`` will be
+    returned in this case.
+
+    If :attr:`device` is a Python integer, it is returned as is.
+
+    If :attr:`device` is ``None``, this will return the current default CUDA
+    device if :attr:`optional` is ``True``.
+    """
+    if device is None:
+        if optional:
+            return get_current_device_index()
+        else:
+            raise ValueError('Expected a torch.device with a specified index '
+                             f'or an integer, but got: {device}')
+    device_index = -1
+    if isinstance(device, str):
+        device = torch.device(device)
+
+    if isinstance(device, torch.device):
+        if not allow_cpu and device.type == 'cpu':
+            raise ValueError(f'Expected a non cpu device, but got: {device}')
+        device_index = -1 if device.type == 'cpu' else torch.cuda.device_index(device)
+
+    if isinstance(device, int):
+        device_index = device
+
+    return device_index
+
+class device(object):
+    r"""Context-manager that changes the selected device.
+    This is similar to device (torch.device or int), but has been
+    introduced for JIT compatibility.
+    Arguments:
+        device (torch.device or int): device index to select. It's a no-op if
+            this argument is a negative integer or ``None``.
+    """
+    def __init__(self, device: Optional[_device]):
+        self.idx = -1
+        self.prev_idx = -1
+        self.device = device
+
+    def __enter__(self):
+        self.idx = get_device_index(self.device, optional=True)
+
+        if self.idx == -1:
+            return
+        self.prev_idx = torch.cuda._current_device()
+
+        if self.prev_idx != self.idx:
+            torch.cuda._set_device(self.idx)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        if self.prev_idx != self.idx:
+            torch.cuda._set_device(self.prev_idx)
+
+class StreamContext(object):
+    r"""Context-manager that selects a given stream.
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+    Arguments:
+        StreamContext (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    .. note:: Streams are per-device. If the selected stream is not on the
+        current device, this function will also change the current device to
+        match the stream.
+    """
+    cur_stream : Optional['torch.classes.cuda.Stream']
+
+    def __init__(self, stream: Optional['torch.classes.cuda.Stream']):
+        self.idx = -1
+        self.stream = stream
+        # Initialize the below streams to default stream on the current device
+        self.device_index = get_current_device_index()
+        self.src_prev_stream = torch.cuda.default_stream(self.device_index)
+        self.dst_prev_stream = torch.cuda.default_stream(self.device_index)
+
+    def __enter__(self):
+        self.idx = get_device_index(device=None, optional=True)
+        # If there is no CUDA device available, return
+        if self.idx == -1:
+            return
+
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # Return if stream is None
+        if cur_stream is None:
+            return
+        self.src_prev_stream = torch.cuda.current_stream(self.idx)
+        # If the stream is not on the current device, then change the device
+        # and set the current stream on the device
+        if self.src_prev_stream.device_index() != cur_stream.device_index():
+            with device(cur_stream.device()):
+                self.dst_prev_stream = torch.cuda.current_stream(cur_stream.device_index())
+            torch.cuda._set_device(cur_stream.device_index())
+        torch.cuda.set_stream(cur_stream)
+
+    def __exit__(self, type: Any, value: Any, traceback: Any):
+        # Local cur_stream variable for type refinement
+        cur_stream = self.stream
+        # If stream is None or no CUDA device available, return
+        if cur_stream is None or self.idx == -1:
+            return
+        # If the stream was not on the current device, restore the previous stream on
+        # the destination device and also reset the current device to the previous device.
+        # Set the current stream on the device to the src_prev_stream
+        if self.src_prev_stream.device_index() != cur_stream.device_index():
+            torch.cuda.set_stream(self.dst_prev_stream)
+            torch.cuda._set_device(self.idx)
+        torch.cuda.set_stream(self.src_prev_stream)
+
+def stream(stream: Optional['torch.classes.cuda.Stream']) -> StreamContext:
+    r"""Wrapper around the Context-manager that selects a given stream.
+    All CUDA kernels queued within its context will be enqueued on a selected
+    stream.
+    Arguments:
+        stream (Stream): selected stream. This manager is a no-op if it's
+            ``None``.
+    """
+    return StreamContext(stream)
+
+def Stream(device: int = -1, priority: int = 0) -> 'torch.classes.cuda.Stream':
+    r"""Wrapper around a CUDA stream.
+    A CUDA stream is a linear sequence of execution that belongs to a specific
+    device, independent from other streams.  See :ref:`cuda-semantics` for
+    details.
+    Arguments:
+        device(int, optional): a device on which to allocate
+            the stream. If :attr:`device` is ``None`` (default) or a negative
+            integer, this will use the current device.
+        priority(int, optional): priority of the stream. Can be either
+            -1 (high priority) or 0 (low priority). By default, streams have
+            priority 0.
+    .. note:: Although CUDA versions >= 11 support more than two levels of
+        priorities, in PyTorch, we only support two levels of priorities.
+    """
+    return torch.classes.cuda.Stream(device, priority)
+
+def Event(enable_timing: bool = False, blocking: bool = False, interprocess: bool = False) -> 'torch.classes.cuda.Event':
+    r"""Wrapper around a CUDA event.
+    CUDA events are synchronization markers that can be used to monitor the
+    device's progress, to accurately measure timing, and to synchronize CUDA
+    streams.
+    Arguments:
+        enable_timing (bool, optional): indicates if the event should measure time
+            (default: ``False``)
+        blocking (bool, optional): if ``True``, :meth:`wait` will be blocking (default: ``False``)
+        interprocess (bool): if ``True``, the event can be shared between processes
+            (default: ``False``)
+    .. _CUDA Event Documentation:
+       https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
+    """
+    return torch.classes.cuda.Event(enable_timing, blocking, interprocess)
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index b4d3438f99ee..da3b3c2301a6 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -134,7 +134,7 @@ def get_class_properties(cls, self_name):
     """
     Get a list of Property objects representing the properties of a class.
 
-    Arguments:
+    Args:
         cls:  The class to get properties of.
         self_name: The name of the class that the properties should belong to.
     Returns:
@@ -188,7 +188,7 @@ def normalize_source_lines(sourcelines: List[str]) -> List[str]:
     all lines in the function body to a point at or greater than that
     level. This allows for comments and continued string literals that
     are at a lower indentation than the rest of the code.
-    Arguments:
+    Args:
         sourcelines: function source code, separated into lines by
                         the '\n' character
     Returns:
@@ -221,7 +221,7 @@ def get_jit_def(fn, def_name, self_name=None):
     """
     Build a JIT AST (TreeView) from the given function.
 
-    Arguments:
+    Args:
         fn: A function object to compile
         def_name: The name to give to the resulting AST object. This is not
             always the same as `fn.__name__`, for example:
diff --git a/torch/jit/mobile/__init__.py b/torch/jit/mobile/__init__.py
index 6afbf6f2cdb6..4356400cb447 100644
--- a/torch/jit/mobile/__init__.py
+++ b/torch/jit/mobile/__init__.py
@@ -10,7 +10,7 @@ def _load_for_lite_interpreter(f, map_location=None):
     Load a :class:`LiteScriptModule`
     saved with :func:`torch.jit._save_for_lite_interpreter`
 
-    Arguments:
+    Args:
         f: a file-like object (has to implement read, readline, tell, and seek),
             or a string containing a file name
         map_location: a string or torch.device used to dynamically remap
diff --git a/torch/jit/quantized.py b/torch/jit/quantized.py
index 615741f38da7..d853a55b3933 100644
--- a/torch/jit/quantized.py
+++ b/torch/jit/quantized.py
@@ -130,8 +130,7 @@ def check_forward_input(self, input):
                     input.size(1), self.input_size))
 
     @torch.jit.script_method
-    def check_forward_hidden(self, input, hx, hidden_label=''):
-        # type: (Tensor, Tensor, str) -> None
+    def check_forward_hidden(self, input: Tensor, hx: Tensor, hidden_label: str = '') -> None:
         if input.size(0) != hx.size(0):
             raise RuntimeError(
                 "Input batch size {} doesn't match hidden{} batch size {}".format(
@@ -169,8 +168,7 @@ def __init__(self, other):
         self.nonlinearity = other.nonlinearity
 
     @torch.jit.script_method
-    def forward(self, input, hx=None):
-        # type: (Tensor, Optional[Tensor]) -> Tensor
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         self.check_forward_input(input)
         if hx is None:
             hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
@@ -201,8 +199,7 @@ def __init__(self, other):
         super(QuantizedLSTMCell, self).__init__(other)
 
     @torch.jit.script_method
-    def forward(self, input, hx=None):
-        # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
+    def forward(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
         self.check_forward_input(input)
         if hx is None:
             zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
@@ -222,8 +219,7 @@ def __init__(self, other):
         super(QuantizedGRUCell, self).__init__(other)
 
     @torch.jit.script_method
-    def forward(self, input, hx=None):
-        # type: (Tensor, Optional[Tensor]) -> Tensor
+    def forward(self, input: Tensor, hx: Optional[Tensor] = None) -> Tensor:
         self.check_forward_input(input)
         if hx is None:
             hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
@@ -236,8 +232,7 @@ def forward(self, input, hx=None):
         )
 
 
-def apply_permutation(tensor, permutation, dim=1):
-    # type: (Tensor, Tensor, int) -> Tensor
+def apply_permutation(tensor: Tensor, permutation: Tensor, dim: int = 1) -> Tensor:
     return tensor.index_select(dim, permutation)
 
 
@@ -303,8 +298,7 @@ def get_weight_bias(ihhh):
                 self.all_weights.append(cell_params)
 
     @torch.jit.script_method
-    def check_input(self, input, batch_sizes):
-        # type: (Tensor, Optional[Tensor]) -> None
+    def check_input(self, input: Tensor, batch_sizes: Optional[Tensor]) -> None:
         expected_input_dim = 2 if batch_sizes is not None else 3
         if input.dim() != expected_input_dim:
             raise RuntimeError(
@@ -316,8 +310,7 @@ def check_input(self, input, batch_sizes):
                     self.input_size, input.size(-1)))
 
     @torch.jit.script_method
-    def get_expected_hidden_size(self, input, batch_sizes):
-        # type: (Tensor, Optional[Tensor]) -> Tuple[int, int, int]
+    def get_expected_hidden_size(self, input: Tensor, batch_sizes: Optional[Tensor]) -> Tuple[int, int, int]:
         if batch_sizes is not None:
             mini_batch = int(batch_sizes[0])
         else:
@@ -328,21 +321,19 @@ def get_expected_hidden_size(self, input, batch_sizes):
         return expected_hidden_size
 
     @torch.jit.script_method
-    def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
-        # type: (Tensor, Tuple[int, int, int], str) -> None
+    def check_hidden_size(self, hx: Tensor, expected_hidden_size: Tuple[int, int, int],
+                          msg: str = 'Expected hidden size {}, got {}') -> None:
         if hx.size() != expected_hidden_size:
             raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
 
     @torch.jit.script_method
-    def check_forward_args(self, input, hidden, batch_sizes):
-        # type: (Tensor, Tensor, Optional[Tensor]) -> None
+    def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]) -> None:
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
         self.check_hidden_size(hidden, expected_hidden_size, msg='Expected hidden size {}, got {}')
 
     @torch.jit.script_method
-    def permute_hidden(self, hx, permutation):
-        # type: (Tensor, Optional[Tensor]) -> Tensor
+    def permute_hidden(self, hx: Tensor, permutation: Optional[Tensor]) -> Tensor:
         if permutation is None:
             return hx
         return apply_permutation(hx, permutation)
@@ -355,8 +346,9 @@ def __init__(self, other, dtype):
         super(QuantizedLSTM, self).__init__(other, dtype)
 
     @torch.jit.script_method
-    def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
-        # type: (Tensor, Optional[Tuple[Tensor, Tensor]], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]  # noqa
+    def forward_impl(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]], batch_sizes: Optional[Tensor],
+                     max_batch_size: int, sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
+        # noqa
         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             zeros = torch.zeros(self.num_layers * num_directions,
@@ -379,8 +371,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
         return output, hidden
 
     @torch.jit.script_method
-    def forward_tensor(self, input, hx=None):
-        # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
+    def forward_tensor(self, input: Tensor, hx: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tuple[Tensor, Tensor]]:
         batch_sizes = None
         max_batch_size = input.size(0) if self.batch_first else input.size(1)
         sorted_indices = None
@@ -391,8 +382,8 @@ def forward_tensor(self, input, hx=None):
         return output, self.permute_hidden(hidden, unsorted_indices)
 
     @torch.jit.script_method
-    def forward_packed(self, input, hx=None):
-        # type: (PackedSequence, Optional[Tuple[Tensor, Tensor]]) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]  # noqa
+    def forward_packed(self, input: PackedSequence, hx: Optional[Tuple[Tensor, Tensor]] = None
+                       ) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]:
         input, batch_sizes, sorted_indices, unsorted_indices = input
         max_batch_size = batch_sizes[0]
         max_batch_size = int(max_batch_size)
@@ -404,15 +395,13 @@ def forward_packed(self, input, hx=None):
 
 
     @torch.jit.script_method
-    def permute_hidden(self, hx, permutation):
-        # type: (Tuple[Tensor, Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor]
+    def permute_hidden(self, hx: Tuple[Tensor, Tensor], permutation: Optional[Tensor]) -> Tuple[Tensor, Tensor]:
         if permutation is None:
             return hx
         return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
 
     @torch.jit.script_method
-    def check_forward_args(self, input, hidden, batch_sizes):
-        # type: (Tensor, Tuple[Tensor, Tensor], Optional[Tensor]) -> None
+    def check_forward_args(self, input: Tensor, hidden: Tuple[Tensor, Tensor], batch_sizes: Optional[Tensor]) -> None:
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
 
@@ -432,8 +421,9 @@ class QuantizedGRU(QuantizedRNNBase):
     __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
 
     @torch.jit.script_method
-    def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
-        # type: (Tensor, Optional[Tensor], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tensor]  # noqa
+    def forward_impl(self, input: Tensor, hx: Optional[Tensor], batch_sizes: Optional[Tensor], max_batch_size: int,
+                     sorted_indices: Optional[Tensor]) -> Tuple[Tensor, Tensor]:
+        # noqa
         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             hx = torch.zeros(self.num_layers * num_directions,
@@ -459,8 +449,7 @@ def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
         return output, hidden
 
     @torch.jit.script_method
-    def forward_tensor(self, input, hx=None):
-        # type: (Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor]
+    def forward_tensor(self, input: Tensor, hx: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
         batch_sizes = None
         max_batch_size = input.size(0) if self.batch_first else input.size(1)
         sorted_indices = None
@@ -470,8 +459,7 @@ def forward_tensor(self, input, hx=None):
         return output, self.permute_hidden(hidden, unsorted_indices)
 
     @torch.jit.script_method
-    def forward_packed(self, input, hx=None):
-        # type: (PackedSequence, Optional[Tensor]) -> Tuple[PackedSequence, Tensor]
+    def forward_packed(self, input: PackedSequence, hx: Optional[Tensor] = None) -> Tuple[PackedSequence, Tensor]:
         input, batch_sizes, sorted_indices, unsorted_indices = input
         max_batch_size = batch_sizes[0]
         max_batch_size = int(max_batch_size)
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 01ce71afd388..b9ac5aa77150 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -1409,7 +1409,13 @@ c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::reduce_scatter(
 c10::intrusive_ptr<ProcessGroup::Work> ProcessGroupNCCL::barrier(
     const BarrierOptions& opts) {
   std::vector<at::Device> devices;
-  if (usedDeviceIdxs_.empty()) {
+
+  // Use user defined GPU device ids if provided
+  if (!opts.device_ids.empty()) {
+    for (auto device : opts.device_ids) {
+      devices.push_back(at::Device(at::DeviceType::CUDA, device));
+    }
+  } else if (usedDeviceIdxs_.empty()) {
     // This means there is not yet a NCCL collective being called
     // Here we have to use the best guesses and will use a single GPU to call
     // allreduce to achieve barrier.
diff --git a/torch/lib/c10d/Types.hpp b/torch/lib/c10d/Types.hpp
index 03b2e59e4295..a5a0d5fa20df 100644
--- a/torch/lib/c10d/Types.hpp
+++ b/torch/lib/c10d/Types.hpp
@@ -62,6 +62,7 @@ struct AllToAllOptions {
 };
 
 struct BarrierOptions {
+  std::vector<int> device_ids;
   std::chrono::milliseconds timeout = kUnsetTimeout;
 };
 
diff --git a/torch/library.h b/torch/library.h
index d86c1afbd50e..fee98abb2b81 100644
--- a/torch/library.h
+++ b/torch/library.h
@@ -116,19 +116,6 @@ class TORCH_API CppFunction final {
     , debug_()
     {}
 
-  /// This static factory lets you create CppFunctions that (1) don't have boxing
-  /// wrappers (because we don't support it yet) and (2) don't have schema
-  /// inference (because some ops don't support it).
-  template <typename Func>
-  static CppFunction makeUnboxedOnly(Func* f) {
-    // TODO: Eliminate the necessity for this function entirely.
-    return CppFunction(
-      c10::KernelFunction::makeFromUnboxedOnlyRuntimeFunction(f),
-      /* cpp_signature */ c10::impl::CppSignature::make<Func>(),
-      /* schema */ nullptr
-    );
-  }
-
   /// This creates a fallthrough function.  Fallthrough functions
   /// immediately redispatch to the next available dispatch key,
   /// but are implemented more efficiently than a hand written
@@ -170,6 +157,22 @@ class TORCH_API CppFunction final {
     );
   }
 
+  /// Create a function from an unboxed kernel function.
+  /// This is typically used to register common operators.
+  template<typename FuncPtr, std::enable_if_t<c10::guts::is_function_type<FuncPtr>::value, std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr* f) {
+    return CppFunction(f);
+  }
+
+  /// Create a function from a compile time unboxed kernel function pointer.
+  /// This is typically used to register common operators.
+  /// Compile time function pointers can be used to allow the compiler
+  /// to optimize (e.g. inline) calls to it.
+  template<typename FuncPtr, std::enable_if_t<c10::is_compile_time_function_pointer<FuncPtr>::value, std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr f) {
+    return CppFunction(f);
+  }
+
   CppFunction&& debug(std::string d) && {
     debug_ = std::move(d);
     return std::move(*this);
@@ -496,20 +499,10 @@ class TORCH_API Library final {
     return impl(name, dispatch(std::forward<Dispatch>(key), std::forward<Func>(raw_f)));
   }
 
-  /// \private
-  ///
-  /// Convenience overload for unboxed only kernels; kernels whose type
-  /// signatures are not supported by our template based metaprogramming
-  /// system.  These are currently quite common but will be eventually
-  /// eliminated.
-  ///
-  /// This is equivalent to calling CppFunction::makeUnboxedOnly() on
-  /// the function, but this name for the function makes it easy to grep for.
   template <typename Name, typename Func>
   Library& impl_UNBOXED(Name name, Func* raw_f) & {
-    // TODO: Remove this overload once the makeUnboxedOnly incidence rate
-    // goes way down
-    return impl(name, CppFunction::makeUnboxedOnly(raw_f));
+    static_assert(c10::guts::false_t<Func>(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
   }
 
   // These overloads cover cases when a SelectiveStr (see Note [Selective build])
@@ -531,7 +524,10 @@ class TORCH_API Library final {
   template <typename Dispatch, typename Func>
   Library& impl(detail::SelectiveStr<false>, Dispatch&& key, Func&& raw_f) & { return *this; }
   template <typename Func>
-  Library& impl_UNBOXED(detail::SelectiveStr<false> name, Func* raw_f) & { return *this; }
+  Library& impl_UNBOXED(detail::SelectiveStr<false> name, Func* raw_f) & {
+    static_assert(c10::guts::false_t<Func>(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
 
   template <typename Func>
   Library& impl(detail::SelectiveStr<true> name, Func&& raw_f) & {
@@ -543,7 +539,8 @@ class TORCH_API Library final {
   }
   template <typename Func>
   Library& impl_UNBOXED(detail::SelectiveStr<true> name, Func* raw_f) & {
-    return impl(name.operator const char*(), CppFunction::makeUnboxedOnly(raw_f));
+    static_assert(c10::guts::false_t<Func>(), ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
   }
 
   /// Register a fallback implementation for all operators which will be used
diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py
index 575bc92534be..0f99def6c7fe 100644
--- a/torch/linalg/__init__.py
+++ b/torch/linalg/__init__.py
@@ -620,3 +620,82 @@
     >>> torch.allclose(torch.tensordot(a, x, dims=x.ndim), b, atol=1e-6)
     True
 """)
+
+
+qr = _add_docstr(_linalg.linalg_qr, r"""
+qr(input, mode='reduced', *, out=None) -> (Tensor, Tensor)
+
+Computes the QR decomposition of a matrix or a batch of matrices :attr:`input`,
+and returns a namedtuple (Q, R) of tensors such that :math:`\text{input} = Q R`
+with :math:`Q` being an orthogonal matrix or batch of orthogonal matrices and
+:math:`R` being an upper triangular matrix or batch of upper triangular matrices.
+
+Depending on the value of :attr:`mode` this function returns the reduced or
+complete QR factorization. See below for a list of valid modes.
+
+.. note::  **Differences with** ``numpy.linalg.qr``:
+
+           * ``mode='raw'`` is not implemented
+
+           * unlike ``numpy.linalg.qr``, this function always returns a
+             tuple of two tensors. When ``mode='r'``, the `Q` tensor is an
+             empty tensor.
+
+.. note::
+          Backpropagation is not supported for ``mode='r'``. Use ``mode='reduced'`` instead.
+
+          If you plan to backpropagate through QR, note that the current backward implementation
+          is only well-defined when the first :math:`\min(input.size(-1), input.size(-2))`
+          columns of :attr:`input` are linearly independent.
+          This behavior may change in the future.
+
+.. note:: This function uses LAPACK for CPU inputs and MAGMA for CUDA inputs,
+          and may produce different (valid) decompositions on different device types
+          and different platforms, depending on the precise version of the
+          underlying library.
+
+Args:
+    input (Tensor): the input tensor of size :math:`(*, m, n)` where `*` is zero or more
+                batch dimensions consisting of matrices of dimension :math:`m \times n`.
+    mode (str, optional): if `k = min(m, n)` then:
+
+          * ``'reduced'`` : returns `(Q, R)` with dimensions (m, k), (k, n) (default)
+
+          * ``'complete'``: returns `(Q, R)` with dimensions (m, m), (m, n)
+
+          * ``'r'``: computes only `R`; returns `(Q, R)` where `Q` is empty and `R` has dimensions (k, n)
+
+Keyword args:
+    out (tuple, optional): tuple of `Q` and `R` tensors
+                satisfying :code:`input = torch.matmul(Q, R)`.
+                The dimensions of `Q` and `R` are :math:`(*, m, k)` and :math:`(*, k, n)`
+                respectively, where :math:`k = \min(m, n)` if :attr:`mode` is `'reduced'` and
+                :math:`k = m` if :attr:`mode` is `'complete'`.
+
+Example::
+
+    >>> a = torch.tensor([[12., -51, 4], [6, 167, -68], [-4, 24, -41]])
+    >>> q, r = torch.linalg.qr(a)
+    >>> q
+    tensor([[-0.8571,  0.3943,  0.3314],
+            [-0.4286, -0.9029, -0.0343],
+            [ 0.2857, -0.1714,  0.9429]])
+    >>> r
+    tensor([[ -14.0000,  -21.0000,   14.0000],
+            [   0.0000, -175.0000,   70.0000],
+            [   0.0000,    0.0000,  -35.0000]])
+    >>> torch.mm(q, r).round()
+    tensor([[  12.,  -51.,    4.],
+            [   6.,  167.,  -68.],
+            [  -4.,   24.,  -41.]])
+    >>> torch.mm(q.t(), q).round()
+    tensor([[ 1.,  0.,  0.],
+            [ 0.,  1., -0.],
+            [ 0., -0.,  1.]])
+    >>> a = torch.randn(3, 4, 5)
+    >>> q, r = torch.linalg.qr(a, mode='complete')
+    >>> torch.allclose(torch.matmul(q, r), a)
+    True
+    >>> torch.allclose(torch.matmul(q.transpose(-2, -1), q), torch.eye(5))
+    True
+""")
diff --git a/torch/multiprocessing/__init__.py b/torch/multiprocessing/__init__.py
index 6a95c4e916b0..039ddf2a1b09 100644
--- a/torch/multiprocessing/__init__.py
+++ b/torch/multiprocessing/__init__.py
@@ -35,7 +35,7 @@
 
 """Add helper function to spawn N processes and wait for completion of any of
 them. This depends `mp.get_context` which was added in Python 3.4."""
-from .spawn import spawn, SpawnContext, _supports_context, start_processes, ProcessContext, \
+from .spawn import spawn, SpawnContext, start_processes, ProcessContext, \
     ProcessRaisedException, ProcessExitedException
 
 
@@ -50,7 +50,7 @@
 def set_sharing_strategy(new_strategy):
     """Sets the strategy for sharing CPU tensors.
 
-    Arguments:
+    Args:
         new_strategy (str): Name of the selected strategy. Should be one of
             the values returned by :func:`get_all_sharing_strategies()`.
     """
diff --git a/torch/multiprocessing/_atfork.py b/torch/multiprocessing/_atfork.py
index de7b77ef7901..b9d59bc30604 100644
--- a/torch/multiprocessing/_atfork.py
+++ b/torch/multiprocessing/_atfork.py
@@ -23,7 +23,7 @@ def register_after_fork(func):
         ``multiprocessing`` module. In python >= 3.7 it also works with
         ``os.fork()``.
 
-    Arguments:
+    Args:
         func (function): Function taking no arguments to be called in the child after fork
 
     """
diff --git a/torch/multiprocessing/spawn.py b/torch/multiprocessing/spawn.py
index 595f49243faf..9ad17c94ccf8 100644
--- a/torch/multiprocessing/spawn.py
+++ b/torch/multiprocessing/spawn.py
@@ -66,24 +66,8 @@ def _wrap(fn, i, args, error_queue):
         sys.exit(1)
 
 
-# Multiprocessing contexts are introduced at Python 3.4
-_supports_context = sys.version_info >= (3, 4)
-
-
-def _python_version_check():
-    if not _supports_context:
-        raise RuntimeError("Requires python 3.4 or higher to use "
-                           "torch.multiprocessing.spawn and "
-                           "torch.multiprocessing.ProcessContext helper "
-                           "to launch multiple processes. If you are using "
-                           "this for distributed training and have a lower "
-                           "version of python, please use "
-                           "torch.distributed.launch instead.")
-
-
 class ProcessContext:
     def __init__(self, processes, error_queues):
-        _python_version_check()
         self.error_queues = error_queues
         self.processes = processes
         self.sentinels = {
@@ -104,7 +88,7 @@ def join(self, timeout=None):
         Returns ``True`` if all processes have been joined successfully,
         ``False`` if there are more processes that need to be joined.
 
-        Arguments:
+        Args:
             timeout (float): Wait this long before giving up on waiting.
         """
         # Ensure this function can be called even when we're done.
@@ -182,7 +166,6 @@ def __init__(self, processes, error_queues):
 # Currently we only add this API first, we can consider adding it to documentation as
 # needed in the future.
 def start_processes(fn, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'):
-    _python_version_check()
     mp = multiprocessing.get_context(start_method)
     error_queues = []
     processes = []
@@ -215,7 +198,7 @@ def spawn(fn, args=(), nprocs=1, join=True, daemon=False, start_method='spawn'):
     child process, it is forwarded and its traceback is included in
     the exception raised in the parent process.
 
-    Arguments:
+    Args:
         fn (function): Function is called as the entrypoint of the
             spawned process. This function must be defined at the top
             level of a module so it can be pickled and spawned. This
diff --git a/torch/nn/common_types.py b/torch/nn/common_types.py
index fa9d5bb1eb00..884f739e2781 100644
--- a/torch/nn/common_types.py
+++ b/torch/nn/common_types.py
@@ -1,4 +1,4 @@
-from typing import TypeVar, Union, Tuple
+from typing import TypeVar, Union, Tuple, Optional
 from .. import Tensor
 
 # Create some useful type aliases
@@ -24,6 +24,11 @@
 _size_5_t = _scalar_or_tuple_5_t[int]
 _size_6_t = _scalar_or_tuple_6_t[int]
 
+# For arguments which represent optional size parameters (eg, adaptive pool parameters)
+_size_any_opt_t = _scalar_or_tuple_any_t[Optional[int]]
+_size_2_opt_t = _scalar_or_tuple_2_t[Optional[int]]
+_size_3_opt_t = _scalar_or_tuple_3_t[Optional[int]]
+
 # For arguments that represent a ratio to adjust each dimension of an input with (eg, upsampling parameters)
 _ratio_2_t = _scalar_or_tuple_2_t[float]
 _ratio_3_t = _scalar_or_tuple_3_t[float]
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 2563d4b0ba29..fc71e4a0c449 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -1430,7 +1430,7 @@ def softmin(input, dim=None, _stacklevel=3, dtype=None):
 
     See :class:`~torch.nn.Softmin` for more details.
 
-    Arguments:
+    Args:
         input (Tensor): input
         dim (int): A dimension along which softmin will be computed (so every slice
             along dim will sum to 1).
@@ -1464,7 +1464,7 @@ def softmax(input, dim=None, _stacklevel=3, dtype=None):
 
     See :class:`~torch.nn.Softmax` for more details.
 
-    Arguments:
+    Args:
         input (Tensor): input
         dim (int): A dimension along which softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
@@ -1563,7 +1563,7 @@ def log_softmax(input, dim=None, _stacklevel=3, dtype=None):
 
     See :class:`~torch.nn.LogSoftmax` for more details.
 
-    Arguments:
+    Args:
         input (Tensor): input
         dim (int): A dimension along which log_softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type of returned tensor.
diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in
index 94071556e144..208dc7c2df40 100644
--- a/torch/nn/functional.pyi.in
+++ b/torch/nn/functional.pyi.in
@@ -1,7 +1,7 @@
 from torch import Tensor
 from torch.types import _size
 from typing import Any, Optional, Tuple, Dict, List, Callable, Sequence, Union
-from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t
+from .common_types import _ratio_any_t, _size_1_t, _size_2_t, _size_3_t, _size_2_opt_t, _size_3_opt_t
 
 # 'TypedDict' is a new accepted type that represents a dictionary with a fixed set of allowed keys.
 # It is standards-track but not in `typing` yet. We leave this hear to be uncommented once the feature
@@ -75,21 +75,21 @@ def adaptive_max_pool1d_with_indices(input: Tensor, output_size: _size, return_i
     Tensor, Tensor]: ...
 
 
-def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[
+def adaptive_max_pool2d_with_indices(input: Tensor, output_size: _size_2_opt_t, return_indices: bool = ...) -> Tuple[
     Tensor, Tensor]: ...
 
 
-def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size, return_indices: bool = ...) -> Tuple[
+def adaptive_max_pool3d_with_indices(input: Tensor, output_size: _size_3_opt_t, return_indices: bool = ...) -> Tuple[
     Tensor, Tensor]: ...
 
 
 def adaptive_avg_pool1d(input: Tensor, output_size: _size_1_t) -> Tensor: ...
 
 
-def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_t) -> Tensor: ...
+def adaptive_avg_pool2d(input: Tensor, output_size: _size_2_opt_t) -> Tensor: ...
 
 
-def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_t) -> Tensor: ...
+def adaptive_avg_pool3d(input: Tensor, output_size: _size_3_opt_t) -> Tensor: ...
 
 
 def dropout(input: Tensor, p: float = ..., training: bool = ..., inplace: bool = ...) -> Tensor: ...
diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py
index 4b07682b1af7..837ecca6fe9d 100644
--- a/torch/nn/modules/activation.py
+++ b/torch/nn/modules/activation.py
@@ -365,11 +365,11 @@ class SiLU(Module):
         \text{silu}(x) = x * \sigma(x), \text{where } \sigma(x) \text{ is the logistic sigmoid.}
 
     .. note::
-        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_ 
-        where the SiLU (Sigmoid Linear Unit) was originally coined, and see 
-        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation 
-        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish: 
-        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_ 
+        See `Gaussian Error Linear Units (GELUs) <https://arxiv.org/abs/1606.08415>`_
+        where the SiLU (Sigmoid Linear Unit) was originally coined, and see
+        `Sigmoid-Weighted Linear Units for Neural Network Function Approximation
+        in Reinforcement Learning <https://arxiv.org/abs/1702.03118>`_ and `Swish:
+        a Self-Gated Activation Function <https://arxiv.org/abs/1710.05941v1>`_
         where the SiLU was experimented with later.
 
     Shape:
@@ -848,8 +848,9 @@ class MultiheadAttention(Module):
         kdim: total number of features in key. Default: None.
         vdim: total number of features in value. Default: None.
 
-        Note: if kdim and vdim are None, they will be set to embed_dim such that
-              query, key, and value have the same number of features.
+    Note that if :attr:`kdim` and :attr:`vdim` are None, they will be set
+    to :attr:`embed_dim` such that query, key, and value have the same
+    number of features.
 
     Examples::
 
@@ -921,9 +922,8 @@ def __setstate__(self, state):
 
         super(MultiheadAttention, self).__setstate__(state)
 
-    def forward(self, query, key, value, key_padding_mask=None,
-                need_weights=True, attn_mask=None):
-        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, key_padding_mask: Optional[Tensor] = None,
+                need_weights: bool = True, attn_mask: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
         r"""
     Args:
         query, key, value: map a query and a set of key-value pairs to an output.
@@ -937,8 +937,7 @@ def forward(self, query, key, value, key_padding_mask=None,
         attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
             the batches while a 3D mask allows to specify a different mask for the entries of each batch.
 
-    Shape:
-        - Inputs:
+    Shapes for inputs:
         - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
           the embedding dimension.
         - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
@@ -949,15 +948,17 @@ def forward(self, query, key, value, key_padding_mask=None,
           If a ByteTensor is provided, the non-zero positions will be ignored while the position
           with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
           value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
-        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
-          3D mask :math:`(N*\text{num_heads}, L, S)` where N is the batch size, L is the target sequence length,
-          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
-          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+        - attn_mask: if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the
+          source sequence length.
+
+          If a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)` where N is the batch size, L is the target sequence
+          length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend
+          the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
           while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
           is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
           is provided, it will be added to the attention weight.
 
-        - Outputs:
+    Shapes for outputs:
         - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
           E is the embedding dimension.
         - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
@@ -1112,7 +1113,7 @@ class Softmin(Module):
           dimensions
         - Output: :math:`(*)`, same shape as the input
 
-    Arguments:
+    Args:
         dim (int): A dimension along which Softmin will be computed (so every slice
             along dim will sum to 1).
 
@@ -1166,7 +1167,7 @@ class Softmax(Module):
         a Tensor of the same dimension and shape as the input with
         values in the range [0, 1]
 
-    Arguments:
+    Args:
         dim (int): A dimension along which Softmax will be computed (so every slice
             along dim will sum to 1).
 
@@ -1240,7 +1241,7 @@ class LogSoftmax(Module):
           dimensions
         - Output: :math:`(*)`, same shape as the input
 
-    Arguments:
+    Args:
         dim (int): A dimension along which LogSoftmax will be computed.
 
     Returns:
diff --git a/torch/nn/modules/container.py b/torch/nn/modules/container.py
index f95e5b144d38..2ead84fe73e8 100644
--- a/torch/nn/modules/container.py
+++ b/torch/nn/modules/container.py
@@ -125,7 +125,7 @@ class ModuleList(Module):
     modules it contains are properly registered, and will be visible by all
     :class:`~torch.nn.Module` methods.
 
-    Arguments:
+    Args:
         modules (iterable, optional): an iterable of modules to add
 
     Example::
@@ -197,7 +197,7 @@ def __dir__(self):
     def insert(self, index: int, module: Module) -> None:
         r"""Insert a given module before a given index in the list.
 
-        Arguments:
+        Args:
             index (int): index to insert.
             module (nn.Module): module to insert
         """
@@ -208,7 +208,7 @@ def insert(self, index: int, module: Module) -> None:
     def append(self: T, module: Module) -> T:
         r"""Appends a given module to the end of the list.
 
-        Arguments:
+        Args:
             module (nn.Module): module to append
         """
         self.add_module(str(len(self)), module)
@@ -217,7 +217,7 @@ def append(self: T, module: Module) -> T:
     def extend(self: T, modules: Iterable[Module]) -> T:
         r"""Appends modules from a Python iterable to the end of the list.
 
-        Arguments:
+        Args:
             modules (iterable): iterable of modules to append
         """
         if not isinstance(modules, container_abcs.Iterable):
@@ -252,7 +252,7 @@ class ModuleDict(Module):
     types (e.g., Python's plain ``dict`` before Python version 3.6) does not
     preserve the order of the merged mapping.
 
-    Arguments:
+    Args:
         modules (iterable, optional): a mapping (dictionary) of (string: module)
             or an iterable of key-value pairs of type (string, module)
 
@@ -311,7 +311,7 @@ def clear(self) -> None:
     def pop(self, key: str) -> Module:
         r"""Remove key from the ModuleDict and return its module.
 
-        Arguments:
+        Args:
             key (string): key to pop from the ModuleDict
         """
         v = self[key]
@@ -344,7 +344,7 @@ def update(self, modules: Mapping[str, Module]) -> None:
             If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
             an iterable of key-value pairs, the order of new elements in it is preserved.
 
-        Arguments:
+        Args:
             modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
                 or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
         """
@@ -379,7 +379,7 @@ class ParameterList(Module):
     list, but parameters it contains are properly registered, and will be
     visible by all :class:`~torch.nn.Module` methods.
 
-    Arguments:
+    Args:
         parameters (iterable, optional): an iterable of :class:`~torch.nn.Parameter` to add
 
     Example::
@@ -458,7 +458,7 @@ def __dir__(self):
     def append(self: T, parameter: 'Parameter') -> T:
         """Appends a given parameter at the end of the list.
 
-        Arguments:
+        Args:
             parameter (nn.Parameter): parameter to append
         """
         self.register_parameter(str(len(self)), parameter)
@@ -467,7 +467,7 @@ def append(self: T, parameter: 'Parameter') -> T:
     def extend(self: T, parameters: Iterable['Parameter']) -> T:
         """Appends parameters from a Python iterable to the end of the list.
 
-        Arguments:
+        Args:
             parameters (iterable): iterable of parameters to append
         """
         if not isinstance(parameters, container_abcs.Iterable):
@@ -518,7 +518,7 @@ class ParameterDict(Module):
     types (e.g., Python's plain ``dict``) does not preserve the order of the
     merged mapping.
 
-    Arguments:
+    Args:
         parameters (iterable, optional): a mapping (dictionary) of
             (string : :class:`~torch.nn.Parameter`) or an iterable of key-value pairs
             of type (string, :class:`~torch.nn.Parameter`)
@@ -581,7 +581,7 @@ def clear(self) -> None:
     def pop(self, key: str) -> 'Parameter':
         r"""Remove key from the ParameterDict and return its parameter.
 
-        Arguments:
+        Args:
             key (string): key to pop from the ParameterDict
         """
         v = self[key]
@@ -611,7 +611,7 @@ def update(self, parameters: Mapping[str, 'Parameter']) -> None:
             If :attr:`parameters` is an ``OrderedDict``, a :class:`~torch.nn.ParameterDict`, or
             an iterable of key-value pairs, the order of new elements in it is preserved.
 
-        Arguments:
+        Args:
             parameters (iterable): a mapping (dictionary) from string to
                 :class:`~torch.nn.Parameter`, or an iterable of
                 key-value pairs of type (string, :class:`~torch.nn.Parameter`)
diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py
index f22c35fa39ff..6a9c4dcd2ef6 100644
--- a/torch/nn/modules/conv.py
+++ b/torch/nn/modules/conv.py
@@ -530,8 +530,9 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
 
     # dilation being an optional parameter is for backwards
     # compatibility
-    def _output_padding(self, input, output_size, stride, padding, kernel_size, dilation=None):
-        # type: (Tensor, Optional[List[int]], List[int], List[int], List[int], Optional[List[int]]) -> List[int]
+    def _output_padding(self, input: Tensor, output_size: Optional[List[int]],
+                        stride: List[int], padding: List[int], kernel_size: List[int],
+                        dilation: Optional[List[int]] = None) -> List[int]:
         if output_size is None:
             ret = _single(self.output_padding)  # converting to list if was not already
         else:
diff --git a/torch/nn/modules/flatten.py b/torch/nn/modules/flatten.py
index c06b7a5534f6..dd491ba99620 100644
--- a/torch/nn/modules/flatten.py
+++ b/torch/nn/modules/flatten.py
@@ -2,7 +2,7 @@
 
 from typing import Tuple, Union
 from torch import Tensor
-from torch import Size
+from torch.types import _size
 
 
 class Flatten(Module):
@@ -53,8 +53,8 @@ class Unflatten(Module):
       be either `int` or `str` when `Tensor` or `NamedTensor` is used, respectively.
 
     * :attr:`unflattened_size` is the new shape of the unflattened dimension of the tensor and it can be
-      a `tuple` of ints or `torch.Size` for `Tensor` input or a `NamedShape` (tuple of `(name, size)` tuples)
-      for `NamedTensor` input.
+      a `tuple` of ints or a `list` of ints or `torch.Size` for `Tensor` input;  a `NamedShape` 
+      (tuple of `(name, size)` tuples) for `NamedTensor` input.
 
     Shape:
         - Input: :math:`(N, *dims)`
@@ -62,7 +62,7 @@ class Unflatten(Module):
 
     Args:
         dim (Union[int, str]): Dimension to be unflattened
-        unflattened_size (Union[torch.Size, NamedShape]): New shape of the unflattened dimension
+        unflattened_size (Union[torch.Size, Tuple, List, NamedShape]): New shape of the unflattened dimension
 
     Examples:
         >>> input = torch.randn(2, 50)
@@ -71,7 +71,7 @@ class Unflatten(Module):
         >>>     nn.Linear(50, 50),
         >>>     nn.Unflatten(1, (2, 5, 5))
         >>> )
-        >>> output = m(output)
+        >>> output = m(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
         >>> # With torch.Size
@@ -79,15 +79,13 @@ class Unflatten(Module):
         >>>     nn.Linear(50, 50),
         >>>     nn.Unflatten(1, torch.Size([2, 5, 5]))
         >>> )
-        >>> output = m(output)
+        >>> output = m(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
         >>> # With namedshape (tuple of tuples)
-        >>> m = nn.Sequential(
-        >>>     nn.Linear(50, 50),
-        >>>     nn.Unflatten('features', (('C', 2), ('H', 50), ('W',50)))
-        >>> )
-        >>> output = m(output)
+        >>> input = torch.randn(2, 50, names=('N', 'features'))
+        >>> unflatten = nn.Unflatten('features', (('C', 2), ('H', 5), ('W', 5)))
+        >>> output = unflatten(input)
         >>> output.size()
         torch.Size([2, 2, 5, 5])
     """
@@ -95,9 +93,9 @@ class Unflatten(Module):
 
     __constants__ = ['dim', 'unflattened_size']
     dim: Union[int, str]
-    unflattened_size: Union[Size, NamedShape]
+    unflattened_size: Union[_size, NamedShape]
 
-    def __init__(self, dim: Union[int, str], unflattened_size: Union[Size, NamedShape]) -> None:
+    def __init__(self, dim: Union[int, str], unflattened_size: Union[_size, NamedShape]) -> None:
         super(Unflatten, self).__init__()
 
         if isinstance(dim, int):
@@ -121,7 +119,7 @@ def _require_tuple_tuple(self, input):
                         "but found type {}".format(type(input).__name__))
 
     def _require_tuple_int(self, input):
-        if (isinstance(input, tuple)):
+        if (isinstance(input, (tuple, list))):
             for idx, elem in enumerate(input):
                 if not isinstance(elem, int):
                     raise TypeError("unflattened_size must be tuple of ints, " + 
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 7fad378bc7b9..f054590da66a 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -489,7 +489,7 @@ def cuda(self: T, device: Optional[Union[int, device]] = None) -> T:
         it should be called before constructing optimizer if the module will
         live on GPU while being optimized.
 
-        Arguments:
+        Args:
             device (int, optional): if specified, all parameters will be
                 copied to that device
 
@@ -509,7 +509,7 @@ def cpu(self: T) -> T:
     def type(self: T, dst_type: Union[dtype, str]) -> T:
         r"""Casts all parameters and buffers to :attr:`dst_type`.
 
-        Arguments:
+        Args:
             dst_type (type or string): the desired type
 
         Returns:
@@ -843,7 +843,6 @@ def _slow_forward(self, *input, **kwargs):
         if recording_scopes:
             name = torch.jit._trace._trace_module_map[self] if self in torch.jit._trace._trace_module_map else None
             if name:
-                cur_scope_name = tracing_state.current_scope()
                 tracing_state.push_scope(name)
             else:
                 recording_scopes = False
@@ -1015,7 +1014,7 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         In rare cases, subclasses can achieve class-specific behavior by
         overriding this method with custom logic.
 
-        Arguments:
+        Args:
             destination (dict): a dict where state will be stored
             prefix (str): the prefix for parameters and buffers used in this
                 module
@@ -1096,7 +1095,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             :attr:`state_dict` to :meth:`~torch.nn.Module.load_state_dict`. So
             it can be modified.
 
-        Arguments:
+        Args:
             state_dict (dict): a dict containing parameters and
                 persistent buffers.
             prefix (str): the prefix for parameters and buffers used in this
@@ -1166,7 +1165,7 @@ def load_state_dict(self, state_dict: Union[Dict[str, Tensor], Dict[str, Tensor]
         the keys of :attr:`state_dict` must exactly match the keys returned
         by this module's :meth:`~torch.nn.Module.state_dict` function.
 
-        Arguments:
+        Args:
             state_dict (dict): a dict containing parameters and
                 persistent buffers.
             strict (bool, optional): whether to strictly enforce that the keys
@@ -1479,7 +1478,7 @@ def zero_grad(self, set_to_none: bool = False) -> None:
         r"""Sets gradients of all model parameters to zero. See similar function
         under :class:`torch.optim.Optimizer` for more context.
 
-        Arguments:
+        Args:
             set_to_none (bool): instead of setting to zero, set the grads to None.
                 See :meth:`torch.optim.Optimizer.zero_grad` for details.
         """
diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py
index e8f68307f230..78aae504083b 100644
--- a/torch/nn/modules/pooling.py
+++ b/torch/nn/modules/pooling.py
@@ -5,7 +5,8 @@
 from .utils import _single, _pair, _triple
 from .. import functional as F
 
-from ..common_types import _size_any_t, _size_1_t, _size_2_t, _size_3_t, _ratio_3_t, _ratio_2_t
+from ..common_types import (_size_any_t, _size_1_t, _size_2_t, _size_3_t,
+                            _ratio_3_t, _ratio_2_t, _size_any_opt_t, _size_2_opt_t, _size_3_opt_t)
 
 
 class _MaxPoolNd(Module):
@@ -953,7 +954,7 @@ class _AdaptiveMaxPoolNd(Module):
     __constants__ = ['output_size', 'return_indices']
     return_indices: bool
 
-    def __init__(self, output_size: _size_any_t, return_indices: bool = False) -> None:
+    def __init__(self, output_size: _size_any_opt_t, return_indices: bool = False) -> None:
         super(_AdaptiveMaxPoolNd, self).__init__()
         self.output_size = output_size
         self.return_indices = return_indices
@@ -1020,7 +1021,7 @@ class AdaptiveMaxPool2d(_AdaptiveMaxPoolNd):
 
     """
 
-    output_size: _size_2_t
+    output_size: _size_2_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_max_pool2d(input, self.output_size, self.return_indices)
@@ -1057,7 +1058,7 @@ class AdaptiveMaxPool3d(_AdaptiveMaxPoolNd):
 
     """
 
-    output_size: _size_3_t
+    output_size: _size_3_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_max_pool3d(input, self.output_size, self.return_indices)
@@ -1066,7 +1067,7 @@ def forward(self, input: Tensor) -> Tensor:
 class _AdaptiveAvgPoolNd(Module):
     __constants__ = ['output_size']
 
-    def __init__(self, output_size: _size_any_t) -> None:
+    def __init__(self, output_size: _size_any_opt_t) -> None:
         super(_AdaptiveAvgPoolNd, self).__init__()
         self.output_size = output_size
 
@@ -1125,7 +1126,7 @@ class AdaptiveAvgPool2d(_AdaptiveAvgPoolNd):
 
     """
 
-    output_size: _size_2_t
+    output_size: _size_2_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_avg_pool2d(input, self.output_size)
@@ -1159,7 +1160,7 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd):
 
     """
 
-    output_size: _size_3_t
+    output_size: _size_3_opt_t
 
     def forward(self, input: Tensor) -> Tensor:
         return F.adaptive_avg_pool3d(input, self.output_size)
diff --git a/torch/nn/modules/utils.py b/torch/nn/modules/utils.py
index 3e0b93c7afc0..97e4195619cb 100644
--- a/torch/nn/modules/utils.py
+++ b/torch/nn/modules/utils.py
@@ -26,8 +26,7 @@ def _reverse_repeat_tuple(t, n):
     return tuple(x for x in reversed(t) for _ in range(n))
 
 
-def _list_with_default(out_size, defaults):
-    # type: (List[int], List[int]) -> List[int]
+def _list_with_default(out_size: List[int], defaults: List[int]) -> List[int]:
     if isinstance(out_size, int):
         return out_size
     if len(defaults) <= len(out_size):
diff --git a/torch/nn/parallel/comm.py b/torch/nn/parallel/comm.py
index dacd74a2fba0..587f88eb1b99 100644
--- a/torch/nn/parallel/comm.py
+++ b/torch/nn/parallel/comm.py
@@ -8,7 +8,7 @@
 def broadcast(tensor, devices=None, *, out=None):
     r"""Broadcasts a tensor to specified GPU devices.
 
-    Arguments:
+    Args:
         tensor (Tensor): tensor to broadcast. Can be on CPU or GPU.
         devices (Iterable[torch.device, str or int], optional): an iterable of
           GPU devices, among which to broadcast.
@@ -43,7 +43,7 @@ def broadcast_coalesced(tensors, devices, buffer_size=10485760):
     Small tensors are first coalesced into a buffer to reduce the number
     of synchronizations.
 
-    Arguments:
+    Args:
         tensors (sequence): tensors to broadcast. Must be on the same device,
           either CPU or GPU.
         devices (Iterable[torch.device, str or int]): an iterable of GPU
@@ -64,7 +64,7 @@ def reduce_add(inputs, destination=None):
     All inputs should have matching shapes, dtype, and layout. The output tensor
     will be of the same shape, dtype, and layout.
 
-    Arguments:
+    Args:
         inputs (Iterable[Tensor]): an iterable of tensors to add.
         destination (int, optional): a device on which the output will be
             placed (default: current device).
@@ -110,7 +110,7 @@ def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
     Small tensors are first coalesced into a buffer to reduce the number
     of synchronizations.
 
-    Arguments:
+    Args:
         inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
             contain tensors from a single device.
         destination (int, optional): a device on which the output will be
@@ -152,7 +152,7 @@ def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
 def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=None):
     """Scatters tensor across multiple GPUs.
 
-    Arguments:
+    Args:
         tensor (Tensor): tensor to scatter. Can be on CPU or GPU.
         devices (Iterable[torch.device, str or int], optional): an iterable of
           GPU devices, among which to scatter.
@@ -202,7 +202,7 @@ def scatter(tensor, devices=None, chunk_sizes=None, dim=0, streams=None, *, out=
 def gather(tensors, dim=0, destination=None, *, out=None):
     r"""Gathers tensors from multiple GPU devices.
 
-    Arguments:
+    Args:
         tensors (Iterable[Tensor]): an iterable of tensors to gather.
           Tensor sizes in all dimensions other than :attr:`dim` have to match.
         dim (int, optional): a dimension along which the tensors will be
diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py
index 1cb446f1155d..064ae6a86b3a 100644
--- a/torch/nn/parallel/distributed.py
+++ b/torch/nn/parallel/distributed.py
@@ -999,7 +999,7 @@ def register_comm_hook(self, state: object, hook: callable):
         and gradient compression which involve different communication strategies for
         parameter syncs while running Distributed DataParallel training.
 
-        Arguments:
+        Args:
             state (object): state is passed to the hook and can be used to maintain
                             and update any state information that users would like to
                             maintain as part of the training process. Examples: error
@@ -1084,7 +1084,7 @@ def _register_builtin_comm_hook(
         The built-in hooks aim to provide efficient C++ implementations for certain hooks,
         which might not be as efficient if implemented in Python using a Python communication hook.
 
-        Arguments:
+        Args:
             comm_hook_type (dist.BuiltinCommHookType): type of communication hook, such as
             ALLREDUCE, FP16_COMPRESS, etc.
 
diff --git a/torch/nn/parallel/replicate.py b/torch/nn/parallel/replicate.py
index 73aac9545507..8effeece5908 100644
--- a/torch/nn/parallel/replicate.py
+++ b/torch/nn/parallel/replicate.py
@@ -44,7 +44,7 @@ def descendant_modules(module):
     if memo is None:
         memo = set()
 
-    # memorize visited modules
+    # memoize visited modules
     memo.add(module)
     if _is_script_module(module):
         memo.update(descendant_modules(module))
@@ -108,7 +108,6 @@ def replicate(network, devices, detach=False):
     modules = list(network.modules())
     module_copies = [[] for device in devices]
     module_indices = {}
-    scriptmodule_skip_attr = {"_parameters", "_buffers", "_modules", "forward", "_c"}
 
     for i, module in enumerate(modules):
         module_indices[module] = i
diff --git a/torch/nn/parameter.py b/torch/nn/parameter.py
index 04a3f6d91b29..c5d63708e908 100644
--- a/torch/nn/parameter.py
+++ b/torch/nn/parameter.py
@@ -15,7 +15,7 @@ class Parameter(torch.Tensor):
     the model. If there was no such class as :class:`Parameter`, these
     temporaries would get registered too.
 
-    Arguments:
+    Args:
         data (Tensor): parameter tensor.
         requires_grad (bool, optional): if the parameter requires gradient. See
             :ref:`excluding-subgraphs` for more details. Default: `True`
diff --git a/torch/nn/quantizable/__init__.py b/torch/nn/quantizable/__init__.py
new file mode 100644
index 000000000000..270dcebaa5f4
--- /dev/null
+++ b/torch/nn/quantizable/__init__.py
@@ -0,0 +1 @@
+from .modules import *
diff --git a/torch/nn/quantizable/modules/__init__.py b/torch/nn/quantizable/modules/__init__.py
new file mode 100644
index 000000000000..b3480b717a2d
--- /dev/null
+++ b/torch/nn/quantizable/modules/__init__.py
@@ -0,0 +1,7 @@
+from .rnn import LSTM
+from .rnn import LSTMCell
+
+__all__ = [
+    'LSTM',
+    'LSTMCell',
+]
diff --git a/torch/nn/quantizable/modules/rnn.py b/torch/nn/quantizable/modules/rnn.py
new file mode 100644
index 000000000000..cfe076fac16c
--- /dev/null
+++ b/torch/nn/quantizable/modules/rnn.py
@@ -0,0 +1,403 @@
+import numbers
+from typing import Optional, Tuple
+import warnings
+
+import torch
+from torch import Tensor
+
+"""
+We will recreate all the RNN modules as we require the modules to be decomposed
+into its building blocks to be able to observe.
+"""
+
+class LSTMCell(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM) cell.
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTMCell`
+
+    Examples::
+
+        >>> import torch.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTMCell(10, 20)
+        >>> input = torch.randn(3, 10)
+        >>> hx = torch.randn(3, 20)
+        >>> cx = torch.randn(3, 20)
+        >>> output = []
+        >>> for i in range(6):
+                hx, cx = rnn(input[i], (hx, cx))
+                output.append(hx)
+    """
+    _FLOAT_MODULE = torch.nn.LSTMCell
+
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True):
+        super().__init__()
+        self.input_size = input_dim
+        self.hidden_size = hidden_dim
+        self.bias = bias
+
+        self.igates = torch.nn.Linear(input_dim, 4 * hidden_dim, bias=bias)
+        self.hgates = torch.nn.Linear(hidden_dim, 4 * hidden_dim, bias=bias)
+        self.gates = torch.nn.quantized.FloatFunctional()
+
+        self.fgate_cx = torch.nn.quantized.FloatFunctional()
+        self.igate_cgate = torch.nn.quantized.FloatFunctional()
+        self.fgate_cx_igate_cgate = torch.nn.quantized.FloatFunctional()
+
+        self.ogate_cy = torch.nn.quantized.FloatFunctional()
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None) -> Tuple[Tensor, Tensor]:
+        if hidden is None or hidden == (None, None):
+            hidden = self.initialize_hidden(x.shape[0], x.is_quantized)
+        hx, cx = hidden
+
+        igates = self.igates(x)
+        hgates = self.hgates(hx)
+        gates = self.gates.add(igates, hgates)
+
+        input_gate, forget_gate, cell_gate, out_gate = gates.chunk(4, 1)
+
+        input_gate = torch.sigmoid(input_gate)
+        forget_gate = torch.sigmoid(forget_gate)
+        cell_gate = torch.tanh(cell_gate)
+        out_gate = torch.sigmoid(out_gate)
+
+        fgate_cx = self.fgate_cx.mul(forget_gate, cx)
+        igate_cgate = self.igate_cgate.mul(input_gate, cell_gate)
+        fgate_cx_igate_cgate = self.fgate_cx_igate_cgate.add(fgate_cx, igate_cgate)
+        cy = fgate_cx_igate_cgate
+
+        tanh_cy = torch.tanh(cy)
+        hy = self.ogate_cy.mul(out_gate, tanh_cy)
+        return hy, cy
+
+    def initialize_hidden(self, batch_size: int, is_quantized: bool = False) -> Tuple[Tensor, Tensor]:
+        h, c = torch.zeros((batch_size, self.hidden_size)), torch.zeros((batch_size, self.hidden_size))
+        if is_quantized:
+            h = torch.quantize_per_tensor(h, scale=1.0, zero_point=0, dtype=torch.quint8)
+            c = torch.quantize_per_tensor(c, scale=1.0, zero_point=0, dtype=torch.quint8)
+        return h, c
+
+    def _get_name(self):
+        return 'QuantizableLSTMCell'
+
+    @classmethod
+    def from_params(cls, wi, wh, bi=None, bh=None):
+        """Uses the weights and biases to create a new LSTM cell.
+
+        Args:
+            wi, wh: Weights for the input and hidden layers
+            bi, bh: Biases for the input and hidden layers
+        """
+        assert (bi is None) == (bh is None)  # Either both None or both have values
+        input_size = wi.shape[1]
+        hidden_size = wh.shape[1]
+        cell = cls(input_dim=input_size, hidden_dim=hidden_size,
+                   bias=(bi is not None))
+        cell.igates.weight = torch.nn.Parameter(wi)
+        if bi is not None:
+            cell.igates.bias = torch.nn.Parameter(bi)
+        cell.hgates.weight = torch.nn.Parameter(wh)
+        if bh is not None:
+            cell.hgates.bias = torch.nn.Parameter(bh)
+        return cell
+
+    @classmethod
+    def from_float(cls, other):
+        assert type(other) == cls._FLOAT_MODULE
+        assert hasattr(other, 'qconfig'), "The float module must have 'qconfig'"
+        observed = cls.from_params(other.weight_ih, other.weight_hh,
+                                   other.bias_ih, other.bias_hh)
+        observed.qconfig = other.qconfig
+        observed.igates.qconfig = other.qconfig
+        observed.hgates.qconfig = other.qconfig
+        return observed
+
+
+class _LSTMSingleLayer(torch.nn.Module):
+    r"""A single one-directional LSTM layer.
+
+    The difference between a layer and a cell is that the layer can process a
+    sequence, while the cell only expects an instantaneous value.
+    """
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True):
+        super().__init__()
+        self.cell = LSTMCell(input_dim, hidden_dim, bias=bias)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        result = []
+        for xx in x:
+            hidden = self.cell(xx, hidden)
+            result.append(hidden[0])  # type: ignore
+        result_tensor = torch.stack(result, 0)
+        return result_tensor, hidden
+
+    @classmethod
+    def from_params(cls, *args, **kwargs):
+        cell = LSTMCell.from_params(*args, **kwargs)
+        layer = cls(cell.input_size, cell.hidden_size, cell.bias)
+        layer.cell = cell
+        return layer
+
+
+class _LSTMLayer(torch.nn.Module):
+    r"""A single bi-directional LSTM layer."""
+    def __init__(self, input_dim: int, hidden_dim: int, bias: bool = True,
+                 batch_first: bool = False, bidirectional: bool = False):
+        super().__init__()
+        self.batch_first = batch_first
+        self.bidirectional = bidirectional
+        self.layer_fw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias)
+        if self.bidirectional:
+            self.layer_bw = _LSTMSingleLayer(input_dim, hidden_dim, bias=bias)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+        if hidden is None:
+            hx_fw, cx_fw = (None, None)
+        else:
+            hx_fw, cx_fw = hidden
+        if self.bidirectional:
+            if hx_fw is None:
+                hx_bw = None
+            else:
+                hx_bw = hx_fw[1]
+                hx_fw = hx_fw[0]
+            if cx_fw is None:
+                cx_bw = None
+            else:
+                cx_bw = cx_fw[1]
+                cx_fw = cx_fw[0]
+            hidden_bw = hx_bw, cx_bw
+        hidden_fw = hx_fw, cx_fw
+        result_fw, hidden_fw = self.layer_fw(x, hidden_fw)
+
+        if self.bidirectional:
+            x_reversed = x.flip(0)
+            result_bw, hidden_bw = self.layer_bw(x_reversed, hidden_bw)
+            result_bw = result_bw.flip(0)
+
+            result = torch.cat([result_fw, result_bw], result_fw.dim() - 1)
+            h = torch.stack([hidden_fw[0], hidden_bw[0]], 0)  # type: ignore
+            c = torch.stack([hidden_fw[1], hidden_bw[1]], 0)  # type: ignore
+        else:
+            result = result_fw
+            h, c = hidden_fw  # type: ignore
+
+        if self.batch_first:
+            result.transpose_(0, 1)
+
+        return result, (h, c)
+
+    @classmethod
+    def from_float(cls, other, layer_idx=0, qconfig=None, **kwargs):
+        r"""
+        There is no FP equivalent of this class. This function is here just to
+        mimic the behavior of the `prepare` within the `torch.quantization`
+        flow.
+        """
+        assert hasattr(other, 'qconfig') or (qconfig is not None)
+
+        input_size = kwargs.get('input_size', other.input_size)
+        hidden_size = kwargs.get('hidden_size', other.hidden_size)
+        bias = kwargs.get('bias', other.bias)
+        batch_first = kwargs.get('batch_first', other.batch_first)
+        bidirectional = kwargs.get('bidirectional', other.bidirectional)
+
+        layer = cls(input_size, hidden_size, bias, batch_first, bidirectional)
+        layer.qconfig = getattr(other, 'qconfig', qconfig)
+        wi = getattr(other, f'weight_ih_l{layer_idx}')
+        wh = getattr(other, f'weight_hh_l{layer_idx}')
+        bi = getattr(other, f'bias_ih_l{layer_idx}', None)
+        bh = getattr(other, f'bias_hh_l{layer_idx}', None)
+
+        layer.layer_fw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
+
+        if other.bidirectional:
+            wi = getattr(other, f'weight_ih_l{layer_idx}_reverse')
+            wh = getattr(other, f'weight_hh_l{layer_idx}_reverse')
+            bi = getattr(other, f'bias_ih_l{layer_idx}_reverse', None)
+            bh = getattr(other, f'bias_hh_l{layer_idx}_reverse', None)
+            layer.layer_bw = _LSTMSingleLayer.from_params(wi, wh, bi, bh)
+        return layer
+
+    # Getters for the weights and biases
+    # Note that jit currently doesn't support the `porperty`, so if you need to
+    # access the weights/biases you would need to navigate manually to the
+    # `layer_fw.cell.igates.*`: https://github.com/pytorch/pytorch/issues/37883
+    @property
+    def weight_ih(self):
+        return self.layer_fw.cell.igates.weight
+
+    @property
+    def weight_hh(self):
+        return self.layer_fw.cell.hgates.weight
+
+    @property
+    def bias_ih(self):
+        return self.layer_fw.cell.igates.bias
+
+    @property
+    def bias_hh(self):
+        return self.layer_fw.cell.hgates.bias
+
+    @property
+    def weight_ih_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.igates.weight
+
+    @property
+    def weight_hh_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.hgates.weight
+
+    @property
+    def bias_ih_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.igates.bias
+
+    @property
+    def bias_hh_reverse(self):
+        assert self.bidirectional, 'There is no reverse path in the non-bidirectional layer'
+        return self.layer_bw.cell.hgates.bias
+
+
+class LSTM(torch.nn.Module):
+    r"""A quantizable long short-term memory (LSTM).
+
+    For the description and the argument types, please, refer to :class:`~torch.nn.LSTM`
+
+    Attributes:
+        layers : instances of the `_LSTMLayer`
+
+    .. note::
+        To access the weights and biases, you need to access them per layer.
+        See examples below.
+
+    Examples::
+
+        >>> import torch.nn.quantizable as nnqa
+        >>> rnn = nnqa.LSTM(10, 20, 2)
+        >>> input = torch.randn(5, 3, 10)
+        >>> h0 = torch.randn(2, 3, 20)
+        >>> c0 = torch.randn(2, 3, 20)
+        >>> output, (hn, cn) = rnn(input, (h0, c0))
+        >>> # To get the weights:
+        >>> print(rnn.layers[0].weight_ih)
+        tensor([[...]])
+        >>> print(rnn.layers[0].weight_hh)
+        AssertionError: There is no reverse path in the non-bidirectional layer
+    """
+    _FLOAT_MODULE = torch.nn.LSTM
+
+    def __init__(self, input_size: int, hidden_size: int,
+                 num_layers: int = 1, bias: bool = True,
+                 batch_first: bool = False, dropout: float = 0.,
+                 bidirectional: bool = False):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.bias = bias
+        self.batch_first = batch_first
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.training = False  # We don't want to train using this module
+        num_directions = 2 if bidirectional else 1
+
+        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
+                isinstance(dropout, bool):
+            raise ValueError("dropout should be a number in range [0, 1] "
+                             "representing the probability of an element being "
+                             "zeroed")
+        if dropout > 0:
+            warnings.warn("dropout option for quantizable LSTM is ignored. "
+                          "If you are training, please, use nn.LSTM version "
+                          "followed by `prepare` step.")
+            if num_layers == 1:
+                warnings.warn("dropout option adds dropout after all but last "
+                              "recurrent layer, so non-zero dropout expects "
+                              "num_layers greater than 1, but got dropout={} "
+                              "and num_layers={}".format(dropout, num_layers))
+
+        layers = [_LSTMLayer(self.input_size, self.hidden_size,
+                             self.bias, batch_first=False,
+                             bidirectional=self.bidirectional)]
+        for layer in range(1, num_layers):
+            layers.append(_LSTMLayer(self.hidden_size, self.hidden_size,
+                                     self.bias, batch_first=False,
+                                     bidirectional=self.bidirectional))
+        self.layers = torch.nn.ModuleList(layers)
+
+    def forward(self, x: Tensor, hidden: Optional[Tuple[Tensor, Tensor]] = None):
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        max_batch_size = x.size(1)
+        num_directions = 2 if self.bidirectional else 1
+        if hidden is None:
+            zeros = torch.zeros(num_directions, max_batch_size,
+                                self.hidden_size, dtype=torch.float,
+                                device=x.device)
+            zeros.squeeze_(0)
+            if x.is_quantized:
+                zeros = torch.quantize_per_tensor(zeros, scale=1.0,
+                                                  zero_point=0, dtype=x.dtype)
+            hxcx = [(zeros, zeros) for _ in range(self.num_layers)]
+        else:
+            hidden_non_opt = torch.jit._unwrap_optional(hidden)
+            if isinstance(hidden_non_opt[0], Tensor):
+                hx = hidden_non_opt[0].reshape(self.num_layers, num_directions,
+                                               max_batch_size,
+                                               self.hidden_size).unbind(0)
+                cx = hidden_non_opt[1].reshape(self.num_layers, num_directions,
+                                               max_batch_size,
+                                               self.hidden_size).unbind(0)
+                hxcx = []
+                for idx in range(self.num_layers):
+                    hxcx.append((hx[idx].squeeze_(0), cx[idx].squeeze_(0)))
+            else:
+                hxcx = hidden_non_opt
+
+        for idx in range(self.num_layers):
+            x, hxcx[idx] = self.layers[idx](x, hxcx[idx])
+
+        hx_list = []
+        cx_list = []
+        for idx in range(self.num_layers):
+            hx_list.append(hxcx[idx][0])
+            cx_list.append(hxcx[idx][1])
+        hx_tensor = torch.stack(hx_list)
+        cx_tensor = torch.stack(cx_list)
+
+        # We are creating another dimension for bidirectional case
+        # need to collapse it
+        hx_tensor = hx_tensor.reshape(-1, *hx_tensor.shape[-2:])
+        cx_tensor = cx_tensor.reshape(-1, *cx_tensor.shape[-2:])
+
+        if self.batch_first:
+            x = x.transpose(0, 1)
+
+        return x, (hx_tensor, cx_tensor)
+
+    def _get_name(self):
+        return 'QuantizableLSTM'
+
+    @classmethod
+    def from_float(cls, other, qconfig=None):
+        assert isinstance(other, cls._FLOAT_MODULE)
+        assert (hasattr(other, 'qconfig') or qconfig)
+        observed = cls(other.input_size, other.hidden_size, other.num_layers,
+                       other.bias, other.batch_first, other.dropout,
+                       other.bidirectional)
+        observed.qconfig = getattr(other, 'qconfig', qconfig)
+        for idx in range(other.num_layers):
+            observed.layers[idx] = _LSTMLayer.from_float(other, idx, qconfig,
+                                                         batch_first=False)
+        observed.eval()
+        observed = torch.quantization.prepare(observed, inplace=True)
+        return observed
+
+    def from_observed(self, other):
+        return torch.quantization.convert(self, inplace=False,
+                                          remove_qconfig=True)
diff --git a/torch/nn/quantized/dynamic/modules/rnn.py b/torch/nn/quantized/dynamic/modules/rnn.py
index df88169471ca..59c0195d7858 100644
--- a/torch/nn/quantized/dynamic/modules/rnn.py
+++ b/torch/nn/quantized/dynamic/modules/rnn.py
@@ -239,8 +239,6 @@ def from_float(cls, mod):
         _all_weight_values = []
         for layer in range(qRNNBase.num_layers):
             for direction in range(num_directions):
-                layer_input_size = qRNNBase.input_size if layer == 0 else qRNNBase.hidden_size * num_directions
-
                 suffix = '_reverse' if direction == 1 else ''
 
                 def retrieve_weight_bias(ihhh):
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index 8e5ae317f650..b3bc78ff6941 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -258,8 +258,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConv1d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv1d_prepack(
             w, b, self.stride, self.padding, self.dilation, self.groups)
 
@@ -347,8 +346,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConv2d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv2d_prepack(
             w, b, self.stride, self.padding, self.dilation, self.groups)
 
@@ -434,8 +432,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConv3d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv3d_prepack(
             w, b, self.stride, self.padding, self.dilation, self.groups)
 
@@ -493,8 +490,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride,
             padding, dilation, transposed, output_padding,
             groups, bias, padding_mode)
 
-    def _input_padding(self, kernel_size, dilation, padding):
-        # type: (List[int], List[int], List[int]) -> List[int]
+    def _input_padding(self, kernel_size: List[int], dilation: List[int], padding: List[int]) -> List[int]:
         res = torch.jit.annotate(List[int], [])
         for kdx in range(len(kernel_size)):
             pad = (dilation[kdx] * (kernel_size[kdx] - 1) - padding[kdx])
@@ -590,8 +586,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConvTranpose1d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv_transpose1d_prepack(
             w, b, self.stride, self.padding, self.output_padding, self.dilation,
             self.groups)
@@ -674,8 +669,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConvTranpose2d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv_transpose2d_prepack(
             w, b, self.stride, self.padding, self.output_padding, self.dilation,
             self.groups)
@@ -759,8 +753,7 @@ def __init__(self, in_channels, out_channels, kernel_size, stride=1,
     def _get_name(self):
         return 'QuantizedConvTranpose3d'
 
-    def set_weight_bias(self, w, b):
-        # type: (torch.Tensor, Optional[torch.Tensor]) -> None
+    def set_weight_bias(self, w: torch.Tensor, b: Optional[torch.Tensor]) -> None:
         self._packed_params = torch.ops.quantized.conv_transpose3d_prepack(
             w, b, self.stride, self.padding, self.output_padding, self.dilation,
             self.groups)
diff --git a/torch/nn/quantized/modules/embedding_ops.py b/torch/nn/quantized/modules/embedding_ops.py
index d16748b3baf7..523994b364c8 100644
--- a/torch/nn/quantized/modules/embedding_ops.py
+++ b/torch/nn/quantized/modules/embedding_ops.py
@@ -22,8 +22,7 @@ def __init__(self, num_embeddings, embedding_dim, dtype=torch.quint8):
             raise NotImplementedError('Unsupported dtype on quantized embedding! Supports quint8 and quint4x2.')
 
     @torch.jit.export
-    def set_weight(self, weight):
-        # type: (torch.Tensor) -> None
+    def set_weight(self, weight: torch.Tensor) -> None:
         if self.dtype in [torch.quint8, torch.quint4x2]:
             self._packed_weight = torch.ops.quantized.embedding_bag_prepack(weight)
         else:
@@ -52,7 +51,6 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
 
     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
                               missing_keys, unexpected_keys, error_msgs):
-        version = local_metadata.get('version', None)
         self.dtype = state_dict[prefix + 'dtype']
         state_dict.pop(prefix + 'dtype')
 
@@ -126,8 +124,7 @@ def extra_repr(self):
 
         return extra_repr_str
 
-    def set_weight(self, w):
-        # type: (torch.Tensor) -> None
+    def set_weight(self, w: torch.Tensor) -> None:
         self._packed_params.set_weight(w)
 
     def weight(self):
diff --git a/torch/nn/quantized/modules/functional_modules.py b/torch/nn/quantized/modules/functional_modules.py
index b9fab962d563..08b5447bb925 100644
--- a/torch/nn/quantized/modules/functional_modules.py
+++ b/torch/nn/quantized/modules/functional_modules.py
@@ -40,45 +40,39 @@ def forward(self, x):
                            "'forward'. Please use the underlying operation")
 
     r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
-    def add(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.add(Tensor, float)``"""
-    def add_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.add(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
-    def mul(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.mul(x, y)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
-    def mul_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.mul(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.cat``"""
-    def cat(self, x, dim=0):
-        # type: (List[Tensor], int) -> Tensor
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
         r = torch.cat(x, dim=dim)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``relu(torch.add(x,y))``"""
-    def add_relu(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         r = torch.nn.functional.relu(r)
         r = self.activation_post_process(r)
@@ -101,38 +95,32 @@ def forward(self, x):
                            "'forward'. Please use the underlying operation")
 
     r"""Operation equivalent to ``torch.add(Tensor, Tensor)``"""
-    def add(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         return r
 
     r"""Operation equivalent to ``torch.add(Tensor, float)``"""
-    def add_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.add(x, y)
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, Tensor)``"""
-    def mul(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.mul(x, y)
         return r
 
     r"""Operation equivalent to ``torch.mul(Tensor, float)``"""
-    def mul_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
         r = torch.mul(x, y)
         return r
 
     r"""Operation equivalent to ``torch.cat``"""
-    def cat(self, x, dim=0):
-        # type: (List[Tensor], int) -> Tensor
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
         r = torch.cat(x, dim=dim)
         return r
 
     r"""Operation equivalent to ``relu(torch.add(x,y))``"""
-    def add_relu(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
         r = torch.add(x, y)
         r = torch.nn.functional.relu(r)
         return r
@@ -195,45 +183,39 @@ def forward(self, x):
                            "'forward'. Please use the underlying operation")
 
     r"""Operation equivalent to ``torch.ops.quantized.add``"""
-    def add(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add(self, x: Tensor, y: Tensor) -> Tensor:
         r = ops.quantized.add(x, y, scale=self.scale, zero_point=self.zero_point)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.add(Tensor, float)``"""
-    def add_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def add_scalar(self, x: Tensor, y: float) -> Tensor:
         r = ops.quantized.add_scalar(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, Tensor)``"""
-    def mul(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def mul(self, x: Tensor, y: Tensor) -> Tensor:
         r = ops.quantized.mul(x, y, scale=self.scale, zero_point=self.zero_point)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.mul(Tensor, float)``"""
-    def mul_scalar(self, x, y):
-        # type: (Tensor, float) -> Tensor
+    def mul_scalar(self, x: Tensor, y: float) -> Tensor:
         r = ops.quantized.mul_scalar(x, y)
         # Note: this operation is not observed because the observation is not
         # needed for the quantized op.
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.cat``"""
-    def cat(self, x, dim=0):
-        # type: (List[Tensor], int) -> Tensor
+    def cat(self, x: List[Tensor], dim: int = 0) -> Tensor:
         r = ops.quantized.cat(x, scale=self.scale, zero_point=self.zero_point, dim=dim)
         r = self.activation_post_process(r)
         return r
 
     r"""Operation equivalent to ``torch.ops.quantized.add_relu``"""
-    def add_relu(self, x, y):
-        # type: (Tensor, Tensor) -> Tensor
+    def add_relu(self, x: Tensor, y: Tensor) -> Tensor:
         r = ops.quantized.add_relu(x, y, scale=self.scale, zero_point=self.zero_point)
         r = self.activation_post_process(r)
         return r
diff --git a/torch/nn/quantized/modules/normalization.py b/torch/nn/quantized/modules/normalization.py
index 4664120ec8b5..c12f74374863 100644
--- a/torch/nn/quantized/modules/normalization.py
+++ b/torch/nn/quantized/modules/normalization.py
@@ -29,7 +29,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.normalized_shape, mod.weight, mod.bias, float(scale),
@@ -63,7 +62,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_groups, mod.num_channels, mod.weight, mod.bias, float(scale), int(zero_point),
@@ -98,7 +96,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
@@ -133,7 +130,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
@@ -168,7 +164,6 @@ def _get_name(self):
 
     @classmethod
     def from_float(cls, mod):
-        activation_post_process = mod.activation_post_process
         scale, zero_point = mod.activation_post_process.calculate_qparams()
         new_mod = cls(
             mod.num_features, mod.weight, mod.bias, float(scale), int(zero_point),
diff --git a/torch/nn/utils/clip_grad.py b/torch/nn/utils/clip_grad.py
index d7caf8311095..8c7da1034669 100644
--- a/torch/nn/utils/clip_grad.py
+++ b/torch/nn/utils/clip_grad.py
@@ -12,7 +12,7 @@ def clip_grad_norm_(parameters: _tensor_or_tensors, max_norm: float, norm_type:
     The norm is computed over all gradients together, as if they were
     concatenated into a single vector. Gradients are modified in-place.
 
-    Arguments:
+    Args:
         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
             single Tensor that will have gradients normalized
         max_norm (float or int): max norm of the gradients
@@ -58,7 +58,7 @@ def clip_grad_value_(parameters: _tensor_or_tensors, clip_value: float) -> None:
 
     Gradients are modified in-place.
 
-    Arguments:
+    Args:
         parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
             single Tensor that will have gradients normalized
         clip_value (float or int): maximum allowed value of the gradients.
diff --git a/torch/nn/utils/convert_parameters.py b/torch/nn/utils/convert_parameters.py
index c51893de400d..f5b286263387 100644
--- a/torch/nn/utils/convert_parameters.py
+++ b/torch/nn/utils/convert_parameters.py
@@ -5,7 +5,7 @@
 def parameters_to_vector(parameters: Iterable[torch.Tensor]) -> torch.Tensor:
     r"""Convert parameters to one vector
 
-    Arguments:
+    Args:
         parameters (Iterable[Tensor]): an iterator of Tensors that are the
             parameters of a model.
 
@@ -27,7 +27,7 @@ def parameters_to_vector(parameters: Iterable[torch.Tensor]) -> torch.Tensor:
 def vector_to_parameters(vec: torch.Tensor, parameters: Iterable[torch.Tensor]) -> None:
     r"""Convert one vector to the parameters
 
-    Arguments:
+    Args:
         vec (Tensor): a single vector represents the parameters of a model.
         parameters (Iterable[Tensor]): an iterator of Tensors that are the
             parameters of a model.
@@ -60,7 +60,7 @@ def _check_param_device(param: torch.Tensor, old_param_device: Optional[int]) ->
     and single vector form is not supported for multiple allocations,
     e.g. parameters in different GPUs, or mixture of CPU/GPU.
 
-    Arguments:
+    Args:
         param ([Tensor]): a Tensor of a parameter of a model
         old_param_device (int): the device where the first parameter of a
                                 model is allocated.
diff --git a/torch/nn/utils/prune.py b/torch/nn/utils/prune.py
index 84fa30021ed1..851a551da0d8 100644
--- a/torch/nn/utils/prune.py
+++ b/torch/nn/utils/prune.py
@@ -587,7 +587,6 @@ def compute_mask(self, t, default_mask):
         # Compute number of units to prune: amount if int,
         # else amount * tensor_size
         nparams_toprune = _compute_nparams_toprune(self.amount, tensor_size)
-        nparams_tokeep = tensor_size - nparams_toprune
         # This should raise an error if the number of units to prune is larger
         # than the number of units in the tensor
         _validate_pruning_amount(nparams_toprune, tensor_size)
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index 33b0b6f23985..eab89d8189ca 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -213,7 +213,7 @@ def pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True)
         them to compute the loss directly. A Tensor can be retrieved from
         a :class:`PackedSequence` object by accessing its ``.data`` attribute.
 
-    Arguments:
+    Args:
         input (Tensor): padded batch of variable length sequences.
         lengths (Tensor or list(int)): list of sequence lengths of each batch
             element (must be on the CPU if provided as a tensor).
@@ -279,7 +279,7 @@ def pad_packed_sequence(sequence, batch_first=False, padding_value=0.0, total_le
         See :ref:`this FAQ section <pack-rnn-unpack-with-data-parallelism>` for
         details.
 
-    Arguments:
+    Args:
         sequence (PackedSequence): batch to pad
         batch_first (bool, optional): if ``True``, the output will be in ``B x T x *``
             format.
@@ -343,7 +343,7 @@ def pad_sequence(sequences, batch_first=False, padding_value=0.0):
         where `T` is the length of the longest sequence. This function assumes
         trailing dimensions and type of all the Tensors in sequences are same.
 
-    Arguments:
+    Args:
         sequences (list[Tensor]): list of variable length sequences.
         batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
             ``T x B x *`` otherwise
@@ -398,7 +398,7 @@ def pack_sequence(sequences, enforce_sorted=True):
         PackedSequence(data=tensor([ 1,  4,  6,  2,  5,  3]), batch_sizes=tensor([ 3,  2,  1]))
 
 
-    Arguments:
+    Args:
         sequences (list[Tensor]): A list of sequences of decreasing length.
         enforce_sorted (bool, optional): if ``True``, checks that the input
             contains sequences sorted by length in a decreasing order. If
diff --git a/torch/onnx/__init__.py b/torch/onnx/__init__.py
index d2d7e5591fb7..e1ee00dbed12 100644
--- a/torch/onnx/__init__.py
+++ b/torch/onnx/__init__.py
@@ -40,7 +40,7 @@ def export(model, args, f, export_params=True, verbose=False, training=TrainingM
     once in order to get a trace of its execution to be exported;
     at the moment, it supports a limited set of dynamic models (e.g., RNNs.)
 
-    Arguments:
+    Args:
         model (torch.nn.Module): the model to be exported.
         args (tuple of arguments or torch.Tensor, a dictionary consisting of named arguments (optional)): 
             a dictionary to specify the input to the corresponding named parameter: 
diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py
index 6e9fe3f27060..8a729d2b7568 100644
--- a/torch/onnx/symbolic_opset11.py
+++ b/torch/onnx/symbolic_opset11.py
@@ -518,7 +518,7 @@ def unbind(g, self, dim=0, _outputs=None):
 
 
 # Generate paddings in ONNX order based on pad in pytorch.
-# Arguments:
+# Args:
 #     dim: the dimension of the tensor.
 #     pad: the paddings in pytorch.
 #          The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index a65cea494529..a9edb9ca32ed 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -932,7 +932,7 @@ def symbolic_fn(g, input, output_size):
 
 
 # Generate paddings in ONNX order based on pad in pytorch.
-# Arguments:
+# Args:
 #     dim: the dimension of the tensor.
 #     pad: the paddings in pytorch.
 #          The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
@@ -2315,6 +2315,9 @@ def log2(g, self):
 def prim_shape(g, self):
     return g.op('Shape', self)
 
+def prim_max(g, self, other):
+    return g.op('Max', self, other)
+
 def prim_data(g, self):
     return self
 
@@ -2365,14 +2368,16 @@ def gather(g, self, dim, index, sparse_grad=False):
 def _var_mean(g, input, dim, unbiased, keepdim):
     if dim is None:
         mean = g.op("ReduceMean", input, keepdims_i=0)
+        t_mean = mean
         num_elements = numel(g, input)
     else:
         mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+        t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
         redudced_dims = g.op("Shape", input)
         # dim could contain one or multiple dimensions
         redudced_dims = g.op("Gather", redudced_dims, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0)
         num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
-    sub_v = g.op("Sub", input, mean)
+    sub_v = g.op("Sub", input, t_mean)
     sqr_sub = g.op("Mul", sub_v, sub_v)
     keepdim_mean = 0 if dim is None else keepdim
     var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index 479f874819f2..7872e7800b44 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -831,7 +831,7 @@ def _graph_op(g, opname, *raw_args, **kwargs):
 
     This function is monkey-patched onto Graph.
 
-    Arguments:
+    Args:
         opname (string): The ONNX operator name, e.g., `Abs` or `Add`.
         args (Node...): The inputs to the operator; usually provided
             as arguments to the `symbolic` definition.
diff --git a/torch/optim/_multi_tensor/adadelta.py b/torch/optim/_multi_tensor/adadelta.py
index c1e58232641c..7c600fafd45e 100644
--- a/torch/optim/_multi_tensor/adadelta.py
+++ b/torch/optim/_multi_tensor/adadelta.py
@@ -7,7 +7,7 @@ class Adadelta(Optimizer):
 
     It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         rho (float, optional): coefficient used for computing a running average
@@ -38,7 +38,7 @@ def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py
index eed6635a074e..d539e865d478 100644
--- a/torch/optim/_multi_tensor/adam.py
+++ b/torch/optim/_multi_tensor/adam.py
@@ -10,7 +10,7 @@ class Adam(Optimizer):
     The implementation of the L2 penalty follows changes proposed in
     `Decoupled Weight Decay Regularization`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-3)
@@ -56,7 +56,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/adamax.py b/torch/optim/_multi_tensor/adamax.py
index 1d7f1a2edbc4..a866a3e02f38 100644
--- a/torch/optim/_multi_tensor/adamax.py
+++ b/torch/optim/_multi_tensor/adamax.py
@@ -7,7 +7,7 @@ class Adamax(Optimizer):
 
     It has been proposed in `Adam: A Method for Stochastic Optimization`__.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 2e-3)
@@ -40,7 +40,7 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py
index cf63e336e8ce..3670c786b68d 100644
--- a/torch/optim/_multi_tensor/adamw.py
+++ b/torch/optim/_multi_tensor/adamw.py
@@ -9,7 +9,7 @@ class AdamW(Optimizer):
     The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
     The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-3)
@@ -55,7 +55,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/asgd.py b/torch/optim/_multi_tensor/asgd.py
index 5d31a72e1f7e..351e4324908f 100644
--- a/torch/optim/_multi_tensor/asgd.py
+++ b/torch/optim/_multi_tensor/asgd.py
@@ -9,7 +9,7 @@ class ASGD(Optimizer):
     It has been proposed in `Acceleration of stochastic approximation by
     averaging`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -36,7 +36,7 @@ def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py
index ef020970055d..ac918307e7c0 100644
--- a/torch/optim/_multi_tensor/rmsprop.py
+++ b/torch/optim/_multi_tensor/rmsprop.py
@@ -17,7 +17,7 @@ class RMSprop(Optimizer):
     is the scheduled learning rate and :math:`v` is the weighted moving average
     of the squared gradient.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -56,7 +56,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py
index af11157b848e..d2a3eca755db 100644
--- a/torch/optim/_multi_tensor/rprop.py
+++ b/torch/optim/_multi_tensor/rprop.py
@@ -5,7 +5,7 @@
 class Rprop(Optimizer):
     """Implements the resilient backpropagation algorithm.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -29,7 +29,7 @@ def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py
index 73df0033b10a..a1f5772871f5 100644
--- a/torch/optim/_multi_tensor/sgd.py
+++ b/torch/optim/_multi_tensor/sgd.py
@@ -76,7 +76,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
index 11cdf5e28b65..ae9286a5cb2f 100644
--- a/torch/optim/adadelta.py
+++ b/torch/optim/adadelta.py
@@ -8,7 +8,7 @@ class Adadelta(Optimizer):
 
     It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         rho (float, optional): coefficient used for computing a running average
@@ -39,7 +39,7 @@ def __init__(self, params, lr=1.0, rho=0.9, eps=1e-6, weight_decay=0):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index a4bffc0efbc6..6299924c5aa3 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -9,7 +9,7 @@ class Adagrad(Optimizer):
     It has been proposed in `Adaptive Subgradient Methods for Online Learning
     and Stochastic Optimization`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -54,7 +54,7 @@ def share_memory(self):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
index c5ae8849aba7..2e0611dae6dc 100644
--- a/torch/optim/adam.py
+++ b/torch/optim/adam.py
@@ -10,7 +10,7 @@ class Adam(Optimizer):
     The implementation of the L2 penalty follows changes proposed in
     `Decoupled Weight Decay Regularization`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-3)
@@ -56,7 +56,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
index 1bb842376349..fa028f469b43 100644
--- a/torch/optim/adamax.py
+++ b/torch/optim/adamax.py
@@ -7,7 +7,7 @@ class Adamax(Optimizer):
 
     It has been proposed in `Adam: A Method for Stochastic Optimization`__.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 2e-3)
@@ -40,7 +40,7 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8,
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
index 07d58313cfb8..e350566d7f5b 100644
--- a/torch/optim/adamw.py
+++ b/torch/optim/adamw.py
@@ -9,7 +9,7 @@ class AdamW(Optimizer):
     The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
     The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-3)
@@ -55,7 +55,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index e9f7ca2b2255..887bf71e0029 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -9,7 +9,7 @@ class ASGD(Optimizer):
     It has been proposed in `Acceleration of stochastic approximation by
     averaging`_.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -36,7 +36,7 @@ def __init__(self, params, lr=1e-2, lambd=1e-4, alpha=0.75, t0=1e6, weight_decay
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/lbfgs.py b/torch/optim/lbfgs.py
index 3571975b1388..4329b6fd8bae 100644
--- a/torch/optim/lbfgs.py
+++ b/torch/optim/lbfgs.py
@@ -197,7 +197,7 @@ class LBFGS(Optimizer):
         ``param_bytes * (history_size + 1)`` bytes). If it doesn't fit in memory
         try reducing the history size, or use a different algorithm.
 
-    Arguments:
+    Args:
         lr (float): learning rate (default: 1)
         max_iter (int): maximal number of iterations per optimization step
             (default: 20)
@@ -283,7 +283,7 @@ def _directional_evaluate(self, closure, x, t, d):
     def step(self, closure):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 706d12683c27..043a8213b4c2 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -87,7 +87,7 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         """Loads the schedulers state.
 
-        Arguments:
+        Args:
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
@@ -227,7 +227,7 @@ def load_state_dict(self, state_dict):
 
         When saving or loading the scheduler, please make sure to also save or load the state of the optimizer.
 
-        Arguments:
+        Args:
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
@@ -305,7 +305,7 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         """Loads the schedulers state.
 
-        Arguments:
+        Args:
             state_dict (dict): scheduler state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 0a302008cd22..b3e38c613fe0 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -24,7 +24,7 @@ class Optimizer(object):
         ordering that is consistent between runs. Examples of objects that don't
         satisfy those properties are sets and iterators over values of dictionaries.
 
-    Arguments:
+    Args:
         params (iterable): an iterable of :class:`torch.Tensor` s or
             :class:`dict` s. Specifies what Tensors should be optimized.
         defaults: (dict): a dict containing default values of optimization
@@ -127,7 +127,7 @@ def pack_group(group):
     def load_state_dict(self, state_dict):
         r"""Loads the optimizer state.
 
-        Arguments:
+        Args:
             state_dict (dict): optimizer state. Should be an object returned
                 from a call to :meth:`state_dict`.
         """
@@ -189,7 +189,7 @@ def update_group(group, new_group):
     def zero_grad(self, set_to_none: bool = False):
         r"""Sets the gradients of all optimized :class:`torch.Tensor` s to zero.
 
-        Arguments:
+        Args:
             set_to_none (bool): instead of setting to zero, set the grads to None.
                 This is will in general have lower memory footprint, and can modestly improve performance.
                 However, it changes certain behaviors. For example:
@@ -219,7 +219,7 @@ def zero_grad(self, set_to_none: bool = False):
     def step(self, closure):
         r"""Performs a single optimization step (parameter update).
 
-        Arguments:
+        Args:
             closure (callable): A closure that reevaluates the model and
                 returns the loss. Optional for most optimizers.
 
@@ -235,7 +235,7 @@ def add_param_group(self, param_group):
         This can be useful when fine tuning a pre-trained network as frozen layers can be made
         trainable and added to the :class:`Optimizer` as training progresses.
 
-        Arguments:
+        Args:
             param_group (dict): Specifies what Tensors should be optimized along with group
             specific optimization options.
         """
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
index ea02096b5460..d19f87fda0b6 100644
--- a/torch/optim/rmsprop.py
+++ b/torch/optim/rmsprop.py
@@ -17,7 +17,7 @@ class RMSprop(Optimizer):
     is the scheduled learning rate and :math:`v` is the weighted moving average
     of the squared gradient.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -56,7 +56,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
index 071f104b958f..ec2a5f1f222a 100644
--- a/torch/optim/rprop.py
+++ b/torch/optim/rprop.py
@@ -5,7 +5,7 @@
 class Rprop(Optimizer):
     """Implements the resilient backpropagation algorithm.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-2)
@@ -29,7 +29,7 @@ def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
index 9c6af5c3aa8b..5b071f820ad6 100644
--- a/torch/optim/sgd.py
+++ b/torch/optim/sgd.py
@@ -76,7 +76,7 @@ def __setstate__(self, state):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/sparse_adam.py b/torch/optim/sparse_adam.py
index 909aa0c6cc62..05a576d6544c 100644
--- a/torch/optim/sparse_adam.py
+++ b/torch/optim/sparse_adam.py
@@ -9,7 +9,7 @@ class SparseAdam(Optimizer):
     In this variant, only moments that show up in the gradient get updated, and
     only those portions of the gradient get applied to the parameters.
 
-    Arguments:
+    Args:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-3)
@@ -54,7 +54,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
     def step(self, closure=None):
         """Performs a single optimization step.
 
-        Arguments:
+        Args:
             closure (callable, optional): A closure that reevaluates the model
                 and returns the loss.
         """
diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py
index d27597406c50..65b694a27cf2 100644
--- a/torch/optim/swa_utils.py
+++ b/torch/optim/swa_utils.py
@@ -17,7 +17,7 @@ class AveragedModel(Module):
     on the device :attr:`device` and allows to compute running averages of the 
     parameters of the :attr:`model`.
 
-    Arguments:
+    Args:
         model (torch.nn.Module): model to use with SWA
         device (torch.device, optional): if provided, the averaged model will be
             stored on the :attr:`device` 
@@ -117,7 +117,7 @@ def update_bn(loader, model, device=None):
 
     It performs one pass over data in `loader` to estimate the activation
     statistics for BatchNorm layers in the model.
-    Arguments:
+    Args:
         loader (torch.utils.data.DataLoader): dataset loader to compute the
             activation statistics on. Each data batch should be either a
             tensor, or a list/tuple whose first element is a tensor
@@ -172,7 +172,7 @@ class SWALR(_LRScheduler):
     This learning rate scheduler is meant to be used with Stochastic Weight 
     Averaging (SWA) method (see `torch.optim.swa_utils.AveragedModel`).
 
-    Arguments:
+    Args:
         optimizer (torch.optim.Optimizer): wrapped optimizer
         swa_lrs (float or list): the learning rate value for all param groups
             together or separately for each group.
diff --git a/torch/overrides.py b/torch/overrides.py
index 6c193b273344..f8d9f2e152f6 100644
--- a/torch/overrides.py
+++ b/torch/overrides.py
@@ -720,6 +720,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]:
         torch.q_scale: lambda input: -1,
         torch.q_zero_point: lambda input: -1,
         torch.qr: lambda input, some=True, out=None: -1,
+        torch.linalg.qr: lambda input, mode='reduced', out=None: -1,
         torch.quantile: lambda input, q, dim=None, keepdim=False, out=None: -1,
         torch.nanquantile: lambda input, q, dim=None, keepdim=False, out=None: -1,
         torch.quantize_per_channel: lambda input, scales, zero_points, axis, dtype: -1,
diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
index 1d407fcd8bbb..25bee1c2019f 100644
--- a/torch/profiler/profiler.py
+++ b/torch/profiler/profiler.py
@@ -50,7 +50,7 @@ class profile(object):
     """
     Profiler context manager.
 
-    Arguments:
+    Args:
 
     - ``activities`` - list of activity groups (CPU, CUDA) to use in profiling;
     - ``schedule`` - callable that takes step (int) as a single parameter and returns
@@ -249,7 +249,7 @@ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         """
         Save stack traces in a file in a format suitable for visualization.
 
-        Arguments:
+        Args:
 
         - ``path`` - save stacks file to this location;
         - ``metric`` - metric to use: "self_cpu_time_total" or "self_cuda_time_total"
diff --git a/torch/quantization/_numeric_suite_fx.py b/torch/quantization/_numeric_suite_fx.py
index eb1596832c4d..aeba95bb4e8f 100644
--- a/torch/quantization/_numeric_suite_fx.py
+++ b/torch/quantization/_numeric_suite_fx.py
@@ -21,7 +21,7 @@
 def remove_qconfig_observer_fx(model):
     # remove activation post process
     act_post_process_removed_graph = Graph()
-    env = {}  # type: Dict[str, Any]
+    env: Dict[str, Any] = {}
 
     modules = dict(model.named_modules())
 
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index f0ee8453557d..460b1c277a93 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -41,8 +41,7 @@ def calculate_qparams(self, **kwargs):
         pass
 
     @torch.jit.export
-    def enable_fake_quant(self, enabled=True):
-        # type: (bool) -> None
+    def enable_fake_quant(self, enabled: bool = True) -> None:
         self.fake_quant_enabled[0] = 1 if enabled else 0
 
     @torch.jit.export
@@ -50,8 +49,7 @@ def disable_fake_quant(self):
         self.enable_fake_quant(False)
 
     @torch.jit.export
-    def enable_observer(self, enabled=True):
-        # type: (bool) -> None
+    def enable_observer(self, enabled: bool = True) -> None:
         self.observer_enabled[0] = 1 if enabled else 0
 
     @torch.jit.export
diff --git a/torch/quantization/fuse_modules.py b/torch/quantization/fuse_modules.py
index d3c767819f5f..5ccebd973fd3 100644
--- a/torch/quantization/fuse_modules.py
+++ b/torch/quantization/fuse_modules.py
@@ -93,7 +93,7 @@ def fuse_modules(model, modules_to_fuse, inplace=False, fuser_func=fuse_known_mo
     with the fused module, replacing the rest of the modules
     with identity.
 
-    Arguments:
+    Args:
         model: Model containing the modules to be fused
         modules_to_fuse: list of list of module names to fuse. Can also be a list
                          of strings if there is only a single list of modules to fuse.
diff --git a/torch/quantization/fx/fuse.py b/torch/quantization/fx/fuse.py
index 5aabbd66c4b1..59e3851dcd57 100644
--- a/torch/quantization/fx/fuse.py
+++ b/torch/quantization/fx/fuse.py
@@ -21,7 +21,7 @@
 
 from .quantization_types import Pattern
 
-from typing import Callable, Tuple, Optional
+from typing import Callable, Tuple
 
 
 class Fuser:
@@ -59,11 +59,12 @@ def load_arg(a):
         model = GraphModule(input_root, self.fused_graph)
         return model
 
-    def _find_matches(self, root: GraphModule, graph: Graph,
-                      patterns: Dict[Pattern, Callable]
-                      ) -> Dict[str, Tuple[Node, Optional[Any]]]:
+    def _find_matches(
+            self, root: GraphModule, graph: Graph,
+            patterns: Dict[Pattern, Callable]
+    ) -> Dict[str, Tuple[Node, FuseHandler]]:
         modules = dict(root.named_modules())
-        match_map = {}  # node name -> (root_node, match_value?)
+        match_map : Dict[str, Tuple[Node, FuseHandler]] = {}  # node name -> (root_node, match_value)
 
         def apply_match(pattern, node, match):
             if isinstance(pattern, tuple):
diff --git a/torch/quantization/fx/fusion_patterns.py b/torch/quantization/fx/fusion_patterns.py
index b7af6008b3f3..1749484fccec 100644
--- a/torch/quantization/fx/fusion_patterns.py
+++ b/torch/quantization/fx/fusion_patterns.py
@@ -6,12 +6,25 @@
 from .utils import _parent_name
 from .quantization_types import QuantizerCls
 from ..fuser_method_mappings import get_fuser_method
+from abc import ABC, abstractmethod
 from typing import Any, Callable, Dict
 
 # ---------------------
-# Fusion Patterns
+# Fusion Pattern Registrations
 # ---------------------
 
+# Base Pattern Handler
+class FuseHandler(ABC):
+    """ Base handler class for the fusion patterns
+    """
+    def __init__(self, quantizer: QuantizerCls, node: Node):
+        pass
+
+    @abstractmethod
+    def fuse(self, quantizer: QuantizerCls, load_arg: Callable,
+             fuse_custom_config_dict: Dict[str, Any] = None) -> Node:
+        pass
+
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv1d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv2d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.Conv3d))
@@ -27,9 +40,9 @@
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm1d, torch.nn.Conv1d)))
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm2d, torch.nn.Conv2d)))
 @register_fusion_pattern((torch.nn.functional.relu, (torch.nn.BatchNorm3d, torch.nn.Conv3d)))
-class ConvBNReLUFusion():
+class ConvBNReLUFusion(FuseHandler):
     def __init__(self, quantizer: QuantizerCls, node: Node):
-        super().__init__()
+        super().__init__(quantizer, node)
         self.relu_node = None
         self.bn_node = None
         if (node.op == 'call_function' and node.target is torch.nn.functional.relu) or \
@@ -94,9 +107,9 @@ def fuse(self, quantizer: QuantizerCls, load_arg: Callable,
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm2d))
 @register_fusion_pattern((torch.nn.functional.relu, torch.nn.BatchNorm3d))
 @register_fusion_pattern((torch.nn.ReLU, torch.nn.BatchNorm3d))
-class ModuleReLUFusion():
+class ModuleReLUFusion(FuseHandler):
     def __init__(self, quantizer: QuantizerCls, node: Node):
-        super().__init__()
+        super().__init__(quantizer, node)
         self.relu_node = node
         assert isinstance(node.args[0], Node)
         node = node.args[0]
diff --git a/torch/quantization/fx/observed_module.py b/torch/quantization/fx/observed_module.py
index a95bc184fa10..808a3b36fb4a 100644
--- a/torch/quantization/fx/observed_module.py
+++ b/torch/quantization/fx/observed_module.py
@@ -2,11 +2,11 @@
 import copy
 from torch.fx import GraphModule  # type: ignore
 from torch.fx.graph import Graph
-from typing import Union, Dict, Any
+from typing import Union, Dict, Any, List
 
 class ObservedGraphModule(GraphModule):
 
-    def get_preserved_attr_names(self):
+    def get_preserved_attr_names(self) -> List[str]:
         return ['_activation_post_process_map',
                 '_patterns',
                 '_qconfig_map',
@@ -35,6 +35,12 @@ def is_observed_module(module: Any) -> bool:
     return isinstance(module, ObservedGraphModule)
 
 class ObservedStandaloneGraphModule(ObservedGraphModule):
+    def get_preserved_attr_names(self) -> List[str] :
+        return super().get_preserved_attr_names() + [
+            "_standalone_module_input_quantized_idxs",
+            "_standalone_module_output_quantized_idxs"
+        ]
+
     def __deepcopy__(self, memo):
         fake_mod = torch.nn.Module()
         fake_mod.__dict__ = copy.deepcopy(self.__dict__)
diff --git a/torch/quantization/fx/quantization_patterns.py b/torch/quantization/fx/quantization_patterns.py
index a1e601332d4a..fb5bef0bd0ad 100644
--- a/torch/quantization/fx/quantization_patterns.py
+++ b/torch/quantization/fx/quantization_patterns.py
@@ -537,6 +537,8 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
     torch._ops.ops.quantized.instance_norm:
     ['running_mean', 'running_var', 'use_input_stats', 'momentum'],
 }
+@register_quant_pattern(torch.nn.ConvTranspose1d)
+@register_quant_pattern(torch.nn.ConvTranspose2d)
 @register_quant_pattern(torch.nn.ELU)
 @register_quant_pattern(torch.nn.LeakyReLU)
 @register_quant_pattern(torch.nn.Hardswish)
@@ -753,10 +755,10 @@ def convert(self, quantizer: QuantizerCls, node: Node, load_arg: Callable,
         qconfig = quantizer.qconfig_map[node.name]
         convert = torch.quantization.quantize_fx._convert_standalone_module_fx  # type: ignore
         observed_standalone_module = quantizer.modules[node.target]
+        input_quantized_idxs = observed_standalone_module._standalone_module_input_quantized_idxs.tolist()
         quantized_standalone_module = convert(observed_standalone_module, debug=debug)
         parent_name, name = _parent_name(node.target)
         # update the modules dict
         setattr(quantizer.modules[parent_name], name, quantized_standalone_module)
         quantizer.modules[node.target] = quantized_standalone_module
-        # standalone module takes float input
-        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=False))
+        return quantizer.quantized_graph.node_copy(node, load_arg(quantized=input_quantized_idxs))
diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py
index af9496a66a63..318295270b61 100644
--- a/torch/quantization/fx/quantize.py
+++ b/torch/quantization/fx/quantize.py
@@ -102,14 +102,15 @@ def insert_observer(
         'call_module', observer_name, (load_arg(node),), {})
     observed_node_names_set.add(node.name)
 
-def insert_observer_for_special_module(
+def maybe_insert_observer_for_special_module(
         quantize_handler: QuantizeHandler, modules: Dict[str, torch.nn.Module],
-        prepare_custom_config_dict: Any, qconfig: Any, node: Node):
+        prepare_custom_config_dict: Any, qconfig: Any, node: Node) -> Optional[List[int]]:
     """ Insert observer for custom module and standalone module
       Returns: standalone_module_input_idxs: the indexs for inputs that
       needs to be observed by parent module
     """
     assert modules is not None
+    standalone_module_input_idxs = None
     if isinstance(quantize_handler, CustomModuleQuantizeHandler):
         custom_module = modules[node.target]  # type: ignore
         custom_module_class_mapping = prepare_custom_config_dict.get(
@@ -129,19 +130,22 @@ def insert_observer_for_special_module(
         class_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_class_configs}
         name_config_map = {x[0]: (x[1], x[2]) for x in standalone_module_name_configs}
         config = class_config_map.get(type(standalone_module), (None, None))
-        config = name_config_map.get(node.target, (None, None))
-        standalone_module_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
-        standalone_prepare_config_dict = {} if config[1] is None else config[1]
+        config = name_config_map.get(node.target, config)
+        sm_qconfig_dict = {"": qconfig} if config[0] is None else config[0]
+        sm_prepare_config_dict = {} if config[1] is None else config[1]
         prepare = \
             torch.quantization.quantize_fx._prepare_standalone_module_fx  # type: ignore
         observed_standalone_module = \
-            prepare(standalone_module, standalone_module_qconfig_dict, standalone_prepare_config_dict)
+            prepare(standalone_module, sm_qconfig_dict, sm_prepare_config_dict)
+        standalone_module_input_idxs = observed_standalone_module.\
+            _standalone_module_input_quantized_idxs.int().tolist()
         observed_standalone_module = mark_observed_standalone_module(
             observed_standalone_module)
         parent_name, name = _parent_name(node.target)
         setattr(modules[parent_name], name,
                 observed_standalone_module)
         modules[node.target] = observed_standalone_module  # type: ignore
+    return standalone_module_input_idxs
 
 def insert_observer_for_output_of_the_node(
         node: Node,
@@ -155,7 +159,8 @@ def insert_observer_for_output_of_the_node(
         observed_graph: Graph,
         load_arg: Callable,
         observed_node_names_set: Set[str],
-        matched_nodes: Optional[List[Node]]):
+        matched_nodes: Optional[List[Node]],
+        standalone_module_input_idxs: Optional[List[int]]):
     """ Insert observer/fake_quantize module for output of the observed
     module if needed
     """
@@ -215,8 +220,13 @@ def input_is_observed(arg):
                 observed_node_names_set.add(node.name)
         elif isinstance(quantize_handler,
                         StandaloneModuleQuantizeHandler):
-            # output is observed in the standalone module
-            return
+            assert node.op == "call_module"
+            assert isinstance(node.target, str)
+            sm_out_qidxs = modules[node.target]._standalone_module_output_quantized_idxs.tolist()  # type: ignore
+            output_is_quantized = 0 in sm_out_qidxs
+
+            if output_is_quantized:
+                observed_node_names_set.add(node.name)
         elif (quantize_handler.all_node_args and
               input_output_observed(quantize_handler)):
             # observer for outputs
@@ -226,6 +236,16 @@ def input_is_observed(arg):
                 activation_post_process_map, env, observed_graph,
                 load_arg, observed_node_names_set)
 
+        # insert observer for input of standalone module
+        if standalone_module_input_idxs is not None:
+            for idx in standalone_module_input_idxs:
+                if node.args[idx].name not in observed_node_names_set:  # type: ignore
+                    new_observer = qconfig.activation()
+                    insert_observer(
+                        node, new_observer, model,
+                        activation_post_process_map, env, observed_graph,
+                        load_arg, observed_node_names_set)
+
 def insert_observer_for_input_arg_of_observed_node(
         node: Node, observed_node_names_set: Set[str],
         quants: Dict[str, Tuple[DefaultQuantizeHandler, Callable]],
@@ -373,10 +393,19 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        When we are preparing a standalone module:
-        both input and output are observed in prepared standalone module
+        How the standalone module is observed is specified by `input_quantized_idxs` and
+        `output_quantized_idxs` in the prepare_custom_config for the standalone module
         Returns:
             model(GraphModule): prepared standalone module
+            attributes:
+                _standalone_module_input_quantized_idxs(List[Int]): a list of
+                    indexes for the graph input that is expected to be quantized,
+                    same as input_quantized_idxs configuration provided
+                    for the standalone module
+                _standalone_module_output_quantized_idxs(List[Int]): a list of
+                    indexs for the graph output that is quantized
+                    same as input_quantized_idxs configuration provided
+                    for the standalone module
         """
         if prepare_custom_config_dict is None:
             prepare_custom_config_dict = {}
@@ -430,8 +459,6 @@ def _prepare(self, model: GraphModule, qconfig_dict: Any,
         def load_arg(a):
             return map_arg(a, lambda node: env[node.name])
 
-        # indexes for the inputs that needs to be observed
-        standalone_module_observed_input_idxs: List[int] = []
         graph_inputs = []
         for node in model.graph.nodes:
             if node.op == 'placeholder':
@@ -487,14 +514,15 @@ def load_arg(a):
                 # parent
                 if qconfig is not None:
                     assert obj is not None
-                    insert_observer_for_special_module(
-                        obj, self.modules, prepare_custom_config_dict, qconfig,
-                        node)
+                    standalone_module_input_idxs = \
+                        maybe_insert_observer_for_special_module(
+                            obj, self.modules, prepare_custom_config_dict, qconfig,
+                            node)
                     insert_observer_for_output_of_the_node(
                         node, obj, qconfig, self.modules, model, pattern,
                         self.activation_post_process_map, env,
                         observed_graph, load_arg, observed_node_names_set,
-                        matched_nodes)
+                        matched_nodes, standalone_module_input_idxs)
             else:
                 env[node.name] = observed_graph.node_copy(node, load_arg)
 
@@ -516,6 +544,21 @@ def load_arg(a):
         model = GraphModule(model, observed_graph)
         self.save_state(model)
         model = mark_observed_module(model)
+        if is_standalone_module:
+            assert result_node is not None
+            assert isinstance(result_node.args[0], Node), \
+                "standalone module only supports returning simple value currently"\
+                "(not tuple, dict etc.)"
+            # indicator for whether output is observed or not.
+            # This used for correctly quantize standalone modules
+            output_is_observed = \
+                result_node.args[0].name in observed_node_names_set
+            # these inputs are observed in parent
+            # converting List[int] to Tensor since module attribute is
+            # Union[Tensor, Module]
+            model._standalone_module_input_quantized_idxs = \
+                torch.Tensor(input_quantized_idxs)
+            model._standalone_module_output_quantized_idxs = torch.Tensor(output_quantized_idxs)
         return model
 
     def save_state(self, observed: GraphModule) -> None:
@@ -569,8 +612,10 @@ def _convert(self, model: GraphModule, debug: bool = False,
         """ standalone_module means it a submodule that is not inlined in
         parent module, and will be quantized separately as one unit.
 
-        Returns a quantized standalone module which accepts float input
-        and produces float output.
+        Returns a quantized standalone module, whether input/output is quantized is
+        specified by prepare_custom_config_dict, with
+        input_quantized_idxs, output_quantized_idxs, please
+        see docs for prepare_fx for details
         """
         if convert_custom_config_dict is None:
             convert_custom_config_dict = {}
@@ -627,36 +672,50 @@ def load_x(n: Node) -> Node:
             else:
                 return env[n.name]
 
-        def load_arg(quantized: Optional[Union[List[Any], bool, Tuple[Any, ...]]]
+        def load_arg(quantized: Optional[Union[List[int], bool, Tuple[int, ...]]]
                      ) -> Callable[[Node], Argument]:
             """
             Input: quantized, which can be None, list, boolean or tuple
-              - if quantized is a list or tuple, then arg should be a list and
-                the args with corresponding indexes will be quantized
-              - if quantized is a boolean, then all args will be
-                quantized/not quantized
               - if quantized is None, then we'll load the node as long as it
                 exists
+              - if quantized is a boolean, then all args will be
+                quantized/not quantized
+              - if quantized is an empty list or tuple, then it is the same as load_arg(quantized=False)
+              - if quantized is a list or tuple, then arg should be a list and
+                the args with corresponding indexes will be quantized
 
             Output: fn which takes arg_or_args, and loads them from the
                 corresponding environment depending on the value of quantized.
             """
             assert quantized is None or \
                 isinstance(quantized, (tuple, list, bool)), type(quantized)
+            if isinstance(quantized, (tuple, list)) and len(quantized) == 0:
+                # empty tuple or list means nothing is quantized
+                quantized = False
 
             def load_arg_impl(arg_or_args):
-                if quantized is None:
+                # we'll update the format of `quantized`
+                # to better match arg_or_args
+                updated_quantized: Optional[Union[List[int], bool, Tuple[int, ...]]] = quantized
+
+                if isinstance(quantized, (tuple, list)) and \
+                   len(quantized) == 1 and isinstance(arg_or_args, Node):
+                    # when argument is one Node instead of tuple, we just need to check
+                    # 0 is in the quantized list
+                    updated_quantized = 0 in quantized
+
+                if updated_quantized is None:
                     return map_arg(arg_or_args, load_x)
-                if isinstance(quantized, bool):
+                if isinstance(updated_quantized, bool):
                     return map_arg(
                         arg_or_args,
-                        load_quantized if quantized else load_non_quantized)
-                elif isinstance(quantized, (tuple, list)):
+                        load_quantized if updated_quantized else load_non_quantized)
+                elif isinstance(updated_quantized, (tuple, list)):
                     assert isinstance(arg_or_args, (tuple, list)), arg_or_args
                     loaded_args = []
                     # for now, we only support quantizing positional arguments
                     for i, a in enumerate(arg_or_args):
-                        if i in quantized:
+                        if i in updated_quantized:
                             loaded_args.append(map_arg(a, load_quantized))
                         else:
                             loaded_args.append(map_arg(a, load_non_quantized))
@@ -690,10 +749,10 @@ def node_arg_is_quantized(node_arg: Any) -> bool:
         def is_output_quantized(node: Node, obj: QuantizeHandler) -> bool:
             """ Check if output node is quantized or not """
             assert self.modules is not None
-            # by default the output is expected to be quantized
+            # by default the output for a quantizable node is expected to be quantized
             quantized = True
 
-            # Need to get correct quantized/non-quantized state for the output
+            # Need to get correct quantized/non-quantized state forn the output
             # of CopyNode
             if type(obj) in [
                     CopyNode,
@@ -750,7 +809,7 @@ def insert_quantize_node(node: Node) -> None:
             "output_quantized_idxs", [])
 
         for node in model.graph.nodes:
-            if node.op == 'output':
+            if node.op == "output":
                 cur_output_node_idx = output_node_seen_cnt
                 output_node_seen_cnt += 1
                 if cur_output_node_idx in output_quantized_idxs:
@@ -775,12 +834,19 @@ def insert_quantize_node(node: Node) -> None:
                     quantized = False
                 else:
                     assert obj is not None
+                    # We will get whether the output is quantized or not before
+                    # convert for standalone module and after convert
+                    # for non-standalone module, since _standalone_module_output_quantized_idxs
+                    # is only available in observed standalone module
+                    if is_observed_standalone_module_node:
+                        out_quant_idxs = self.modules[node.target]._standalone_module_output_quantized_idxs.tolist()  # type: ignore
+                        assert len(out_quant_idxs) <= 1, "Currently standalone only support one output"
+                        quantized = 0 in out_quant_idxs
+
                     result = obj.convert(
                         self, node, load_arg, debug=debug,
                         convert_custom_config_dict=convert_custom_config_dict)
-                    if is_observed_standalone_module_node:
-                        quantized = False
-                    else:
+                    if not is_observed_standalone_module_node:
                         quantized = is_output_quantized(node, obj)
 
                 if quantized:
@@ -929,7 +995,7 @@ def _find_matches(
             standalone_module_names = []
 
         match_map: Dict[str, MatchResult] = {}
-        all_matched = set()
+        all_matched : Set[str] = set()
 
         def record_match(pattern, node, matched):
             if isinstance(pattern, tuple):
diff --git a/torch/quantization/fx/utils.py b/torch/quantization/fx/utils.py
index c1f849803342..8285e204b1ed 100644
--- a/torch/quantization/fx/utils.py
+++ b/torch/quantization/fx/utils.py
@@ -9,7 +9,7 @@
     Node,
 )
 
-from typing import Callable, Optional, List, Dict, Any
+from typing import Callable, Optional, List, Dict, Any, Set
 
 # turn foo.bar -> ['foo', 'bar']
 def _parent_name(target):
@@ -140,7 +140,7 @@ def get_next_qparams_idx(module, qparams):
         inputs.append(graph.create_node('get_attr', qparam_full_path))
     return graph.create_node('call_function', quantize_op, tuple(inputs), {})
 
-def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key):
+def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key) -> List[Any]:
     r""" Get all the unique custom module keys in the custom config dict
     e.g.
     Input:
@@ -163,7 +163,7 @@ def get_custom_module_class_keys(custom_config_dict, custom_config_dict_key):
     [CustomModule1, CustomModule2, CustomModule3]
     """
     # using set to dedup
-    float_custom_module_classes = set()
+    float_custom_module_classes : Set[Any] = set()
     custom_module_mapping = custom_config_dict.get(custom_config_dict_key, {})
     for quant_mode in ["static", "dynamic", "weight_only"]:
         quant_mode_custom_module_config = custom_module_mapping.get(quant_mode, {})
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 32d07c939695..2cc579f66087 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -390,6 +390,8 @@ def __init__(self, dtype=torch.quint8, qscheme=torch.per_tensor_affine,
 
     def forward(self, x_orig):
         r"""Records the running minimum and maximum of ``x``."""
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_val.dtype)
         min_val_cur, max_val_cur = torch._aminmax(x)
@@ -463,6 +465,8 @@ def __init__(self, averaging_constant=0.01, dtype=torch.quint8,
                                                           quant_max=quant_max)
 
     def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_val.dtype)
         min_val = self.min_val
@@ -532,6 +536,8 @@ def forward(self, x_orig):
         return self._forward(x_orig)
 
     def _forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         min_vals = self.min_vals
         max_vals = self.max_vals
@@ -638,6 +644,8 @@ def __init__(self, averaging_constant=0.01, ch_axis=0, dtype=torch.quint8,
         self.averaging_constant = averaging_constant
 
     def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()  # avoid keeping autograd tape
         x = x.to(self.min_vals.dtype)
         min_vals = self.min_vals
@@ -877,8 +885,9 @@ def _combine_histograms(self,
         orig_hist = orig_hist + interpolated_histogram.to(torch.float)
         return orig_hist
 
-    def forward(self, x_orig):
-        # type: (torch.Tensor) -> torch.Tensor
+    def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
+        if x_orig.numel() == 0:
+            return x_orig
         x = x_orig.detach()
         min_val = self.min_val
         max_val = self.max_val
diff --git a/torch/quantization/qconfig.py b/torch/quantization/qconfig.py
index 8da4ad6bb182..2d91d8ab6b3e 100644
--- a/torch/quantization/qconfig.py
+++ b/torch/quantization/qconfig.py
@@ -3,6 +3,8 @@
 from .fake_quantize import *
 import torch.nn as nn
 
+from typing import Union
+
 class QConfig(namedtuple('QConfig', ['activation', 'weight'])):
     """
     Describes how to quantize a layer or a part of the network by providing
@@ -109,3 +111,18 @@ def get_default_qat_qconfig(backend='fbgemm'):
     else:
         qconfig = default_qat_qconfig
     return qconfig
+
+def assert_valid_qconfig(qconfig: Union[QConfig, QConfigDynamic],
+                         mod: torch.nn.Module) -> None:
+    is_conv_transpose_mod = (
+        isinstance(mod, torch.nn.ConvTranspose1d) or
+        isinstance(mod, torch.nn.ConvTranspose2d) or
+        isinstance(mod, torch.nn.ConvTranspose3d))
+    if is_conv_transpose_mod:
+        example_observer = qconfig.weight()
+        is_per_channel = (
+            isinstance(example_observer, torch.quantization.PerChannelMinMaxObserver) or
+            isinstance(example_observer, torch.quantization.MovingAveragePerChannelMinMaxObserver)
+        )
+        assert not is_per_channel, \
+            'Per channel weight observer is not supported yet for ConvTranspose{n}d.'
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index a9417ecb80f3..77752a8af9c9 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -5,6 +5,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.quantized as nnq
+import torch.nn.quantizable as nnqa
 from torch.nn.intrinsic import _FusedModule
 
 from .quantization_mappings import (
@@ -49,6 +50,8 @@ def _propagate_qconfig_helper(module, qconfig_dict, allow_list=None,
     module_qconfig = qconfig_dict.get(prefix, module_qconfig)
     module_qconfig = getattr(module, 'qconfig', module_qconfig)
 
+    torch.quantization.qconfig.assert_valid_qconfig(module_qconfig, module)
+
     module.qconfig = module_qconfig
     for name, child in module.named_children():
         module_prefix = prefix + '.' + name if prefix else name
@@ -152,7 +155,10 @@ def insert_activation_post_process(m, special_act_post_process=None):
         elif needs_observation(child) and type(child) in custom_module_class_mapping:
             observed_child = custom_module_class_mapping[type(child)].from_float(child)
             setattr(module, name, observed_child)
-            insert_activation_post_process(observed_child)
+            # TODO: These are the modules that cannot be observed
+            #       Once there are more, we should move them to a separate list
+            if custom_module_class_mapping[type(child)] != nnqa.LSTM:
+                insert_activation_post_process(observed_child)
         else:
             add_observer_(child, qconfig_propagation_list, non_leaf_module_list, device, custom_module_class_mapping)
 
@@ -252,9 +258,12 @@ def _remove_activation_post_process(module):
         delattr(module, 'activation_post_process')
 
     # remove activation_post_proceess hook
+    handle_ids_to_remove = set()
     for handle_id, hook_fn in module._forward_hooks.items():
         if hook_fn is _observer_forward_hook:
-            module._forward_hooks.pop(handle_id)
+            handle_ids_to_remove.add(handle_id)
+    for handle_id in handle_ids_to_remove:
+        module._forward_hooks.pop(handle_id)
 
 # TODO: rename to something more general
 def _remove_qconfig(module):
diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py
index cba104b8f783..89ba877ffe78 100644
--- a/torch/quantization/quantize_fx.py
+++ b/torch/quantization/quantize_fx.py
@@ -107,8 +107,20 @@ def _prepare_standalone_module_fx(
     standalone_module means it a submodule that is not inlined in parent module,
         and will be quantized separately as one unit.
 
-    Both input and output of the module are observed in the
-    standalone module.
+    How the standalone module is observed is specified by `input_quantized_idxs` and
+    `output_quantized_idxs` in the prepare_custom_config for the standalone module
+
+    Returns:
+        model(GraphModule): prepared standalone module
+        attributes:
+            _standalone_module_input_quantized_idxs(List[Int]): a list of
+                indexes for the graph input that is expected to be quantized,
+                same as input_quantized_idxs configuration provided
+                for the standalone module
+            _standalone_module_output_quantized_idxs(List[Int]): a list of
+                indexs for the graph output that is quantized
+                same as input_quantized_idxs configuration provided
+                for the standalone module
     """
     return _prepare_fx(model, qconfig_dict, prepare_custom_config_dict, is_standalone_module=True)
 
@@ -378,8 +390,9 @@ def _convert_standalone_module_fx(
     r""" [Internal use only] Convert a model produced by :func:`~torch.quantization.prepare_standalone_module_fx`
     and convert it to a quantized model
 
-    Return:
-        A quantized standalone module which accepts float input
-        and produces float output.
+    Returns a quantized standalone module, whether input/output is quantized is
+    specified by prepare_custom_config_dict, with
+    input_quantized_idxs, output_quantized_idxs, please
+    see docs for prepare_fx for details
     """
     return _convert_fx(graph_module, debug, convert_custom_config_dict, is_standalone_module=True)
diff --git a/torch/random.py b/torch/random.py
index 30af86065907..31e2643845d2 100644
--- a/torch/random.py
+++ b/torch/random.py
@@ -67,7 +67,7 @@ def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="device
     Forks the RNG, so that when you return, the RNG is reset
     to the state that it was previously in.
 
-    Arguments:
+    Args:
         devices (iterable of CUDA IDs): CUDA devices for which to fork
             the RNG.  CPU RNG state is always forked.  By default, :meth:`fork_rng` operates
             on all devices, but will emit a warning if your machine has a lot
diff --git a/torch/serialization.py b/torch/serialization.py
index ebc5d0a08541..3b6f5828d858 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -192,7 +192,7 @@ def storage_to_tensor_type(storage):
 
 def _is_path(name_or_buffer):
     return isinstance(name_or_buffer, str) or \
-        (sys.version_info[0] == 3 and isinstance(name_or_buffer, pathlib.Path))
+        isinstance(name_or_buffer, pathlib.Path)
 
 
 class _opener(object):
diff --git a/torch/sparse/__init__.py b/torch/sparse/__init__.py
index e7abab351017..9ed1b0dc02ac 100644
--- a/torch/sparse/__init__.py
+++ b/torch/sparse/__init__.py
@@ -174,7 +174,7 @@ def softmax(input: Tensor, dim: int, dtype: Optional[DType] = None) -> Tensor:
     It is applied to all slices along `dim`, and will re-scale them so
     that the elements lie in the range `[0, 1]` and sum to 1.
 
-    Arguments:
+    Args:
         input (Tensor): input
         dim (int): A dimension along which softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type
@@ -191,7 +191,7 @@ def log_softmax(input: Tensor, dim: int, dtype: Optional[DType] = None) -> Tenso
 
     See :class:`~torch.sparse.softmax` for more details.
 
-    Arguments:
+    Args:
         input (Tensor): input
         dim (int): A dimension along which softmax will be computed.
         dtype (:class:`torch.dtype`, optional): the desired data type
diff --git a/torch/tensor.py b/torch/tensor.py
index e3e640da5c81..06a8ee4144c9 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -198,7 +198,7 @@ def backward(self, gradient=None, retain_graph=None, create_graph=False, inputs=
             in a user-specified CUDA stream context, see
             :ref:`Stream semantics of backward passes<bwd-cuda-stream-semantics>`.
 
-        Arguments:
+        Args:
             gradient (Tensor or None): Gradient w.r.t. the
                 tensor. If it is a tensor, it will be automatically converted
                 to a Tensor that does not require grad unless ``create_graph`` is True.
@@ -671,7 +671,7 @@ def __array_wrap__(self, array):
     def __contains__(self, element):
         r"""Check if `element` is present in tensor
 
-        Arguments:
+        Args:
             element (Tensor or scalar): element to be checked
                 for presence in current tensor"
         """
@@ -772,7 +772,7 @@ def refine_names(self, *names):
         Python 2 does not support Ellipsis but one may use a string literal
         instead (``'...'``).
 
-        Arguments:
+        Args:
             names (iterable of str): The desired names of the output tensor. May
                 contain up to one Ellipsis.
 
@@ -818,7 +818,7 @@ def align_to(self, *names):
         Python 2 does not support Ellipsis but one may use a string literal
         instead (``'...'``).
 
-        Arguments:
+        Args:
             names (iterable of str): The desired dimension ordering of the
                 output tensor. May contain up to one Ellipsis that is expanded
                 to all unmentioned dim names of :attr:`self`.
@@ -855,7 +855,7 @@ def unflatten(self, dim, sizes):
           if :attr:`self` is a `NamedTensor`. The total number of elements in sizes must match the number
           of elements in the original dim being unflattened.
 
-        Arguments:
+        Args:
             dim (Union[int, str]): Dimension to unflatten
             sizes (Union[Tuple[int] or torch.Size, Tuple[Tuple[str, int]]]): New shape of the unflattened dimension
 
diff --git a/torch/testing/_internal/common_device_type.py b/torch/testing/_internal/common_device_type.py
index 36f02eff0c0f..4986e5a9bcbb 100644
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@@ -187,6 +187,9 @@ def _construct_test_name(test_name, op, device_type, dtype):
 class DeviceTypeTestBase(TestCase):
     device_type: str = 'generic_device_type'
 
+    # Flag to disable test suite early due to unrecoverable error such as CUDA error.
+    _stop_test_suite = False
+
     # Precision is a thread-local setting since it may be overridden per test
     _tls = threading.local()
     _tls.precision = TestCase._precision
@@ -227,6 +230,9 @@ def _get_precision_override(self, test, dtype):
             return self.precision
         return test.precision_overrides.get(dtype, self.precision)
 
+    def _should_stop_test_suite(self, rte):
+        return False
+
     # Creates device-specific tests.
     @classmethod
     def instantiate_test(cls, name, test, *, generic_cls=None):
@@ -271,6 +277,11 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
                     self.precision = self._get_precision_override(test_fn, dtype)
                     args = (arg for arg in (device_arg, dtype, op) if arg is not None)
                     result = test_fn(self, *args)
+                except RuntimeError as rte:
+                    # check if rte should stop entire test suite.
+                    self._stop_test_suite = self._should_stop_test_suite(rte)
+                    # raise the runtime error as is for the test suite to record.
+                    raise rte
                 finally:
                     self.precision = guard_precision
 
@@ -313,6 +324,12 @@ def instantiated_test(self, name=name, test=test_fn, dtype=dtype, op=op):
             for dtype in dtypes:
                 instantiate_test_helper(cls, name, test=test, dtype=dtype, op=None)
 
+    def run(self, result=None):
+        super().run(result=result)
+        # Early terminate test if _stop_test_suite is set.
+        if self._stop_test_suite:
+            result.stop()
+
 
 class CPUTestBase(DeviceTypeTestBase):
     device_type = 'cpu'
@@ -327,6 +344,14 @@ class CUDATestBase(DeviceTypeTestBase):
     no_magma: ClassVar[bool]
     no_cudnn: ClassVar[bool]
 
+    def _should_stop_test_suite(self, rte):
+        # CUDA device side error will cause subsequence test cases to fail.
+        # stop entire test suite if catches RuntimeError during torch.cuda.synchronize().
+        try:
+            torch.cuda.synchronize()
+        except RuntimeError as rte:
+            return True
+        return False
 
     def has_cudnn(self):
         return not self.no_cudnn
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 87d0baa895e8..119750396f1e 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -340,6 +340,77 @@ def sample_inputs_broadcast_to(op_info, device, dtype, requires_grad):
                                           requires_grad=requires_grad), shape))
                  for size, shape in test_cases)
 
+def sample_inputs_stack(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad)), kwargs=dict(idx=0)),)
+
+def sample_inputs_hstack_dstack_vstack(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad),
+                        make_tensor((S, S), device, dtype,
+                                    low=None, high=None,
+                                    requires_grad=requires_grad))),)
+
+def sample_inputs_gather(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((M, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, gather_variable((S, S), 1, M, True, device=device))),
+            SampleInput((make_tensor((M, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        1, gather_variable((M, S // 2), 0, S, True, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor([0], dtype=torch.int64, device=device))),
+            SampleInput((make_tensor((S,), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor(0, dtype=torch.int64, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor(0, dtype=torch.int64, device=device))),
+            )
+
+
+def sample_inputs_index_select(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((S, S, S), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, index_variable(2, S, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor([0], dtype=torch.int64, device=device))),
+            SampleInput((make_tensor((), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        0, torch.tensor(0, dtype=torch.int64, device=device))),
+            )
+
+def sample_movedim_moveaxis(op_info, device, dtype, requires_grad):
+    return (SampleInput((make_tensor((4, 3, 2, 1), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        (0, 1, 2, 3), (3, 2, 1, 0))),
+            SampleInput((make_tensor((4, 3, 2, 1), device, dtype,
+                                     low=None, high=None,
+                                     requires_grad=requires_grad),
+                        (0, -1, -2, -3), (-3, -2, -1, -0))))
+
 def np_unary_ufunc_integer_promotion_wrapper(fn):
     # Wrapper that passes PyTorch's default scalar
     #   type as an argument to the wrapped NumPy
@@ -546,6 +617,30 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
     return out
 
 
+def sample_inputs_flip(op_info, device, dtype, requires_grad):
+    tensors = (
+        make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad)
+    )
+
+    dims = ((0, 1, 2), (0,), (0, 2), (-1,))
+
+    # On CUDA, `dims=()` errors out with IndexError
+    # Reference: https://github.com/pytorch/pytorch/issues/49982
+    if device == 'cpu':
+        dims = dims + ((),)  # type: ignore
+
+    samples = [SampleInput(tensor, kwargs={'dims': dim}) for tensor, dim in product(tensors, dims)]
+
+    return samples
+
+def sample_inputs_fliplr_flipud(op_info, device, dtype, requires_grad):
+    tensors = (
+        make_tensor((S, M, S), device, dtype, low=None, high=None, requires_grad=requires_grad),
+        make_tensor((S, 0, M), device, dtype, low=None, high=None, requires_grad=requires_grad)
+    )
+    return [SampleInput(tensor) for tensor in tensors]
+
 # Operator database (sorted alphabetically)
 op_db: List[OpInfo] = [
     # NOTE: CPU complex acos produces incorrect outputs (https://github.com/pytorch/pytorch/issues/42952)
@@ -717,7 +812,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.fftn',
                      aten_name='fft_fftn',
@@ -725,7 +820,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=True,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,
                      decorators=[precisionOverride(
                          {torch.float: 1e-4, torch.cfloat: 1e-4})],),
@@ -735,7 +830,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.rfft',
                      aten_name='fft_rfft',
@@ -743,7 +838,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=False,
                      dtypes=all_types_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.rfftn',
                      aten_name='fft_rfftn',
@@ -751,7 +846,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=True,
                      dtypes=all_types_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,
                      decorators=[precisionOverride({torch.float: 1e-4})],),
     SpectralFuncInfo('fft.ifft',
@@ -760,7 +855,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.ifftn',
                      aten_name='fft_ifftn',
@@ -768,7 +863,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=True,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.ihfft',
                      aten_name='fft_ihfft',
@@ -776,7 +871,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=False,
                      dtypes=all_types_and(torch.bool),
                      default_test_dtypes=floating_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.irfft',
                      aten_name='fft_irfft',
@@ -784,7 +879,7 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=False,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
     SpectralFuncInfo('fft.irfftn',
                      aten_name='fft_irfftn',
@@ -792,8 +887,26 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                      ndimensional=True,
                      dtypes=all_types_and_complex_and(torch.bool),
                      default_test_dtypes=floating_and_complex_types(),
-                     supports_tensor_out=False,
+                     supports_tensor_out=True,
                      test_inplace_grad=False,),
+    OpInfo('flip',
+           op=torch.flip,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_flip,
+           test_inplace_grad=False,
+           supports_tensor_out=False),
+    OpInfo('fliplr',
+           op=torch.fliplr,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_fliplr_flipud,
+           test_inplace_grad=False,
+           supports_tensor_out=False),
+    OpInfo('flipud',
+           op=torch.flipud,
+           dtypes=all_types_and_complex_and(torch.bool, torch.half, torch.bfloat16),
+           sample_inputs_func=sample_inputs_fliplr_flipud,
+           test_inplace_grad=False,
+           supports_tensor_out=False),
     UnaryUfuncInfo('log',
                    ref=np.log,
                    domain=(0, float('inf')),
@@ -1001,6 +1114,16 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
                        SkipInfo('TestUnaryUfuncs', 'test_reference_numerics',
                                 dtypes=[torch.bfloat16]),
                    )),
+    UnaryUfuncInfo('rsqrt',
+                   ref=lambda x: np.reciprocal(np.sqrt(x)),
+                   domain=(0, float('inf')),
+                   dtypes=all_types_and_complex_and(torch.bool),
+                   dtypesIfCPU=all_types_and_complex_and(torch.bool),
+                   dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.half),
+                   decorators=(precisionOverride({torch.half: 5e-2}),),
+                   promotes_integers_to_float=True,
+                   assert_autodiffed=True,
+                   handles_complex_extremals=False),
     UnaryUfuncInfo('sqrt',
                    ref=np.sqrt,
                    domain=(0, float('inf')),
@@ -1059,6 +1182,75 @@ def sample_inputs_pinverse(op_info, device, dtype, requires_grad=False):
            supports_tensor_out=False,
            sample_inputs_func=sample_inputs_pinverse,
            decorators=[skipCUDAIfNoMagma, skipCPUIfNoLapack]),
+    OpInfo('gather',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16),
+           dtypesIfCUDA=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           sample_inputs_func=sample_inputs_gather),
+    OpInfo('index_select',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           skips=(
+               # https://github.com/pytorch/pytorch/issues/49707
+               SkipInfo('TestCommon', 'test_variant_consistency_eager',
+                        dtypes=[torch.float16, torch.bfloat16]),
+               SkipInfo('TestCommon', 'test_variant_consistency_jit', dtypes=[torch.float16, torch.bfloat16]),
+           ),
+           sample_inputs_func=sample_inputs_index_select),
+    OpInfo('stack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args, idx: torch.stack([*args], idx),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_stack),
+    OpInfo('hstack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args: torch.hstack([*args]),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack),
+    OpInfo('vstack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args: torch.vstack([*args]),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack),
+    OpInfo('dstack',
+           # gradcheck expects the input arguments as a flat list
+           op=lambda *args: torch.dstack([*args]),
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           skips=(
+               SkipInfo('TestCommon', 'test_variant_consistency_jit',
+                        dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16)),
+           ),
+           sample_inputs_func=sample_inputs_hstack_dstack_vstack),
+    OpInfo('movedim',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           sample_inputs_func=sample_movedim_moveaxis),
+    OpInfo('moveaxis',
+           dtypes=all_types_and_complex_and(torch.bool, torch.float16, torch.bfloat16),
+           test_inplace_grad=False,
+           supports_tensor_out=False,
+           sample_inputs_func=sample_movedim_moveaxis),
 ]
 
 if TEST_SCIPY:
@@ -1161,10 +1353,10 @@ def reference_sigmoid(x):
 spectral_funcs = [op for op in op_db if isinstance(op, SpectralFuncInfo)]
 sparse_unary_ufuncs = [op for op in op_db if isinstance(op, UnaryUfuncInfo) and op.supports_sparse is True]
 
-def index_variable(shape, max_indices):
+def index_variable(shape, max_indices, device=torch.device('cpu')):
     if not isinstance(shape, tuple):
         shape = (shape,)
-    index = torch.rand(*shape).mul_(max_indices).floor_().long()
+    index = torch.rand(*shape, device=device).mul_(max_indices).floor_().long()
     return index
 
 
@@ -1176,14 +1368,14 @@ def index_perm_variable(shape, max_indices):
     return index
 
 
-def gather_variable(shape, index_dim, max_indices, duplicate=False):
+def gather_variable(shape, index_dim, max_indices, duplicate=False, device=torch.device('cpu')):
     assert len(shape) == 2
     assert index_dim < 2
     batch_dim = 1 - index_dim
-    index = torch.LongTensor(*shape)
+    index = torch.zeros(*shape, dtype=torch.long, device=device)
     for i in range(shape[index_dim]):
         index.select(index_dim, i).copy_(
-            torch.randperm(max_indices)[:shape[batch_dim]])
+            torch.randperm(max_indices, device=device)[:shape[batch_dim]])
     if duplicate:
         index.select(batch_dim, 0).copy_(index.select(batch_dim, 1))
     return index
@@ -1387,13 +1579,6 @@ def method_tests():
         ('reshape_as', (S, S, S), (non_differentiable(torch.rand(S * S, S)),)),
         ('reshape_as', (), (non_differentiable(torch.tensor(42.)),), 'scalar'),
         ('reshape_as', (), (non_differentiable(torch.rand(1, 1)),), 'scalar_to_dims'),
-        ('flip', (S, S, S), ([0],), 'd0'),
-        ('flip', (S, S, S), ([0, 1, 2],), 'd012'),
-        ('flip', (S, S, S), ([0, 2],), 'd02'),
-        ('flip', (S, S, S), ([2, 0],), 'd20'),
-        ('flip', (S, S, S), ([-1],), 'neg_d'),
-        ('fliplr', (S, S, S), ()),
-        ('flipud', (S, S, S), ()),
         ('roll', (S, S, S), (0, 0), 'd0'),
         ('roll', (S, S, S), (1, 2), 'd12'),
         ('roll', (S, S, S), (0, 2,), 'd02'),
@@ -1466,6 +1651,10 @@ def method_tests():
         ('ceil', (), NO_ARGS, 'scalar', (True,)),
         ('rad2deg', (S, S, S), NO_ARGS),
         ('deg2rad', (S, S, S), NO_ARGS),
+        # Removing the 'rsqrt' entries leads to failure in
+        # test_index_fill_variable_dim_*
+        # TODO: Remove when fixed.
+        # Reference: https://github.com/pytorch/pytorch/issues/48230
         ('rsqrt', torch.rand(S, S, S) + 1e-2, NO_ARGS, '', (True,)),
         ('rsqrt', uniform_scalar(1e-2, requires_grad=True), NO_ARGS, 'scalar', (True,)),
         ('rsqrt', torch.rand(S, S, S, dtype=torch.cfloat) + 1e-2, NO_ARGS, 'complex', (True,)),
@@ -1865,10 +2054,10 @@ def method_tests():
         ('diagonal', (M, M, M), (1, 1, 2), '3d_1'),
         ('diagonal', (M, M, M), (2, 0, 1), '3d_2'),
         ('diagonal', (M, M, M), (-2, 0, 1), '3d_3'),
-        ('tile', (S, S, S), ([S, S, S, S],), 'more_reps_dims', (False,)),
-        ('tile', (S, S, S), ([S, S, S],), 'same_reps_dims', (False,)),
-        ('tile', (S, S, S), ([S, M],), 'less_reps_dims', (False,)),
-        ('tile', (S, S, S), ([S, S, 0],), 'zero_rep_dim', (False,)),
+        ('tile', (2, 2), ([2, 2, 2],), 'more_reps_dims', (False,)),
+        ('tile', (2, 2), ([2, 2],), 'same_reps_dims', (False,)),
+        ('tile', (2, 2), ([2, 3],), 'less_reps_dims', (False,)),
+        ('tile', (2, 2, 2), ([2, 2, 0],), 'zero_rep_dim', (False,)),
         ('tile', (), ([S, S, S],), 'empty_tensor', (False,)),
         ('tril', (M, M), NO_ARGS),
         ('tril', (M, M), (2,), 'idx'),
@@ -1883,9 +2072,6 @@ def method_tests():
         ('trace', (M, M), NO_ARGS),
         ('cross', (S, 3), ((S, 3),)),
         ('cross', (S, 3, S), ((S, 3, S), 1), 'dim'),
-        ('index_select', (S, S, S), (0, index_variable(2, S)), 'dim', (), [0]),
-        ('index_select', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_mixed_dim', (), [0]),
-        ('index_select', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_dim', (), [0]),
         ('index_add', (S, S), (0, index_variable(2, S), (2, S)), 'dim', (), [0]),
         ('index_add', (), (0, torch.tensor([0], dtype=torch.int64), (1,)), 'scalar_input_dim', (), [0]),
         ('index_add', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalar_all_dim', (), [0]),
@@ -2084,11 +2270,6 @@ def method_tests():
         ('tensor_split', (S, S, S), (3, 1), 'sections_dim', (False,), [1]),
         ('tensor_split', (S, S, S), ([2, 4],), 'indices', (False,)),
         ('tensor_split', (S, S, S), ([2, 4], 1), 'indices_dim', (False,), [1]),
-        ('gather', (M, S), (0, gather_variable((S, S), 1, M, True)), 'dim0', (), [0]),
-        ('gather', (M, S), (1, gather_variable((M, S // 2), 0, S, True)), 'dim1', (), [0]),
-        ('gather', (), (0, torch.tensor([0], dtype=torch.int64)), 'scalar_input', (), [0]),
-        ('gather', (S,), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_index', (), [0]),
-        ('gather', (), (0, torch.tensor(0, dtype=torch.int64)), 'scalar_both', (), [0]),
         ('scatter', (M, S), (0, gather_variable((S, S), 1, M), (S, S)), 'dim0', (), [0]),
         ('scatter', (M, S), (1, gather_variable((M, S // 2), 0, S), (M, S // 2)), 'dim1', (), [0]),
         ('scatter', (), (0, torch.tensor(0, dtype=torch.int64), ()), 'scalartensor_all_dim0', (), [0]),
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
index c588f69c2875..022255a5298b 100644
--- a/torch/testing/_internal/common_nn.py
+++ b/torch/testing/_internal/common_nn.py
@@ -2988,7 +2988,7 @@ def fractional_max_pool3d_test(test_case):
                             .scale_factor(std::vector<double>({3., 3., 3.}))
                             .mode(torch::kTrilinear)
                             .align_corners(false)''',
-        input_size=(1, 2, 3, 4, 4),
+        input_size=(1, 2, 3, 4, 5),
         fullname='interpolate_trilinear_scale_3d',
         # See https://github.com/pytorch/pytorch/issues/5006
         precision=3e-4,
@@ -4866,7 +4866,7 @@ def __call__(self, test_case):
 
         if self.should_test_pickle:
             # TODO: do this with in-memory files as soon as torch.save will support it
-            with TemporaryFile() as f:
+            with tempfile.TemporaryFile() as f:
                 test_case._forward(module, input)
                 torch.save(module, f)
                 f.seek(0)
diff --git a/torch/testing/_internal/common_quantized.py b/torch/testing/_internal/common_quantized.py
index 243cd964b96d..f14556597128 100644
--- a/torch/testing/_internal/common_quantized.py
+++ b/torch/testing/_internal/common_quantized.py
@@ -102,6 +102,35 @@ def _calculate_dynamic_per_channel_qparams(X, dtype):
 
     return scale, zero_point
 
+def _snr(x, x_hat):
+    """Calculates the signal to noise ratio and returns the signal and noise
+    power, as well as the SNR in dB.
+    If the input is a list/tuple this function is called recursively on each
+    element. The result will have the same nested structure as the inputs.
+
+    Args:
+        x, x_hat: Either a tensor or a nested list/tuple of tensors.
+    Returns:
+        signal, noise, SNR(in dB): Either floats or a nested list of floats
+    """
+    if isinstance(x, (list, tuple)):
+        assert(len(x) == len(x_hat))
+        res = []
+        for idx in range(len(x)):
+            res.append(_snr(x[idx], x_hat[idx]))
+        return res
+    if x_hat.is_quantized:
+        x_hat = x_hat.dequantize()
+    if x.is_quantized:
+        x = x.dequantize()
+    noise = (x - x_hat).norm()
+    if noise == 0:
+        return 0.0, float('inf'), float('inf')
+    signal = x.norm()
+    snr = signal / noise
+    snr_db = 20 * snr.log10()
+    return signal, noise, snr_db
+
 @contextmanager
 def override_quantized_engine(qengine):
     previous = torch.backends.quantized.engine
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index df6919cf65b0..9f70551eb3b2 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -303,11 +303,16 @@ def run_tests(argv=UNITTEST_ARGS):
 
 if IS_WINDOWS:
     @contextmanager
-    def TemporaryFileName(dir=None):
+    def TemporaryFileName(*args, **kwargs):
         # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
         # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
         # close the file after creation and try to remove it manually
-        f = tempfile.NamedTemporaryFile(delete=False, dir=dir)
+        if 'delete' in kwargs:
+            if kwargs['delete'] is not False:
+                raise UserWarning("only TemporaryFileName with delete=False is supported on Windows.")
+        else:
+            kwargs['delete'] = False
+        f = tempfile.NamedTemporaryFile(*args, **kwargs)
         try:
             f.close()
             yield f.name
@@ -315,8 +320,8 @@ def TemporaryFileName(dir=None):
             os.unlink(f.name)
 else:
     @contextmanager  # noqa: T484
-    def TemporaryFileName(dir=None):
-        with tempfile.NamedTemporaryFile(dir=dir) as f:
+    def TemporaryFileName(*args, **kwargs):
+        with tempfile.NamedTemporaryFile(*args, **kwargs) as f:
             yield f.name
 
 if IS_WINDOWS:
@@ -1381,19 +1386,26 @@ def assertExpectedStripMangled(self, s, subname=None):
         s = re.sub(r'__torch__[^ ]+', '', s)
         self.assertExpected(s, subname)
 
-    # returns captured stderr
+    # run code in subprocess and capture exceptions.
     @staticmethod
-    def runWithPytorchAPIUsageStderr(code):
+    def run_process_no_exception(code, env=None):
         import subprocess
 
-        env = os.environ.copy()
-        env["PYTORCH_API_USAGE_STDERR"] = "1"
-        pipes = subprocess.Popen(
+        popen = subprocess.Popen(
             [sys.executable, '-c', code],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             env=env)
-        return pipes.communicate()[1].decode('ascii')
+        (stdout, stderr) = popen.communicate()
+        return (stdout, stderr)
+
+    # returns captured stderr
+    @staticmethod
+    def runWithPytorchAPIUsageStderr(code):
+        env = os.environ.copy()
+        env["PYTORCH_API_USAGE_STDERR"] = "1"
+        (stdout, stderr) = TestCase.run_process_no_exception(code, env=env)
+        return stderr.decode('ascii')
 
 
 def download_file(url, binary=True):
diff --git a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
index c7fdbe536061..15d5cfeca214 100644
--- a/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/dist_autograd_test.py
@@ -11,6 +11,7 @@
 import torch.testing._internal.dist_utils
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
+from torch.distributed.rpc import RRef
 from torch.testing._internal.common_utils import IS_MACOS
 from torch.testing._internal.dist_utils import (
     dist_init,
@@ -70,8 +71,7 @@ def create_tensor():
 
 
 @torch.jit.script
-def create_torchscript_tensor():
-    # type: () -> Tensor
+def create_torchscript_tensor() -> torch.Tensor:
     return torch.ones((3, 3)).requires_grad_()
 
 
@@ -94,8 +94,7 @@ def my_script_add(t1, t2):
 
 
 @torch.jit.script
-def my_script_ref_add(ref_t1, t2):
-    # type: (RRef[Tensor], Tensor) -> Tensor
+def my_script_ref_add(ref_t1: RRef[torch.Tensor], t2: torch.Tensor) -> torch.Tensor:
     t1 = ref_t1.to_here()
     return torch.add(t1, t2)
 
diff --git a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
index ee3ebdb33eff..5ae40cdea065 100644
--- a/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
+++ b/torch/testing/_internal/distributed/rpc/jit/dist_autograd_test.py
@@ -34,8 +34,7 @@ def test_get_gradients(self):
         dst_rank = self.rank
 
         @torch.jit.script
-        def dist_get_gradients(context_id):
-            # type: (int) -> (Dict[Tensor, Tensor])
+        def dist_get_gradients(context_id: int) -> (Dict[Tensor, Tensor]):
             return dist_autograd.get_gradients(context_id)
 
         FileCheck().check("get_gradients").run(str(dist_get_gradients.graph))
diff --git a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
index 656f25322274..96ede7231a97 100644
--- a/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
+++ b/torch/testing/_internal/distributed/rpc/jit/rpc_test_faulty.py
@@ -3,6 +3,7 @@
 import torch
 import torch.distributed.rpc as rpc
 from torch import Tensor
+from torch.distributed.rpc import RRef
 from torch.testing._internal.dist_utils import (
     dist_init,
     worker_name,
@@ -63,18 +64,15 @@ def rpc_async_call_future_ret(
     return fut
 
 @torch.jit.script
-def rref_to_here(rref_var):
-    # type: (RRef[Tensor]) -> Tensor
+def rref_to_here(rref_var: RRef[Tensor]) -> Tensor:
     return rref_var.to_here()
 
 @torch.jit.script
-def rref_to_here_with_timeout(rref_var, timeout):
-    # type: (RRef[Tensor], float) -> Tensor
+def rref_to_here_with_timeout(rref_var: RRef[Tensor], timeout: float) -> Tensor:
     return rref_var.to_here(timeout)
 
 @torch.jit.script
-def rpc_async_with_rref_arg(dst_worker_name, args):
-    # type: (str, Tuple[RRef[Tensor]]) -> Tensor
+def rpc_async_with_rref_arg(dst_worker_name: str, args: Tuple[RRef[Tensor]]) -> Tensor:
     fut = rpc.rpc_async(dst_worker_name, rref_to_here, args)
     ret = fut.wait()
     return ret
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
index 8eec8100270b..ede2471aa3a2 100644
--- a/torch/testing/_internal/distributed/rpc/rpc_test.py
+++ b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -1335,7 +1335,11 @@ def convert_remote_to_local(event_name):
                 for event in events
                 if convert_remote_to_local(event.name) in EXPECTED_REMOTE_EVENTS
             ]
-            self.assertEqual(remote_events_list, EXPECTED_REMOTE_EVENTS)
+            self.assertEqual(
+                set(remote_events_list),
+                set(EXPECTED_REMOTE_EVENTS),
+                f"Mismatch between profiled events: {set(remote_events_list)} and expected events: {set(EXPECTED_REMOTE_EVENTS)}",
+            )
 
     @dist_init
     def test_profiler_remote_events_profiled(self):
@@ -1579,8 +1583,8 @@ def _profiler_test_with_rpc(self, rpc_exec_mode, func, args, use_record_function
                 scope_event = get_function_event(events, "foo")
                 # Since RPC call is within the scope, its CPU interval should be
                 # contained within foo's interval.
-                self.assertTrue(scope_event.time_range.start < rpc_event.time_range.start)
-                self.assertTrue(scope_event.time_range.end > rpc_event.time_range.end)
+                self.assertLessEqual(scope_event.time_range.start, rpc_event.time_range.start)
+                self.assertGreaterEqual(scope_event.time_range.end, rpc_event.time_range.end)
             # the sender, dest worker, function run, and type of RPC should all
             # be recorded.
             self_worker_name = worker_name(self.rank)
@@ -1776,7 +1780,13 @@ def _assert_top_level_events(self, process_global_events, expected_top_level_eve
                 if time_range.start > last_end_time:
                     top_level_event_names.append(event_name)
                     last_end_time = time_range.end
-        self.assertEqual(sorted(top_level_event_names), sorted(expected_top_level_event_names))
+        top_level_event_names = sorted(top_level_event_names)
+        expected_top_level_event_names = sorted(expected_top_level_event_names)
+        self.assertEqual(
+            top_level_event_names,
+            expected_top_level_event_names,
+            f"Expected events {expected_top_level_event_names}, but got {top_level_event_names}",
+        )
 
     @dist_init
     def test_server_process_global_profiler(self):
@@ -1799,9 +1809,12 @@ def test_server_process_global_profiler(self):
         outer_profile_rref.rpc_sync().__exit__(None, None, None)
 
         inner_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (inner_profile_rref,))
-        self._assert_top_level_events(inner_events, ['aten::sub'])
+        expected_inner_events = ['aten::sub']
+        expected_outer_events = expected_inner_events + ['aten::add']
+
+        self._assert_top_level_events(inner_events, expected_inner_events)
         outer_events = rpc.rpc_sync(dst_worker_name, get_events_from_profile, (outer_profile_rref,))
-        self._assert_top_level_events(outer_events, ['aten::add', 'aten::sub'])
+        self._assert_top_level_events(outer_events, expected_outer_events)
 
         inner_profile_rref.rpc_sync().key_averages()
         outer_profile_rref.rpc_sync().key_averages()
diff --git a/torch/testing/_internal/expecttest.py b/torch/testing/_internal/expecttest.py
index 9e46a9a84a37..4dae7ebf03dc 100644
--- a/torch/testing/_internal/expecttest.py
+++ b/torch/testing/_internal/expecttest.py
@@ -3,6 +3,7 @@
 import traceback
 import os
 import string
+from typing import Tuple
 
 
 # This file implements expect tests (also known as "golden" tests).
@@ -139,7 +140,8 @@ def ok_for_raw_triple_quoted_string(s, quote):
                        r"(?P<raw>r?)", re.DOTALL)
 
 
-def replace_string_literal(src, lineno, new_string):
+def replace_string_literal(src : str, lineno : int,
+                           new_string : str) -> Tuple[str, int]:
     r"""
     Replace a triple quoted string literal with new contents.
     Only handles printable ASCII correctly at the moment.  This
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 5bf37a3c9dec..730f8e32a9d1 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -103,7 +103,7 @@ class Timer(object):
 
         `label`, `sub_label`, `description`, `env`, `num_threads`
 
-    Arguments:
+    Args:
         stmt: Code snippet to be run in a loop and timed.
 
         setup: Optional setup code. Used to define variables used in `stmt`
diff --git a/torch/utils/bundled_inputs.py b/torch/utils/bundled_inputs.py
index c5d603885e4a..741c0841778a 100644
--- a/torch/utils/bundled_inputs.py
+++ b/torch/utils/bundled_inputs.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union
+from typing import Any, TypeVar, Optional, Tuple, List, NamedTuple, Union, Sequence
 import textwrap
 import torch
 from torch._C import TupleType, OptionalType, ListType
@@ -17,7 +17,7 @@ class InflatableArg(NamedTuple):
 
 def augment_model_with_bundled_inputs(
         model: torch.jit.ScriptModule,
-        inputs: Optional[List[Tuple[Any, ...]]] = None,
+        inputs: Optional[Sequence[Tuple[Any, ...]]] = None,
         _receive_inflate_expr: Optional[List[str]] = None,  # For debugging.
 ) -> None:
     """Add bundled sample inputs to a model.
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 17354c0e1867..7ca849d48de0 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -226,7 +226,7 @@ def check_compiler_ok_for_platform(compiler: str) -> bool:
     r'''
     Verifies that the compiler is the expected one for the current platform.
 
-    Arguments:
+    Args:
         compiler (str): The compiler executable to check.
 
     Returns:
@@ -261,7 +261,7 @@ def check_compiler_abi_compatibility(compiler) -> bool:
     r'''
     Verifies that the given compiler is ABI-compatible with PyTorch.
 
-    Arguments:
+    Args:
         compiler (str): The compiler executable name to check (e.g. ``g++``).
             Must be executable in a shell process.
 
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 4224e6bb146a..d4ef1a99a2df 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -55,7 +55,7 @@ class _InfiniteConstantSampler(Sampler):
     r"""Analogous to ``itertools.repeat(None, None)``.
     Used as sampler for :class:`~torch.utils.data.IterableDataset`.
 
-    Arguments:
+    Args:
         data_source (Dataset): dataset to sample from
     """
 
@@ -78,7 +78,7 @@ class DataLoader(Generic[T_co]):
 
     See :py:mod:`torch.utils.data` documentation page for more details.
 
-    Arguments:
+    Args:
         dataset (Dataset): dataset from which to load the data.
         batch_size (int, optional): how many samples per batch to load
             (default: ``1``).
@@ -308,10 +308,6 @@ def multiprocessing_context(self):
     def multiprocessing_context(self, multiprocessing_context):
         if multiprocessing_context is not None:
             if self.num_workers > 0:
-                if not multiprocessing._supports_context:
-                    raise ValueError('multiprocessing_context relies on Python >= 3.4, with '
-                                     'support for different start methods')
-
                 if isinstance(multiprocessing_context, string_classes):
                     valid_start_methods = multiprocessing.get_all_start_methods()
                     if multiprocessing_context not in valid_start_methods:
diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py
index 7c45c10dd812..0bef57c97629 100644
--- a/torch/utils/data/dataset.py
+++ b/torch/utils/data/dataset.py
@@ -158,7 +158,7 @@ class TensorDataset(Dataset[Tuple[Tensor, ...]]):
 
     Each sample will be retrieved by indexing tensors along the first dimension.
 
-    Arguments:
+    Args:
         *tensors (Tensor): tensors that have the same size of the first dimension.
     """
     tensors: Tuple[Tensor, ...]
@@ -179,7 +179,7 @@ class ConcatDataset(Dataset[T_co]):
 
     This class is useful to assemble different existing datasets.
 
-    Arguments:
+    Args:
         datasets (sequence): List of datasets to be concatenated
     """
     datasets: List[Dataset[T_co]]
@@ -232,7 +232,7 @@ class ChainDataset(IterableDataset):
     chainning operation is done on-the-fly, so concatenating large-scale
     datasets with this class will be efficient.
 
-    Arguments:
+    Args:
         datasets (iterable of IterableDataset): datasets to be chained together
     """
     def __init__(self, datasets: Iterable[Dataset]) -> None:
@@ -284,7 +284,7 @@ class BufferedShuffleDataset(IterableDataset[T_co]):
         ...     random.seed(...)
         >>> print(list(torch.utils.data.DataLoader(ds, ..., num_workers=n, worker_init_fn=init_fn)))
 
-    Arguments:
+    Args:
         dataset (IterableDataset): The original IterableDataset.
         buffer_size (int): The buffer size for shuffling.
     """
@@ -315,7 +315,7 @@ class Subset(Dataset[T_co]):
     r"""
     Subset of a dataset at specified indices.
 
-    Arguments:
+    Args:
         dataset (Dataset): The whole Dataset
         indices (sequence): Indices in the whole set selected for subset
     """
@@ -341,7 +341,7 @@ def random_split(dataset: Dataset[T], lengths: Sequence[int],
 
     >>> random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))
 
-    Arguments:
+    Args:
         dataset (Dataset): Dataset to be split
         lengths (sequence): lengths of splits to be produced
         generator (Generator): Generator used for the random permutation.
diff --git a/torch/utils/data/distributed.py b/torch/utils/data/distributed.py
index e048b54a462c..8ce61d9948c5 100644
--- a/torch/utils/data/distributed.py
+++ b/torch/utils/data/distributed.py
@@ -21,7 +21,7 @@ class DistributedSampler(Sampler[T_co]):
     .. note::
         Dataset is assumed to be of constant size.
 
-    Arguments:
+    Args:
         dataset: Dataset used for sampling.
         num_replicas (int, optional): Number of processes participating in
             distributed training. By default, :attr:`world_size` is retrieved from the
@@ -129,7 +129,7 @@ def set_epoch(self, epoch: int) -> None:
         use a different random ordering for each epoch. Otherwise, the next iteration of this
         sampler will yield the same ordering.
 
-        Arguments:
+        Args:
             epoch (int): Epoch number.
         """
         self.epoch = epoch
diff --git a/torch/utils/data/sampler.py b/torch/utils/data/sampler.py
index 8cc650e89948..e48ad64fdc9b 100644
--- a/torch/utils/data/sampler.py
+++ b/torch/utils/data/sampler.py
@@ -55,7 +55,7 @@ def __iter__(self) -> Iterator[T_co]:
 class SequentialSampler(Sampler[int]):
     r"""Samples elements sequentially, always in the same order.
 
-    Arguments:
+    Args:
         data_source (Dataset): dataset to sample from
     """
     data_source: Sized
@@ -74,7 +74,7 @@ class RandomSampler(Sampler[int]):
     r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
     If with replacement, then user can specify :attr:`num_samples` to draw.
 
-    Arguments:
+    Args:
         data_source (Dataset): dataset to sample from
         replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
         num_samples (int): number of samples to draw, default=`len(dataset)`. This argument
@@ -131,7 +131,7 @@ def __len__(self):
 class SubsetRandomSampler(Sampler[int]):
     r"""Samples elements randomly from a given list of indices, without replacement.
 
-    Arguments:
+    Args:
         indices (sequence): a sequence of indices
         generator (Generator): Generator used in sampling.
     """
diff --git a/torch/utils/hipify/hipify_python.py b/torch/utils/hipify/hipify_python.py
index d1639d20adba..adc480793d82 100755
--- a/torch/utils/hipify/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -782,7 +782,9 @@ def repl(m):
                                                     os.path.relpath(header_filepath, output_directory),
                                                     all_files, includes, stats, hip_clang_launch, is_pytorch_extension,
                                                     clean_ctx, show_progress)
-                return templ.format(os.path.relpath(HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"], header_dir))
+                value = HIPIFY_FINAL_RESULT[header_filepath]["hipified_path"]
+                assert value is not None
+                return templ.format(os.path.relpath(value, header_dir))
 
             return m.group(0)
         return repl