diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt
index cd62eb5c5b752..93b2882ef475d 100644
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@@ -1 +1 @@
-e3fb8c09b1ae675832424d7bf5f3484f697efd39
+68161e98aaeaeca02166063d19de92e81ea00c3b
diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py
index 21d43a24bd275..264c3737914ec 100755
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@@ -85,11 +85,13 @@ def rebase_ghstack_onto(
 ) -> None:
     if (
         subprocess.run(
-            [sys.executable, "-m", "ghstack", "--help"], capture_output=True
+            [sys.executable, "-m", "ghstack", "--help"],
+            capture_output=True,
+            check=False,
         ).returncode
         != 0
     ):
-        subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"])
+        subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"], check=True)
     orig_ref = f"{re.sub(r'/head$', '/orig', pr.head_ref())}"
 
     repo.fetch(orig_ref, orig_ref)
@@ -116,7 +118,7 @@ def rebase_ghstack_onto(
     if dry_run:
         print("Don't know how to dry-run ghstack")
     else:
-        ghstack_result = subprocess.run(["ghstack"], capture_output=True)
+        ghstack_result = subprocess.run(["ghstack"], capture_output=True, check=True)
         push_result = ghstack_result.stdout.decode("utf-8")
         print(push_result)
         if ghstack_result.returncode != 0:
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index fd5989b631462..ef263c5a3d656 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -30,6 +30,8 @@ jobs:
         run: |
           tag_or_branch="${PT_GITHUB_REF#refs/tags/}"
           tag_or_branch="${tag_or_branch#refs/heads/}"
+          # replace directory separators with _ in branch name
+          tag_or_branch="${tag_or_branch//\//_}"
           echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
           echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
       - name: Create source distribution
diff --git a/aten/src/ATen/Utils.cpp b/aten/src/ATen/Utils.cpp
index a0fbc499378e2..0ed5f60161ab6 100644
--- a/aten/src/ATen/Utils.cpp
+++ b/aten/src/ATen/Utils.cpp
@@ -1,13 +1,10 @@
 #include <ATen/Context.h>
-#include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Functions.h>
 #include <ATen/Utils.h>
 #include <c10/util/accumulate.h>
 
 
-// NOLINTNEXTLINE(modernize-deprecated-headers)
-#include <stdarg.h>
 #include <cstdlib>
 #include <stdexcept>
 #include <typeinfo>
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index e0f803871114a..569a820042c0e 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -1,6 +1,5 @@
 #include <ATen/autocast_mode.h>
 
-#include <exception>
 #include <mutex>
 #include <ATen/CachedTensorUtils.h>
 #include <c10/util/flat_hash_map.h>
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
index d8438bb4ce1b1..957b89c7a1f16 100644
--- a/aten/src/ATen/core/Formatting.cpp
+++ b/aten/src/ATen/core/Formatting.cpp
@@ -57,6 +57,7 @@ struct FormatGuard {
     out.copyfmt(saved);
   }
 private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   std::ostream & out;
   std::ios saved;
 };
@@ -65,7 +66,7 @@ std::ostream& operator<<(std::ostream & out, const DeprecatedTypeProperties& t)
   return out << t.toString();
 }
 
-static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Tensor& self) {
+static std::tuple<double, int> __printFormat(std::ostream& stream, const Tensor& self) {
   auto size = self.numel();
   if(size == 0) {
     return std::make_tuple(1., 0);
@@ -116,13 +117,13 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
     }
   }
   double scale = 1;
-  int64_t sz = 11;
+  int sz = 11;
   if(intMode) {
     if(expMax > 9) {
       sz = 11;
       stream << std::scientific << std::setprecision(4);
     } else {
-      sz = expMax + 1;
+      sz = static_cast<int>(expMax) + 1;
       stream << defaultfloat;
     }
   } else {
@@ -141,7 +142,7 @@ static std::tuple<double, int64_t> __printFormat(std::ostream& stream, const Ten
         if(expMax == 0) {
           sz = 7;
         } else {
-          sz = expMax+6;
+          sz = static_cast<int>(expMax) + 6;
         }
         stream << std::fixed << std::setprecision(4);
       }
@@ -163,9 +164,7 @@ static void printScale(std::ostream & stream, double scale) {
 }
 static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent)
 {
-  double scale = 0.0;
-  int64_t sz = 0;
-  std::tie(scale, sz) = __printFormat(stream, self);
+  auto [scale, sz] = __printFormat(stream, self);
 
   __printIndent(stream, indent);
   int64_t nColumnPerLine = (linesize-indent)/(sz+1);
@@ -284,9 +283,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
       stream << "[ " << tensor_.toString() << "{}";
     } else if(tensor.ndimension() == 1) {
       if (tensor.numel() > 0) {
-        double scale = 0.0;
-        int64_t sz = 0;
-        std::tie(scale, sz) =  __printFormat(stream, tensor);
+        auto [scale, sz] = __printFormat(stream, tensor);
         if(scale != 1) {
           printScale(stream, scale);
         }
diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp
index 825aca6a04556..cf15c44b7f3a5 100644
--- a/aten/src/ATen/core/List_test.cpp
+++ b/aten/src/ATen/core/List_test.cpp
@@ -3,6 +3,7 @@
 
 using namespace c10;
 
+// NOLINTBEGIN(performance-move-const-arg)
 TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {
     List<string> list;
     EXPECT_TRUE(list.empty());
@@ -1159,3 +1160,4 @@ TEST(ListTest, toTypedList) {
   genericList = impl::toList(std::move(stringList));
   EXPECT_THROW(c10::impl::toTypedList<int64_t>(std::move(genericList)), c10::Error);
 }
+// NOLINTEND(performance-move-const-arg)
diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp
index 40fd58a73ab18..dcf55dfa273cd 100644
--- a/aten/src/ATen/core/NamedTensor.cpp
+++ b/aten/src/ATen/core/NamedTensor.cpp
@@ -2,7 +2,6 @@
 #include <ATen/core/NamedTensor.h>
 
 #include <ATen/core/TensorBase.h>
-#include <c10/util/C++17.h>
 
 namespace at {
 
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 7edecf5d9aa82..14b15b00e77b3 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -1210,7 +1210,7 @@ TORCH_API intrusive_ptr<ivalue::Future> collectAny(
         TypePtr typePtr,
         std::vector<c10::Device> devices)
         : srcFutures(srcs),
-          dstFuture(make_intrusive<ivalue::Future>(typePtr, std::move(devices))) {}
+          dstFuture(make_intrusive<ivalue::Future>(std::move(typePtr), std::move(devices))) {}
     std::atomic<bool> done{false};
     List<intrusive_ptr<ivalue::Future>> srcFutures;
     intrusive_ptr<ivalue::Future> dstFuture;
diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp
index 92cb179631af1..fef96198717e2 100644
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@@ -44,7 +44,7 @@ namespace {
 
 CppFunction::CppFunction(c10::KernelFunction func, c10::optional<c10::impl::CppSignature> cpp_signature, std::unique_ptr<c10::FunctionSchema> schema)
   : func_(std::move(func))
-  , cpp_signature_(std::move(cpp_signature))
+  , cpp_signature_(cpp_signature)
   , schema_(std::move(schema))
   , debug_()
   {}
@@ -172,7 +172,7 @@ Library& Library::_def(std::variant<c10::OperatorName, c10::FunctionSchema>&& na
       std::move(name),
       dispatch_key,
       std::move(f.func_),
-      std::move(f.cpp_signature_),
+      f.cpp_signature_,
       std::move(f.schema_),
       debugString(std::move(f.debug_), file_, line_)
     )
@@ -223,7 +223,7 @@ Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify
           std::move(name),
           dispatch_key,
           std::move(f.func_),
-          std::move(f.cpp_signature_),
+          f.cpp_signature_,
           std::move(f.schema_),
           debugString(std::move(f.debug_), file_, line_)
         )
diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp
index 368311abd408b..3ea84cc2b6752 100644
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@@ -5,9 +5,13 @@
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAFunctions.h>
 
+#include <chrono>
+#include <thread>
+
 namespace at::cuda {
 
 static bool _cuda_graphs_debug = false;
+constexpr int kSynchronizeBusyWaitMillis = 10;
 
 MempoolId_t graph_pool_handle() {
 #if !defined(USE_ROCM) || ROCM_VERSION >= 50300
@@ -55,6 +59,25 @@ CaptureId_t capture_sequence_id() {
  * describes memory management for captures.
  */
 
+std::atomic<int> CUDAGraph::pending_event_queries = 0;
+
+// Track any outstanding event queries that could happen e.g., in a NCCL watchdog so that they
+// can be resolved before the capture begins. Note that event queries are not allowed during a
+// graph capture in the default capture mode.
+void CUDAGraph::inc_pending_event_queries() {
+  pending_event_queries++;
+}
+
+void CUDAGraph::dec_pending_event_queries() {
+  TORCH_INTERNAL_ASSERT(pending_event_queries > 0,
+    "Attempted to decrement the number of outstanding events to be queried, but it was <= 0.");
+  pending_event_queries--;
+}
+
+int CUDAGraph::num_pending_event_queries() {
+  return pending_event_queries;
+}
+
 CUDAGraph::CUDAGraph()
   // CUDAStreams may not be default-constructed.
   : capture_stream_(at::cuda::getCurrentCUDAStream()) {
@@ -115,6 +138,15 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
   // due to the capture status being updated _after_ a capture had already started.
   c10::cuda::CUDACachingAllocator::beginAllocateStreamToPool(capture_dev_, capture_stream_, mempool_id_);
 
+  // At this point, any NCCL watchdogs should be aware that we are in capture mode
+  // and therefore should not enqueue any additional work that could be event-queried.
+  // We still must wait on any existing work that has not been cleaned up.
+  while (num_pending_event_queries()) {
+    TORCH_WARN_ONCE("Waiting for pending NCCL work to finish before starting graph capture.");
+    std::this_thread::sleep_for(
+      std::chrono::milliseconds(kSynchronizeBusyWaitMillis));
+  }
+
   // cudaStreamCaptureModeGlobal is the most conservative option to
   // prevent potentially unsafe CUDA API calls during capture.  See
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h
index 00113180e3fa0..804067560a6ea 100644
--- a/aten/src/ATen/cuda/CUDAGraph.h
+++ b/aten/src/ATen/cuda/CUDAGraph.h
@@ -5,6 +5,8 @@
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAStream.h>
 
+#include <mutex>
+
 namespace at {
 
 struct CUDAGeneratorImpl;
@@ -19,6 +21,9 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   CUDAGraph();
   ~CUDAGraph();
 
+  static void inc_pending_event_queries();
+  static void dec_pending_event_queries();
+  static int num_pending_event_queries();
   void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal);
   void capture_end();
   void replay();
@@ -33,6 +38,8 @@ struct TORCH_CUDA_CPP_API CUDAGraph {
   cudaGraphExec_t graph_exec_ = NULL;
 #endif
 
+  static std::atomic<int> pending_event_queries;
+
   // internal states so reset() can do its best cleaning up
   // Set to true in capture_end if cudaStreamEndCapture succeeded
   // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate
diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp
index 36531b6412771..22dbb661f18b4 100644
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@@ -10,6 +10,7 @@
 #include <cuda_runtime_api.h>
 #include <stdint.h>
 #include <deque>
+#include <future>
 #include <memory>
 #include <mutex>
 #include <set>
@@ -397,7 +398,7 @@ class CUDAHostAllocator {
         (void*)devptr == (void*)ptr,
         "Host and device pointer dont match with cudaHostRegister. "
         "Please dont use this feature by setting "
-        "PYTORCH_PINNED_ALLOC_CONF=use_cuda_host_register:False (default)",
+        "PYTORCH_CUDA_ALLOC_CONF=use_cuda_host_register:False (default)",
         "");
   }
 
@@ -412,18 +413,31 @@ class CUDAHostAllocator {
     size_t numMapThreads = c10::cuda::CUDACachingAllocator::
         CUDAAllocatorConfig::pinned_num_register_threads();
     if ((numMapThreads > 1) && (roundSize >= (pageSize * numMapThreads))) {
+      // parallelize the mapping of pages with a threadpool
       auto* pool = getThreadPool();
+      std::vector<std::promise<void>> promises;
+      std::vector<std::future<void>> futures;
+      promises.reserve(numMapThreads);
+      futures.reserve(numMapThreads);
+
       for (size_t i = 0; i < numMapThreads; i++) {
-        pool->run(std::bind(
-            &CUDAHostAllocator::mapPagesForRegister,
-            this,
+        promises.emplace_back();
+        futures.push_back(promises[i].get_future());
+        auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable {
+          mapPagesForRegister(
             *ptr,
             roundSize,
             i, // thread task-id
             numMapThreads,
-            pageSize));
+            pageSize);
+          // set the promise when mapping pages are done
+          promises[i].set_value();
+        };
+        pool->run(task);
+      }
+      for (auto& future : futures) {
+        future.wait();
       }
-      pool->waitWorkComplete();
     } else {
       // Map pages in the same thread
       mapPagesForRegister(*ptr, roundSize, 0, 1, pageSize);
diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
index 2db15c8028969..c25c4972da25d 100644
--- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp
+++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp
@@ -17,7 +17,7 @@ namespace at { namespace functorch {
 // we do not support batch_group_count (which is needed for convolution backwards).
 // Instead, there's a convolution_backward op that needs a batching rule.
 static std::tuple<Tensor,optional<int64_t>>
-convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, int64_t groups) {
+convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tensor& rhs, optional<int64_t> rhs_bdim, const optional<Tensor>& bias, optional<int64_t> bias_bdim, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups) {
   DimVector lhs_spec(stride.size() + 2);
   std::iota(lhs_spec.begin(), lhs_spec.end(), 0);
   DimVector rhs_spec = lhs_spec;
@@ -43,13 +43,13 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
   if (lhs_bdim && !rhs_bdim) {
     auto new_x = reshape_dim_into(*lhs_bdim, lhs_spec[0], lhs);
     auto out = at::convolution_symint(new_x, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
-    out = reshape_dim_outof(out_spec[0], lhs.sizes()[*lhs_bdim], out);
+    out = reshape_dim_outof_symint(out_spec[0], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[0]);
   } else if (!lhs_bdim && rhs_bdim) {
     if (groups == 1) {
       auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[0], rhs);
       auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
-      out = reshape_dim_outof(out_spec[1], rhs.size(*rhs_bdim), out);
+      out = reshape_dim_outof_symint(out_spec[1], rhs.size(*rhs_bdim), out);
       result = std::make_tuple(out, out_spec[1]);
     } else {
       if (transposed) {
@@ -64,9 +64,9 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // NIHW, I(BO)HW -> N(GBO)HW
         auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
-        out = reshape_dim_outof(1, groups, out);
+        out = reshape_dim_outof_symint(1, groups, out);
         // NG(BO)HW -> NGBOHW
-        out = reshape_dim_outof(2, rhs.size(*rhs_bdim), out);
+        out = reshape_dim_outof_symint(2, rhs.size(*rhs_bdim), out);
         // NGBOHW -> NB(GO)HW
         out = reshape_dim_into(1, 2, out);
         result = std::make_tuple(out, 1);
@@ -78,7 +78,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // (PyTorch convs have a fixed dimension order)
 
         // B(GO)IHW -> BGOIHW
-        auto new_w = reshape_dim_outof(0 + (*rhs_bdim == 0), groups, rhs);
+        auto new_w = reshape_dim_outof_symint(0 + (*rhs_bdim == 0), groups, rhs);
         // BGOIHW -> G(BO)IHW
         new_w = reshape_dim_into(*rhs_bdim + (*rhs_bdim > 0), 1, new_w);
         // G(BO)IHW -> (GBO)IHW
@@ -86,9 +86,9 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
         // N(GI)HW, (GBO)IHW -> N(GBO)HW
         auto out = at::convolution_symint(lhs, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
         // N(GBO)HW -> NG(BO)HW
-        out = reshape_dim_outof(1, groups, out);
+        out = reshape_dim_outof_symint(1, groups, out);
         // NG(BO)HW -> NGBOHW
-        out = reshape_dim_outof(2, rhs.size(*rhs_bdim), out);
+        out = reshape_dim_outof_symint(2, rhs.size(*rhs_bdim), out);
         // NGBOHW -> NB(GO)HW
         out = reshape_dim_into(1, 2, out);
         result = std::make_tuple(out, 1);
@@ -100,7 +100,7 @@ convolution_batch_rule(const Tensor& lhs, optional<int64_t> lhs_bdim, const Tens
     auto dim_with_groups = transposed ? 1 : 0;
     auto new_w = reshape_dim_into(*rhs_bdim, rhs_spec[dim_with_groups], rhs);
     auto out = at::convolution_symint(new_x, new_w, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups);
-    out = reshape_dim_outof(out_spec[1], lhs.sizes()[*lhs_bdim], out);
+    out = reshape_dim_outof_symint(out_spec[1], lhs.sizes()[*lhs_bdim], out);
     result = std::make_tuple(out, out_spec[1]);
   } else {
     result = std::make_tuple(at::convolution_symint(lhs, rhs, unbatched_bias, stride, padding, dilation, transposed, output_padding, groups), nullopt);
@@ -244,8 +244,8 @@ convolution_backward_input_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, int64_t groups) {
+    c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, c10::SymInt groups) {
   const std::array<bool, 3> mask = {true, false, false};
   if (grad_output_bdim && weight_bdim) {
     // regular: BNO, BOI -> N(BO), (BO)I -> N(BI)
@@ -296,7 +296,7 @@ convolution_backward_input_batch_rule(
     } else {
       // N(GO), B(GI)O -> N(GO), (GBI)O -> N(GBI)
       auto weight_ = moveBatchDimToFront(weight, weight_bdim); // B(GI)O
-      weight_ = reshape_dim_outof(1, groups, weight_);         // BGIO
+      weight_ = reshape_dim_outof_symint(1, groups, weight_);         // BGIO
       weight_ = weight_.transpose(0, 1);                       // GBIO
       weight_ = weight_.flatten(0, 2);                         // (GBI)O
       const auto dummy_input = make_dummy(input, input_bdim, 1, batch_size);
@@ -306,8 +306,8 @@ convolution_backward_input_batch_rule(
       grad_input = std::get<0>(result); // N(GBI)
     }
     // N(GBI) -> NG(BI) -> NGBI -> NBGI -> NB(GI)
-    grad_input = reshape_dim_outof(1, groups, grad_input);
-    grad_input = reshape_dim_outof(2, batch_size, grad_input);
+    grad_input = reshape_dim_outof_symint(1, groups, grad_input);
+    grad_input = reshape_dim_outof_symint(2, batch_size, grad_input);
     grad_input = grad_input.transpose(1, 2);
     grad_input = reshape_dim_into(2, 2, grad_input);
     return std::make_tuple(grad_input, 1);
@@ -325,8 +325,8 @@ convolution_backward_weight_batch_rule(
     const Tensor& grad_output, optional<int64_t> grad_output_bdim,
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim,
-    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, int64_t groups) {
+    c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, c10::SymInt groups) {
   const std::array<bool, 3> mask = {false, true, false};
   if (grad_output_bdim && input_bdim) {
     // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed)
@@ -338,7 +338,7 @@ convolution_backward_weight_batch_rule(
         grad_output_, input_, dummy_weight, nullopt, stride, padding,
         dilation, transposed, output_padding, groups * batch_size, mask);
     auto grad_weight = std::get<1>(result);
-    grad_weight = reshape_dim_outof(0, batch_size, grad_weight);
+    grad_weight = reshape_dim_outof_symint(0, batch_size, grad_weight);
     return std::make_tuple(grad_weight, 0);
   } else if (grad_output_bdim && !input_bdim) {
     const auto batch_size = grad_output.size(*grad_output_bdim);
@@ -352,11 +352,11 @@ convolution_backward_weight_batch_rule(
           grad_output_, input, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
-      grad_weight = reshape_dim_outof(out_ch_dim, batch_size, grad_weight);
+      grad_weight = reshape_dim_outof_symint(out_ch_dim, batch_size, grad_weight);
       return std::make_tuple(grad_weight, out_ch_dim);
     } else {
       auto grad_output_ = moveBatchDimToFront(grad_output, grad_output_bdim); // BN(GO)
-      grad_output_ = reshape_dim_outof(2, groups, grad_output_);              // BNGO
+      grad_output_ = reshape_dim_outof_symint(2, groups, grad_output_);              // BNGO
       grad_output_ = grad_output_.movedim(0, 2);                              // NGBO
       grad_output_ = grad_output_.flatten(1, 3);                              // N(GBO)
       if (!transposed) {
@@ -366,7 +366,7 @@ convolution_backward_weight_batch_rule(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
-        grad_weight = grad_weight.unflatten(0, { groups, batch_size, -1 }); // GBOI
+        grad_weight = grad_weight.unflatten_symint(0, { groups, batch_size, -1 }); // GBOI
         grad_weight = grad_weight.transpose(0, 1);                          // BGOI
         grad_weight = grad_weight.flatten(1, 2);                            // B(GO)I
         return std::make_tuple(grad_weight, 0);
@@ -377,7 +377,7 @@ convolution_backward_weight_batch_rule(
             grad_output_, input, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
-        grad_weight = reshape_dim_outof(1, batch_size, grad_weight);
+        grad_weight = reshape_dim_outof_symint(1, batch_size, grad_weight);
         return std::make_tuple(grad_weight, 1);
       }
     }
@@ -393,11 +393,11 @@ convolution_backward_weight_batch_rule(
           grad_output, input_, dummy_weight, nullopt, stride, padding,
           dilation, transposed, output_padding, groups, mask);
       auto grad_weight = std::get<1>(result);
-      grad_weight = reshape_dim_outof(in_ch_dim, batch_size, grad_weight);
+      grad_weight = reshape_dim_outof_symint(in_ch_dim, batch_size, grad_weight);
       return std::make_tuple(grad_weight, in_ch_dim);
     } else {
       auto input_ = moveBatchDimToFront(input, input_bdim); // BN(GI)
-      input_ = reshape_dim_outof(2, groups, input_);        // BNGI
+      input_ = reshape_dim_outof_symint(2, groups, input_);        // BNGI
       input_ = input_.movedim(0, 2);                        // NGBI
       input_ = input_.flatten(1, 3);                        // N(GBI)
       if (!transposed) {
@@ -407,7 +407,7 @@ convolution_backward_weight_batch_rule(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
-        grad_weight = reshape_dim_outof(1, batch_size, grad_weight);
+        grad_weight = reshape_dim_outof_symint(1, batch_size, grad_weight);
         return std::make_tuple(grad_weight, 1);
       } else {
         // transposed: N(GO), BN(GI) -> N(GO), N(GBI) -> (GBI)O
@@ -416,7 +416,7 @@ convolution_backward_weight_batch_rule(
             grad_output, input_, dummy_weight, nullopt, stride, padding,
             dilation, transposed, output_padding, groups, mask);
         auto grad_weight = std::get<1>(result);
-        grad_weight = grad_weight.unflatten(0, { groups, batch_size, -1 }); // GBIO
+        grad_weight = grad_weight.unflatten_symint(0, { groups, batch_size, -1 }); // GBIO
         grad_weight = grad_weight.transpose(0, 1);                          // BGIO
         grad_weight = grad_weight.flatten(1, 2);                            // B(GI)O
         return std::make_tuple(grad_weight, 0);
@@ -436,8 +436,8 @@ convolution_backward_weight_batch_rule(
 static std::tuple<Tensor,Tensor,Tensor> convolution_backward_plumbing(
     const Tensor& grad_output_, const Tensor& input_, const Tensor& weight_,
     const c10::OptionalArrayRef<SymInt> bias_sizes_opt,
-    IntArrayRef stride, c10::SymIntArrayRef padding, IntArrayRef dilation, bool transposed,
-    c10::SymIntArrayRef output_padding, int64_t groups, std::array<bool, 3> output_mask) {
+    c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed,
+    c10::SymIntArrayRef output_padding, c10::SymInt groups, std::array<bool, 3> output_mask) {
   const auto maybe_layer = maybeCurrentDynamicLayer();
   vmap_check_escaped(maybe_layer, "convolution_backward_plumbing");
   int64_t cur_level = maybe_layer->layerId();
diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
index 9d1a681bf2c1f..64cefc90e0d85 100644
--- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
+++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp
@@ -314,10 +314,10 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) {
   m.impl("conv1d", native::conv1d_symint);
   m.impl("conv2d", native::conv2d_symint);
   m.impl("conv3d", native::conv3d_symint);
-  OP_DECOMPOSE2(conv1d, padding);
-  OP_DECOMPOSE2(conv2d, padding);
-  OP_DECOMPOSE2(conv3d, padding);
-  OP_DECOMPOSE(_convolution_mode);
+  m.impl("conv1d.padding", native::conv1d_padding_symint);
+  m.impl("conv2d.padding", native::conv2d_padding_symint);
+  m.impl("conv3d.padding", native::conv3d_padding_symint);
+  m.impl("_convolution_mode", native::_convolution_mode_symint);
   OP_DECOMPOSE(type_as);
   OP_DECOMPOSE(linalg_diagonal);
   OP_DECOMPOSE(diagonal_copy);
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 5d4cb697979e8..5d2691b9761ee 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -118,8 +118,8 @@ enum class ConvBackend {
 // This overload is exposed to python for testing, etc.
 TORCH_API ConvBackend select_conv_backend(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation,
-    bool transposed, SymIntArrayRef output_padding, int64_t groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation,
+    bool transposed, SymIntArrayRef output_padding, c10::SymInt groups, const at::OptionalSymIntArrayRef bias_sizes_opt);
 
 TORCH_API at::MemoryFormat _determine_backend_memory_format(const Tensor& input,
     const Tensor& weight,
@@ -211,7 +211,7 @@ static void convolution_shape_check(
 template <typename T>
 static inline std::vector<T> _conv_output_size(
     ArrayRef<T> input_size, ArrayRef<T> weight_size,
-    ArrayRef<T> padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+    ArrayRef<T> padding, ArrayRef<T> stride, ArrayRef<T> dilation = ArrayRef<T>()
 ) {
   // ASSERT(input_size.size() > 2)
   // ASSERT(input_size.size() == weight_size.size())
@@ -237,7 +237,7 @@ static inline std::vector<int64_t> conv_output_size(
 
 static inline std::vector<c10::SymInt> conv_output_size(
     SymIntArrayRef input_size, SymIntArrayRef weight_size,
-    SymIntArrayRef padding, IntArrayRef stride, IntArrayRef dilation = IntArrayRef()
+    SymIntArrayRef padding, SymIntArrayRef stride, SymIntArrayRef dilation = SymIntArrayRef()
 ) {
   return _conv_output_size(input_size, weight_size, padding, stride, dilation);
 }
@@ -245,7 +245,7 @@ static inline std::vector<c10::SymInt> conv_output_size(
 template <typename T>
 std::vector<T> _conv_input_size(
     ArrayRef<T> output_size, ArrayRef<T> weight_size,
-    ArrayRef<T> padding, ArrayRef<T> output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+    ArrayRef<T> padding, ArrayRef<T> output_padding, ArrayRef<T> stride, ArrayRef<T> dilation, T groups
 ) {
   // ASSERT(output_size.size() > 2)
   // ASSERT(output_size.size() == weight_size.size())
@@ -263,7 +263,7 @@ std::vector<T> _conv_input_size(
 
 static inline std::vector<c10::SymInt> conv_input_size(
     SymIntArrayRef output_size, SymIntArrayRef weight_size,
-    SymIntArrayRef padding, SymIntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
+    SymIntArrayRef padding, SymIntArrayRef output_padding, SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups
 ) {
   return _conv_input_size(output_size, weight_size, padding, output_padding, stride, dilation, groups);
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index ef7c3bfd98f27..9c31026af54cf 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -93,7 +93,7 @@ static bool conv_benchmark_empty_cache = true;
 
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
 template <typename T>
-bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
+bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
   auto w = at::symint::size<T>(input, 3);  // same as h
   auto ch = at::symint::size<T>(input, 1);
   auto bs = at::symint::size<T>(input, 0);
@@ -216,7 +216,7 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, int stride) {
 
 // simplified version for cudnn 8.2 and above
 template <typename T>
-bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, int stride, const at::Tensor& weight) {
+bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
   // 1D conv
   if(at::symint::size<T>(input, 2) == 1 && stride == 1){
     return true;
@@ -269,9 +269,9 @@ static bool xnnpack_use_convolution2d(
     const Tensor& weight,
     const at::OptionalSymIntArrayRef bias_sizes_opt,
     const SymIntArrayRef padding,
-    const IntArrayRef stride,
-    const IntArrayRef dilation,
-    const int64_t groups,
+    const SymIntArrayRef stride,
+    const SymIntArrayRef dilation,
+    const c10::SymInt groups,
     const bool transposed) {
   // Never use xnnpack for symbolic tracing
   return false;
@@ -284,12 +284,12 @@ static bool xnnpack_use_convolution2d(
 // int64_t
 template <typename T>
 struct ConvParams {
-  std::vector<int64_t> stride;
+  std::vector<T> stride;
   std::vector<T> padding;
-  std::vector<int64_t> dilation;
+  std::vector<T> dilation;
   bool transposed;
   std::vector<T> output_padding;
-  int groups;
+  T groups;
   bool benchmark;
   bool deterministic;
   bool cudnn_enabled;
@@ -644,7 +644,7 @@ static void check_shape_forward(const at::Tensor& input,
                                 const ConvParams<T>& params) {
   int64_t k = input.ndimension();
   int64_t weight_dim = weight_sizes.size();
-  int64_t groups = params.groups;
+  auto groups = params.groups;
   const auto& padding = params.padding;
   const auto& dilation = params.dilation;
   bool transposed = params.transposed;
@@ -831,12 +831,12 @@ at::Tensor complex_convolution(
     const Tensor& input,
     const Tensor& weight,
     const Tensor& bias,
-    IntArrayRef stride,
+    SymIntArrayRef stride,
     SymIntArrayRef padding,
-    IntArrayRef dilation,
+    SymIntArrayRef dilation,
     bool transposed,
     SymIntArrayRef output_padding,
-    int64_t groups) {
+    c10::SymInt groups) {
   check_input_same_type_as_parameters(input, weight, bias);
   Tensor i_r, i_i, w_r, w_i;
   std::tie(i_r, i_i) = complex_to_real(input.resolve_conj());
@@ -871,10 +871,10 @@ at::Tensor complex_convolution_mode(
     const at::Tensor& input,
     const at::Tensor& weight,
     const c10::optional<at::Tensor>& bias_opt,
-    at::IntArrayRef stride,
+    c10::SymIntArrayRef stride,
     c10::string_view padding,
-    at::IntArrayRef dilation,
-    int64_t groups) {
+    c10::SymIntArrayRef dilation,
+    c10::SymInt groups) {
   auto bias = bias_opt.value_or(Tensor());
   check_input_same_type_as_parameters(input, weight, bias);
   Tensor i_r, i_i, w_r, w_i;
@@ -884,15 +884,15 @@ at::Tensor complex_convolution_mode(
   // See [NOTE] Complex Convolution
   Tensor a, b, c;
   if (!bias.defined()) {
-    a = at::_convolution_mode(i_r, w_r, bias, stride, padding, dilation, groups);
-    b = at::_convolution_mode(i_i, w_i, bias, stride, padding, dilation, groups);
-    c = at::_convolution_mode(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups);
+    a = at::_convolution_mode_symint(i_r, w_r, bias, stride, padding, dilation, groups);
+    b = at::_convolution_mode_symint(i_i, w_i, bias, stride, padding, dilation, groups);
+    c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups);
   } else {
     Tensor b_r, b_i;
     std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj());
-    a = at::_convolution_mode(i_r, w_r, b_r, stride, padding, dilation, groups);
-    b = at::_convolution_mode(i_i, w_i, Tensor(), stride, padding, dilation, groups);
-    c = at::_convolution_mode(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups);
+    a = at::_convolution_mode_symint(i_r, w_r, b_r, stride, padding, dilation, groups);
+    b = at::_convolution_mode_symint(i_i, w_i, Tensor(), stride, padding, dilation, groups);
+    c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups);
   }
 
   auto i = c10::Scalar(c10::complex<double>(0, 1));
@@ -903,7 +903,7 @@ at::Tensor complex_convolution_mode(
 
 at::Tensor conv1d_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, int64_t groups) {
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -922,7 +922,7 @@ at::Tensor conv1d_symint(
 
 at::Tensor conv2d_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, int64_t groups) {
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -949,7 +949,7 @@ at::Tensor conv2d_symint(
 
 at::Tensor conv3d_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, IntArrayRef dilation, int64_t groups) {
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef dilation, c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -969,7 +969,7 @@ at::Tensor conv3d_symint(
 
 static Tensor convolution_same(
     const Tensor &input, const Tensor &weight, const Tensor &bias,
-    IntArrayRef stride, IntArrayRef dilation, int64_t groups) {
+    SymIntArrayRef stride, SymIntArrayRef dilation, c10::SymInt groups) {
 
   auto k = weight.dim();
   TORCH_CHECK(k > 2, "weight should have at least three dimensions");
@@ -1031,10 +1031,10 @@ static Tensor convolution_same(
                                 dilation, false, output_padding, groups);
 }
 
-Tensor _convolution_mode(
+Tensor _convolution_mode_symint(
     const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, c10::string_view padding, IntArrayRef dilation,
-    int64_t groups) {
+    SymIntArrayRef stride, c10::string_view padding, SymIntArrayRef dilation,
+    c10::SymInt groups) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -1043,18 +1043,16 @@ Tensor _convolution_mode(
     return at::native::convolution_same(
         input, weight, bias, stride, dilation, groups);
   } else if (padding == "valid") {
-    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
-    const int64_t padding_[] = {0};
-    return at::convolution(
-        input, weight, bias, stride, padding_, dilation, false, padding_, groups);
+    return at::convolution_symint(
+        input, weight, bias, stride, {{0}}, dilation, false, {{0}}, groups);
   }
   TORCH_CHECK(false, "Invalid padding string: '", padding, "'");
 }
 
-at::Tensor conv1d(
+at::Tensor conv1d_padding_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
-    IntArrayRef stride, c10::string_view padding, IntArrayRef dilation,
-    int64_t groups) {
+    c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
+    c10::SymInt groups) {
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d");
@@ -1062,15 +1060,15 @@ at::Tensor conv1d(
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
   } else {
-    output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+    output = at::_convolution_mode_symint(input, weight, bias, stride, std::move(padding), dilation, groups);
   }
   return is_batched ? std::move(output) : output.squeeze(0);
 }
 
-at::Tensor conv2d(
+at::Tensor conv2d_padding_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
-    IntArrayRef stride, c10::string_view padding, IntArrayRef dilation,
-    int64_t groups) {
+    c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
+    c10::SymInt groups) {
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d");
@@ -1078,15 +1076,15 @@ at::Tensor conv2d(
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
   } else {
-    output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+    output = at::_convolution_mode_symint(input, weight, bias, stride, std::move(padding), dilation, groups);
   }
   return is_batched ? std::move(output) : output.squeeze(0);
 }
 
-at::Tensor conv3d(
+at::Tensor conv3d_padding_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias,
-    IntArrayRef stride, c10::string_view padding, IntArrayRef dilation,
-    int64_t groups) {
+    c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation,
+    c10::SymInt groups) {
   Tensor input;
   bool is_batched;
   std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d");
@@ -1094,14 +1092,14 @@ at::Tensor conv3d(
   if (at::isComplexType(input_.scalar_type())) {
     output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
   } else {
-    output = at::_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups);
+    output = at::_convolution_mode_symint(input, weight, bias, stride, std::move(padding), dilation, groups);
   }
   return is_batched ? std::move(output) : output.squeeze(0);
 }
 
 at::Tensor conv_transpose1d_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, int64_t groups, IntArrayRef dilation) {
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -1122,7 +1120,7 @@ at::Tensor conv_transpose1d_symint(
 
 at::Tensor conv_transpose2d_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, int64_t groups, IntArrayRef dilation) {
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -1143,7 +1141,7 @@ at::Tensor conv_transpose2d_symint(
 
 at::Tensor conv_transpose3d_symint(
     const Tensor& input_, const Tensor& weight, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, int64_t groups, IntArrayRef dilation) {
+    SymIntArrayRef stride, SymIntArrayRef padding, SymIntArrayRef output_padding, c10::SymInt groups, SymIntArrayRef dilation) {
   // See [Note: hacky wrapper removal for optional tensor]
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
@@ -1302,8 +1300,8 @@ ConvBackend _select_conv_backend(
 // Selects a backend for convolution based on the inputs and params.
 ConvBackend select_conv_backend(
     const Tensor& input_r, const Tensor& weight_r, const c10::optional<Tensor>& bias_opt,
-    IntArrayRef stride_, SymIntArrayRef padding_, IntArrayRef dilation_,
-    bool transposed_, SymIntArrayRef output_padding_, int64_t groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) {
+    SymIntArrayRef stride_, SymIntArrayRef padding_, SymIntArrayRef dilation_,
+    bool transposed_, SymIntArrayRef output_padding_, c10::SymInt groups_, const at::OptionalSymIntArrayRef bias_sizes_opt) {
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
index bc3e3505185ad..cceb8baf9b063 100644
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@@ -234,7 +234,7 @@ index_select_add(
       offsets_data = offsets_include_last.data();
     }
 #if defined(USE_FBGEMM)
-    constexpr bool isbf16 = std::is_same<data_t, at::Half>::value ? false : true;
+    constexpr bool isbf16 = std::is_same_v<data_t, at::Half> ? false : true;
     auto kernel_16bit_index_t = fbgemm_kernel_cache
         ? fbgemm_kernel_cache
               ->getCallback</* has_weight */ false, index_t, uint16_t>(ddim)
@@ -245,7 +245,8 @@ index_select_add(
               /* prefetch */ 16,
               /* is_weight_positional */ false,
               /* use_offsets */ true,
-              /* isbf16*/ isbf16);
+              /* is_bf16_out */ isbf16,
+              /* is_bf16_in */ isbf16);
     at::parallel_for(
         0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
           bool success = kernel_16bit_index_t(
@@ -607,7 +608,7 @@ index_select_scale_add(
     auto* scale_data_fp32 = scale_fp32.mutable_data_ptr<float>();
 
 #if defined(USE_FBGEMM)
-    constexpr bool isbf16 = std::is_same<data_t, at::Half>::value ? false : true;
+    constexpr bool isbf16 = std::is_same_v<data_t, at::Half> ? false : true;
     if constexpr (isbf16) {
       fbgemm::Bfloat16ToFloat_simd(
           reinterpret_cast<const fbgemm::bfloat16*>(scale_data),
@@ -629,7 +630,8 @@ index_select_scale_add(
               /* prefetch */ 16,
               /* is_weight_positional */ false,
               /* use_offsets */ true,
-              /* isbf16*/ isbf16);
+              /* is_bf16_out */ isbf16,
+              /* is_bf16_in */ isbf16);
     at::parallel_for(
         0, output_size, 1, [&](index_t start_idx, index_t end_idx) {
           bool success = kernel_16bit_index_t(
diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h
index 4bb6c258988a3..33a733273a80a 100644
--- a/aten/src/ATen/native/Pool.h
+++ b/aten/src/ATen/native/Pool.h
@@ -76,7 +76,7 @@ static inline T pooling_output_shape(
 
 template <typename T>
 std::pair<T, T> _pooling_same_mode_padding_lr(
-    T inputSize, T kernelSize, int64_t stride, int64_t dilation) {
+    T inputSize, T kernelSize, T stride, T dilation) {
   // NOTE: with strides, the output shape is ceil(inputSize/stride)
   auto total_padding = T(dilation) * (kernelSize - 1);
 
@@ -99,8 +99,8 @@ inline std::pair<int64_t, int64_t> pooling_same_mode_padding_lr(
 }
 
 inline std::pair<c10::SymInt, c10::SymInt> pooling_same_mode_padding_lr(
-    c10::SymInt inputSize, c10::SymInt kernelSize, int64_t stride, int64_t dilation) {
-  return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), stride, dilation);
+    c10::SymInt inputSize, c10::SymInt kernelSize, c10::SymInt stride, c10::SymInt dilation) {
+  return _pooling_same_mode_padding_lr(std::move(inputSize), std::move(kernelSize), std::move(stride), std::move(dilation));
 }
 
 // AveragePool2d/DilatedMaxPool2d (forward)
diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp
index d84e7adc27a14..adf23e53fa0d7 100644
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@@ -2096,7 +2096,27 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
       && self.layout() == other.layout()
       && self.is_neg() == other.is_neg()
       && self.is_conj() == other.is_conj()) {
-    return true;
+    if (c10::isIntegralType(self.scalar_type(), /*includeBool=*/true)) {
+      return true;
+    }
+    std::atomic<bool> result{true};
+    auto iter = TensorIteratorConfig().add_input(self).build();
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "equal_notnan_cpu", [&] {
+      iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) {
+        if (!result) {
+            return;
+        }
+        char* self_data = data[0];
+        for (C10_UNUSED const auto i : c10::irange(dim_size)) {
+          if (isnan_(c10::load<scalar_t>(self_data))) {
+            result = false;
+            return;
+          }
+          self_data += strides[0];
+        }
+      });
+    });
+    return result.load();
   }
 
   std::atomic<bool> result{true};
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5522368ff64a5..bc932f05d7d0e 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1615,59 +1615,67 @@
   variants: method
   manual_cpp_binding: True
 
-- func: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
+- func: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: convolution
   autogen: convolution.out
   tags: core
 
-- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     CompositeExplicitAutograd, CUDA: convolution_backward
   autogen: convolution_backward.out
   tags: core
 
-- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
+- func: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: convolution_overrideable
   autogen: convolution_overrideable.out
 
-- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+- func: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   dispatch:
     CompositeExplicitAutograd: convolution_backward_overrideable
   autogen: convolution_backward_overrideable.out
 
-- func: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- func: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   dispatch:
     CompositeExplicitAutograd: _convolution
   autogen: _convolution.out
 
-- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
+- func: _convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor
 
-- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, int[] stride, str padding, int[] dilation, int groups) -> Tensor
+- func: _convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor
+  dispatch:
+    CompositeImplicitAutograd: _convolution_mode_symint
 
-- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: _convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
 
-- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor
+- func: conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor
   dispatch:
     CompositeImplicitAutograd: conv1d_symint
 
-- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
+- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
   dispatch:
     CompositeImplicitAutograd: conv2d_symint
 
-- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
+- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
   dispatch:
     CompositeImplicitAutograd: conv3d_symint
 
-- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding="valid", int[1] dilation=1, int groups=1) -> Tensor
+- func: conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding="valid", SymInt[1] dilation=1, SymInt groups=1) -> Tensor
   cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv1d_padding_symint
 
-- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding="valid", int[2] dilation=1, int groups=1) -> Tensor
+- func: conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
   cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv2d_padding_symint
 
-- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, str padding="valid", int[3] dilation=1, int groups=1) -> Tensor
+- func: conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, str padding="valid", SymInt[3] dilation=1, SymInt groups=1) -> Tensor
   cpp_no_default_args: ['bias', 'stride', 'padding']
+  dispatch:
+    CompositeImplicitAutograd: conv3d_padding_symint
 
 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
   dispatch:
@@ -1677,15 +1685,15 @@
 - func: conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)
 
 # NB: we inherit the goofy argument order from PyTorch torch.nn.functional
-- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor
+- func: conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] output_padding=0, SymInt groups=1, SymInt[1] dilation=1) -> Tensor
   dispatch:
     CompositeImplicitAutograd: conv_transpose1d_symint
 
-- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor
+- func: conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor
   dispatch:
     CompositeImplicitAutograd: conv_transpose2d_symint
 
-- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor
+- func: conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt groups=1, SymInt[3] dilation=1) -> Tensor
   dispatch:
     CompositeImplicitAutograd: conv_transpose3d_symint
 
@@ -1807,32 +1815,32 @@
     CUDA: cudnn_batch_norm_backward
   autogen: cudnn_batch_norm_backward.out
 
-- func: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+- func: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   dispatch:
     CUDA: cudnn_convolution
   autogen: cudnn_convolution.out
 
-- func: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+- func: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   dispatch:
     CUDA: cudnn_convolution_transpose
   autogen: cudnn_convolution_transpose.out
 
-- func: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
+- func: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     MPS: _mps_convolution_transpose
   autogen: _mps_convolution_transpose.out
 
-- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool[2] output_mask) -> (Tensor, Tensor)
+- func: mps_convolution_transpose_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[2] output_mask) -> (Tensor, Tensor)
   dispatch:
     MPS: mps_convolution_transpose_backward
   autogen: mps_convolution_transpose_backward.out
 
-- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
+- func: cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     CUDA: cudnn_convolution_relu
   autogen: cudnn_convolution_relu.out
 
-- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
+- func: cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     CUDA: cudnn_convolution_add_relu
   autogen: cudnn_convolution_add_relu.out
@@ -3923,17 +3931,17 @@
 # TODO: Add this function to MPS dispatch key so that we avoid declaring it in
 # native_functions.yaml
 # https://github.com/pytorch/pytorch/issues/77394
-- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+- func: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     MPS: _mps_convolution
   autogen: _mps_convolution.out
 
-- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- func: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     MPS: mps_convolution_backward
   autogen: mps_convolution_backward.out
 
-- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+- func: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     CompositeExplicitAutograd: mkldnn_convolution
   autogen: mkldnn_convolution.out
@@ -3959,26 +3967,26 @@
     CUDA: miopen_batch_norm_backward
   autogen: miopen_batch_norm_backward.out
 
-- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution
   autogen: miopen_convolution.out
 
-- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_convolution_transpose
   autogen: miopen_convolution_transpose.out
 
-- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- func: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
   dispatch:
     CUDA: miopen_depthwise_convolution
   autogen: miopen_depthwise_convolution.out
 
-- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
+- func: miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     CUDA: miopen_convolution_relu
 
-- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> Tensor
+- func: miopen_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
   dispatch:
     CUDA: miopen_convolution_add_relu
 
@@ -4284,7 +4292,7 @@
 
 - func: _nnpack_available() -> bool
 
-- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
+- func: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
   variants: function
   dispatch:
     CompositeExplicitAutograd: _nnpack_spatial_convolution
@@ -4411,13 +4419,13 @@
     CompositeExplicitAutogradNonFunctional: math_pixel_unshuffle
   autogen: pixel_unshuffle.out
 
-- func: channel_shuffle(Tensor self, int groups) -> Tensor
+- func: channel_shuffle(Tensor self, SymInt groups) -> Tensor
   dispatch:
     CPU, CUDA: channel_shuffle
     QuantizedCPU: channel_shuffle_quantized_cpu
   autogen: channel_shuffle.out
 
-- func: native_channel_shuffle(Tensor self, int groups) -> Tensor
+- func: native_channel_shuffle(Tensor self, SymInt groups) -> Tensor
   dispatch:
     CPU: channel_shuffle_cpu
     CompositeImplicitAutograd: math_channel_shuffle
@@ -7239,14 +7247,14 @@
     CPU: dense_to_mkldnn
   autogen: to_mkldnn.out
 
-- func: mkldnn_reorder_conv2d_weight(Tensor self, int[2] padding=0, int[2] stride=1, int[2] dilation=1, int groups=1, int[]? input_size=None) -> Tensor
+- func: mkldnn_reorder_conv2d_weight(Tensor self, SymInt[2] padding=0, SymInt[2] stride=1, SymInt[2] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor
   variants: function
   python_module: nn
   dispatch:
     MkldnnCPU: mkldnn_reorder_conv2d_weight
   autogen: mkldnn_reorder_conv2d_weight.out
 
-- func: mkldnn_reorder_conv3d_weight(Tensor self, int[3] padding=0, int[3] stride=1, int[3] dilation=1, int groups=1) -> Tensor
+- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor
   variants: function
   python_module: nn
   dispatch:
@@ -12611,101 +12619,101 @@
 # make the operational distinction clear.
   tags: pointwise
 
-- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True
   dispatch:
     CPU: slow_conv_transpose2d_structured_cpu
     CUDA: slow_conv_transpose2d_structured_cuda
 
-- func: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
   python_module: nn
   structured_delegate: slow_conv_transpose2d.out
 
-- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv_transpose3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_out_cpu
     CUDA: slow_conv_transpose3d_out_cuda
 
-- func: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_transpose3d_cpu
     CUDA: slow_conv_transpose3d_cuda
 
-- func: thnn_conv2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: thnn_conv2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
 
-- func: thnn_conv2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, int[2] padding=0) -> Tensor
+- func: thnn_conv2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0) -> Tensor
   python_module: nn
 
-- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding, *, Tensor(a!) output) -> Tensor(a!)
+- func: _slow_conv2d_forward.output(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) output) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv2d_forward_out_cpu
     CUDA: slow_conv2d_forward_out_cuda
 
-- func: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor
+- func: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv2d_forward_cpu
     CUDA: slow_conv2d_forward_cuda
 
-- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+- func: _slow_conv2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, *, Tensor(a!) grad_input, Tensor(b!) grad_weight, Tensor(c!) grad_bias) -> (Tensor(a!), Tensor(b!), Tensor(c!))
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_out_cpu
     CUDA: slow_conv2d_backward_out_cuda
 
-- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+- func: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
   python_module: nn
   dispatch:
     CPU: slow_conv2d_backward_cpu
     CUDA: slow_conv2d_backward_cuda
   autogen: _slow_conv2d_backward.output_mask_out
 
-- func: _conv_depthwise2d.out(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
+- func: _conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)
   use_const_ref_for_mutable_tensors: True
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda_out
 
-- func: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
+- func: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise2d_cuda
 
-- func: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
+- func: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
   python_module: nn
   dispatch:
     CUDA: conv_depthwise3d_cuda
   autogen: conv_depthwise3d.out
 
-- func: slow_conv3d.out(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
+- func: slow_conv3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
 
-- func: slow_conv3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0) -> Tensor
+- func: slow_conv3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0) -> Tensor
   python_module: nn
 
-- func: slow_conv3d_forward.output(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
+- func: slow_conv3d_forward.output(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, *, Tensor(a!) output) -> Tensor(a!)
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_out_cpu
 
-- func: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
+- func: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv3d_forward_cpu
 
-- func: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
+- func: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated2d_cpu
     CUDA: slow_conv_dilated2d_cuda
   autogen: slow_conv_dilated2d.out
 
-- func: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
+- func: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
   python_module: nn
   dispatch:
     CPU: slow_conv_dilated3d_cpu
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 400d57b7c1f8f..0511b8fd0f63c 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -936,8 +936,7 @@ if(USE_ROCM)
   torch_compile_options(torch_hip)  # see cmake/public/utils.cmake
   # TODO: Not totally sure if this is live or not
   if(USE_NCCL)
-    # fmt::fmt-header-only is needed by the NCCL code in torch/csrc/distributed/c10d
-    target_link_libraries(torch_hip PRIVATE __caffe2_nccl fmt::fmt-header-only)
+    target_link_libraries(torch_hip PRIVATE __caffe2_nccl)
     target_compile_definitions(torch_hip PRIVATE USE_NCCL)
   endif()
 
@@ -972,8 +971,7 @@ elseif(USE_CUDA)
       target_compile_definitions(torch_cuda PRIVATE USE_CUSPARSELT)
   endif()
   if(USE_NCCL)
-    # fmt::fmt-header-only is needed by the NCCL code in torch/csrc/distributed/c10d
-    target_link_libraries(torch_cuda PRIVATE __caffe2_nccl fmt::fmt-header-only)
+    target_link_libraries(torch_cuda PRIVATE __caffe2_nccl)
     target_compile_definitions(torch_cuda PRIVATE USE_NCCL)
   endif()
   if(USE_UCC)
diff --git a/setup.py b/setup.py
index 6ef6666014836..04eaf8ccc5282 100644
--- a/setup.py
+++ b/setup.py
@@ -926,16 +926,10 @@ def configure_extension_build():
             "-Wno-unused-parameter",
             "-Wno-missing-field-initializers",
             "-Wno-unknown-pragmas",
-            # This is required for Python 2 declarations that are deprecated in 3.
-            "-Wno-deprecated-declarations",
             # Python 2.6 requires -fno-strict-aliasing, see
             # http://legacy.python.org/dev/peps/pep-3123/
             # We also depend on it in our code (even Python 3).
             "-fno-strict-aliasing",
-            # Clang has an unfixed bug leading to spurious missing
-            # braces warnings, see
-            # https://bugs.llvm.org/show_bug.cgi?id=21629
-            "-Wno-missing-braces",
         ]
 
     library_dirs.append(lib_path)
diff --git a/test/distributed/_tensor/test_api.py b/test/distributed/_tensor/test_api.py
index d15ea96d6beea..e187279ec4c20 100644
--- a/test/distributed/_tensor/test_api.py
+++ b/test/distributed/_tensor/test_api.py
@@ -57,6 +57,12 @@ def test_distribute_tensor(self):
                 self.assertTrue(dist_tensor.requires_grad)
                 self.assertTrue(dist_tensor.is_leaf)
 
+        # test negative dim
+        shard_minus_spec = [Shard(-1)]
+        tensor_to_shard = torch.randn(3, 3 * self.world_size)
+        dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_minus_spec)
+        self.assertEqual(dist_tensor.placements[0].dim, 1)
+
     @with_comms
     def test_distribute_tensor_errors(self):
         device_mesh = DeviceMesh(
diff --git a/test/distributed/_tensor/test_dtensor.py b/test/distributed/_tensor/test_dtensor.py
index bc2315468c81a..afd63b080417f 100644
--- a/test/distributed/_tensor/test_dtensor.py
+++ b/test/distributed/_tensor/test_dtensor.py
@@ -194,6 +194,14 @@ def test_from_local(self):
         expected_grad = torch.ones(3, 3) * 9
         self.assertEqual(local_tensor_with_grad.grad, expected_grad)
 
+    @with_comms
+    def test_from_local_negative_dim(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        shard_spec = [Shard(-1)]
+        local_tensor = torch.randn(3, 3)
+        sharded_tensor = DTensor.from_local(local_tensor, device_mesh, shard_spec)
+        self.assertEqual(sharded_tensor.placements[0].dim, 1)
+
     @with_comms
     def test_to_local(self):
         device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
diff --git a/test/distributed/_tensor/test_math_ops.py b/test/distributed/_tensor/test_math_ops.py
index 72bfd9c9d6d05..8c3ba342b087f 100644
--- a/test/distributed/_tensor/test_math_ops.py
+++ b/test/distributed/_tensor/test_math_ops.py
@@ -67,6 +67,7 @@ def test_softmax_fwd(self):
                 dist_y = torch.nn.functional.softmax(
                     dist_x, dim=softmax_dim, dtype=torch.float32
                 )
+                shard_dim = shard_dim + dist_y.ndim if shard_dim < 0 else shard_dim
                 self.assertTrue(dist_y.placements[0].is_shard(dim=shard_dim))
                 dist_y = dist_y.redistribute(device_mesh, [Replicate()])
                 self.assertEqual(dist_y.to_local(), local_y)
@@ -102,6 +103,7 @@ def test_softmax_with_bwd(self):
                     dist_softmax = dist_x.softmax(dim=softmax_dim)
             else:
                 dist_softmax = dist_x.softmax(dim=softmax_dim)
+                shard_dim = shard_dim + dist_x.ndim if shard_dim < 0 else shard_dim
                 self.assertTrue(dist_softmax.placements[0].is_shard(dim=shard_dim))
                 dist_y = dist_softmax.sum()
                 dist_y = dist_y.redistribute(device_mesh, [Replicate()])
diff --git a/test/distributed/_tensor/test_redistribute.py b/test/distributed/_tensor/test_redistribute.py
index b6ffdbd996740..77ace6f79d71a 100644
--- a/test/distributed/_tensor/test_redistribute.py
+++ b/test/distributed/_tensor/test_redistribute.py
@@ -222,6 +222,18 @@ def test_partial_to_shard(self):
                 torch.ones(local_shape) * self.world_size,
             )
 
+    @with_comms
+    def test_redistribute_negative_shard_dim(self):
+        device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
+        local_tensor = torch.randn(12, 3, device=self.device_type, requires_grad=True)
+        shard_spec = [Shard(1)]
+        shard_minus_spec = [Shard(-1)]
+
+        shard_tensor = distribute_tensor(local_tensor, device_mesh, shard_spec)
+        self.assertEqual(shard_tensor.placements[0].dim, 1)
+        reshard_tensor = shard_tensor.redistribute(device_mesh, shard_minus_spec)
+        self.assertEqual(shard_tensor.placements[0].dim, 1)
+
 
 class MultiDimRedistributeTest(DTensorTestBase):
     @property
diff --git a/test/distributed/_tensor/test_xla_integration.py b/test/distributed/_tensor/test_xla_integration.py
new file mode 100644
index 0000000000000..9d4e26fc3f02a
--- /dev/null
+++ b/test/distributed/_tensor/test_xla_integration.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import os
+import unittest
+from functools import wraps
+from typing import Any, Callable, Dict, Tuple
+
+import numpy as np
+import torch
+from torch.distributed._tensor import DeviceMesh, distribute_tensor, Replicate, Shard
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+# wrapper to check xla test requirements
+def with_xla(func: Callable) -> Callable:
+    assert func is not None
+
+    @wraps(func)  # pyre-ignore[6]
+    def wrapper(
+        self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
+    ) -> None:
+        # TODO(yeounoh) replace this with xr.use_spmd() when we deprecate the flag.
+        os.environ["XLA_USE_SPMD"] = "1"
+        try:
+            import torch_xla  # type:ignore[import]  # noqa: F401
+        except ImportError as exc:
+            raise unittest.SkipTest("torch_xla is not installed.") from exc
+        self.device_type = "xla"
+        func(self, *args, **kwargs)  # type: ignore[misc]
+        os.environ["XLA_USE_SPMD"] = "0"
+
+    return wrapper
+
+
+class DTensorXLAIntegrationTest(TestCase):
+    @with_xla
+    def test_xla_distribute_tensor_1d_shard(self):
+        import torch_xla.runtime as xr  # type:ignore[import]
+
+        device_count = xr.global_runtime_device_count()
+        if device_count > 1:
+            device_mesh = DeviceMesh("xla", list(range(device_count)))
+            shard_spec = [Shard(0)]
+
+            for requires_grad in [True, False]:
+                tensor_to_shard = torch.randn(
+                    3 * device_count, 3, requires_grad=requires_grad
+                )
+                dist_tensor = distribute_tensor(
+                    tensor_to_shard, device_mesh, shard_spec
+                )
+                # TODO(yeounoh) switch to DTensor API when XLAShardedTensor inherits DTensor
+                assert type(dist_tensor).__name__ == "XLAShardedTensor"
+                global_tensor = dist_tensor.global_tensor  # type:ignore[attr-defined]
+                self.assertEqual(
+                    global_tensor.size(), torch.Size([3 * device_count, 3])
+                )
+                local_tensor = dist_tensor.local_shards[0].data
+                self.assertEqual(local_tensor.size(), torch.Size([3, 3]))
+                if requires_grad:
+                    self.assertTrue(dist_tensor.global_tensor.requires_grad)
+                    self.assertTrue(dist_tensor.is_leaf)
+
+    @with_xla
+    def test_xla_distribute_tensor_1d_replicate(self):
+        import torch_xla.runtime as xr  # type:ignore[import]
+
+        device_count = xr.global_runtime_device_count()
+        device_mesh = DeviceMesh("xla", list(range(device_count)))
+        shard_spec = [Replicate()]
+
+        for requires_grad in [True, False]:
+            tensor_to_shard = torch.randn(
+                3 * device_count, 3, requires_grad=requires_grad
+            )
+            dist_tensor = distribute_tensor(tensor_to_shard, device_mesh, shard_spec)
+            # TODO(yeounoh) switch to DTensor API when XLAShardedTensor inherits DTensor
+            assert type(dist_tensor).__name__ == "XLAShardedTensor"
+            global_tensor = dist_tensor.global_tensor  # type:ignore[attr-defined]
+            self.assertEqual(global_tensor.size(), torch.Size([3 * device_count, 3]))
+            local_tensor = dist_tensor.local_shards[0].data
+            self.assertEqual(local_tensor.size(), torch.Size([3 * device_count, 3]))
+            if requires_grad:
+                self.assertTrue(dist_tensor.global_tensor.requires_grad)
+                self.assertTrue(dist_tensor.is_leaf)
+
+    @with_xla
+    def test_xla_distribute_tensor_2d(self):
+        import torch_xla.runtime as xr  # type:ignore[import]
+
+        device_count = xr.global_runtime_device_count()
+        if device_count > 1:
+            device_mesh = DeviceMesh(
+                "xla", np.array(range(device_count)).reshape(2, device_count // 2)
+            )
+            shard_spec = [Replicate(), Shard(0)]
+
+            for requires_grad in [True, False]:
+                tensor_to_shard = torch.randn(
+                    3 * device_count // 2, 3, requires_grad=requires_grad
+                )
+                dist_tensor = distribute_tensor(
+                    tensor_to_shard, device_mesh, shard_spec
+                )
+                # TODO(yeounoh) switch to DTensor API when XLAShardedTensor inherits DTensor
+                assert type(dist_tensor).__name__ == "XLAShardedTensor"
+                global_tensor = dist_tensor.global_tensor  # type:ignore[attr-defined]
+                self.assertEqual(
+                    global_tensor.size(), torch.Size([3 * device_count // 2, 3])
+                )
+                local_tensor = dist_tensor.local_shards[0].data
+                self.assertEqual(local_tensor.size(), torch.Size([3, 3]))
+                if requires_grad:
+                    self.assertTrue(dist_tensor.global_tensor.requires_grad)
+                    self.assertTrue(dist_tensor.is_leaf)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/e2e/test_fine_tuning.py b/test/distributed/checkpoint/e2e/test_fine_tuning.py
index 207c6ad869386..fd948e191e102 100644
--- a/test/distributed/checkpoint/e2e/test_fine_tuning.py
+++ b/test/distributed/checkpoint/e2e/test_fine_tuning.py
@@ -7,6 +7,7 @@
 import torch.distributed as dist
 import torch.distributed.checkpoint as dist_cp
 import torch.nn as nn
+from torch.distributed._tensor import init_device_mesh
 from torch.distributed.checkpoint.state_dict import (
     get_state_dict,
     set_state_dict,
@@ -14,8 +15,11 @@
 )
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTest
-from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    DTensorTestBase,
+    with_comms,
+)
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
 
@@ -73,14 +77,20 @@ def forward(self, batch):
         return x
 
 
-class TestFineTuning(FSDPTest):
+class TestFineTuning(DTensorTestBase):
     @property
     def world_size(self) -> int:
         return min(4, torch.cuda.device_count())
 
+    @property
+    def backend(self):
+        return "cpu:gloo,cuda:nccl"
+
     def pretrain(self, pretrain_dir: str) -> None:
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+
         model = PreTrainedModel().cuda()
-        model = FSDP(model)
+        model = FSDP(model, device_mesh=device_mesh)
         optim = torch.optim.Adam(model.parameters(), lr=1e-3)
 
         # Trainining
@@ -100,9 +110,11 @@ def pretrain(self, pretrain_dir: str) -> None:
         )
 
     def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
+        device_mesh = init_device_mesh(self.device_type, (self.world_size,))
+
         model = FineTuningModel().cuda()
         # TODO: make the parallelism more complicated, e.g., using 2D + DDP.
-        model = FSDP(model, use_orig_params=True)
+        model = FSDP(model, use_orig_params=True, device_mesh=device_mesh)
         optim = torch.optim.Adam(model.parameters(), lr=1e-3)
 
         # Simulate that the fine tuning restart after 3 iterations
@@ -167,6 +179,7 @@ def finetune(self, pretrain_dir: str, finetune_dir: str) -> None:
             )
 
     @skip_if_lt_x_gpu(4)
+    @with_comms
     @with_temp_dir
     def test_fine_tuning(self) -> None:
         self.assertTrue(os.path.exists(self.temp_dir))
@@ -183,3 +196,7 @@ def test_fine_tuning(self) -> None:
 
         self.pretrain(pretrain_dir)
         self.finetune(pretrain_dir, finetune_dir)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/checkpoint/test_state_dict.py b/test/distributed/checkpoint/test_state_dict.py
index 5c3c4464f50bb..41d74014839f1 100644
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from torch.distributed._composable import fully_shard, replicate
 from torch.distributed._shard.sharded_tensor import ShardedTensor
-from torch.distributed._tensor import DTensor
+from torch.distributed._tensor import DTensor, init_device_mesh
 from torch.distributed.checkpoint.state_dict import (
     _patch_model_state_dict,
     _patch_optimizer_state_dict,
@@ -31,7 +31,7 @@
 )
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest
-from torch.testing._internal.common_utils import TEST_WITH_DEV_DBG_ASAN
+from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 
 if not dist.is_available():
@@ -221,11 +221,20 @@ def _test_save_load(
         self._verify_osd_by_load(model, optim, copy_optim, dist_osd)
         self._verify_osd(model, optim, osd, dist_osd)
 
-    def _test_fsdp(self, use_orig_params: bool, use_composable: bool) -> None:
+    def _test_fsdp(
+        self, use_orig_params: bool, use_composable: bool, use_dtensor: bool
+    ) -> None:
         if not use_orig_params and use_composable:
             return
 
+        # TODO: remove this return after we complete the composable API side change for device_mesh
+        if use_composable and use_dtensor:
+            return
+
         def init_model_optim():
+            if use_dtensor:
+                device_mesh = init_device_mesh("cuda", (self.world_size,))
+
             orig_model = CompositeParamModel(device=torch.device("cuda"))
             orig_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
             copy_optim = torch.optim.Adam(orig_model.parameters(), lr=1e-3)
@@ -234,11 +243,21 @@ def init_model_optim():
                     copy.deepcopy(orig_model), policy=ModuleWrapPolicy({UnitModule})
                 )
             else:
-                dist_model = FSDP(
-                    copy.deepcopy(orig_model),
-                    auto_wrap_policy=ModuleWrapPolicy({UnitModule}),
-                    use_orig_params=use_orig_params,
-                )
+                if use_dtensor:
+                    device_mesh = init_device_mesh("cuda", (self.world_size,))
+                    dist_model = FSDP(
+                        copy.deepcopy(orig_model),
+                        auto_wrap_policy=ModuleWrapPolicy({UnitModule}),
+                        use_orig_params=use_orig_params,
+                        device_mesh=device_mesh,
+                    )
+                else:
+                    dist_model = FSDP(
+                        copy.deepcopy(orig_model),
+                        auto_wrap_policy=ModuleWrapPolicy({UnitModule}),
+                        use_orig_params=use_orig_params,
+                    )
+
             dist_optim = torch.optim.Adam(dist_model.parameters(), lr=1e-3)
             return orig_model, orig_optim, copy_optim, dist_model, dist_optim
 
@@ -247,7 +266,11 @@ def init_model_optim():
     @skip_if_lt_x_gpu(2)
     def test_fsdp(self) -> None:
         self.run_subtests(
-            {"use_orig_params": [True, False], "use_composable": [True, False]},
+            {
+                "use_orig_params": [True, False],
+                "use_composable": [True, False],
+                "use_dtensor": [True, False],
+            },
             self._test_fsdp,
         )
 
@@ -418,3 +441,7 @@ def test_partial(self) -> None:
         )
         self.assertEqual(model.l.weight, model_state_dict1["l.weight"])
         self.assertEqual(model.l.bias, model_state_dict1["l.bias"])
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index f82909a2222c9..0c85af718904a 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -47,6 +47,7 @@
     TestCase,
     run_tests,
     retry_on_connect_failures,
+    skipIfRocm,
     TEST_WITH_DEV_DBG_ASAN,
     TEST_WITH_ROCM,
     skip_but_pass_in_sandcastle,
@@ -457,6 +458,32 @@ def test_allreduce_in_cudagraph(self):
             graph.replay()
             self.assertEqual(xs[0].item(), 8)
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
+    @skipIfRocm()
+    def test_nccl_watchdog_cudagraph(self):
+        # test that the watchdog does not crash graphs with disallowed event query
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        rank = self.rank_to_GPU[self.rank][0]
+        with torch.cuda.device(rank):
+            for i in range(100):
+                xs = [torch.FloatTensor([1]).cuda(rank)]
+                ys = [torch.FloatTensor([4]).cuda(rank)]
+                for _ in range(30):
+                    pg.allreduce(xs[0]).wait()
+
+                graph = torch.cuda.CUDAGraph()
+                with torch.cuda.graph(graph):
+                    xs[0] += 0.0
+                    pg.allreduce(xs[0]).wait()
+                    pg.allreduce(xs[0]).wait()
+                    pg.allreduce(xs[0]).wait()
+                    xs[0] += 0.0
+
+                for _ in range(1400):
+                    graph.replay()
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(torch.cuda.device_count() < 2, "NCCL test requires 2+ GPUs")
     def test_reduce_ops(self):
diff --git a/test/dynamo/test_allow_inline_skip.py b/test/dynamo/test_allow_inline_skip.py
index b8c1a5e357843..e65dbdb12dd2d 100644
--- a/test/dynamo/test_allow_inline_skip.py
+++ b/test/dynamo/test_allow_inline_skip.py
@@ -6,9 +6,9 @@
 import torch
 import torch._dynamo.test_case
 from torch._dynamo.skipfiles import (
-    FILE_INLINELIST,
     FUNC_INLINELIST,
-    SUBMODULE_INLINELIST,
+    LEGACY_MOD_INLINELIST,
+    MOD_INLINELIST,
 )
 from torch._dynamo.utils import istype
 
@@ -36,7 +36,7 @@ class AllowInlineSkipTests(torch._dynamo.test_case.TestCase):
     # this unit test is to make sure the functions/modules can be correctly imported
     # or loaded in case there is typo in the strings.
     def test_skipfiles_inlinelist_correctness(self):
-        for m in FILE_INLINELIST.union(SUBMODULE_INLINELIST):
+        for m in LEGACY_MOD_INLINELIST.union(MOD_INLINELIST):
             self.assertTrue(isinstance(importlib.import_module(m), types.ModuleType))
         for f in FUNC_INLINELIST:
             module_name, fn_name = f.rsplit(".", 1)
@@ -54,11 +54,9 @@ def fn(x):
         func_inlinelist.add("torch._dynamo.utils.istype")
 
         self.assertTrue(
-            "torch._dynamo.utils" not in torch._dynamo.skipfiles.FILE_INLINELIST
-        )
-        self.assertTrue(
-            "torch._dynamo" not in torch._dynamo.skipfiles.SUBMODULE_INLINELIST
+            "torch._dynamo" not in torch._dynamo.skipfiles.LEGACY_MOD_INLINELIST
         )
+        self.assertTrue("torch._dynamo" not in torch._dynamo.skipfiles.MOD_INLINELIST)
 
         with unittest.mock.patch(
             "torch._dynamo.skipfiles.get_func_inlinelist",
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 9cb83b8e08dcb..7ad5f4e11feb0 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -1962,6 +1962,157 @@ def fn(x):
             res = fn(x)
             self.assertEqual(ref, res)
 
+    def test_cast_tensor_single_elem(self):
+        with torch._dynamo.config.patch({"capture_scalar_outputs": True}):
+            for t, val in [
+                (float, 1.0),
+                (float, 1),
+                (float, True),
+                (int, 1),
+                (int, False),
+                # (int, 1.0), # fails due to a >= 0 comparison in sym_int
+            ]:  # , bool, complex]: no casting for sym_bool, no sym_complex
+
+                def fn(x):
+                    x = x + 1
+                    return t(x)
+
+                opt_fn = torch.compile(
+                    fn, backend="eager", fullgraph=True, dynamic=False
+                )
+                x = torch.tensor([val])
+                res = fn(x)
+                ref = opt_fn(x)
+                self.assertEqual(ref, res)
+
+                # Cannot handle non single-elem
+                with self.assertRaises(ValueError):
+                    fn(torch.tensor([val] * 2))
+                with self.assertRaises(torch._dynamo.exc.TorchRuntimeError):
+                    opt_fn(torch.tensor([val] * 2))
+
+    def test_set_construction(self):
+        def fn(x):
+            y = x.add_(1)
+            s = set({x})
+            s.add(y)
+            return len(s)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        x = torch.randn(4)
+        res = fn(x)
+        ref = opt_fn(x)
+        self.assertEqual(ref, res)
+
+    def test_is_tensor_tensor(self):
+        def fn(x, y):
+            if x is y:
+                return x * 2
+            else:
+                return x + y
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        x = torch.zeros(2)
+        y = torch.ones(2)
+
+        self.assertEqual(fn(x, y), fn_opt(x, y))
+        self.assertEqual(fn(x, x), fn_opt(x, x))
+
+    def test_is_mutated_tensor_tensor(self):
+        def fn(x):
+            y = x.add_(1)
+            return x is y
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        z = torch.ones(4)
+
+        self.assertEqual(fn(z), fn_opt(z))
+
+    def test_is_mutated_tensor_tensor_across_graph_break(self):
+        def fn(x):
+            y = x.add_(1)
+            cond = x is y
+            x.add_(1)
+            # The real tensor values are recovered when graph breaking.
+            # Hence we recover the invariant.
+            torch._dynamo.graph_break()
+            x.add_(1)
+            return x is y, cond
+
+        fn_opt = torch.compile(backend="eager", dynamic=True)(fn)
+
+        z = torch.ones(4)
+
+        self.assertEqual(fn(z), fn_opt(z))
+
+    def test_is_mutated_tensor_tensor(self):
+        def fn(x):
+            y = x.add_(1)
+            return y is x
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        z = torch.ones(4, 1)
+
+        self.assertEqual(fn(z), fn_opt(z))
+
+    def test_is_init_in_compile_mutated_tensor_tensor(self):
+        def fn(x):
+            z = x.clone()
+            y = z.add_(1)
+            return y is z
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        z = torch.ones(4, 1)
+
+        self.assertEqual(fn(z), fn_opt(z))
+
+    def test_is_init_in_compile_vmapped_mutated_tensor_tensor(self):
+        def fn(z):
+            x = z.clone()
+            y = torch.vmap(torch.Tensor.acos_)(x)
+            _ = y is z
+            return y is x
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        z = torch.ones(4, 1)
+
+        self.assertEqual(fn(z), fn_opt(z))
+
+    def test_is_vmapped_mutated_tensor_tensor(self):
+        def fn(x):
+            y = torch.vmap(torch.Tensor.acos_)(x)
+            return y is x
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        z = torch.ones(4, 1)
+
+        self.assertEqual(fn(z), fn_opt(z))
+
+    def test_is_init_in_compile_vmapped_mutated_tensor_tensor_multi_arg(self):
+        def fn(y, z):
+            a = y.clone()
+            b = z.clone()
+
+            def g(a, b):
+                return a.acos_(), b.acos_()
+
+            c, d = torch.vmap(g)(a, b)
+            return a is c is b is d
+
+        fn_opt = torch.compile(backend="eager", fullgraph=True, dynamic=True)(fn)
+
+        y = torch.ones(4, 2)
+        z = torch.ones(4, 10)
+
+        self.assertEqual(fn(y, z), fn_opt(y, z))
+        self.assertEqual(fn(y, y), fn_opt(y, y))
+
 
 common_utils.instantiate_parametrized_tests(DefaultsTests)
 
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index b3183c029fec7..c9f6df2f7cf81 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -2221,10 +2221,10 @@ def foo(mod, x):
 
         mod = Mod()
         foo(mod, torch.rand([4]))
-        self.assertEqual(compiles_without_buffers, 0)
+        self.assertEqual(compiles_without_buffers, 1)
 
         foo(mod, torch.rand([4], dtype=torch.half))
-        self.assertEqual(compiles_without_buffers, 1)
+        self.assertEqual(compiles_without_buffers, 2)
 
         class Mod2(Mod):
             def __setattr__(self, name, value):
@@ -2232,7 +2232,7 @@ def __setattr__(self, name, value):
 
         foo(Mod2(), torch.rand([4]))
         # causes two compilations, bc unimplemented custom setattr
-        self.assertTrue(compiles_without_buffers >= 2)
+        self.assertTrue(compiles_without_buffers >= 4)
 
     def test_unspec_non_inlinable_module(self):
         mod = UnspecNonInlinableModule()
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 9d9c6036239ef..19dcf8b609094 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -343,6 +343,17 @@ def fn(inputs, dim):
         compl_fn = torch.compile(fn, dynamic=True, backend="eager", fullgraph=True)
         self.assertEqual(compl_fn(inputs, dim), fn(inputs, dim))
 
+    # https://github.com/pytorch/pytorch/issues/104812
+    def test_argmin_coerces_symint_to_intlist_spec(self):
+        def fn(x, dim):
+            # the python arg parser coerces dim into a vector<int>
+            return torch.amin(x, dim=dim, keepdim=True)
+
+        x = torch.randn(4, 4, 4)
+        dim = 2
+        compl_fn = torch.compile(fn, dynamic=True, backend="eager", fullgraph=True)
+        self.assertEqual(compl_fn(x, dim), fn(x, dim))
+
     def test_exponential(self):
         def fn(inputs, op_inputs_dict):
             res = inputs.exponential_(**op_inputs_dict)
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index 29cc6a87212cd..62a423be91e87 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -620,23 +620,6 @@ def f(a):
         inp = [torch.ones(3, 3, requires_grad=False)]
         self.verify_aot_autograd(f, inp, test_mutation=True)
 
-    def test_input_mutation_resize_smaller(self):
-        def f(a, b):
-            a.resize_(2, 2)
-            return a + b
-        # tenors that require gradients cannot be resized, so only test requires_grad=False case
-        inp = [
-            torch.ones(3, 3),
-            torch.ones(2, 2, requires_grad=True),
-        ]
-        self.verify_aot_autograd(f, inp, test_mutation=True)
-
-        inp = [
-            torch.ones(3, 3),
-            torch.ones(2, 2),
-        ]
-        self.verify_aot_autograd(f, inp, test_mutation=True)
-
     def test_input_mutation_batchnorm(self):
         def f(inpt, weight, bias, running_mean, running_var):
             # This is additionally a good test, because the input tensors that we mutate
@@ -1885,45 +1868,6 @@ def forward(self, x, y):
             """At compilation time, graph 1 was compiled under the assumption that input 1 would not require grad, but at runtime this was not the case.  This indicates a guard bug in AOTAutograd or Dynamo, please file a bug to PyTorch."""  # noqa: B950
         )
 
-    def test_resize_input(self):
-        def f(x, y):
-            y.resize_(4)
-            y.zero_()
-            self.assertEqual(x.shape, (4,))
-            return y
-
-        # NB: don't use verify_aot_autograd as the inputs get
-        # mutated and I don't trust verify to do it right
-
-        compiled_f = aot_function(f, nop)
-        ref_x = torch.randn(0)
-        ref_out = f(ref_x, ref_x)
-
-        test_x = torch.randn(0)
-        test_out = compiled_f(test_x, test_x)
-
-        self.assertEqual(ref_out, test_out)
-
-    def test_resize_input_smaller(self):
-        def f(x, y):
-            y.resize_(4)
-            y.zero_()
-            self.assertEqual(x.shape, (4,))
-            return y
-
-        # NB: don't use verify_aot_autograd as the inputs get
-        # mutated and I don't trust verify to do it right
-
-        compiled_f = aot_function(f, nop)
-        ref_x = torch.randn(5)
-        ref_out = f(ref_x, ref_x)
-
-        test_x = torch.randn(5)
-        test_out = compiled_f(test_x, test_x)
-
-        self.assertEqual(ref_out, test_out)
-
-
     def test_custom_autograd(self):
         class CustomFn(torch.autograd.Function):
             @staticmethod
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index 9b0fdaec8ccc5..e3d14b522c534 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -318,9 +318,13 @@ def is_inplace(op, variant):
 
 vjp_fail = {
     xfail('tensor_split'),  # data_ptr composite compliance
+    # https://github.com/pytorch/pytorch/issues/96560
     decorate('nn.functional.batch_norm', decorator=skipIfRocm),
+    # https://github.com/pytorch/pytorch/issues/96560
     decorate('nn.functional.instance_norm', decorator=skipIfRocm),
     # https://github.com/pytorch/pytorch/issues/96560
+    decorate('nn.functional.layer_norm', decorator=skipIfRocm),
+    # https://github.com/pytorch/pytorch/issues/96560
     decorate('nn.functional.scaled_dot_product_attention', decorator=skipIfRocm),
 }
 
@@ -478,10 +482,11 @@ def abs_if_complex(t):
         xfail('NumpyExpMarkDirtyAutogradFunction'),  # TODO: https://github.com/pytorch/pytorch/issues/91280
 
         # https://github.com/pytorch/pytorch/issues/96560
-        # ROCm: NotImplementedError
         decorate('nn.functional.batch_norm', decorator=skipIfRocm),
-        # ROCm: NotImplementedError
+        # https://github.com/pytorch/pytorch/issues/96560
         decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+        # https://github.com/pytorch/pytorch/issues/96560
+        decorate('nn.functional.layer_norm', decorator=skipIfRocm),
 
         # --- Non-Contiguous Failures! ---
         # This is expected to fail as the operator
@@ -1030,8 +1035,11 @@ def test_vmapvjp(self, device, dtype, op):
         xfail("_native_batch_norm_legit"),
 
         # https://github.com/pytorch/pytorch/issues/96560
-        # ROCm: NotImplementedError
+        decorate('nn.functional.batch_norm', decorator=skipIfRocm),
+        # https://github.com/pytorch/pytorch/issues/96560
         decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+        # https://github.com/pytorch/pytorch/issues/96560
+        decorate('nn.functional.layer_norm', decorator=skipIfRocm),
         # ----------------------------------------------------------------------
     }
 
@@ -1547,6 +1555,8 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail("native_batch_norm"),
         xfail("_native_batch_norm_legit"),
         xfail('native_dropout_backward'),
+        decorate('linalg.svd', decorator=skipIfRocm),  # https://github.com/pytorch/pytorch/issues/97256
+        decorate('svd', decorator=skipIfRocm),  # Flaky tensor-likes are not close error on ROCm, adjust tolerance?
     }))
     @ops(op_db + additional_op_db + autograd_function_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
@@ -1771,7 +1781,13 @@ def fn(input, weight, bias):
         xfail('nn.functional.max_unpool2d'),  # contiguous call
         xfail('to_sparse'),  # dispatch key issue
 
-       # https://github.com/pytorch/pytorch/issues/96560
+        # https://github.com/pytorch/pytorch/issues/96560
+        decorate('nn.functional.batch_norm', decorator=skipIfRocm),
+        # https://github.com/pytorch/pytorch/issues/96560
+        decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+        # https://github.com/pytorch/pytorch/issues/96560
+        decorate('nn.functional.layer_norm', decorator=skipIfRocm),
+        # https://github.com/pytorch/pytorch/issues/96560
         decorate('xlogy', decorator=skipIfRocm),
 
         # numerical inconsistencies, look like bugs
diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py
index a3df80e6eb807..5b1f6c0e7634b 100644
--- a/test/functorch/test_vmap.py
+++ b/test/functorch/test_vmap.py
@@ -3598,6 +3598,8 @@ def test():
 
         # https://github.com/pytorch/pytorch/issues/96560
         decorate('nn.functional.batch_norm', decorator=skipIfRocm),
+        decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+        decorate('nn.functional.layer_norm', decorator=skipIfRocm),
 
         # RuntimeError: output with shape [4, 4] doesn't match the broadcast shape [1, 4, 4]
         xfail('addcdiv'),
@@ -3741,6 +3743,8 @@ def test_vmap_exhaustive(self, device, dtype, op):
         skip('_softmax_backward_data'),
         # https://github.com/pytorch/pytorch/issues/96560
         decorate('nn.functional.batch_norm', decorator=skipIfRocm),
+        decorate('nn.functional.instance_norm', decorator=skipIfRocm),
+        decorate('nn.functional.layer_norm', decorator=skipIfRocm),
 
         # One or more of the overload doesn't have a Batch rule.
         xfail('bincount'),
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index a6fbb2ce6b6ce..3b1ff31c80178 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -2267,8 +2267,10 @@ def fn(x):
         metrics.reset()
         x = torch.randn(1, 32, 16, 68)
         opt_fn = torch._dynamo.optimize("inductor")(fn)
+        code = run_and_get_cpp_code(opt_fn, x)
         self.assertTrue(same(fn(x), opt_fn(x)))
-        assert metrics.generated_cpp_vec_kernel_count == 2
+        # def and use
+        FileCheck().check_count("cpp_fused", 2, exactly=True).run(code)
 
     def test_invalid_index_of_empty_tensor(self):
         def fn(a):
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index f3405ab2ad511..dc5cb7b722c5e 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -223,7 +223,7 @@ def mm(a, b):
     @unittest.skipIf(not SM75OrLater, "need sm_75")
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
     @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen, Triton, CUTLASS"))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_backend_regular_mm(
         self, dynamic: bool, max_autotune_gemm_backends: str
@@ -260,7 +260,7 @@ def mm(a, b):
     @unittest.skipIf(not SM75OrLater, "need sm_75")
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
     @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen, Triton, CUTLASS"))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_backend_mm_bias(
         self, dynamic: bool, max_autotune_gemm_backends: str
@@ -331,7 +331,7 @@ def addmm(x, a, b):
     @unittest.skipIf(not SM75OrLater, "need sm_75")
     @unittest.skipIf(config.is_fbcode(), "fbcode requires different CUTLASS path setup")
     @parametrize("dynamic", (False,))
-    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen, Triton, CUTLASS"))
+    @parametrize("max_autotune_gemm_backends", ("CUTLASS", "ATen,Triton,CUTLASS"))
     @unittest.mock.patch.dict(os.environ, {"PATH": _get_path_without_sccache()})
     def test_max_autotune_cutlass_backend_addmm(
         self, dynamic, max_autotune_gemm_backends
diff --git a/test/inductor/test_perf.py b/test/inductor/test_perf.py
index 8677351a95baf..65978e241e52f 100644
--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@@ -189,6 +189,24 @@ def f(a, b):
         inp = (T(10, 10), T(10, 10))
         self.assertExpectedInline(count_numel(f, *inp), """400""")
 
+        def f(a, b, c):
+            return torch.cat((a + 1, b + 2, c + 3)) + 10
+
+        inp = (T(10, 10), T(10, 10), T(10, 10))
+        self.assertExpectedInline(count_numel(f, *inp), """600""")
+
+        def f(a, b, c, d, e):
+            return torch.cat((a + 1, b + 2, c + 3, d + 4, e + 5)) + 10
+
+        inp = [T(10, 10) for _ in range(5)]
+        self.assertExpectedInline(count_numel(f, *inp), """2000""")
+
+        def f(a, b):
+            return torch.cat([a.sum(dim=0), b.sum(dim=0)]) + 10
+
+        inp = [T(10, 10, 10), T(10, 10, 10)]
+        self.assertExpectedInline(count_numel(f, *inp), """2600""")
+
     def test_index(self):
         def f(a, b):
             return a[b]
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index ebd4a27386172..5659aee7777fc 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -203,6 +203,8 @@ def format_op(op):
     "cholesky": {f32, f64},
     "complex": {f16},
     "exponential": {f16},
+    "resize_": {b8, f16, f32, f64, i32, i64},
+    "resize_as_": {b8, f16, f32, f64, i32, i64},
     "geometric": {f16},
     "log_normal": {f16},
     "masked_scatter": {f16, f32, f64},
@@ -234,6 +236,8 @@ def format_op(op):
     "cauchy": {f16},
     "cholesky": {f32, f64},
     "exponential": {f16},
+    "resize_": {b8, f16, f32, f64, i32, i64},
+    "resize_as_": {b8, f16, f32, f64, i32, i64},
     "geometric": {f16},
     "log_normal": {f16},
     "masked_scatter": {f16, f32, f64},
diff --git a/test/mobile/custom_build/prepare_model.py b/test/mobile/custom_build/prepare_model.py
index 039ec2bec76be..feb044e3bfc03 100644
--- a/test/mobile/custom_build/prepare_model.py
+++ b/test/mobile/custom_build/prepare_model.py
@@ -5,11 +5,11 @@
 """
 
 import torch
-import torchvision
 import yaml
+from torchvision import models
 
 # Download and trace the model.
-model = torchvision.models.mobilenet_v2(pretrained=True)
+model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
 model.eval()
 example = torch.rand(1, 3, 224, 224)
 traced_script_module = torch.jit.trace(model, example)
diff --git a/test/mobile/model_test/torchvision_models.py b/test/mobile/model_test/torchvision_models.py
index 25c4ab15c5d04..e86fe2fdbf948 100644
--- a/test/mobile/model_test/torchvision_models.py
+++ b/test/mobile/model_test/torchvision_models.py
@@ -1,12 +1,12 @@
 import torch
-import torchvision
 from torch.utils.bundled_inputs import augment_model_with_bundled_inputs
 from torch.utils.mobile_optimizer import optimize_for_mobile
+from torchvision import models
 
 
 class MobileNetV2Module:
     def getModule(self):
-        model = torchvision.models.mobilenet_v2(pretrained=True)
+        model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
         model.eval()
         example = torch.zeros(1, 3, 224, 224)
         traced_script_module = torch.jit.trace(model, example)
@@ -23,7 +23,7 @@ def getModule(self):
 
 class MobileNetV2VulkanModule:
     def getModule(self):
-        model = torchvision.models.mobilenet_v2(pretrained=True)
+        model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
         model.eval()
         example = torch.zeros(1, 3, 224, 224)
         traced_script_module = torch.jit.trace(model, example)
@@ -40,7 +40,7 @@ def getModule(self):
 
 class Resnet18Module:
     def getModule(self):
-        model = torchvision.models.resnet18(pretrained=True)
+        model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
         model.eval()
         example = torch.zeros(1, 3, 224, 224)
         traced_script_module = torch.jit.trace(model, example)
diff --git a/test/onnx/test_fx_to_onnx_with_onnxruntime.py b/test/onnx/test_fx_to_onnx_with_onnxruntime.py
index f8439ef210565..a5f90cb61f62d 100644
--- a/test/onnx/test_fx_to_onnx_with_onnxruntime.py
+++ b/test/onnx/test_fx_to_onnx_with_onnxruntime.py
@@ -261,7 +261,7 @@ def test_resnet18(self):
         # So we are explicitly calling `model.eval()` for any model that contains
         # batch norm.
         # Ref: https://github.com/pytorch/pytorch/issues/99662#issuecomment-1528178221
-        model = torchvision.models.resnet18(pretrained=False).eval()
+        model = torchvision.models.resnet18(weights=None).eval()
         dummy_input = torch.randn(1, 3, 224, 224)
 
         self.run_test_with_fx_to_onnx_exporter_and_onnx_runtime(
@@ -276,7 +276,7 @@ def test_resnet18(self):
     @skip_if_no_torchvision
     def test_shufflenet_v2(self):
         # TODO(bowbao): see Note [training vs eval in dynamo_export]
-        model = torchvision.models.shufflenet_v2_x0_5(pretrained=False).eval()
+        model = torchvision.models.shufflenet_v2_x0_5(weights=None).eval()
         dummy_input = torch.randn(1, 3, 224, 224, requires_grad=False)
         test_inputs = torch.randn(3, 3, 224, 224, requires_grad=False)
 
diff --git a/test/onnx/test_models.py b/test/onnx/test_models.py
index b50e8e903c7ba..f56d9cd9f23c2 100644
--- a/test/onnx/test_models.py
+++ b/test/onnx/test_models.py
@@ -253,7 +253,7 @@ def test_shufflenet(self):
     def test_fcn(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(
-            toC(fcn_resnet101(pretrained=False, pretrained_backbone=False)),
+            toC(fcn_resnet101(weights=None, weights_backbone=None)),
             toC(x),
             rtol=1e-3,
             atol=1e-5,
@@ -263,7 +263,7 @@ def test_fcn(self):
     def test_deeplab(self):
         x = Variable(torch.randn(BATCH_SIZE, 3, 224, 224).fill_(1.0))
         self.exportTest(
-            toC(deeplabv3_resnet101(pretrained=False, pretrained_backbone=False)),
+            toC(deeplabv3_resnet101(weights=None, weights_backbone=None)),
             toC(x),
             rtol=1e-3,
             atol=1e-5,
diff --git a/test/onnx/test_models_onnxruntime.py b/test/onnx/test_models_onnxruntime.py
index af259b4e1d67a..f9afca487f52b 100644
--- a/test/onnx/test_models_onnxruntime.py
+++ b/test/onnx/test_models_onnxruntime.py
@@ -420,7 +420,7 @@ def test_mobilenet_v3(self):
     @skipIfUnsupportedMinOpsetVersion(11)
     @skipScriptTest()
     def test_shufflenet_v2_dynamic_axes(self):
-        model = torchvision.models.shufflenet_v2_x0_5(pretrained=False)
+        model = torchvision.models.shufflenet_v2_x0_5(weights=None)
         dummy_input = torch.randn(1, 3, 224, 224, requires_grad=True)
         test_inputs = torch.randn(3, 3, 224, 224, requires_grad=True)
         self.run_test(
diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py
index 15f1b5abf2d3c..2a57f043c700f 100644
--- a/test/onnx/test_utility_funs.py
+++ b/test/onnx/test_utility_funs.py
@@ -1668,7 +1668,7 @@ def forward(self, x):
         self.assertEqual(len(list(graph.nodes())), 1)
 
     def test_fuse_resnet18(self):
-        model = torchvision.models.resnet18(pretrained=False)
+        model = torchvision.models.resnet18(weights=None)
         x = torch.randn(2, 3, 224, 224, requires_grad=True)
         graph, _, __ = self._model_to_graph(
             model,
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index 684ba89eef89b..879c0224b177d 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -811,6 +811,255 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             m, expected_node_list=node_list, expected_node_occurrence=node_occurrence
         )
 
+    def _test_transitive_sharing_with_cat_helper(self, quantizer):
+        m = TestHelperModules.Conv2dWithTwoCat().eval()
+        example_inputs = (torch.randn(1, 3, 5, 5), torch.randn(1, 3, 5, 5), torch.randn(1, 6, 3, 3), torch.randn(1, 6, 3, 3))
+
+        # program capture
+        m = capture_pre_autograd_graph(
+            m,
+            example_inputs,
+        )
+        m = prepare_pt2e(m, quantizer)
+        m(*example_inputs)
+        # make sure the two input observers and output are shared
+        conv_output_obs = []
+        for n in m.graph.nodes:
+            if n.op == "call_function" and n.target == torch.ops.aten.conv2d.default:
+                conv_output_obs.append(getattr(m, list(n.users)[0].target))
+            if n.op == "call_function" and n.target == torch.ops.aten.cat.default:
+                inputs = n.args[0]
+                input0 = inputs[0]
+                input1 = inputs[1]
+                assert input0.op == "call_module"
+                assert input1.op == "call_module"
+                obs_ins0 = getattr(m, input0.target)
+                obs_ins1 = getattr(m, input1.target)
+                assert obs_ins0 == obs_ins1
+
+                output_obs = list(n.users)[0]
+                assert output_obs.op == "call_module"
+                obs_ins2 = getattr(m, output_obs.target)
+                assert obs_ins0 == obs_ins2, "input observer does not match output"
+
+        assert len(conv_output_obs) == 2, "expecting two observer that follows conv2d ops"
+        # checking that the output observers for the two convs are shared as well
+        assert conv_output_obs[0] == conv_output_obs[1]
+
+        m(*example_inputs)
+        m = convert_pt2e(m, fold_quantize=True)
+
+        node_occurrence = {
+            # two for input of the first conv, one for output for the first conv
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ): 7,
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ): 9,
+        }
+        node_list = [
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(torch.ops.aten.cat.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+            ns.call_function(
+                torch.ops.quantized_decomposed.dequantize_per_tensor.default
+            ),
+            ns.call_function(torch.ops.aten.cat.default),
+            ns.call_function(
+                torch.ops.quantized_decomposed.quantize_per_tensor.default
+            ),
+        ]
+        self.checkGraphModuleNodes(
+            m, expected_node_list=node_list, expected_node_occurrence=node_occurrence
+        )
+
+    def test_shared_qspec_transitivity(self):
+        """This tests the transitivity of SharedQuantizationSpec, that is
+        if A is shared with B, B is shared with C, then C should be shared with A as well
+
+        x1 -> conv1 -> cat1 -----> cat2
+        x2 -> conv2 -/            /
+                       x3 -> add /
+                       x4  /
+
+        both cat has shared input and output, and because of cat and (cat1 -> cat2) is the same Tensor
+        so there is an implicit sharing here, all tensors connect to cat1 and cat2 are in the same
+        sharing group after transitive sharing
+        """
+        # TODO: refactor this to a common util
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.conv2d.default
+                    ):
+                        input_act = node.args[0]
+                        assert isinstance(input_act, Node)
+                        weight = node.args[1]
+                        assert isinstance(weight, Node)
+                        bias = node.args[2]
+                        assert isinstance(bias, Node)
+                        act_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_observer,
+                        )
+                        weight_qspec = QuantizationSpec(
+                            dtype=torch.int8,
+                            quant_min=-128,
+                            quant_max=127,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_weight_observer,
+                        )
+                        bias_qspec = QuantizationSpec(
+                            dtype=torch.float32,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.PlaceholderObserver,
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                                bias: bias_qspec,
+                            },
+                            output_qspec=act_qspec,
+                            _annotated=True,
+                        )
+                    elif node.target is torch.ops.aten.cat.default:
+                        cat_node = node
+                        input_nodes = cat_node.args[0]
+                        first_input_node = input_nodes[0]
+                        input_qspec_map = {}
+                        act_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_observer,
+                        )
+                        input_qspec_map[first_input_node] = act_qspec
+                        share_qparams_with_input_act0_qspec = SharedQuantizationSpec((first_input_node, cat_node))
+                        for input_node in input_nodes[1:]:
+                            input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+
+                        cat_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map=input_qspec_map,
+                            output_qspec=share_qparams_with_input_act0_qspec,
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        self._test_transitive_sharing_with_cat_helper(BackendAQuantizer())
+
+    def test_shared_qspec_transitivity_case_2(self):
+        """This tests the transitivity of SharedQuantizationSpec, that is
+        if A is shared with B, B is shared with C, then C should be shared with A as well
+
+        x1 -> conv1 -> cat1 -----> cat2
+        x2 -> conv2 -/            /
+                       x3 -> add /
+                       x4  /
+
+        both cat has shared input and output, and because of cat and (cat1 -> cat2) is the same Tensor
+        so there is an implicit sharing here, all tensors connect to cat1 and cat2 are in the same
+        sharing group after transitive sharing
+
+        the difference is that for this one, all edges and nodes are shared with the second input edge of cat
+        instead of the first input edge of cat as in previous example
+        """
+        # TODO: refactor this to a common util
+        class BackendAQuantizer(Quantizer):
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.conv2d.default
+                    ):
+                        input_act = node.args[0]
+                        assert isinstance(input_act, Node)
+                        weight = node.args[1]
+                        assert isinstance(weight, Node)
+                        bias = node.args[2]
+                        assert isinstance(bias, Node)
+                        act_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_observer,
+                        )
+                        weight_qspec = QuantizationSpec(
+                            dtype=torch.int8,
+                            quant_min=-128,
+                            quant_max=127,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_weight_observer,
+                        )
+                        bias_qspec = QuantizationSpec(
+                            dtype=torch.float32,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.PlaceholderObserver,
+                        )
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act: act_qspec,
+                                weight: weight_qspec,
+                                bias: bias_qspec,
+                            },
+                            output_qspec=act_qspec,
+                            _annotated=True,
+                        )
+                    elif node.target is torch.ops.aten.cat.default:
+                        cat_node = node
+                        input_nodes = cat_node.args[0]
+                        first_input_node = input_nodes[0]
+                        second_input_node = input_nodes[1]
+                        input_qspec_map = {}
+                        act_qspec = QuantizationSpec(
+                            dtype=torch.uint8,
+                            quant_min=0,
+                            quant_max=255,
+                            qscheme=torch.per_tensor_affine,
+                            is_dynamic=False,
+                            observer_or_fake_quant_ctr=observer.default_observer,
+                        )
+                        input_qspec_map[second_input_node] = act_qspec
+                        share_qparams_with_input_act1_qspec = SharedQuantizationSpec((second_input_node, cat_node))
+                        input_qspec_map[first_input_node] = share_qparams_with_input_act1_qspec
+
+                        cat_node.meta[
+                            "quantization_annotation"
+                        ] = QuantizationAnnotation(
+                            input_qspec_map=input_qspec_map,
+                            output_qspec=share_qparams_with_input_act1_qspec,
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        self._test_transitive_sharing_with_cat_helper(BackendAQuantizer())
+
     def test_int16(self):
         class Int16ActQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
diff --git a/test/test_cuda.py b/test/test_cuda.py
index e265e094f26f1..e4bb3145d56a1 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -80,8 +80,23 @@ def test_pinned_memory_with_cudaregister(self):
         torch.cuda.memory._set_allocator_settings("pinned_use_cuda_host_register:True,pinned_num_register_threads:8")
         t = torch.ones(20)
         self.assertFalse(t.is_pinned())
-        pinned_t = torch.ones(1 << 21).pin_memory()
-        self.assertTrue(pinned_t.is_pinned())
+        try:
+            pinned_t = torch.ones(1 << 21).pin_memory()
+            self.assertTrue(pinned_t.is_pinned())
+            pinned_t = torch.ones(1 << 24).pin_memory()
+            self.assertTrue(pinned_t.is_pinned())
+        except RuntimeError as e:
+            # Some GPUs don't support same address space on host and device side
+            pass
+
+    def test_pinned_memory_with_cudaregister_multithread(self):
+        num_threads = 4
+        threads = [threading.Thread(target=self.test_pinned_memory_with_cudaregister)
+                   for t in range(num_threads)]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
 
     def test_cudart_register(self):
         t = torch.ones(20)
diff --git a/test/test_jit.py b/test/test_jit.py
index 6ffca95e33f1d..0131619f4a9d5 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -142,6 +142,7 @@
 import unittest
 import warnings
 import zipfile
+import tracemalloc
 
 
 def canonical(graph):
@@ -12813,7 +12814,7 @@ def some_func(x):
         x = torch.rand(3, 4)
         self.assertEqual(some_func(x), x)
 
-    def test_file_format_serialization(self):
+    def _make_filereader_test_file(self):
         filename = tempfile.mktemp()
         writer = torch._C.PyTorchFileWriter(filename)
         buffers = [os.urandom(size) for size in [random.randint(1, 100) for i in range(20)]]
@@ -12824,6 +12825,10 @@ def test_file_format_serialization(self):
         serialized_offsets = pickle.dumps(offsets)
         writer.write_record("meta", serialized_offsets, len(serialized_offsets))
         writer.write_end_of_file()
+        return filename, buffers, serialized_offsets
+
+    def test_file_format_serialization(self):
+        filename, buffers, serialized_offsets = self._make_filereader_test_file()
 
         reader = torch._C.PyTorchFileReader(filename)
         serialized_offsets_read = reader.get_record("meta")
@@ -12833,6 +12838,29 @@ def test_file_format_serialization(self):
             data = reader.get_record(str(offset))
             assert(data == buffers[i])
 
+    def test_file_reader_no_memory_leak(self):
+        num_iters = 10000
+        filename, _, _ = self._make_filereader_test_file()
+
+        # Load from filename
+        tracemalloc.start()
+        for i in range(num_iters):
+            torch._C.PyTorchFileReader(filename)
+        _, peak_from_string = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+
+        # Load from stream
+        tracemalloc.start()
+        with open(filename, 'rb') as f:
+            for i in range(num_iters):
+                f.seek(0)
+                torch._C.PyTorchFileReader(f)
+        _, peak_from_file = tracemalloc.get_traced_memory()
+        tracemalloc.stop()
+
+        # Check if the peak sizes at most differ by an empirically obtained factor
+        assert peak_from_file < peak_from_string * 500
+
     # for each type, the input type annotation and corresponding return type annotation
     def type_input_return_pairs(self):
         return [
diff --git a/test/test_mkldnn.py b/test/test_mkldnn.py
index 683003474ba02..7c39d36ec1649 100644
--- a/test/test_mkldnn.py
+++ b/test/test_mkldnn.py
@@ -1440,12 +1440,12 @@ def _test_imagenet_model(self, model):
 
     @skipIfNoTorchVision
     def test_resnet18(self):
-        model = torchvision.models.resnet.resnet18(pretrained=False)
+        model = torchvision.models.resnet.resnet18(weights=None)
         self._test_imagenet_model(model)
 
     @skipIfNoTorchVision
     def test_resnext50_32x4d(self):
-        model = torchvision.models.resnet.resnext50_32x4d(pretrained=False)
+        model = torchvision.models.resnet.resnext50_32x4d(weights=None)
         self._test_imagenet_model(model)
 
     def _lstm_params_list(self):
diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py
index 08b62cc1476ba..d3b04617d2c1b 100644
--- a/test/test_sort_and_select.py
+++ b/test/test_sort_and_select.py
@@ -1122,6 +1122,20 @@ def test_isin_different_devices(self, device, dtype):
         with self.assertRaises(RuntimeError):
             torch.isin(c, d)
 
+    @dtypes(*integral_types())
+    def test_sort_overflow(self, device, dtype):
+        " Regression test for https://github.com/pytorch/pytorch/issues/111189 "
+        prev_num_threads = torch.get_num_threads()
+        try:
+            low = 0 if dtype == torch.uint8 else -1
+            x = torch.full((32768,), low, dtype=dtype, device=device)
+            x[:100] = torch.iinfo(x.dtype).max
+            torch.set_num_threads(1)
+            uv = x.sort().values.unique()
+            self.assertEqual(uv.size(0), 2)
+        finally:
+            torch.set_num_threads(prev_num_threads)
+
 
 instantiate_device_type_tests(TestSortAndSelect, globals())
 
diff --git a/test/test_torch.py b/test/test_torch.py
index f090b6b523e69..2798fee12da26 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6351,6 +6351,11 @@ def test_equal(self):
             self.assertNotEqual(t_0.size(), t_1.size())
             self.assertFalse(torch.equal(t_0, t_1))
 
+            # Fast path: tensor containing `nan` is not equal to self
+            for dtype in floating_and_complex_types():
+                t = torch.tensor([1., float('nan')], dtype=dtype)
+                self.assertFalse(torch.equal(t, t))
+
     def test_element_size(self):
         byte = torch.ByteStorage().element_size()
         char = torch.CharStorage().element_size()
diff --git a/third_party/fbgemm b/third_party/fbgemm
index d0ee798b1f198..70c6e83c29f67 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit d0ee798b1f198cc51b6ddae20cf6063f6380ba3f
+Subproject commit 70c6e83c29f67278751abd0e28433c50743ccbe9
diff --git a/third_party/gloo b/third_party/gloo
index 2cbcef29a6aff..cf1e1abc95d0b 160000
--- a/third_party/gloo
+++ b/third_party/gloo
@@ -1 +1 @@
-Subproject commit 2cbcef29a6aff241896a86c719195f1757bfd1b8
+Subproject commit cf1e1abc95d0b961222ee82b6935f76250fbcf16
diff --git a/third_party/kineto b/third_party/kineto
index 5d3c309049e30..49e854d805d91 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 5d3c309049e30193bfae03720a2e665f09d65447
+Subproject commit 49e854d805d916b2031e337763928d2f8d2e1fbf
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 9601cc67126ba..f3c577bc531a1 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2260,11 +2260,11 @@
 - name: max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
   self: max_pool2d_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode)
 
-- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, int[] padding, int[] stride, int[] dilation, int groups) -> Tensor
-  self, weight, bias: "grad.defined() ? mps_convolution_backward(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: _mps_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  self, weight, bias: "grad.defined() ? mps_convolution_backward_symint(self, grad, weight, padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
-  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<int64_t>(padding.size(), 0), groups, grad_input_mask)
+- name: mps_convolution_backward(Tensor self, Tensor grad_output, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+  grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask)
 
 - name: max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
   self: max_pool2d_with_indices_backward(grad, self, kernel_size, stride, padding, dilation, ceil_mode, result1)
@@ -2286,54 +2286,54 @@
   indices: non_differentiable
   result: auto_linear
 
-- name: convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups) -> Tensor
+- name: convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups)
 
 # TorchScript serializes calls to _convolution so this entry is present until that is changed to use convolution.
 # Note that the benchmark, deterministic, cudnn_enabled, and allow_tf32 flags are queried from the global context
 # by convolution_backward instead of being passed along from the forward pass.
-- name: _convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
+- name: _convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor
   input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
   result: _convolution_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32)
 
-- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, int[] stride, SymInt[] padding, int[] dilation, bool transposed, SymInt[] output_padding, int groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+- name: convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
   result0: std::get<0>(convolution_backward_symint(grad_output_p, input_p, weight_t, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false})) + std::get<0>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {true, false, false}))
   result1: std::get<1>(convolution_backward_symint(grad_output_p, input_t, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false})) + std::get<1>(convolution_backward_symint(grad_output_t, input_p, weight_p, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, {false, true, false}))
   result2: convolution_backward_jvp_grad_bias(grad_output_t, result2)
 
-- name: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor
-  input, weight, bias: "grad.defined() ? convolution_backward_overrideable(grad, input, weight, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor
+  input, weight, bias: "grad.defined() ? convolution_backward_overrideable_symint(grad, input, weight, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  grad_output, input, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
+- name: convolution_backward_overrideable(Tensor grad_output, Tensor input, Tensor weight, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  grad_output, input, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, input, stride, padding, dilation, transposed, output_padding, groups, grad_input_mask)
 
-- name: slow_conv_transpose2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, int[2] dilation=1) -> Tensor
+- name: slow_conv_transpose2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt[2] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_transpose3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, int[3] dilation=1) -> Tensor
+- name: slow_conv_transpose3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] output_padding=0, SymInt[3] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: _slow_conv2d_forward(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, int[2] padding) -> Tensor
-  self, weight, bias: "grad.defined() ? _slow_conv2d_backward(grad, self, weight, kernel_size, stride, padding, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+- name: _slow_conv2d_forward(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding) -> Tensor
+  self, weight, bias: "grad.defined() ? _slow_conv2d_backward_symint(grad, self, weight, kernel_size, stride, padding, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, int[2] kernel_size, int[2] stride, int[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
-  grad_output, self, weight: _convolution_double_backward(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
+- name: _slow_conv2d_backward.output_mask(Tensor grad_output, Tensor self, Tensor weight, SymInt[2] kernel_size, SymInt[2] stride, SymInt[2] padding, bool[3] output_mask) -> (Tensor grad_input, Tensor grad_weight, Tensor grad_bias)
+  grad_output, self, weight: _convolution_double_backward_symint(grads[0], grads[1], grads[2], grad_output, weight, self, stride, padding, {{1, 1}}, false, {{0, 0}}, 1, grad_input_mask)
 
-- name: _conv_depthwise2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias, int[2] stride, SymInt[2] padding, int[2] dilation) -> Tensor
+- name: _conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: conv_depthwise3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding, int[3] dilation) -> Tensor
+- name: conv_depthwise3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding, SymInt[3] dilation) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad.contiguous(), self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ {{0, 0, 0}}, /*groups=*/ 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv3d_forward(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias, int[3] stride, SymInt[3] padding) -> Tensor
+- name: slow_conv3d_forward(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias, SymInt[3] stride, SymInt[3] padding) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, /*dilation=*/ {{1, 1, 1}}, false, /*output_padding=*/ {{0, 0, 0}}, 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated2d(Tensor self, Tensor weight, int[2] kernel_size, Tensor? bias=None, int[2] stride=1, SymInt[2] padding=0, int[2] dilation=1) -> Tensor
+- name: slow_conv_dilated2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: slow_conv_dilated3d(Tensor self, Tensor weight, int[3] kernel_size, Tensor? bias=None, int[3] stride=1, SymInt[3] padding=0, int[3] dilation=1) -> Tensor
+- name: slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: col2im(Tensor self, SymInt[2] output_size, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor
@@ -2618,14 +2618,14 @@
 - name: _cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor)
   log_probs: _cudnn_ctc_loss_backward(grad, result0, result1, zero_infinity)
 
-- name: cudnn_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+- name: cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
   self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, output_padding, stride, dilation, true, groups, {grad_input_mask[0], grad_input_mask[1]})"
 
-- name: _mps_convolution_transpose(Tensor self, Tensor weight, int[] padding, int[] output_padding, int[] stride, int[] dilation, int groups) -> Tensor
-  self, weight: "grad.defined() ? mps_convolution_transpose_backward(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor>()"
+- name: _mps_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
+  self, weight: "grad.defined() ? mps_convolution_transpose_backward_symint(self, grad, weight, padding, output_padding, stride, dilation, groups, grad_input_mask) : std::tuple<Tensor, Tensor>()"
 
-- name: cudnn_convolution(Tensor self, Tensor weight, int[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
-  self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector<int64_t>(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})"
+- name: cudnn_convolution(Tensor self, Tensor weight, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor
+  self, weight: "_cudnn_convolution_backward(self, grad, weight, padding, std::vector<c10::SymInt>(padding.size(), 0), stride, dilation, false, groups, {grad_input_mask[0], grad_input_mask[1]})"
 
 - name: cudnn_grid_sampler(Tensor self, Tensor grid) -> Tensor output
   self, grid: "grad.defined() ? cudnn_grid_sampler_backward(self, grid, grad) : std::tuple<Tensor, Tensor>()"
@@ -2655,9 +2655,9 @@
 
 # nnpack
 
-- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, int[2] stride=1) -> Tensor
+- name: _nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor
   # NNPACK does not support strided convolutions in the backwards path, which is the reason why we are using the closest available function that does here.
-  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<int64_t>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
+  input, weight, bias: "grad.defined() ? convolution_backward_symint(grad, input, weight, bias->sym_sizes(), stride, padding, std::vector<c10::SymInt>(padding.size(), 1), false, std::vector<c10::SymInt>(padding.size(), 0), 1, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 #LSTM MPS
 - name: _lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
@@ -2688,13 +2688,13 @@
 
 # miopen
 
-- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- name: miopen_convolution_transpose(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, true, output_padding, groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- name: miopen_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
-- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups, bool benchmark, bool deterministic) -> Tensor
+- name: miopen_depthwise_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
@@ -2720,7 +2720,7 @@
 - name: mkldnn_rnn_layer_backward(Tensor input, Tensor weight1, Tensor weight2, Tensor weight3, Tensor weight4, Tensor hx_, Tensor cx_tmp, Tensor output, Tensor hy_, Tensor cy_, Tensor? grad_output, Tensor? grad_hy, Tensor? grad_cy, bool reverse, int mode, int hidden_size, int num_layers, bool has_biases, bool train, bool bidirectional, int[] batch_sizes, bool batch_first, Tensor workspace) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor)
 
 # mkldnn
-- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, int[] stride, int[] dilation, int groups) -> Tensor
+- name: mkldnn_convolution(Tensor self, Tensor weight, Tensor? bias, SymInt[] padding, SymInt[] stride, SymInt[] dilation, SymInt groups) -> Tensor
   self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, /*transposed=*/ false, /*output_padding=*/ std::vector<c10::SymInt>(padding.size(), 0), groups, grad_input_mask) : std::tuple<Tensor, Tensor, Tensor>()"
 
 - name: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
diff --git a/torch/_dynamo/allowed_functions.py b/torch/_dynamo/allowed_functions.py
index bb45d22236049..5d54b1033efb1 100644
--- a/torch/_dynamo/allowed_functions.py
+++ b/torch/_dynamo/allowed_functions.py
@@ -120,6 +120,7 @@ def _disallowed_function_ids():
         warnings.warn,
         torch._C._dynamo.eval_frame.unsupported,
         torch.Tensor.__init__,
+        torch.resize_as_,
     ]
 
     # extract all dtypes from torch
diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
index 20bd96513d681..722adce01f1e1 100644
--- a/torch/_dynamo/bytecode_transformation.py
+++ b/torch/_dynamo/bytecode_transformation.py
@@ -819,7 +819,7 @@ def maybe_pop_n(n):
             if output and output[-1].opcode == dis.EXTENDED_ARG:
                 output.pop()
 
-    for i, inst in enumerate(instructions):
+    for inst in instructions:
         if inst.opcode == dis.EXTENDED_ARG:
             # Leave this instruction alone for now so we never shrink code
             inst.arg = 0
diff --git a/torch/_dynamo/codegen.py b/torch/_dynamo/codegen.py
index f6de25657bc3b..0af74c8101ddf 100644
--- a/torch/_dynamo/codegen.py
+++ b/torch/_dynamo/codegen.py
@@ -25,9 +25,9 @@
     NumpyNdarrayVariable,
     SymNodeVariable,
     TensorVariable,
-    TensorWithTFOverrideVariable,
     UnspecializedPythonVariable,
 )
+from .variables.torch_function import TensorWithTFOverrideVariable
 
 
 @dataclasses.dataclass
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 2ddc2407fb0c8..e8ec24c43813d 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -980,7 +980,7 @@ def source_ref(source):
             ):
                 continue
 
-            guard.create(builder, builder)
+            guard.create(builder)
         self.check_fn = self.compile_check_fn(builder, guards, guard_fail_fn)
         self._weakrefs.clear()
         # Keep track of weak references of objects with ID_MATCH guard. This
@@ -1004,6 +1004,7 @@ def compile_check_fn(self, builder, guards_out, guard_fail_fn):
         code_parts = ["___guarded_code.valid", "___check_global_state()"]
 
         def add_code_part(code, guard, log_only=False):
+            extra = ""
             if guard.user_stack:
                 for fs in reversed(guard.user_stack):
                     if fs.filename not in uninteresting_files():
@@ -1013,7 +1014,7 @@ def add_code_part(code, guard, log_only=False):
             elif guard.stack:
                 extra = f"  # {format_frame(guard.stack.summary()[-1])}"
 
-                guards_log.debug("%s", f"{code:<60}{extra}")
+            guards_log.debug("%s", f"{code:<60}{extra}")
 
             if verbose_guards_log.isEnabledFor(logging.DEBUG):
                 maybe_stack = ""
diff --git a/torch/_dynamo/resume_execution.py b/torch/_dynamo/resume_execution.py
index f0352312a5daa..6d4bd6bb1a8b8 100644
--- a/torch/_dynamo/resume_execution.py
+++ b/torch/_dynamo/resume_execution.py
@@ -519,7 +519,7 @@ def remap_block_offsets(
                     # to determine where in the original code the PUSH_EXC_INFO offset
                     # replaced.
                     prefix_blocks = []
-                    for idx, inst in enumerate(instructions):
+                    for inst in instructions:
                         if len(prefix_blocks) == len(
                             meta.prefix_block_target_offset_remap
                         ):
diff --git a/torch/_dynamo/skipfiles.py b/torch/_dynamo/skipfiles.py
index 52bd1f906295e..068345f694fd9 100644
--- a/torch/_dynamo/skipfiles.py
+++ b/torch/_dynamo/skipfiles.py
@@ -64,10 +64,10 @@
     * BUILTIN_SKIPLIST contains builtin python modules, such as abc, collections, etc.
     * THIRDPARTY_SKIPLIST contains common third party libraries, such as numpy, pandas, etc.
 * Functions in these two SKIPLISTs are always skipped, except when they are explicitly
-    put into the three INLINELIST: FUNC_INLINELIST, FILE_INLINELIST and SUBMODULE_INLINELIST.
+    put into the two INLINELIST: FUNC_INLINELIST and MOD_INLINELIST.
 * PyTorch(torch) is in the BUILTIN_SKIPLIST by default, but there are many cases
     where we want inline the functions under torch namespace. We should add them
-    into one of the three *_INLINELIST to make dynamo inline those functions.
+    into one of the two *_INLINELIST to make dynamo inline those functions.
 * If you call functions under skipped modules/files, Dynamo will wrap these functions
     as SkipFilesVariable. There are a few functions(e.g, collections.OrderedDict) that
     we have special handling at SkipFilesVariable.call_function.
@@ -76,17 +76,14 @@
 
 To figure out what the behavior is, check the following list in order:
 * FUNC_INLINELIST (Inline if YES)
-* FILE_INLINELIST (Inline if YES)
-* SUBMODULE_INLINELIST (Inline if YES)
+* MOD_INLINELIST (Inline if YES)
 * BUILTIN_SKIPLIST & THIRDPARTY_SKIPLIST (Skip if YES)
 * Inline by default
 
 In general, if you want to force inline a function or module, please consider adding
-the function's file or python module to FILE_INLINELIST first.
-Use the FUNC_INLINELIST only when there are other functions under the same file that
-you don't want to inline.
-In the future, we will consolidate FILE_INLINELIST and SUBMODULE_INLINELIST into one list
-as we use the same logic (filename.startswith) to determine if a file or module is skipped.
+the function's python module to MOD_INLINELIST first.
+Use the FUNC_INLINELIST only when there are other functions under the same module that
+you don't want to inline them.
 """
 
 
@@ -114,7 +111,7 @@
     tempfile,
     threading,
     tokenize,
-    torch,  # torch/* is skipped by default unless specified in FILE_INLINELIST or SUBMODULE_INLINELIST
+    torch,  # torch/* is skipped by default unless specified in FUNC_INLINELIST or MOD_INLINELIST
     traceback,
     types,
     typing,
@@ -165,36 +162,25 @@ def _module_dir(m: types.ModuleType):
 }
 
 
-# Force inline functions in these files or directories, even they are in *_SKIPLIST.
-# We are using python module name instead of file or directory object to avoid circular dependency.
-# Please keep this sorted alphabetically.
-# TODO: Merge FILE_INLINELIST into SUBMODULE_INLINELIST.
-FILE_INLINELIST = {
-    "torch._dynamo._trace_wrapped_higher_order_op",
-    "torch._dynamo.comptime",
+# These are legacy workarounds, don't add new modules to this list.
+# Please use the MOD_INLINELIST instead to force inline functions under particular modules.
+LEGACY_MOD_INLINELIST = {
     "torch._dynamo.external_utils",
-    "torch._dynamo.polyfill",
     "torch._export.db.examples",
     "torch._export.wrappers",
     "torch._functorch.apis",
     "torch._functorch.deprecated",
     "torch._higher_order_ops.cond",
-    "torch._inductor.test_operators",
     "torch.ao.quantization.pt2e.eval_utils",
     "torch.ao.quantization.pt2e.qat_utils",
     "torch.ao.quantization.pt2e.representation.rewrite",
     "torch.ao.quantization.pt2e.utils",
     "torch.ao.quantization.quantizer.xnnpack_quantizer",
-    "torch.nn.modules.container",
-    "torch.optim._functional",
-    "torch.random",
-    "torch.utils._content_store",
-    "torch.utils._foreach_utils",
+    "torch.optim",
 }
 
-
 if torch.distributed.is_available():
-    FILE_INLINELIST |= {
+    LEGACY_MOD_INLINELIST |= {
         "torch.distributed._tensor.api",
         "torch.distributed._tensor.device_mesh",
         "torch.distributed.algorithms._checkpoint.checkpoint_wrapper",
@@ -203,31 +189,35 @@ def _module_dir(m: types.ModuleType):
         "torch.distributed.tensor.parallel.style",
     }
 
-# Include optimizer code for tracing
-FILE_INLINELIST |= {
-    str(obj.__module__) for obj in torch.optim.__dict__.values() if inspect.isclass(obj)
-}
 
-# TODO: consolidate SUBMODULE_INLINELIST and FILE_INLINELIST into one list
-# Force inline functions under these modules, even the modules is in *_SKIPLIST.
-SUBMODULE_INLINELIST = {
+# Force inline functions under these modules, even they are in *_SKIPLIST.
+# We are using python module name instead of file or directory object to avoid circular dependency.
+# Please keep this sorted alphabetically.
+MOD_INLINELIST = {
     "torch._refs",
     "torch._prims",
     "torch._decomp",
+    "torch._dynamo._trace_wrapped_higher_order_op",
+    "torch._dynamo.comptime",
+    "torch._dynamo.polyfill",
+    "torch._inductor.test_operators",
     "torch.ao.nn",
     "torch.distributions",
     "torch.fx._pytree",
     "torch.nn",
+    "torch.random",
     "torch.sparse",
     "torch.testing",
+    "torch.utils._content_store",
     "torch.utils._contextlib",
+    "torch.utils._foreach_utils",
     "torch.utils._pytree",
 }
 
 
 if torch.distributed.is_available():
-    SUBMODULE_INLINELIST.add("torch.distributed")
-    SUBMODULE_INLINELIST.add("torch.distributed._functional_collectives")
+    MOD_INLINELIST.add("torch.distributed")
+    MOD_INLINELIST.add("torch.distributed._functional_collectives")
 
 
 # TODO: support adding bound method into this list
@@ -243,17 +233,17 @@ def get_func_inlinelist():
 
 
 @functools.lru_cache(None)
-def get_file_inlinelist():
+def get_legacy_mod_inlinelist():
     inlinelist = set()
-    for f in FILE_INLINELIST:
-        inlinelist.add(_module_dir(torch) + f[len("torch.") :].replace(".", "/"))
+    for m in LEGACY_MOD_INLINELIST:
+        inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
     return inlinelist
 
 
 @functools.lru_cache(None)
-def get_submodule_inlinelist():
+def get_mod_inlinelist():
     inlinelist = set()
-    for m in SUBMODULE_INLINELIST:
+    for m in MOD_INLINELIST:
         inlinelist.add(_module_dir(torch) + m[len("torch.") :].replace(".", "/"))
     return inlinelist
 
@@ -303,21 +293,19 @@ class SkipResult:
     reason: Optional[str]
 
 
-# TODO(ybliang): This is a temp function, we should consolidate this with check_file.
-def _check_file_inner(filename, allow_torch=False):
+def check_file(filename, allow_torch=False):
     """Should skip this file?"""
     if filename is None:
         return SkipResult(True, "filename is None")
-    if any(filename.startswith(d) for d in get_file_inlinelist()):
+    if any(filename.startswith(d) for d in get_legacy_mod_inlinelist()):
         return SkipResult(
             False,
-            "inlined according skipfiles.FILE_INLINELIST",
+            "inlined according skipfiles.LEGACY_MOD_INLINELIST",
         )
-    # TODO(ybliang): the is_torch check should be consolidate with is_torch_inline_allowed
-    if allow_torch and is_torch(filename):
+    if allow_torch and is_torch_inline_allowed(filename):
         return SkipResult(
             False,
-            "inlined according skipfiles.is_torch",
+            "inlined according skipfiles.MOD_INLINELIST",
         )
     if is_fbcode and bool(FBCODE_SKIP_DIRS_RE.match(filename)):
         return SkipResult(
@@ -330,17 +318,6 @@ def _check_file_inner(filename, allow_torch=False):
         return SkipResult(False, "inlined by default")
 
 
-def check_file(filename, allow_torch=False, extra_check=False):
-    result = _check_file_inner(filename, allow_torch)
-    if extra_check and result.skipped and is_torch_inline_allowed(filename):
-        return SkipResult(
-            False,
-            "inlined according skipfiles.is_torch_inline_allowed returning True",
-        )
-    else:
-        return result
-
-
 """
 This is the main entry point to determine whether an object (function) should be inlined or skipped.
 Let's illustrate the logic with an example:
@@ -366,10 +343,13 @@ def f3(x, y):
     * If f2 is skipped by Dynamo, when evaluating the frame of f3, Dynamo need the inline/skip check again
       and the call site is in catch_errors_wrapper.catch_errors of eval_frame.py.
 * For global variables and function arguments, Dynamo needs to decide if they are wrapped as SkipFilesVariable in builder.py.
+
+allow_torch is used to indicate whether we are checking the MOD_INLINELIST (torch modules), we only do this check when
+f2 is not skipped.
 """
 
 
-def check_verbose(obj, allow_torch=False, extra_check=False):
+def check_verbose(obj, allow_torch=False):
     if isinstance(
         obj, (UserFunctionVariable, UserMethodVariable, NestedUserFunctionVariable)
     ):
@@ -387,11 +367,11 @@ def check_verbose(obj, allow_torch=False, extra_check=False):
             False,
             "inlined according skipfiles.FUNC_INLINELIST",
         )
-    return check_file(filename, allow_torch, extra_check)
+    return check_file(filename, allow_torch)
 
 
-def check(obj, allow_torch=False, extra_check=False):
-    return check_verbose(obj, allow_torch, extra_check).skipped
+def check(obj, allow_torch=False):
+    return check_verbose(obj, allow_torch).skipped
 
 
 # skip common third party libs
@@ -402,7 +382,7 @@ def check(obj, allow_torch=False, extra_check=False):
 
 
 def is_torch_inline_allowed(filename):
-    return any(filename.startswith(d) for d in get_submodule_inlinelist())
+    return any(filename.startswith(d) for d in get_mod_inlinelist())
 
 
 @functools.lru_cache(None)
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index 148e7fa202d34..404c4b8972f90 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1189,6 +1189,21 @@ def CALL_FUNCTION_KW(self, inst):
         assert len(kwargs) == len(argnames)
         self.call_function(fn, args, kwargs)
 
+    def LOAD_METHOD_SUPER(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        arg = inst.argval[0]
+        argval = self.code_options["co_names"][arg]
+        if sys.version_info < (3, 11):
+            self.LOAD_ATTR(dataclasses.replace(inst, argval=argval))
+        else:
+            self.LOAD_METHOD(dataclasses.replace(inst, argval=argval))
+
+    def LOAD_ATTR_SUPER(self, inst):
+        self.CALL_FUNCTION(dataclasses.replace(inst, argval=2))
+        arg = inst.argval[0]
+        argval = self.code_options["co_names"][arg]
+        self.LOAD_ATTR(dataclasses.replace(inst, argval=argval))
+
     def LOAD_METHOD(self, inst):
         self.LOAD_ATTR(inst)
         obj = self.pop()
@@ -2242,7 +2257,7 @@ def check_inlineable(func):
         except NotImplementedError:
             pass  # closures
 
-        result = skipfiles.check_verbose(func, extra_check=True)
+        result = skipfiles.check_verbose(func, allow_torch=True)
         if result.skipped:
             from torch._dynamo.variables.misc import (
                 produce_trampoline_autograd_apply,
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 5eaa3d022a26c..88b33fa28c8c2 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1337,6 +1337,15 @@ def get_debug_dir():
     return _get_debug_dir(debug_root)
 
 
+def extract_fake_example_value(node, required=True):
+    if "example_value" in node.meta and is_fake(node.meta["example_value"]):
+        return node.meta["example_value"]
+    elif required:
+        unimplemented("`FakeTensor` example value was required but not available")
+    else:
+        return None
+
+
 def get_fake_value(node, tx):
     """
     Run the computation represented by `node` using fake tensors and return the result.
@@ -1351,6 +1360,10 @@ def get_fake_value(node, tx):
 
     op = node.op
 
+    # FX Node should always return the same value
+    if "example_value" in node.meta and is_fake(node.meta["example_value"]):
+        return node.meta["example_value"]
+
     def fake_wrapper(e):
         if isinstance(e, torch.Tensor):
             assert is_fake(e)
@@ -2184,7 +2197,7 @@ def is_tensor_base_attr_getter(value):
 
 def has_torch_function(vt: "torch._dynamo.variables.base.VariableTracker") -> bool:
     from torch._dynamo.variables import UserDefinedObjectVariable
-    from torch._dynamo.variables.tensor import TensorWithTFOverrideVariable
+    from torch._dynamo.variables.torch_function import TensorWithTFOverrideVariable
 
     return isinstance(vt, TensorWithTFOverrideVariable) or (
         isinstance(vt, UserDefinedObjectVariable)
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index 9acd49790d48d..7962d52e988c7 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -133,10 +133,10 @@
     SymNodeVariable,
     TensorSubclassVariable,
     TensorVariable,
-    TensorWithTFOverrideVariable,
     UnspecializedPythonVariable,
 )
 from .torch import tensor_dunder_fns, torch_special_class_types, TorchVariable
+from .torch_function import TensorWithTFOverrideVariable
 from .user_defined import (
     KeyedJaggedTensorVariable,
     UserDefinedClassVariable,
@@ -727,6 +727,7 @@ def index_source(key):
             istype(value, (type, types.FunctionType))
             and skipfiles.check(value, allow_torch=True)
             and not inspect.getattr_static(value, "_torchdynamo_inline", False)
+            and not inspect.getattr_static(value, "__script_if_tracing_wrapper", False)
         ):
             return SkipFilesVariable(
                 value,
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index 761434cc137c4..7fb525f9c9863 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -27,6 +27,7 @@
     check_constant_args,
     check_numpy_ndarray_args,
     check_unspec_python_args,
+    extract_fake_example_value,
     get_fake_value,
     guard_if_dyn,
     is_utils_checkpoint,
@@ -575,14 +576,20 @@ def call_function(
 
         # Handle cases like int(torch.seed())
         # Also handle sym_float to sym_int cases
-        if self.fn in (int, float) and isinstance(args[0], SymNodeVariable):
+        if self.fn in (int, float) and isinstance(
+            args[0], (SymNodeVariable, variables.TensorVariable)
+        ):
+            if isinstance(args[0], variables.TensorVariable):
+                item = args[0].call_method(tx, "item", [], {})
+            else:
+                item = args[0]
             fn_ = sym_int if self.fn is int else sym_float
             out = wrap_fx_proxy(
                 tx=tx,
                 proxy=tx.output.create_proxy(
                     "call_function",
                     fn_,
-                    (args[0].as_proxy(),),
+                    (item.as_proxy(),),
                     {},
                 ),
                 **options,
@@ -1425,6 +1432,12 @@ def _unimplemented():
         if isinstance(left, TensorVariable):
             from .builder import wrap_fx_proxy_cls
 
+            if op is operator.is_ and isinstance(right, TensorVariable):
+                return ConstantVariable.create(
+                    id(extract_fake_example_value(left.as_proxy().node))
+                    == id(extract_fake_example_value(right.as_proxy().node))
+                )
+
             if op not in supported_tensor_comparison_ops.values():
                 _unimplemented()
             if (
@@ -1466,7 +1479,6 @@ def _unimplemented():
             right, UserDefinedObjectVariable
         ):
             return ConstantVariable.create(op(left.value, right.value))
-
         if op.__name__ == "is_":
             # If the two objects are of different type, we can safely return False
             if type(left) is not type(right):
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index 095c046a1f1fd..9d1d8675ff9a4 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -805,8 +805,12 @@ def _as_set_element(self, vt):
         assert isinstance(vt, VariableTracker)
 
         if isinstance(vt, TensorVariable):
-            tensor_node = vt.as_proxy().node
-            return SetVariable.SetElement(vt, tensor_node)
+            fake_tensor = vt.as_proxy().node.meta.get("example_value")
+            if fake_tensor is None:
+                unimplemented(
+                    "Cannot check Tensor object identity without its fake value"
+                )
+            return SetVariable.SetElement(vt, fake_tensor)
         if isinstance(vt, ConstantVariable):
             return SetVariable.SetElement(vt, vt.value)
 
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index 5bb16debf35ca..cd5367f64b907 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -589,37 +589,8 @@ def has_bool_key(v):
             )
             return ConstantVariable.create(None, **options)
         elif name in ("resize_", "resize_as_"):
-            if "memory_format" in kwargs:
-                memory_format = kwargs["memory_format"].as_python_constant()
-            else:
-                memory_format = torch.contiguous_format
-
-            if name == "resize_":
-                self.size = args[0].as_python_constant()
-                self.is_contiguous = (memory_format,)
-            else:
-                assert isinstance(args[0], TensorVariable)
-                if self.size and args[0].size:
-                    if (
-                        self.size == args[0].size
-                        or memory_format is torch.preserve_format
-                    ):
-                        self.is_contiguous = args[0].is_contiguous
-                    else:
-                        self.size = args[0].size
-                        self.stride = args[0].stride
-                        self.ndim = args[0].ndim
-                        self.is_contiguous = (memory_format,)
-
-            return wrap_fx_proxy(
-                tx,
-                tx.output.create_proxy(
-                    "call_method",
-                    name,
-                    *proxy_args_kwargs([self] + list(args), kwargs),
-                ),
-                **options,
-            )
+            # Handling resizing in its full generality is difficult.
+            unimplemented(f"Tensor.{name}")
         elif (
             name == "add_" and len(args) == 1 and len(kwargs) == 1 and "alpha" in kwargs
         ):
@@ -845,95 +816,6 @@ def call_method(
         )
 
 
-class TensorWithTFOverrideVariable(VariableTracker):
-    """
-    Represents a tensor subclass instance with a __torch_function__ override.
-    """
-
-    @staticmethod
-    def create(
-        tx,
-        tensor_variable,
-        torch_function_fn,
-        subclass_type,
-        **kwargs,
-    ):
-        var = TensorWithTFOverrideVariable(
-            tensor_variable,
-            torch_function_fn,
-            subclass_type,
-            **kwargs,
-        )
-        # stash the subclass type to rewrap an output tensor if needed
-        if var.global_mangled_class_name() not in tx.output.global_scope:
-            tx.output.install_global(var.global_mangled_class_name(), subclass_type)
-
-        return var
-
-    def __init__(
-        self,
-        tensor_variable,
-        torch_function_fn,
-        subclass_type,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tensor_variable = tensor_variable
-        self.torch_function_fn = torch_function_fn
-        self.subclass_type = subclass_type
-
-    def as_proxy(self):
-        return self.tensor_variable.as_proxy()
-
-    def python_type(self):
-        return self.subclass_type
-
-    def subclass_type_var(self):
-        from ..source import GlobalSource
-        from .user_defined import UserDefinedClassVariable
-
-        return UserDefinedClassVariable(
-            self.subclass_type, source=GlobalSource(self.global_mangled_class_name())
-        )
-
-    def global_mangled_class_name(self):
-        return f"__subclass_{self.subclass_type.__name__}_{id(self.subclass_type)}"
-
-    def call_torch_function(self, tx, fn, types, args, kwargs):
-        from .torch_function import call_torch_function
-
-        return call_torch_function(
-            tx,
-            self.subclass_type_var(),
-            self.torch_function_fn,
-            fn,
-            types,
-            args,
-            kwargs,
-        )
-
-    def call_method(
-        self,
-        tx,
-        name,
-        args: "List[VariableTracker]",
-        kwargs: "Dict[str, VariableTracker]",
-    ) -> "VariableTracker":
-        # This code block implements inlining the __torch_function__ override
-        # of `call_method`.
-        if tx.output.torch_function_enabled:
-            import torch
-            from .builder import SourcelessBuilder
-            from .torch_function import dispatch_torch_function
-
-            # [Note: __torch_function__] Currently we only support methods that are defined on tensor
-            # we will graph break in other cases this will need a bigger overhaul of extracting methods/comparing them for equality
-            func_var = SourcelessBuilder()(tx, getattr(torch.Tensor, name))
-            return dispatch_torch_function(tx, func_var, [self] + args, kwargs)
-        else:
-            return self.tensor_variable.call_method(tx, name, args, kwargs)
-
-
 class NumpyNdarrayVariable(TensorVariable):
     """
     Represents an np.ndarray, but backed by torch Tensor via torch._numpy.ndarray.
@@ -1093,6 +975,7 @@ def call_function(
     ) -> VariableTracker:
         if len(args) == 1 and isinstance(args[0], TensorVariable):
             from .builder import VariableBuilder
+            from .torch_function import TensorWithTFOverrideVariable
 
             torch_fn = VariableBuilder(
                 tx, AttrSource(self.source, "__torch_function__")
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index cba3a86ac59da..dc71535745daa 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -41,8 +41,11 @@
 from .distributed import is_constant_pg_functions, is_from_local, ProcessGroupVariable
 from .higher_order_ops import TorchHigherOrderOperatorVariable
 from .lists import ListVariable, TupleVariable
-from .tensor import TensorWithTFOverrideVariable
-from .torch_function import can_dispatch_torch_function, dispatch_torch_function
+from .torch_function import (
+    can_dispatch_torch_function,
+    dispatch_torch_function,
+    TensorWithTFOverrideVariable,
+)
 
 log = logging.getLogger(__name__)
 
diff --git a/torch/_dynamo/variables/torch_function.py b/torch/_dynamo/variables/torch_function.py
index c317e39d682ec..a54907a71697a 100644
--- a/torch/_dynamo/variables/torch_function.py
+++ b/torch/_dynamo/variables/torch_function.py
@@ -2,6 +2,7 @@
 from torch.utils._pytree import tree_flatten
 from ..exc import unimplemented
 from ..utils import is_tensor_base_attr_getter
+from .base import VariableTracker
 from .constant import ConstantVariable
 from .lists import TupleVariable
 
@@ -48,8 +49,6 @@ def call_torch_function(
 
 
 def can_dispatch_torch_function(tx, args, kwargs):
-    from .tensor import TensorWithTFOverrideVariable
-
     if tx.output.torch_function_enabled:
         all_args = tree_flatten(args)[0] + tree_flatten(kwargs)[0]
         return any(isinstance(arg, TensorWithTFOverrideVariable) for arg in all_args)
@@ -59,7 +58,6 @@ def can_dispatch_torch_function(tx, args, kwargs):
 
 def dispatch_torch_function(tx, fn, args, kwargs):
     """Gathers all args that are TensorWithTFOverrideVariable and dispatches based on the ordering in _get_overloaded_args"""
-    from .tensor import TensorWithTFOverrideVariable
 
     all_args = tree_flatten(args)[0] + tree_flatten(kwargs)[0]
     overloaded_args = _get_overloaded_args(
@@ -82,3 +80,89 @@ def dispatch_torch_function(tx, fn, args, kwargs):
     unimplemented(
         f"All __torch_function__ overrides for call {fn} with args {args} and kwargs {kwargs} returned NotImplemented"
     )
+
+
+class TensorWithTFOverrideVariable(VariableTracker):
+    """
+    Represents a tensor subclass instance with a __torch_function__ override.
+    """
+
+    @staticmethod
+    def create(
+        tx,
+        tensor_variable,
+        torch_function_fn,
+        subclass_type,
+        **kwargs,
+    ):
+        var = TensorWithTFOverrideVariable(
+            tensor_variable,
+            torch_function_fn,
+            subclass_type,
+            **kwargs,
+        )
+        # stash the subclass type to rewrap an output tensor if needed
+        if var.global_mangled_class_name() not in tx.output.global_scope:
+            tx.output.install_global(var.global_mangled_class_name(), subclass_type)
+
+        return var
+
+    def __init__(
+        self,
+        tensor_variable,
+        torch_function_fn,
+        subclass_type,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tensor_variable = tensor_variable
+        self.torch_function_fn = torch_function_fn
+        self.subclass_type = subclass_type
+
+    def as_proxy(self):
+        return self.tensor_variable.as_proxy()
+
+    def python_type(self):
+        return self.subclass_type
+
+    def subclass_type_var(self):
+        from ..source import GlobalSource
+        from .user_defined import UserDefinedClassVariable
+
+        return UserDefinedClassVariable(
+            self.subclass_type, source=GlobalSource(self.global_mangled_class_name())
+        )
+
+    def global_mangled_class_name(self):
+        return f"__subclass_{self.subclass_type.__name__}_{id(self.subclass_type)}"
+
+    def call_torch_function(self, tx, fn, types, args, kwargs):
+        return call_torch_function(
+            tx,
+            self.subclass_type_var(),
+            self.torch_function_fn,
+            fn,
+            types,
+            args,
+            kwargs,
+        )
+
+    def call_method(
+        self,
+        tx,
+        name,
+        args: "List[VariableTracker]",
+        kwargs: "Dict[str, VariableTracker]",
+    ) -> "VariableTracker":
+        # This code block implements inlining the __torch_function__ override
+        # of `call_method`.
+        if tx.output.torch_function_enabled:
+            import torch
+            from .builder import SourcelessBuilder
+
+            # [Note: __torch_function__] Currently we only support methods that are defined on tensor
+            # we will graph break in other cases this will need a bigger overhaul of extracting methods/comparing them for equality
+            func_var = SourcelessBuilder()(tx, getattr(torch.Tensor, name))
+            return dispatch_torch_function(tx, func_var, [self] + args, kwargs)
+        else:
+            return self.tensor_variable.call_method(tx, name, args, kwargs)
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index 541a6e1600450..82d897f4596f2 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -1601,7 +1601,7 @@ def inner_fn(*args):
         # This is annoying: our joint function needs to be aware of functionalization
         # (syncing mutated inputs before calling autograd.grad())
         # In theory, we could make the autograd engine do this automatically, although that probably isn't any cleaner.
-        for i, arg in enumerate(args_maybe_cloned):
+        for arg in args_maybe_cloned:
             if not isinstance(arg, Tensor):
                 continue
             sync_functional_tensor(arg)
@@ -2999,22 +2999,6 @@ def runtime_wrapper(*args):
                     continue
                 original_inpt = args[inpt_idx]
                 updated_inpt = updated_inputs[i]
-                # TODO: add better resize_() support for autograd case.
-                # Check for the case when an input has been resized.
-                # Note: One important thing to check for is user code that calls inpt.storage().resize_().
-                # We can't trace operations on storage into the graph, so we should get dynamo to graph break.
-                # TODO: handle resize_() on inputs to a larger size.
-                # This is actually non-trivial to detect, so we should probably just handle it
-                # (or make dynamo detect).
-                # We can't just check of original_inpt.storage_size != updated_inpt.storage_size,
-                # Because the original_inpt might be a view of some larger tensor,
-                # and updated_inpt is always densely packed.
-                if not trace_joint and original_inpt.untyped_storage().size() != updated_inpt.untyped_storage().size():
-                    # It actually isn't enough just to see if the storage sizes are different between old and new inputs.
-                    # If the original input was a slice into some larger storage, the same will not be true for the updated input.
-                    # So before doing the resize_(), we **also** check that functionalization detected a metadata mutation.
-                    if meta.mutates_metadata:
-                        original_inpt.resize_(updated_inpt.size())
                 if meta.mutates_metadata and not meta.mutates_data:
                     if trace_joint:
                         assert isinstance(updated_inpt, TensorAlias)
diff --git a/torch/_guards.py b/torch/_guards.py
index 5dc85af1a79c5..e532a32cdd2ac 100644
--- a/torch/_guards.py
+++ b/torch/_guards.py
@@ -77,29 +77,6 @@ class GuardSource(enum.Enum):
     LOCAL_FSDP_MODULE = 7
     GLOBAL_FSDP_MODULE = 8
 
-    def select(self, locals_, globals_):
-        # SHAPE_ENV counts as locals, because the guard expressions
-        # created by shape env can reference f_locals
-        #
-        # RANDOM_VALUE counts as locals, because what we do is we run
-        # Python RNG and assign it to a temporary, and then perform
-        # guard tests on that temporary
-        if self in (
-            GuardSource.LOCAL,
-            GuardSource.LOCAL_NN_MODULE,
-            GuardSource.LOCAL_FSDP_MODULE,
-            GuardSource.SHAPE_ENV,
-            GuardSource.RANDOM_VALUE,
-        ):
-            return locals_
-        if self in (
-            GuardSource.GLOBAL,
-            GuardSource.GLOBAL_NN_MODULE,
-            GuardSource.GLOBAL_FSDP_MODULE,
-        ):
-            return globals_
-        raise NotImplementedError(str(self))
-
     def is_fsdp_module(self) -> bool:
         return self in (GuardSource.GLOBAL_FSDP_MODULE, GuardSource.LOCAL_FSDP_MODULE)
 
@@ -255,8 +232,8 @@ def __str__(self):
         output += f"    Guarded Class Weakref: {self.guarded_class_weakref}\n"
         return output
 
-    def create(self, local_builder: GuardBuilderBase, global_builder: GuardBuilderBase):
-        return self.create_fn(self.source.select(local_builder, global_builder), self)
+    def create(self, builder: GuardBuilderBase):
+        return self.create_fn(builder, self)
 
     def is_nn_module(self):
         return self.source.is_nn_module()
diff --git a/torch/_inductor/bounds.py b/torch/_inductor/bounds.py
index 9896c499bc987..82f85b031c7d0 100644
--- a/torch/_inductor/bounds.py
+++ b/torch/_inductor/bounds.py
@@ -67,9 +67,18 @@ def swap_submodules(
                 subblock = self.loop_body.subblocks[key]
                 # The result within the lambda will reference to the final
                 # set of modules at the end of the for-loop as it stores a reference to it
-                result[key] = lambda mask, value: self.masked_subblock(
-                    subblock, self._bounds, mask, value, result
-                )
+
+                # bind subblock in a function because python lambdas close over by reference
+                # moving the lambda out of make_fn would close over the reference to subblock,
+                # so all lambdas would have the same subblock reference that is the final
+                # subblock in the loop
+                def make_fn(subblock):
+                    return lambda mask, value: self.masked_subblock(
+                        subblock, self._bounds, mask, value, result
+                    )
+
+                result[key] = make_fn(subblock)
+
             else:
                 assert "set_indirect" in key
                 idx = int(key[len("set_indirect") :])
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
index 010f48f52bdb6..f8e81b7cb0621 100644
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@@ -6,7 +6,7 @@
 import os
 import re
 from itertools import count
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import sympy
 from sympy import Expr
@@ -354,6 +354,7 @@ def write_header(self):
                 from torch._inductor.select_algorithm import extern_kernels
 
                 aten = torch.ops.aten
+                inductor_ops = torch.ops.inductor
                 assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                 reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
                 async_compile = AsyncCompile()
@@ -776,7 +777,7 @@ def define_user_defined_triton_kernel(self, kernel, kwargs):
         from ..ir import Buffer
         from .common import SizeArg, TensorArg
 
-        signature = []
+        signature: List[Union[TensorArg, SizeArg]] = []
         constants = {}
         for key, arg in kwargs.items():
             # Not a real argument
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 6167a7c917bf2..3970c9c6dd63e 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -201,6 +201,9 @@
 # how many nodes to allow into a single fusion
 max_fusion_size = 64
 
+# max number of inputs to generate cat as a pointwise op with masked laods
+max_pointwise_cat_inputs = 4
+
 # replace small reductions with pointwise, disable with `= 1`
 unroll_reductions_threshold = 8
 
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index 890d504ffc81a..3e7f31621f8e6 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -35,7 +35,7 @@
     register_graph_pattern,
     stable_topological_sort,
 )
-from ..utils import decode_device, is_view
+from ..utils import decode_device, is_pointwise_use
 from ..virtualized import V
 from .group_batch_fusion import group_batch_fusion_post_grad_passes
 
@@ -784,21 +784,6 @@ def view_to_reshape(gm):
             nd.target = torch.ops.aten.reshape.default
 
 
-def is_pointwise_use(use):
-    if not use.op == "call_function":
-        return False
-
-    if not (
-        isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem
-    ):
-        return False
-
-    if use.target is operator.getitem or is_view(use.target):
-        return all(is_pointwise_use(u) for u in use.users)
-
-    return torch.Tag.pointwise in use.target.tags
-
-
 def should_prefer_unfused_addmm(match):
     inp = match.kwargs["inp"]
     if not inp.meta["val"].is_cuda:
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 0d8ef4f452c72..02113274b48f2 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -732,7 +732,9 @@ def run_node(self, n: torch.fx.Node):
         if n.op == "call_function":
             args, kwargs = self.fetch_args_kwargs_from_env(n)
             origins |= gather_origins(args, kwargs)
-        with ir.IRNode.current_origins(origins), self.set_current_node(n):
+        with ir.IRNode.current_origins(origins), self.set_current_node(
+            n
+        ), V.set_current_node(n):
             if (
                 n.op == "call_function"
                 and n.target is not operator.getitem
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 1f54546bc290e..990ced46c3c3f 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3238,6 +3238,16 @@ def create(cls, inputs, dim):
 
         return kernel
 
+    @classmethod
+    def can_realize_into_without_copy(cls, src):
+        if isinstance(src, TensorBox):
+            # unwrap a TensorBox
+            return cls.can_realize_into_without_copy(src.data)
+
+        return isinstance(src.data.layout, FlexibleLayout) and not isinstance(
+            src.data, ExternKernelAlloc
+        )
+
     @classmethod
     def realize_into(cls, src, dst):
         # Attempt to turn this into a ReinterpretView rather than assert.
@@ -3255,9 +3265,7 @@ def realize_into(cls, src, dst):
             src.realize()
             # ExternKernelAlloc has specific requirements for output layout, should create a copy
             assert hasattr(src.data, "layout")
-            if isinstance(src.data.layout, FlexibleLayout) and not isinstance(
-                src.data, ExternKernelAlloc
-            ):
+            if cls.can_realize_into_without_copy(src):
                 src.data.layout = AliasedLayout(dst)
                 return src.data
         # introduce a copy
@@ -3815,6 +3823,33 @@ def __init__(self, x, *constant_args):
         self.name = V.graph.register_buffer(self)
 
 
+class AccumulateGrad(ExternKernel):
+    """
+    This needs to be a custom class to handle mutation properly
+    """
+
+    kernel = "inductor_ops.accumulate_grad_"
+
+    def codegen(self, wrapper):
+        (variable, new_grad) = (t.codegen_reference() for t in self.inputs)
+        wrapper.writeline(f"{self.kernel}({variable}, {new_grad})")
+
+    def should_allocate(self):
+        return False
+
+    def get_mutation_names(self):
+        assert isinstance(self.layout, MutationLayout)
+        return (self.layout.target.get_name(),)
+
+    def __init__(self, variable, new_grad):
+        super().__init__(
+            None,
+            MutationLayout(variable),
+            self.unwrap_storage([variable, new_grad]),
+        )
+        self.name = V.graph.register_buffer(self)
+
+
 class ScatterFallback(ExternKernel):
     """
     This needs to be a custom class to handle mutation properly.
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 9bc59ea32b7d5..972bfeb255cde 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -48,7 +48,14 @@
     validate_ir,
     View,
 )
-from .utils import ceildiv, decode_device, is_dynamic, pad_listlike, sympy_product
+from .utils import (
+    ceildiv,
+    decode_device,
+    is_dynamic,
+    is_pointwise_use,
+    pad_listlike,
+    sympy_product,
+)
 from .virtualized import ops, V
 
 log = logging.getLogger(__name__)
@@ -947,6 +954,76 @@ def as_strided_copy(x, size, stride, storage_offset=None):
     return clone(result)
 
 
+def pointwise_cat(inputs, dim=0):
+    # (inclusive, exclusive)
+    inputs_ranges: List[Tuple[sympy.Expr, sympy.Expr]] = []
+    prev_end = 0
+    for inp in inputs:
+        inputs_ranges.append((prev_end, prev_end + inp.get_size()[dim]))
+        prev_end = inputs_ranges[-1][-1]
+
+    inputs_loaders = [inp.make_loader() for inp in inputs]
+
+    def inner_fn(idx):
+        idx_dim = ops.index_expr(idx[dim], torch.int64)
+
+        masks = []
+        masked_loads = []
+        for i in range(len(inputs)):
+            start = (
+                ops.constant(0, torch.int64)
+                if i == 0
+                else ops.index_expr(inputs_ranges[i][0], torch.int64)
+            )
+            end = ops.index_expr(inputs_ranges[i][1], torch.int64)
+
+            start_cond = ops.ge(idx_dim, start)
+            end_cond = ops.lt(idx_dim, end)
+            if i == 0:
+                mask = end_cond
+            elif i == len(inputs) - 1:
+                mask = start_cond
+            else:
+                mask = ops.and_(start_cond, end_cond)
+
+            masks.append(mask)
+            idx_load = list(idx)
+
+            # if we're concatting [4], [2]
+            # when we index the second tensor for 5 we want to index 5 - 4
+            idx_load[dim] -= inputs_ranges[i][0]
+
+            masked_loads.append(
+                ops.masked(
+                    mask,
+                    lambda: inputs_loaders[i](idx_load),
+                    0.0,  # this value should be unused
+                ),
+            )
+
+        def get_masked_val(i):
+            if i != len(inputs) - 1:
+                return ops.where(
+                    masks[i],
+                    masked_loads[i],
+                    get_masked_val(i + 1),
+                )
+            else:
+                return masked_loads[-1]
+
+        return get_masked_val(0)
+
+    new_size = list(inputs[0].get_size())
+    new_size[dim] = inputs_ranges[-1][-1]
+
+    return Pointwise.create(
+        device=inputs[0].get_device(),
+        dtype=inputs[0].get_dtype(),
+        inner_fn=inner_fn,
+        ranges=new_size,
+    )
+
+
 @register_lowering(aten.cat)
 def cat(inputs, dim=0):
     if all(input.get_dtype() is torch.uint8 for input in inputs):
@@ -966,6 +1043,36 @@ def cat(inputs, dim=0):
         *inputs, type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.DEFAULT
     )
     inputs = [to_dtype(inp, dtype) for inp in inputs]
+
+    def should_lower_cat_input(x) -> bool:
+        # Unrealized inputs will not be storage and layouts, and we dont want to realize
+        # them in case we want to fuse
+        if ir.is_storage_and_layout(x):
+            storage, _ = ir.as_storage_and_layout(x, freeze=False)
+            return not ir.ConcatKernel.can_realize_into_without_copy(storage)
+
+        if isinstance(x, TensorBox):
+            if isinstance(x.data, ir.BaseView):
+                return should_lower_cat_input(x.data.unwrap_view())
+            else:
+                return should_lower_cat_input(x.data)
+
+        if isinstance(x, ir.StorageBox):
+            return should_lower_cat_input(x.data)
+
+        if isinstance(x, ir.Pointwise):
+            return True
+
+        return False
+
+    if len(inputs) <= config.max_pointwise_cat_inputs:
+        pointwise_uses = all(is_pointwise_use(use) for use in V.current_node.users)
+        all_pointwise_inputs = all(should_lower_cat_input(inp) for inp in inputs)
+        any_pointwise_inputs = any(should_lower_cat_input(inp) for inp in inputs)
+
+        if all_pointwise_inputs or (any_pointwise_inputs and pointwise_uses):
+            return pointwise_cat(inputs, dim)
+
     return TensorBox(ir.ConcatKernel.create(inputs, dim))
 
 
@@ -4214,7 +4321,6 @@ def loader(index, reduction_index):
         assert len(reduction_index) == len(reduced_idx)
         if keepdims:
             assert len(index) == len(size)
-            assert all(index[i] == 0 for i in reduced_idx)
             index = [index[i] for i in kept_idx]
         assert len(index) == len(kept_idx)
         new_index = [None] * (len(index) + len(reduction_index))
@@ -4864,6 +4970,15 @@ def _realize(x):
     return clone(x)
 
 
+@register_lowering(torch.ops.inductor.accumulate_grad_)
+def accumulate_grad_(variable, new_grad):
+    # TODO(jansel): decompose into `variable.grad += new_grad` when variable.grad is defined
+    variable.realize()
+    new_grad.realize()
+    ir.AccumulateGrad(variable, new_grad)
+    return variable
+
+
 @register_lowering(triton_kernel_wrapper_mutation)
 def triton_kernel_wrap_(*, kernel_idx, grid, kwargs):
     ir.UserDefinedTritonKernel(kernel_idx=kernel_idx, grid=grid, kernel_args=kwargs)
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 801f6ab3ac6ea..66ab71646ac11 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -264,6 +264,21 @@ def is_view(op: torch._ops.OpOverload):
     return any(a.alias_info is not None for a in op._schema.arguments)
 
 
+def is_pointwise_use(use):
+    if not use.op == "call_function":
+        return False
+
+    if not (
+        isinstance(use.target, torch._ops.OpOverload) or use.target is operator.getitem
+    ):
+        return False
+
+    if use.target is operator.getitem or is_view(use.target):
+        return all(is_pointwise_use(u) for u in use.users)
+
+    return torch.Tag.pointwise in use.target.tags
+
+
 def gen_gm_and_inputs(target, args, kwargs):
     g = torch.fx.Graph()
     g_args = []
diff --git a/torch/_inductor/virtualized.py b/torch/_inductor/virtualized.py
index 6c35928a4bf77..ce6438e52e979 100644
--- a/torch/_inductor/virtualized.py
+++ b/torch/_inductor/virtualized.py
@@ -173,6 +173,7 @@ def __getattr__(self, item):
 _debug = Virtualized("debug", NullHandler)
 _interpreter = Virtualized("interpreter", NullHandler)
 _aot_compilation = Virtualized("aot_compilation", NullHandler)
+_current_node = Virtualized("current_node", NullHandler)
 
 
 class OpsValue:
@@ -281,6 +282,8 @@ class _V:
     set_interpreter_handler: Callable[[Any], Any] = _interpreter._set_handler
     set_aot_compilation: Callable[[Any], Any] = _aot_compilation._set_handler
     get_aot_compilation: Callable[[], Any] = _aot_compilation._get_handler
+    set_current_node: Callable[[Any], Any] = _current_node._set_handler
+    get_current_node: Callable[[], Any] = _current_node._get_handler
 
     @property
     def ops(self) -> _MockHandler:
@@ -319,5 +322,9 @@ def interpreter(self):
     def aot_compilation(self):
         return _aot_compilation._get_handler()
 
+    @property
+    def current_node(self):
+        return _current_node._get_handler()
+
 
 V = _V()
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index d4e1d68091a63..c20ccc0f7aeaf 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -352,9 +352,10 @@ def meta_copy_(self, src, non_blocking=False):
             "more than one element of the written-to tensor refers to a single memory location"
         )
 
-    intermediate = src.to(self, non_blocking)
-    if self.size() != intermediate.size():
-        aten.expand_copy.default(intermediate, self.size())
+    if isinstance(src, Tensor):
+        intermediate = src.to(self, non_blocking)
+        if self.size() != intermediate.size():
+            aten.expand_copy.default(intermediate, self.size())
     return self
 
 
diff --git a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
index 8e04338446dab..5463862aa1cd4 100644
--- a/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
+++ b/torch/ao/quantization/fx/_model_report/model_report_visualizer.py
@@ -252,7 +252,7 @@ def _generate_channels_table(
 
         if len(channel_features) > 0:
             # now we add all channel data
-            for index, module_fqn in enumerate(filtered_data):
+            for module_fqn in filtered_data:
                 # we iterate over all channels
                 for channel in range(num_channels):
                     # we make a new row for the channel
@@ -644,7 +644,7 @@ def generate_histogram_visualization(self, feature_filter: str, module_fqn_filte
             # set the legend as well
             # combine all the data
             all_data = []
-            for index, channel_info in enumerate(y_data):
+            for channel_info in y_data:
                 all_data.extend(channel_info)
 
             val, bins, _ = plt.hist(
diff --git a/torch/ao/quantization/pt2e/prepare.py b/torch/ao/quantization/pt2e/prepare.py
index bae416b6394e0..32ed0b4bd8b1a 100644
--- a/torch/ao/quantization/pt2e/prepare.py
+++ b/torch/ao/quantization/pt2e/prepare.py
@@ -5,13 +5,13 @@
     _get_output_act_obs_or_fq,
     _get_dtype_and_is_dynamic,
     _insert_obs_or_fq,
-    _maybe_insert_output_observer_for_node,
     _save_state,
     _is_activation_post_process_node,
-    _get_qspec_for_arg,
+    _create_obs_or_fq_from_qspec,
 )
 from torch.fx import (
     GraphModule,
+    Graph,
     Node,
 )
 from torch.fx.node import Argument
@@ -19,14 +19,217 @@
 from torch.ao.quantization import QConfigMapping
 from torch.ao.quantization.qconfig import QConfigAny
 from torch.ao.quantization.fx.custom_config import PrepareCustomConfig
-from typing import Dict, Tuple, Union, Any
+from typing import Dict, Tuple, Union, Any, Optional
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
     EdgeOrNode,
     SharedQuantizationSpec,
+    QuantizationSpecBase,
 )
 from torch.ao.quantization import ObserverOrFakeQuantize
 
+# TODO: make pt2e folder private?
+__all__ = [
+    "prepare",
+]
+
+def _find_root(edge_or_node: EdgeOrNode, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]) -> EdgeOrNode:
+    """Find the root node for the sharing tree
+    Args:
+        edge_or_node: edge/node that we want to find the root
+        shared_with_map: each edge/node points to the parent, the root node will points to itself
+
+    Returns:
+        root edge/node
+    """
+    parent = shared_with_map[edge_or_node]
+    if parent == edge_or_node:
+        return edge_or_node
+    root = _find_root(parent, shared_with_map)
+    # path compression
+    shared_with_map[edge_or_node] = root
+    return root
+
+def _union(parent: EdgeOrNode, child: EdgeOrNode, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]) -> None:
+    """Merge the subtree for `child` with `parent`, the order is important here
+    """
+    root_parent = _find_root(parent, shared_with_map)
+    root_child = _find_root(child, shared_with_map)
+    # union the two trees by pointing the root of child to root of parent
+    shared_with_map[root_child] = root_parent
+
+def _update_shared_with(edge_or_node: EdgeOrNode, qspec: QuantizationSpecBase, shared_with_map: Dict[EdgeOrNode, EdgeOrNode]):
+    """Update the `shared_with_map` based on the qspec, this applies the `SharedQuantizationSpec`
+    configuration and established the relationship between `edge_or_node` with the edge/node that it
+    is pointing to, we'll use this information in the end to get the group id
+    """
+    if isinstance(qspec, SharedQuantizationSpec):
+        sharing_with = qspec.edge_or_node
+        # we point from edge_or_node to the node that it is sharing_with, e.g.
+        # qspec for a = SharedQuantizationSpec(b) means `a` points to `b`
+        _union(sharing_with, edge_or_node, shared_with_map)
+
+def _find_root_qspec(
+    qspec: QuantizationSpecBase,
+    edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase],
+    shared_with_map: Dict[EdgeOrNode, EdgeOrNode]
+) -> QuantizationSpecBase:
+    """Unwraps qspec to get the final root qspec (non SharedQuantizationSpec)
+    if qspec is SharedQuantizationSpec
+       (1). tries to find the root node for the node that the qspec points to
+       (2). recursively find the root qspec based on the qspec for the root node
+    """
+    if isinstance(qspec, SharedQuantizationSpec):
+        sharing_with = qspec.edge_or_node
+        root = _find_root(sharing_with, shared_with_map)
+        qspec = edge_or_node_to_qspec[root]
+        return _find_root_qspec(qspec, edge_or_node_to_qspec, shared_with_map)
+    return qspec
+
+def _has_same_dtype(qspec_a: QuantizationSpecBase, qspec_b: QuantizationSpecBase):
+    return (
+        hasattr(qspec_a, "dtype") and
+        hasattr(qspec_b, "dtype") and
+        qspec_a.dtype == qspec_b.dtype
+    )
+
+def _has_same_is_dynamic(qspec_a: QuantizationSpecBase, qspec_b: QuantizationSpecBase):
+    return (
+        hasattr(qspec_a, "is_dynamic") and
+        hasattr(qspec_b, "is_dynamic") and
+        qspec_a.is_dynamic == qspec_b.is_dynamic
+    )
+
+def _get_edge_or_node_to_qspec(model: torch.fx.GraphModule) -> Dict[EdgeOrNode, QuantizationSpecBase]:
+    """Get a map from EdgeOrNode to quantization spec based on annotations on the nodes
+    """
+    edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase] = {}
+    for n in model.graph.nodes:
+        if hasattr(n, "meta") and "quantization_annotation" in n.meta:
+            qa = n.meta["quantization_annotation"]
+            for input_to_n, qspec in qa.input_qspec_map.items():
+                input_edge = (input_to_n, n)
+                edge_or_node_to_qspec[input_edge] = qspec
+            if qa.output_qspec is not None:
+                output_node = n
+                qspec = qa.output_qspec
+                edge_or_node_to_qspec[output_node] = qspec
+    return edge_or_node_to_qspec
+
+def _get_edge_or_node_to_group_id(edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase]) -> Dict[EdgeOrNode, int]:
+    """Map from edge/node to the group ID, generated from quantization annotations,
+    edge/node with the same group ID should use the same observer/fake_quant instance
+
+    This is applying SharedQuantizationSpec configuration and map each edge/node to a group
+    There is another implicit sharing that's built in the quantization, when we have the following:
+       * op1 -> op2
+       * output of op1: int8_qspec
+       * (op1 -> op2) input edge: int8_qspec
+    we'll assume sharing between the output of op1 and input of (op1 -> op2) since these are the same Tensor.
+
+    Figuring out the correct group ID for all edge/node is a standard union find problem:
+    https://www.geeksforgeeks.org/introduction-to-disjoint-set-data-structure-or-union-find-algorithm/
+
+    Args:
+        edge_or_node_to_qspec: Dictionary from edge_or_node to the qspec, derived from annotations
+    Returns:
+        edge_or_node_to_group_id: Dictionary from edge_or_node to group_id (int), all edge or node that
+        belongs to the same group should have the same id
+
+    Example:
+        op2 -> cat1 -> cat2
+           op1 /        /
+                     op3
+        edge_or_node_to_qspec: {
+            op1: int8_qspec,
+            op2: int8_qspec,
+            (op1, cat1): int8_qspc,
+            (op2, cat1): SharedQuantizationSpec((op1, cat1)),
+            cat1: SharedQuantizationSpec((op1, cat1)),
+            (op3, cat2): int8_qspec,
+            (cat1, cat2): SharedQuantizationSpec((op3, cat2)),
+            cat2: SharedQuantizationSpec((op3, cat2)),
+        }
+
+        edge_or_node_to_group_id = _get_edge_or_node_to_group_id(edge_or_node_to_qspec)
+        edge_or_node_to_group_id: {
+            op1: 1,
+            op2: 1,
+            (op1, cat1): 1,
+            (op2, cat1): 1,
+            cat1: 1,
+            (op3, cat2): 1,
+            (cat1, cat2): 1,
+            cat2: 1,
+        }
+        # everything are in the same group because (cat1) and (cat1, cat2) are implicitly shared, which
+        # connects the two sharing group around cat1 and cat2 op due to transitive sharing
+    """
+    # means the observer of key should be shared with observer with value, by default it will
+    # be shared with itself
+    shared_with_map: Dict[EdgeOrNode, EdgeOrNode] = {k: k for k in edge_or_node_to_qspec.keys()}
+    for edge_or_node, qspec in edge_or_node_to_qspec.items():
+        if isinstance(edge_or_node, torch.fx.Node):
+            output_node = edge_or_node
+            _update_shared_with(output_node, qspec, shared_with_map)
+        else:
+            input_edge = edge_or_node
+            input_edge_root = _find_root(input_edge, shared_with_map)
+            input_edge_root_qspec = edge_or_node_to_qspec[input_edge_root]
+            input_edge_root_qspec = _find_root_qspec(input_edge_root_qspec, edge_or_node_to_qspec, shared_with_map)
+
+            # find root_qspec for `arg` Node (the output of previous node)
+            assert isinstance(input_edge, tuple)
+            arg, n = input_edge
+            arg_as_output_root_qspec = None
+            if arg in edge_or_node_to_qspec:
+                arg_as_output_qspec = edge_or_node_to_qspec[arg]
+                arg_as_output_root_qspec = _find_root_qspec(arg_as_output_qspec, edge_or_node_to_qspec, shared_with_map)
+            # TODO: add assertions for types of root qspecs
+            if (
+                arg_as_output_root_qspec is not None and
+                _has_same_dtype(arg_as_output_root_qspec, input_edge_root_qspec) and
+                _has_same_is_dynamic(arg_as_output_root_qspec, input_edge_root_qspec)
+            ):
+                # the input arg to the node should reuse the existing output observer for arg
+                # since dtype is the same (we may want to extend this to be a more strict check
+                # in the future)
+                # so we point from `input_edge` to `arg` (output of the argument)
+                _union(arg, input_edge, shared_with_map)
+            _update_shared_with(input_edge, qspec, shared_with_map)
+
+    # now that we get the sharing relations between all edges and nodes, we can assingn group ids
+    cur_group_id = 0
+    edge_or_node_to_group_id: Dict[EdgeOrNode, int] = {}
+    for edge_or_node in shared_with_map.keys():
+        root = _find_root(edge_or_node, shared_with_map)
+        if root not in edge_or_node_to_group_id:
+            edge_or_node_to_group_id[root] = cur_group_id
+            cur_group_id += 1
+        edge_or_node_to_group_id[edge_or_node] = edge_or_node_to_group_id[root]
+
+    return edge_or_node_to_group_id
+
+def _get_obs_or_fq_map(
+    edge_or_node_to_group_id: Dict[EdgeOrNode, int],
+    edge_or_node_to_qspec: Dict[EdgeOrNode, QuantizationSpecBase],
+    is_qat: bool
+) -> Dict[EdgeOrNode, ObserverOrFakeQuantize]:
+    """Generates the EdgeOrNode to observer/fake_quant instances
+    Makes sure that for EdgeOrNode that has the same group_id should have the same observer or fake quant
+    instances
+    """
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+    group_id_to_obs_or_fq: Dict[int, ObserverOrFakeQuantize] = {}
+    for edge_or_node, qspec in edge_or_node_to_qspec.items():
+        group_id = edge_or_node_to_group_id[edge_or_node]
+        if group_id not in group_id_to_obs_or_fq:
+            # TODO: maybe edge_or_node_to_qspec should be edge_or_node_to_root_qspec, this will simplify
+            # the implementation for _create_obs_or_fq_from_qspec
+            group_id_to_obs_or_fq[group_id] = _create_obs_or_fq_from_qspec(qspec, obs_or_fq_map, is_qat)
+        obs_or_fq_map[edge_or_node] = group_id_to_obs_or_fq[group_id]
+    return obs_or_fq_map
+
 def _maybe_insert_input_observer_for_arg_or_kwarg(
     node: Union[Node, Any],
     arg: Argument,
@@ -72,21 +275,11 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
             observed_arg = arg.args[0]
             assert isinstance(observed_arg, Node), f"expect observed argument to be a Node, but got: {type(observed_arg)}"
             assert observed_arg in obs_or_fq_map, \
-                f"can't refer to a node that does not have observer/fake_quant inserted yet: {observed_arg}"
-            input_qspec_map = quantization_annotation.input_qspec_map
-            input_arg_qspec = _get_qspec_for_arg(arg, input_qspec_map, named_modules)
-            if isinstance(input_arg_qspec, SharedQuantizationSpec):
-                # if the argument is set to use SharedQuantizationSpec, we will
-                # reset the observer instance to align with the configured edge/node
-                obs_or_fq_name = arg.target
-                setattr(model, obs_or_fq_name, arg_as_input_act_obs_or_fq)
-                named_modules[obs_or_fq_name] = arg_as_input_act_obs_or_fq
-            else:
-                # otherwise reuse the existing obs/fq
-                arg_as_input_act_obs_or_fq = obs_or_fq_map[observed_arg]
+                f"can't find a sharing group for node: {observed_arg}"
+            # reuse the existing obs/fq
+            arg_as_input_act_obs_or_fq = obs_or_fq_map[observed_arg]
             # we don't need to insert new observer node
             new_arg = arg
-            obs_or_fq_map[(observed_arg, node)] = arg_as_input_act_obs_or_fq
         else:
             # skip inserting new observers if there is an observer inserted for the arg before
             # that has the same dtype that we want to insert here
@@ -113,23 +306,24 @@ def _maybe_insert_input_observer_for_arg_or_kwarg(
 
             assert arg_as_input_act_obs_or_fq is not None
             if existing_obs_node is None:
+                maybe_observed_arg = arg
+                # When quantizing two layers with different configs we can have
+                # conv2d (int8) -> avgpool(uint8)
+                # In this case observer insertion for avgpool will come here but the input
+                # to avgpool will be output observer of conv2d
+                # Now the obs map that we update must correspond to the original input of
+                # avgpool and not the output obs of conv2d
+                # This is because when referring to the edge, quantizer would refer to
+                # original input and not the observed one.
+                while _is_activation_post_process_node(arg, named_modules):
+                    arg = arg.args[0]  # type: ignore[assignment]
+                arg_as_input_act_obs_or_fq = obs_or_fq_map[(arg, node)]
                 new_obs_node = _insert_obs_or_fq(
-                    arg, arg_as_input_act_obs_or_fq, model, named_modules, model.graph)
+                    maybe_observed_arg, arg_as_input_act_obs_or_fq, model, named_modules, model.graph)
                 # override this arg to be the observed arg
                 new_arg = new_obs_node
             else:
                 new_arg = existing_obs_node
-            # When quantizing two layers with different configs we can have
-            # conv2d (int8) -> avgpool(uint8)
-            # In this case observer insertion for avgpool will come here but the input
-            # to avgpool will be output observer of conv2d
-            # Now the obs map that we update must correspond to the original input of
-            # avgpool and not the output obs of conv2d
-            # This is because when referring to the edge, quantizer would refer to
-            # original input and not the observed one.
-            while _is_activation_post_process_node(arg, named_modules):
-                arg = arg.args[0]  # type: ignore[assignment]
-            obs_or_fq_map[(arg, node)] = arg_as_input_act_obs_or_fq
 
     return new_arg
 
@@ -172,6 +366,19 @@ def _maybe_insert_input_observers_for_node(
     # assign the new args to the node, inplace
     node.args = tuple(new_args)
 
+def _maybe_insert_output_observer_for_node(
+    node: Node,
+    model: torch.nn.Module,
+    named_modules: Dict[str, torch.nn.Module],
+    graph: Graph,
+    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize],
+    is_qat: bool,
+) -> Optional[Node]:
+    if node in obs_or_fq_map:
+        output_act_obs_or_fq = obs_or_fq_map[node]
+        return _insert_obs_or_fq(node, output_act_obs_or_fq, model, named_modules, graph)
+    return None
+
 def _maybe_insert_input_and_output_observers_for_node(
     node: Node,
     model: torch.fx.GraphModule,
@@ -213,7 +420,8 @@ def _maybe_insert_input_and_output_observers_for_node(
         return
 
     # this returns the new observer node if it was needed
-    maybe_output_obs_node = _maybe_insert_output_observer_for_node(node, model, named_modules, model.graph, obs_or_fq_map, is_qat)
+    maybe_output_obs_node = _maybe_insert_output_observer_for_node(
+        node, model, named_modules, model.graph, obs_or_fq_map, is_qat)
 
     if maybe_output_obs_node is None:
         return
@@ -246,9 +454,17 @@ def prepare(
     # Since we are mutating the graph as we go, we iterate over the original
     # nodes before observer insertion, instead of model.graph.nodes.
     nodes_before_observation = list(model.graph.nodes)
-    obs_or_fq_map: Dict[EdgeOrNode, ObserverOrFakeQuantize] = {}
+
+    # At the high level we construct a map from EdgeOrNode to a observer_or_fake_quant instance
+    # all edge/nodes that belongs to the same group will use the same instance
+    # and when we insert observers we'll just query this map to get the correct observer_or_fake_quant
+    # instance
+    edge_or_node_to_qspec = _get_edge_or_node_to_qspec(model)
+    edge_or_node_to_group_id = _get_edge_or_node_to_group_id(edge_or_node_to_qspec)
+    obs_or_fq_map = _get_obs_or_fq_map(edge_or_node_to_group_id, edge_or_node_to_qspec, is_qat)
 
     for node in nodes_before_observation:
+        # TODO: simplify logic for inserting observers
         _maybe_insert_input_and_output_observers_for_node(node, model, obs_or_fq_map, is_qat)
 
     model = GraphModule(model, model.graph)
diff --git a/torch/ao/quantization/quantize_pt2e.py b/torch/ao/quantization/quantize_pt2e.py
index 765cb9446bfd1..85cd839f46a38 100644
--- a/torch/ao/quantization/quantize_pt2e.py
+++ b/torch/ao/quantization/quantize_pt2e.py
@@ -231,8 +231,8 @@ def convert_pt2e(
     model = _convert_to_reference_decomposed_fx(model)
     model = _fold_conv_bn_qat(model)
     pm = PassManager([DuplicateDQPass()])
-    model = pm(model).graph_module
 
+    model = pm(model).graph_module
     pm = PassManager([PortNodeMetaForQDQ()])
     model = pm(model).graph_module
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index b6983a1c25e78..80e069bcfb43f 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -1517,12 +1517,12 @@ Call this whenever a new thread is created in order to propagate values from
       [](const at::Tensor& input,
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias_opt,
-         at::IntArrayRef stride_,
+         at::SymIntArrayRef stride_,
          at::SymIntArrayRef padding_,
-         at::IntArrayRef dilation_,
+         at::SymIntArrayRef dilation_,
          bool transposed_,
          at::SymIntArrayRef output_padding_,
-         int64_t groups_) {
+         c10::SymInt groups_) {
         return at::native::select_conv_backend(
             input,
             weight,
@@ -1532,7 +1532,7 @@ Call this whenever a new thread is created in order to propagate values from
             dilation_,
             transposed_,
             output_padding_,
-            groups_,
+            std::move(groups_),
             c10::nullopt);
       },
       py::arg("input"),
@@ -1551,12 +1551,12 @@ Call this whenever a new thread is created in order to propagate values from
       [](const at::Tensor& input,
          const at::Tensor& weight,
          const c10::optional<at::Tensor>& bias,
-         at::IntArrayRef stride_,
+         at::SymIntArrayRef stride_,
          at::SymIntArrayRef padding_,
-         at::IntArrayRef dilation_,
+         at::SymIntArrayRef dilation_,
          bool transposed_,
          at::SymIntArrayRef output_padding_,
-         int64_t groups_,
+         c10::SymInt groups_,
          c10::optional<std::vector<c10::SymInt>> bias_sizes_opt) {
         c10::OptionalArrayRef<c10::SymInt> ref = c10::nullopt;
         if (bias_sizes_opt) {
@@ -1571,7 +1571,7 @@ Call this whenever a new thread is created in order to propagate values from
             dilation_,
             transposed_,
             output_padding_,
-            groups_,
+            std::move(groups_),
             ref);
       },
       py::arg("input"),
diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 7b0f14eeeb830..5d212c6ce9a0f 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1274,12 +1274,12 @@ Tensor convolution_jvp(
     const Tensor& weight_t,
     const Tensor& bias_p,
     const Tensor& bias_t,
-    IntArrayRef stride,
+    at::SymIntArrayRef stride,
     at::SymIntArrayRef padding,
-    IntArrayRef dilation,
+    at::SymIntArrayRef dilation,
     bool transposed,
     at::SymIntArrayRef output_padding,
-    int64_t groups) {
+    const c10::SymInt& groups) {
   auto bias_t_opt =
       bias_t.defined() ? c10::optional<at::Tensor>(bias_t) : c10::nullopt;
   return (
@@ -1312,12 +1312,12 @@ Tensor _convolution_jvp(
     const Tensor& weight_t,
     const Tensor& bias_p,
     const Tensor& bias_t,
-    IntArrayRef stride,
+    at::SymIntArrayRef stride,
     at::SymIntArrayRef padding,
-    IntArrayRef dilation,
+    at::SymIntArrayRef dilation,
     bool transposed,
     at::SymIntArrayRef output_padding,
-    int64_t groups,
+    const c10::SymInt& groups,
     bool benchmark,
     bool deterministic,
     bool cudnn_enabled,
@@ -6670,30 +6670,31 @@ std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
     const at::Tensor& self,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    at::IntArrayRef padding,
-    at::IntArrayRef output_padding,
-    at::IntArrayRef stride,
-    at::IntArrayRef dilation,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef output_padding,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef dilation,
     bool transposed,
-    int64_t groups,
+    c10::SymInt groups,
     ::std::array<bool, 2> output_mask) {
   if (!grad_output.defined()) {
     return std::tuple<Tensor, Tensor>();
   }
 
   // Just call the general backward and ignore the bias gradient part.
-  std::tuple<Tensor, Tensor, Tensor> grad_inputs = at::convolution_backward(
-      grad_output,
-      self,
-      weight,
-      c10::nullopt,
-      stride,
-      padding,
-      dilation,
-      transposed,
-      output_padding,
-      groups,
-      {output_mask[0], output_mask[1], false});
+  std::tuple<Tensor, Tensor, Tensor> grad_inputs =
+      at::convolution_backward_symint(
+          grad_output,
+          self,
+          weight,
+          c10::nullopt,
+          stride,
+          padding,
+          dilation,
+          transposed,
+          output_padding,
+          std::move(groups),
+          {output_mask[0], output_mask[1], false});
   std::tuple<Tensor, Tensor> result =
       std::make_tuple(std::get<0>(grad_inputs), std::get<1>(grad_inputs));
   return result;
diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h
index 571879f7c4d93..3d232a41f468a 100644
--- a/torch/csrc/autograd/FunctionsManual.h
+++ b/torch/csrc/autograd/FunctionsManual.h
@@ -971,12 +971,12 @@ Tensor convolution_jvp(
     const Tensor& weight_t,
     const Tensor& bias_p,
     const Tensor& bias_t,
-    IntArrayRef stride,
+    at::SymIntArrayRef stride,
     at::SymIntArrayRef padding,
-    IntArrayRef dilation,
+    at::SymIntArrayRef dilation,
     bool transposed,
     at::SymIntArrayRef output_padding,
-    int64_t groups);
+    const c10::SymInt& groups);
 
 Tensor _convolution_jvp(
     const Tensor& input_p,
@@ -985,12 +985,12 @@ Tensor _convolution_jvp(
     const Tensor& weight_t,
     const Tensor& bias_p,
     const Tensor& bias_t,
-    IntArrayRef stride,
+    at::SymIntArrayRef stride,
     at::SymIntArrayRef padding,
-    IntArrayRef dilation,
+    at::SymIntArrayRef dilation,
     bool transposed,
     at::SymIntArrayRef output_padding,
-    int64_t groups,
+    const c10::SymInt& groups,
     bool benchmark,
     bool deterministic,
     bool cudnn_enabled,
@@ -1023,12 +1023,12 @@ std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
     const at::Tensor& self,
     const at::Tensor& grad_output,
     const at::Tensor& weight,
-    at::IntArrayRef padding,
-    at::IntArrayRef output_padding,
-    at::IntArrayRef stride,
-    at::IntArrayRef dilation,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef output_padding,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef dilation,
     bool transposed,
-    int64_t groups,
+    c10::SymInt groups,
     ::std::array<bool, 2> output_mask);
 
 Tensor scatter_reduce_jvp(
diff --git a/torch/csrc/autograd/functions/accumulate_grad.cpp b/torch/csrc/autograd/functions/accumulate_grad.cpp
index c7f0923752c93..265d9fa5594f0 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.cpp
+++ b/torch/csrc/autograd/functions/accumulate_grad.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/autograd/functions/accumulate_grad.h>
 
+#include <ATen/core/dispatch/Dispatcher.h>
 #include <torch/csrc/autograd/functions/basic_ops.h>
 #include <torch/csrc/autograd/functions/tensor.h>
 #include <torch/csrc/autograd/functions/utils.h>
@@ -83,17 +84,19 @@ variable_list AccumulateGrad::apply_with_saved(
   at::Tensor grad_copy = variable.grad();
   saved.before(variable_copy);
   saved.before(grad_copy);
-  accumulateGrad(
-      variable_copy,
-      grad_copy,
-      grads[0],
-      0 /* num_expected_refs, 0 disables aliased reuse */,
-      [&saved, this](const at::Tensor& grad_update) {
-        saved.assign_mutable_grad(variable, grad_update);
-      });
+  variable_copy.mutable_grad() = grad_copy;
+  // op is intentionally static
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("inductor::accumulate_grad_", "")
+                       .typed<void(const at::Tensor&, const at::Tensor&)>();
+  op.call(variable_copy, grads[0]);
   saved.after(variable_copy);
   saved.after(grad_copy);
 
+  TORCH_CHECK(
+      tensor_post_acc_grad_hooks() == nullptr,
+      "compiled_autograd does not support tensor_post_acc_grad_hooks");
+
   return variable_list();
 }
 
diff --git a/torch/csrc/distributed/c10d/NCCLUtils.cpp b/torch/csrc/distributed/c10d/NCCLUtils.cpp
index b14517523f521..a016644f4febc 100644
--- a/torch/csrc/distributed/c10d/NCCLUtils.cpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cpp
@@ -7,22 +7,20 @@
 
 #include <mutex>
 
-#include <fmt/format.h>
-
 namespace c10d {
 
 ncclComm_t NCCLComm::getNcclComm() {
   std::unique_lock<std::mutex> lock(mutex_);
   if (aborted_) {
     auto commFailureMsg = commFailureReason_ != c10::nullopt
-        ? fmt::format(
-              " Original reason for failure was: {}", *commFailureReason_)
+        ? c10::str(" Original reason for failure was: ", *commFailureReason_)
         : "";
     TORCH_CHECK(
         false,
-        fmt::format(
-            "NCCL communicator was aborted on rank {}. {}",
+        c10::str(
+            "NCCL communicator was aborted on rank ",
             rank_,
+            ". ",
             commFailureMsg));
   }
   return ncclComm_;
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 45dc9110053fe..231882a9beeee 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1,4 +1,3 @@
-#include <fmt/format.h>
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
 #include <torch/csrc/distributed/c10d/UCCForNCCL.hpp>
@@ -14,6 +13,7 @@
 #include <utility>
 
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGraph.h>
 #include <c10/core/DeviceType.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -288,20 +288,24 @@ const int64_t ProcessGroupNCCL::kWatchdogThreadSleepMillis = 1000;
 constexpr int64_t kSynchronizeBusyWaitMillis = 10;
 thread_local uint64_t ProcessGroupNCCL::ncclActiveGroupCounter_ = 0;
 
-std::string getWorkInfo(const ProcessGroupNCCL::WorkNCCL& workNCCL) {
-  return fmt::format(
-      "WorkNCCL(SeqNum={}, OpType={}, NumelIn={}, NumelOut={}, Timeout(ms)={})",
+std::ostream& operator<<(
+    std::ostream& output,
+    const ProcessGroupNCCL::WorkNCCL& workNCCL) {
+  std::string workInfo;
+  workInfo = c10::str(
+      "WorkNCCL(",
+      "SeqNum=",
       workNCCL.seq_,
+      ", OpType=",
       opTypeToString(workNCCL.opType_),
+      ", NumelIn=",
       workNCCL.numelIn_,
+      ", NumelOut=",
       workNCCL.numelOut_,
-      workNCCL.opTimeout_.count());
-}
-
-std::ostream& operator<<(
-    std::ostream& output,
-    const ProcessGroupNCCL::WorkNCCL& workNCCL) {
-  return output << getWorkInfo(workNCCL);
+      ", Timeout(ms)=",
+      workNCCL.opTimeout_.count(),
+      ")");
+  return output << workInfo;
 }
 
 ProcessGroupNCCL::WorkNCCL::WorkNCCL(
@@ -447,12 +451,15 @@ bool ProcessGroupNCCL::WorkNCCL::checkTimeout(
   if (exception())
     return true;
 
-  std::string exceptionMsg = fmt::format(
-      "[Rank {}] Watchdog caught collective operation timeout: {} ran for {}"
-      " milliseconds before timing out.",
+  std::string exceptionMsg = c10::str(
+      "[Rank ",
       rank_,
-      getWorkInfo(*this),
-      timeElapsed.count());
+      "] ",
+      "Watchdog caught collective operation timeout: ",
+      *this,
+      " ran for ",
+      timeElapsed.count(),
+      " milliseconds before timing out.");
 
   LOG(ERROR) << exceptionMsg;
   std::exception_ptr exception_ptr =
@@ -514,10 +521,12 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
       // here, it was observed that CUDA GPU will have 100% utilization and
       // can not run new events successfully.
       if (timedOut) {
-        std::string exceptionMsg = fmt::format(
-            "[Rank {}] Work {} timed out in blocking wait (NCCL_BLOCKING_WAIT=1).",
+        std::string exceptionMsg = c10::str(
+            "[Rank ",
             rank_,
-            getWorkInfo(*this));
+            "] Work ",
+            (*this),
+            " timed out in blocking wait (NCCL_BLOCKING_WAIT=1).");
         LOG(ERROR) << exceptionMsg;
         break;
       }
@@ -881,8 +890,7 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
     onCompletionHookThread_.join();
 
   // Abort all NCCL Communicators on Process Group Destruction
-  std::string abortReason =
-      fmt::format("Process Group destroyed on rank {}", rank_);
+  std::string abortReason = c10::str("Process Group destroyed on rank ", rank_);
   abort(abortReason);
 }
 
@@ -895,17 +903,17 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
   } catch (std::exception& e) {
     if (std::string(e.what()).find("driver shutting down") !=
         std::string::npos) {
-      LOG(INFO) << fmt::format(
-          "[Rank {}] "
-          "main process destroyed cuda before watchdog loop exited, terminating watchdog. "
-          "(Watchdog caught exception: {})",
-          rank_,
-          e.what());
+      LOG(INFO)
+          << "[Rank " << rank_
+          << "] main process destroyed cuda before watchdog loop exited, terminating watchdog."
+          << " (Watchdog caught exception: " << e.what();
+
     } else {
       // Append error message reported from workCleanupLoop
-      const auto exitMsg = fmt::format(
-          "[Rank {}] NCCL watchdog thread terminated with exception: {}",
+      const auto exitMsg = c10::str(
+          "[Rank ",
           rank_,
+          "] NCCL watchdog thread terminated with exception: ",
           e.what());
       LOG(ERROR) << exitMsg;
       // TODO(whc) clean up the rethrow - why is it stored in a class var and
@@ -914,9 +922,10 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
       std::rethrow_exception(watchDogException_);
     }
   } catch (...) {
-    const auto exitMsg = fmt::format(
-        "[Rank {}] NCCL watchdog thread terminated with exception: unknown",
-        rank_);
+    const auto exitMsg = c10::str(
+        "[Rank ",
+        rank_,
+        "] NCCL watchdog thread terminated with exception: unknown");
     LOG(ERROR) << exitMsg;
     watchDogException_ = std::make_exception_ptr(std::runtime_error(exitMsg));
     std::rethrow_exception(watchDogException_);
@@ -1018,6 +1027,7 @@ void ProcessGroupNCCL::workCleanupLoop() {
         } else {
           it = workMetaList_.erase(it);
         }
+        at::cuda::CUDAGraph::dec_pending_event_queries();
       } else {
         // Increment the iterator if the current WorkNCCL object is not
         // completed.
@@ -1113,8 +1123,8 @@ std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal(
     // commFailureReason is set.
     auto commFailureReason = ncclComm->getNcclCommFailureReason();
     if (commFailureReason != c10::nullopt) {
-      return std::make_exception_ptr(std::runtime_error(fmt::format(
-          "NCCL communicator encountered error set by ProcessGroupNCCL: {}",
+      return std::make_exception_ptr(std::runtime_error(c10::str(
+          "NCCL communicator encountered error set by ProcessGroupNCCL: ",
           *commFailureReason)));
     }
     ncclResult_t ncclAsyncErr = ncclComm->checkForNcclError();
@@ -1164,12 +1174,15 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
       TORCH_CHECK(vec.size() == NCCL_UNIQUE_ID_BYTES);
       std::memcpy(ncclID, vec.data(), vec.size());
     } catch (const std::exception& e) {
-      std::string exceptionMsg = fmt::format(
-          "[{}] is setting up NCCL communicator and retrieving ncclUniqueId "
-          "from [0] via c10d key-value store by key '{}', but store->get('{}') got error: ",
+      std::string exceptionMsg = c10::str(
+          "[",
           rank_,
+          "] is setting up NCCL communicator and "
+          "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
+          storeKey,
+          "', but store->get('",
           storeKey,
-          storeKey);
+          "') got error: ");
       TORCH_CHECK(
           false,
           exceptionMsg + e.what() +
@@ -1177,12 +1190,14 @@ void ProcessGroupNCCL::broadcastUniqueNCCLID(
     } catch (...) {
       TORCH_CHECK(
           false,
-          fmt::format(
-              "Unknown exception while [{}] is setting up NCCL communicator and "
-              "retrieving ncclUniqueId from [0] via c10d key-value store by key '{}"
-              "'. This may indicate a possible application crash on rank 0 or a network set up issue.",
+          c10::str(
+              "Unknown exception while [",
               rank_,
-              storeKey));
+              "] is setting up NCCL communicator and "
+              "retrieving ncclUniqueId from [0] via c10d key-value store by key '",
+              storeKey,
+              "'",
+              ". This may indicate a possible application crash on rank 0 or a network set up issue."));
     }
   }
 }
@@ -1834,8 +1849,13 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
   work->numelIn_ = inputs[0].numel();
   work->numelOut_ = outputs[0].numel();
 
+  // Notify graphs before we check the capture status preemptively
+  at::cuda::CUDAGraph::inc_pending_event_queries();
+
   if (!coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None) {
     workEnqueue(work);
+  } else {
+    at::cuda::CUDAGraph::dec_pending_event_queries();
   }
 
   return work;
@@ -2013,8 +2033,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
   // Enqueue P2P op so that it can be cancelled by NCCL watchdog
   c10::cuda::CaptureStatus capture_status =
       c10::cuda::currentStreamCaptureStatusMayInitCtx();
+
+  // Notify graphs before we check the capture status preemptively
+  at::cuda::CUDAGraph::inc_pending_event_queries();
+
   if (!coalescing_state_ && capture_status == c10::cuda::CaptureStatus::None) {
     workEnqueue(work);
+  } else {
+    at::cuda::CUDAGraph::dec_pending_event_queries();
   }
 
   return work;
@@ -2813,13 +2839,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
     // ensure that each process is on a different GPU
     auto numGPUs = at::cuda::getNumGPUs();
     int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs);
-    LOG(INFO) << fmt::format(
-        "Rank {} using GPU {}"
-        " to perform barrier as devices used by this process are currently unknown. "
-        "This can potentially cause a hang if this rank to GPU mapping is incorrect."
-        "Specify device_ids in barrier() to force use of a particular device.",
+    LOG(INFO) << c10::str(
+        "Rank ",
         this->getRank(),
-        deviceIdx);
+        " using GPU ",
+        deviceIdx,
+        " to perform barrier as devices used by this process are currently unknown. ",
+        "This can potentially cause a hang if this rank to GPU mapping is incorrect.",
+        "Specify device_ids in barrier() to force use of a particular device.");
     devices.emplace_back(getDeviceForRank(rank_));
   } else {
     for (auto usedDeviceIdx : usedDeviceIdxs_) {
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index fd0fe5c8e85ce..782b55fec7b8e 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -233,7 +233,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     friend std::ostream& operator<<(
         std::ostream& output,
         const WorkNCCL& workNCCL);
-    friend std::string getWorkInfo(const WorkNCCL& workNCCL);
 
    private:
     // Helper function for synchronize
diff --git a/torch/csrc/dynamo/compiled_autograd.h b/torch/csrc/dynamo/compiled_autograd.h
index 18c7723ac11c9..f44c2185a9959 100644
--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@@ -469,7 +469,6 @@ struct TraceState {
   size_t sym_sizes_index;
   std::vector<c10::optional<c10::SymInt>> sym_sizes;
   variable_list outputs;
-  std::vector<size_t> output_grad_targets;
 };
 
 class SwapSavedVariables {
@@ -620,17 +619,6 @@ class SwapSavedVariables {
   NO_OP_VISIT(double);
 #undef NO_OP_VISIT
 
-  // record the need to run `dst.mutable_grad() = src` after the graph
-  // dst is a real tensor, src is a fake tensor
-  void assign_mutable_grad(const at::Tensor& dst, const at::Tensor& src) {
-    const TensorArg& arg = compiler.tensor_args.lookup(dst);
-    TORCH_INTERNAL_ASSERT(arg.defined());
-    TORCH_INTERNAL_ASSERT(
-        state.outputs.size() == state.output_grad_targets.size());
-    state.outputs.emplace_back(src);
-    state.output_grad_targets.emplace_back(arg.index());
-  }
-
   SwapSavedVariables(AutogradCompilerCall& c, TraceState& s)
       : compiler(c), state(s) {}
 
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index e5c6b51238f10..515115e0a49f6 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -104,7 +104,6 @@ struct CacheNode {
     next.clear();
     key_storage.clear();
     expected_sizes.clear();
-    output_grad_targets.clear();
     compiled_fn = nullptr;
   }
 
@@ -208,9 +207,6 @@ struct CacheNode {
   std::vector<SizeInput> expected_sizes;
 
   THPObjectPtr compiled_fn;
-  // Maps each return value of compiled_fn to an input index.  After the graph
-  // runs we do: `inputs[output_grad_targets[i]].mutable_grad() = outputs[i]`
-  std::vector<size_t> output_grad_targets;
 };
 
 struct InputBuffers : public std::unordered_map<Node*, InputBuffer> {
@@ -463,7 +459,6 @@ variable_list compiled_autograd(
     }
 
     cache->compiled_fn = check(call_end_capture(py_compiler, state.outputs));
-    cache->output_grad_targets = std::move(state.output_grad_targets);
     state.debug_asserts();
   } // End cache miss region
 
@@ -482,21 +477,8 @@ variable_list compiled_autograd(
   THPObjectPtr pyresult(check(PyObject_CallFunctionObjArgs(
       cache->compiled_fn.get(), inputs.get(), sizes.get(), hooks.get(), NULL)));
   variable_list outputs = THPVariable_UnpackList(pyresult);
-  if (accumulate_grad) {
-    TORCH_INTERNAL_ASSERT(outputs.size() == cache->output_grad_targets.size());
-    for (const auto i : c10::irange(outputs.size())) {
-      // Here we set the `var.grad = ...` for each call to
-      // `saved.assign_mutable_grad(var, ...)`.  For the case on inplace grad
-      // accumuation there will be an `add_` op in the graph and no return
-      // value.
-      compiler_call.tensor_args.inputs[cache->output_grad_targets[i]]
-          .mutable_grad() = outputs[i];
-    }
-    return variable_list();
-  } else {
-    TORCH_INTERNAL_ASSERT(outputs.size() == output_edges.size());
-    return outputs;
-  }
+  TORCH_INTERNAL_ASSERT(outputs.size() == output_edges.size());
+  return outputs;
 }
 
 static PyObject* set_autograd_compiler(PyObject* dummy, PyObject* args) {
diff --git a/torch/csrc/inductor/inductor_ops.cpp b/torch/csrc/inductor/inductor_ops.cpp
index 7f72773956cfd..7dc48f0702098 100644
--- a/torch/csrc/inductor/inductor_ops.cpp
+++ b/torch/csrc/inductor/inductor_ops.cpp
@@ -4,6 +4,7 @@
 #include <ATen/ops/mm.h>
 #endif
 
+#include <torch/csrc/autograd/functions/accumulate_grad.h>
 #include <torch/csrc/inductor/inductor_ops.h>
 #include <torch/library.h>
 
@@ -38,13 +39,34 @@ Tensor _reinterpret_tensor(
   return self_;
 }
 
+static void accumulate_grad_(const Tensor& variable, const Tensor& new_grad) {
+  at::Tensor& grad = variable.mutable_grad();
+  if (new_grad.device() != kMeta) {
+    torch::autograd::AccumulateGrad::accumulateGrad(
+        variable,
+        grad,
+        new_grad,
+        1 /* num_expected_refs */,
+        [&grad](at::Tensor&& grad_update) { grad = std::move(grad_update); });
+  } else {
+    // no shape checking for `device="meta"` to workaround FSDP inplace mutation
+    if (!grad.defined()) {
+      grad = new_grad;
+    }
+  }
+}
+
 TORCH_LIBRARY_FRAGMENT(inductor, m) {
   m.def(
       "_mm_plus_mm(Tensor a, Tensor b, Tensor c, Tensor d, Tensor(t!) out) -> Tensor(t!)",
-      _mm_plus_mm);
+      dispatch(c10::DispatchKey::CompositeExplicitAutograd, _mm_plus_mm));
   m.def(
       "_reinterpret_tensor(Tensor self, int[] size, int[] stride, int offset_increment=0) -> Tensor",
-      _reinterpret_tensor);
+      dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd, _reinterpret_tensor));
+  m.def(
+      "accumulate_grad_(Tensor variable, Tensor new_grad) -> ()",
+      dispatch(c10::DispatchKey::CompositeExplicitAutograd, accumulate_grad_));
 }
 
 } // namespace inductor
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 4dc1710923d6b..762d7b9f3305f 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1455,6 +1455,7 @@ void initJITBindings(PyObject* module) {
             PyObject_CallMethod(buffer_.ptr(), "readinto", "O", memview.get());
         if (res) {
           int64_t i = static_cast<int64_t>(PyLong_AsLongLong(res));
+          Py_DECREF(res);
           if (i > 0) {
             return i;
           }
diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h
index c5526ab58c5dc..abf1371e290a1 100644
--- a/torch/csrc/utils/python_arg_parser.h
+++ b/torch/csrc/utils/python_arg_parser.h
@@ -629,6 +629,11 @@ inline std::vector<int64_t> PythonArgs::intlistWithDefault(
   if (size1 > 0 && THPUtils_checkLong(arg)) {
     return std::vector<int64_t>(size1, THPUtils_unpackLong(arg));
   }
+  if (size1 > 0 && torch::is_symint(py::handle(arg))) {
+    return std::vector<int64_t>(
+        size1,
+        py::handle(arg).cast<c10::SymInt>().guard_int(__FILE__, __LINE__));
+  }
   auto tuple = PyTuple_Check(arg);
   // NOLINTNEXTLINE(bugprone-branch-clone)
   const auto size2 = tuple ? PyTuple_GET_SIZE(arg) : PyList_GET_SIZE(arg);
diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py
index 950b3d2b87d5e..9ca0ee0eba743 100644
--- a/torch/distributed/_shard/sharded_tensor/utils.py
+++ b/torch/distributed/_shard/sharded_tensor/utils.py
@@ -109,7 +109,7 @@ def build_metadata_from_local_shards(
     first_shard_is_pinned = local_shards[0].tensor.is_pinned()
 
     # 1). Validate local tensors and associated metadatas
-    for i, local_shard in enumerate(local_shards):
+    for local_shard in local_shards:
         local_shard_tensor = local_shard.tensor
         local_shard_meta = local_shard.metadata
         local_shard_metadatas.append(local_shard_meta)
diff --git a/torch/distributed/_tensor/_utils.py b/torch/distributed/_tensor/_utils.py
index 592cb6bd6e25e..9aac9e01d4732 100644
--- a/torch/distributed/_tensor/_utils.py
+++ b/torch/distributed/_tensor/_utils.py
@@ -143,7 +143,12 @@ def compute_global_tensor_info(
     for idx, placement in enumerate(placements):
         mesh_dim_size = mesh.size(idx)
         if placement.is_shard():
-            shard_dim = cast(Shard, placement).dim
+            shard_placement = cast(Shard, placement)
+            if shard_placement.dim < 0:
+                # normalize shard dim to be positive
+                shard_placement.dim += len(tensor_shape)
+            shard_dim = shard_placement.dim
+
             local_dim_size = tensor_shape[shard_dim]
             tensor_shape[shard_dim] = local_dim_size * mesh_dim_size
 
diff --git a/torch/distributed/_tensor/_xla.py b/torch/distributed/_tensor/_xla.py
new file mode 100644
index 0000000000000..80d93c7652fae
--- /dev/null
+++ b/torch/distributed/_tensor/_xla.py
@@ -0,0 +1,200 @@
+import logging
+import os
+from functools import wraps
+from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union
+
+import torch
+
+import torch.nn as nn
+from torch.distributed._tensor.device_mesh import DeviceMesh
+from torch.distributed._tensor.placement_types import Placement, Replicate
+
+log = logging.getLogger(__name__)
+
+TORCH_XLA_INITIALIZED = False
+try:
+    import torch_xla.core.xla_model as xm  # type:ignore[import]  # noqa: F401
+    import torch_xla.runtime as xr  # type:ignore[import]
+    from torch_xla.experimental.xla_sharded_tensor import (  # type:ignore[import]
+        XLAShardedTensor,
+    )
+    from torch_xla.experimental.xla_sharding import (  # type:ignore[import]
+        mark_sharding,
+        Mesh,
+        ShardingType,
+    )
+
+    TORCH_XLA_INITIALIZED = True
+except ImportError as e:
+    log.warning(e.msg)
+
+
+# wrapper to check xla test requirements
+def with_xla(func: Callable) -> Callable:
+    assert func is not None
+
+    @wraps(func)  # pyre-ignore[6]
+    def wrapper(
+        self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
+    ) -> None:
+        if TORCH_XLA_INITIALIZED:
+            # TODO(yeounoh) replace this with xr.use_spmd() when we deprecate the flag.
+            os.environ["XLA_USE_SPMD"] = "1"
+            return func(self, *args, **kwargs)  # type: ignore[misc]
+        else:
+            raise ImportError(
+                "torch.distributed._tensor._xla API requires torch_xla package installation."
+            )
+
+    return wrapper
+
+
+@with_xla
+def convert_to_xla_mesh(dt_mesh: DeviceMesh) -> "Mesh":
+    """
+    Convert DTensor `dt_mesh` to XLAShardedTensor `partition_spec`.
+
+    Example (1x4 logical device mesh topology):
+      ```
+      dt_mesh = DeviceMesh("xla", [[1, 2, 3, 4]])
+      dt_mesh.mesh.shape
+      >> torch.Size([1, 4])
+
+      mesh = convert_to_xla_mesh(dt_mesh)
+      mesh_shape
+      >> [1, 4]
+      ```
+    """
+    assert dt_mesh.size() == xr.global_runtime_device_count()
+    return Mesh(
+        dt_mesh.mesh.flatten(), tuple(dt_mesh.mesh.size()), dt_mesh.mesh_dim_names
+    )
+
+
+@with_xla
+def convert_to_xla_partition_spec(
+    tensor: torch.Tensor, placements: Sequence[Placement]
+) -> Tuple[Union[Tuple, int, None]]:
+    """
+    Convert DTensor `placements` to XLAShardedTensor `partitoin_spec`.
+    This supports Shard and Replicate Placement types.
+
+    Example:
+      ```
+      # Mesh partitioning, 1/4-th of the input with replicated overlaps.
+      # The first input tensor dimension is sharded across the second mesh
+      # dimension, and the rest is replicated over the first mesh dimension.
+      t = torch.randn(4, 8, 8)
+      dt_mesh = DeviceMesh("xla", torch.arange(8).reshape(2,4))
+      placements = [Replicate(), Shard(0)]
+      my_dtensor = distribute_tensor(t, dt_mesh, placements)
+
+      # `placements = [Replicate(), Shard(0)]` describes sharding per mesh dim,
+      # and this is equivalent to `partition_spec = (1, None, None)` which is
+      # sharding per input tensor dimension.
+      partition_spec = convert_to_xla_partition_spec(t, placements)
+      >> (1, None, None)
+      ```
+    """
+    # per tensor dimension sharding
+    sharding_spec = [None] * len(tensor.shape)
+    for mesh_idx, spec in enumerate(placements):
+        if spec.is_shard():  # type:ignore[truthy-function]
+            # mesh_idx to tensor_idx (spec.dim)
+            tensor_idx = spec.dim  # type:ignore[attr-defined]
+            sharding_spec[tensor_idx] = mesh_idx  # type:ignore[call-overload]
+        elif spec.is_replicate():
+            # spec.dim is already set to None by default
+            continue
+        else:
+            raise ValueError(f"Unsupported placement type: {type(spec).__name__}")
+    return tuple(sharding_spec)  # type:ignore[return-value]
+
+
+@with_xla
+def xla_distribute_tensor(
+    tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+    placements: Optional[Sequence[Placement]] = None,
+) -> "XLAShardedTensor":
+    """
+    Distribute a torch.Tensor to the `device_mesh` according to the `placements`
+    specified. The rank of `device_mesh` and `placements` must be the same.
+
+    Args:
+        tensor (torch.Tensor): torch.Tensor to be distributed. Note that if you
+            want to shard a tensor on a dimension that is not evenly divisible by
+            the number of devices in that mesh dimension, we use `torch.chunk`
+            semantic to shard the tensor and scatter the shards.
+        device_mesh (:class:`DeviceMesh`, optional): DeviceMesh to distribute the
+            tensor, if not specified, must be called under a DeviceMesh context
+            manager, default: None
+        placements (List[:class:`Placement`], optional): the placements that
+            describes how to place the tensor on DeviceMesh, must have the same
+            number of elements as `device_mesh.ndim`. If not specified, we will
+            by default replicate the tensor across the `device_mesh` from the
+            first rank of each dimension of the `device_mesh`.
+
+    Returns:
+        A :class:`XLAShardedTensor` object
+
+    .. note:: We return a XLAShardedTensor with a global view and access to local shards.
+    The successive ops would be programmed as if on a single-device and without calling
+    any explicit collective ops. The actual sharded computation on the sharding annotated tensor
+    happens lazily, is transparent to the user. In the future, we will introduce
+    a new DTensor type for this kind of programming-mode (single-controller) and return.
+    """
+    # device_mesh is not optional in xla_distribute_tensor
+    dt_mesh = device_mesh
+    assert dt_mesh.device_type == "xla"
+
+    # convert to XLA device mesh
+    xla_mesh = convert_to_xla_mesh(dt_mesh)
+    assert xla_mesh.mesh_shape == tuple(dt_mesh.mesh.size())
+
+    # convert tensor to the corresponding device type if it's not in that device type
+    if not tensor.is_meta:
+        tensor = tensor.to(dt_mesh.device_type)
+    # set default placements to replicated if not specified
+    if placements is None:
+        placements = [Replicate() for _ in range(dt_mesh.ndim)]
+    assert (
+        len(placements) == dt_mesh.ndim
+    ), "`placements` must have the same length as `device_mesh.ndim`! "
+    f"Found placements length: {len(placements)}, and device_mesh.ndim: {dt_mesh.ndim}."
+    # convert placements to xla partition spec
+    partition_spec = convert_to_xla_partition_spec(tensor, placements)
+    assert len(tensor.shape) == len(
+        partition_spec
+    ), "`partition_spec` from `placements` must have the same length as `tensor.length`! "
+    f"Found tensor shape length: {len(tensor.shape)}, and partition_spec length: {len(partition_spec)}."
+
+    global_tensor = tensor
+    if type(tensor).__name__ == "DTensor":
+        raise ValueError(
+            "Cannot distribute a DTensor with local tensor on xla devices."
+            "The input tensor must be global."
+        )
+    if type(tensor).__name__ == "XLAShardedTensor":
+        sharding_type = tensor.sharding_type  # type:ignore[attr-defined]
+        assert (
+            sharding_type is None or sharding_type == ShardingType.REPLICATED
+        ), "XLAShardedTensor `tensor` is already annotated with non-replication sharding. "
+        "Clear the existing sharding annotation first, by callling torch_xla.experimental.xla_sharding.clear_sharding API."
+        global_tensor = tensor.global_tensor  # type:ignore[attr-defined]
+    assert global_tensor is not None, "distributing a tensor should not be None"
+
+    # Annotates sharding and returns an XLAShardedTensor
+    xla_tensor = mark_sharding(global_tensor, xla_mesh, partition_spec)
+    return xla_tensor
+
+
+@with_xla
+def xla_distribute_module(
+    module: nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+    partition_fn: Optional[Callable[[str, nn.Module, DeviceMesh], None]] = None,
+    input_fn: Optional[Callable[..., None]] = None,
+    output_fn: Optional[Callable[..., None]] = None,
+) -> nn.Module:
+    raise NotImplementedError
diff --git a/torch/distributed/_tensor/api.py b/torch/distributed/_tensor/api.py
index 26d1d52c2c61a..e2bd0d60ad7e1 100644
--- a/torch/distributed/_tensor/api.py
+++ b/torch/distributed/_tensor/api.py
@@ -237,7 +237,6 @@ def __tensor_unflatten__(inner_tensors, flatten_spec):
         assert (
             flatten_spec is not None
         ), "Expecting spec to be not None from `__tensor_flatten__` return value!"
-        assert isinstance(inner_tensors, dict) and len(inner_tensors) == 1
         local_tensor = inner_tensors["_local_tensor"]
         spec, requires_grad = flatten_spec
         return DTensor(
@@ -394,15 +393,18 @@ def redistribute(
         if placements is None:
             raise RuntimeError("placements is needed for redistribute!")
 
-        # Early return the original DTensor if the placements are the same.
-        if self._spec.placements == placements:
-            return self
-
         for placement in placements:
             if placement.is_partial():
                 raise RuntimeError(
                     "Can not redistribute to _Partial, _Partial is for internal use only!"
                 )
+            elif isinstance(placement, Shard) and placement.dim < 0:
+                # normalize shard dim to be positive
+                placement.dim += self.ndim
+
+        # Early return the original DTensor if the placements are the same.
+        if self._spec.placements == placements:
+            return self
 
         # pyre-fixme[16]: `Redistribute` has no attribute `apply`.
         return Redistribute.apply(self, device_mesh, placements)
@@ -451,7 +453,12 @@ def distribute_tensor(
             first rank of each dimension of the `device_mesh`.
 
     Returns:
-        A :class:`DTensor` object
+        A :class:`DTensor` or `XLAShardedTensor` object.
+
+    Note:
+        When initialize the DeviceMesh with the `xla` device_type, `distribute_tensor`
+        return `XLAShardedTensor` instead. see [link](https://github.com/pytorch/pytorch/issues/92909)
+        for more details. The XLA integration is experimental and subject to change.
     """
 
     torch._C._log_api_usage_once("torch.dtensor.distribute_tensor")
@@ -459,6 +466,14 @@ def distribute_tensor(
     # get default device mesh if there's nothing specified
     device_mesh = device_mesh or mesh_resources.get_current_mesh()
     device_type = device_mesh.device_type
+    if device_type == "xla":
+        # call PyTorch/XLA SPMD for `xla` backend type device mesh.
+        # This returns XLAShardedTensor
+        from torch.distributed._tensor._xla import xla_distribute_tensor
+
+        return xla_distribute_tensor(
+            tensor, device_mesh, placements
+        )  # type:ignore[return-value]
 
     # instantiate a RNG tracker if haven't. By default DTensor uses an
     # OffsetBasedRNGTracker to perform random operators.
@@ -485,7 +500,6 @@ def distribute_tensor(
             f"`placements` must have the same length as `device_mesh.ndim`! "
             f"Found placements length: {len(placements)}, and device_mesh.ndim: {device_mesh.ndim}."
         )
-
     if isinstance(tensor, DTensor):
         # if the tensor is already a DTensor, we just need to check if the
         # device mesh and placements are the same
@@ -508,6 +522,9 @@ def distribute_tensor(
     for idx, placement in enumerate(placements):
         if placement.is_shard():
             placement = cast(Shard, placement)
+            if placement.dim < 0:
+                # normalize shard placement dim
+                placement.dim += tensor.ndim
             local_tensor = placement._shard_tensor(local_tensor, device_mesh, idx)
         elif placement.is_replicate():
             placement = cast(Replicate, placement)
diff --git a/torch/distributed/_tensor/device_mesh.py b/torch/distributed/_tensor/device_mesh.py
index 614d560119e7c..e023d4dfccb33 100644
--- a/torch/distributed/_tensor/device_mesh.py
+++ b/torch/distributed/_tensor/device_mesh.py
@@ -174,12 +174,16 @@ def __init__(
         # private field to pre-generate DeviceMesh's hash
         self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
         self._hash = hash((self._flatten_mesh_list, self.mesh.shape))
-        # always try to create default (world) pg, even if it is not initialized
-        # already. The world pg is used for device mesh identity (rank) on each
-        # process (we need to know if the current global rank is in the mesh or not)
-        self._get_or_create_default_group()
-        if _init_process_groups:
-            self._init_process_groups(_validate_mesh)
+
+        # Skip process group initialization if xla device.
+        # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
+        if device_type != "xla":
+            # always try to create default (world) pg, even if it is not initialized
+            # already. The world pg is used for device mesh identity (rank) on each
+            # process (we need to know if the current global rank is in the mesh or not).
+            self._get_or_create_default_group()
+            if _init_process_groups:
+                self._init_process_groups(_validate_mesh)
 
     def _get_or_create_default_group(self):
         default_initialized = is_initialized()
diff --git a/torch/distributed/_tensor/sharding_prop.py b/torch/distributed/_tensor/sharding_prop.py
index a73c877525d29..5f472915f9fde 100644
--- a/torch/distributed/_tensor/sharding_prop.py
+++ b/torch/distributed/_tensor/sharding_prop.py
@@ -88,7 +88,7 @@ def _propagate_tensor_meta(self, op_schema: OpSchema) -> object:
 
         elif isinstance(fake_out, (tuple, list)):
             tensor_meta_list = []
-            for i, fake_out_item in enumerate(fake_out):
+            for fake_out_item in fake_out:
                 if isinstance(fake_out_item, torch.Tensor):
                     tensor_meta_list.append(
                         TensorMeta(
diff --git a/torch/fx/node.py b/torch/fx/node.py
index e400ceeb165aa..075040dc308bf 100644
--- a/torch/fx/node.py
+++ b/torch/fx/node.py
@@ -39,7 +39,9 @@
     _ops.aten.sym_constrain_range_for_size.default,
     _ops.profiler._record_function_enter,
     _ops.profiler._record_function_enter_new,
-    _ops.profiler._record_function_exit}
+    _ops.profiler._record_function_exit,
+    _ops.inductor.accumulate_grad_.default,
+}
 
 
 @compatibility(is_backward_compatible=False)
diff --git a/torch/testing/_internal/common_quantization.py b/torch/testing/_internal/common_quantization.py
index 15de3bb55e52c..496cd16523b19 100644
--- a/torch/testing/_internal/common_quantization.py
+++ b/torch/testing/_internal/common_quantization.py
@@ -2571,6 +2571,20 @@ def forward(self, x, y):
             z = torch.cat([x, y], dim=1)
             return z
 
+    class Conv2dWithTwoCat(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.conv2 = torch.nn.Conv2d(3, 3, 3)
+
+        def forward(self, x1, x2, x3, x4):
+            x1 = self.conv1(x1)
+            x2 = self.conv2(x2)
+            y = torch.cat([x1, x2], dim=1)
+            z = x3 + x4
+            w = torch.cat([z, y])
+            return w
+
     class EmbeddingModule(torch.nn.Module):
         def __init__(self):
             super().__init__()
diff --git a/torch/testing/_internal/optests/aot_autograd.py b/torch/testing/_internal/optests/aot_autograd.py
index ed99640a6a8aa..4d2897c07385a 100644
--- a/torch/testing/_internal/optests/aot_autograd.py
+++ b/torch/testing/_internal/optests/aot_autograd.py
@@ -52,7 +52,7 @@ def aot_autograd_check(
     def func_no_tensors(args):
         reconstructed_flat_args = []
         args = iter(args)
-        for idx, v in enumerate(flat_args):
+        for v in flat_args:
             if isinstance(v, torch.Tensor):
                 reconstructed_flat_args.append(next(args))
             else:
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index ee99964184f2b..b101c38f4ccdb 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -328,36 +328,6 @@ def _estimate_block_size(self, min_run_time: float) -> int:
                 number *= 10
         return number
 
-    def adaptive_autorange(
-            self,
-            threshold: float = 0.1,
-            *,
-            min_run_time: float = 0.01,
-            max_run_time: float = 10.0,
-            callback: Optional[Callable[[int, float], NoReturn]] = None,
-    ) -> common.Measurement:
-        number = self._estimate_block_size(min_run_time=0.05)
-
-        def time_hook() -> float:
-            return self._timeit(number)
-
-        def stop_hook(times: List[float]) -> bool:
-            if len(times) > 3:
-                return common.Measurement(
-                    number_per_run=number,
-                    raw_times=times,
-                    task_spec=self._task_spec
-                ).meets_confidence(threshold=threshold)
-            return False
-        times = self._threaded_measurement_loop(
-            number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
-
-        return common.Measurement(
-            number_per_run=number,
-            raw_times=times,
-            task_spec=self._task_spec
-        )
-
     def blocked_autorange(
         self,
         callback: Optional[Callable[[int, float], NoReturn]] = None,
@@ -418,6 +388,69 @@ def stop_hook(times: List[float]) -> bool:
             task_spec=self._task_spec
         )
 
+    def adaptive_autorange(
+            self,
+            threshold: float = 0.1,
+            *,
+            min_run_time: float = 0.01,
+            max_run_time: float = 10.0,
+            callback: Optional[Callable[[int, float], NoReturn]] = None,
+    ) -> common.Measurement:
+        """Similar to `blocked_autorange` but also checks for variablility in measurements
+        and repeats until iqr/median is smaller than `threshold` or `max_run_time` is reached.
+
+
+        At a high level, adaptive_autorange executes the following pseudo-code::
+
+            `setup`
+
+            times = []
+            while times.sum < max_run_time
+                start = timer()
+                for _ in range(block_size):
+                    `stmt`
+                times.append(timer() - start)
+
+                enough_data = len(times)>3 and times.sum > min_run_time
+                small_iqr=times.iqr/times.mean<threshold
+
+                if enough_data and small_iqr:
+                    break
+
+        Args:
+            threshold: value of iqr/median threshold for stopping
+
+            min_run_time: total runtime needed before checking `threshold`
+
+            max_run_time: total runtime  for all measurements regardless of `threshold`
+
+        Returns:
+            A `Measurement` object that contains measured runtimes and
+            repetition counts, and can be used to compute statistics.
+            (mean, median, etc.)
+        """
+        number = self._estimate_block_size(min_run_time=0.05)
+
+        def time_hook() -> float:
+            return self._timeit(number)
+
+        def stop_hook(times: List[float]) -> bool:
+            if len(times) > 3:
+                return common.Measurement(
+                    number_per_run=number,
+                    raw_times=times,
+                    task_spec=self._task_spec
+                ).meets_confidence(threshold=threshold)
+            return False
+        times = self._threaded_measurement_loop(
+            number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
+
+        return common.Measurement(
+            number_per_run=number,
+            raw_times=times,
+            task_spec=self._task_spec
+        )
+
     @overload
     def collect_callgrind(
         self,
diff --git a/torchgen/api/unboxing.py b/torchgen/api/unboxing.py
index 0a3aad42864ed..df4430c49b745 100644
--- a/torchgen/api/unboxing.py
+++ b/torchgen/api/unboxing.py
@@ -114,7 +114,7 @@ def convert_arguments(f: NativeFunction) -> Tuple[List[Binding], List[str]]:
         for i in range(len(args))
     ] + [""]
     binding_list = []
-    for i, arg in enumerate(args):
+    for arg in args:
         # expecting only Argument
         if not isinstance(arg.argument, Argument):
             raise Exception(