From 2245d7d3b90162ae2958929a22c140537cfc4b42 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Sun, 16 Nov 2025 07:19:16 +0000
Subject: [PATCH 01/47] Improve char printing (#167899)

This PR outputs chars to stream without building temporary strings.
They were modified by (on fish)
```
sed  -i -e 's/<< "\([^\\\']\)"/<< \'\1\'/g' (grep '<< "."' -r torch c10 aten -l)
```
and revert some invalid changes.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167899
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/LegacyBatchedTensorImpl.h       |   2 +-
 aten/src/ATen/TensorIndexing.cpp              |   6 +-
 aten/src/ATen/TensorNames.cpp                 |   2 +-
 aten/src/ATen/TensorUtils.cpp                 |   8 +-
 aten/src/ATen/Version.cpp                     |  26 ++--
 aten/src/ATen/code_template.h                 |   2 +-
 aten/src/ATen/core/Dimname.cpp                |   2 +-
 aten/src/ATen/core/Range.cpp                  |   2 +-
 aten/src/ATen/core/Tensor.cpp                 |   2 +-
 aten/src/ATen/core/Vitals.cpp                 |   4 +-
 aten/src/ATen/core/alias_info.h               |  10 +-
 aten/src/ATen/core/blob.h                     |   2 +-
 aten/src/ATen/core/class_type.cpp             |   4 +-
 aten/src/ATen/core/class_type.h               |   2 +-
 .../core/dispatch/DispatchKeyExtractor.cpp    |   6 +-
 aten/src/ATen/core/dispatch/Dispatcher.cpp    |   4 +-
 aten/src/ATen/core/dispatch/OperatorEntry.cpp |  12 +-
 aten/src/ATen/core/function_schema.cpp        |  14 +--
 aten/src/ATen/core/function_schema.h          |   8 +-
 aten/src/ATen/core/ivalue.cpp                 |  40 +++---
 aten/src/ATen/core/jit_type.h                 |  20 +--
 aten/src/ATen/core/operator_name.cpp          |   2 +-
 aten/src/ATen/core/tensor_type.cpp            |  20 +--
 aten/src/ATen/core/type.cpp                   |  34 ++---
 aten/src/ATen/core/union_type.cpp             |   4 +-
 aten/src/ATen/cpu/vec/vec256/vec256.h         |   2 +-
 aten/src/ATen/cpu/vec/vec512/vec512.h         |   2 +-
 aten/src/ATen/cuda/detail/CUDAHooks.cpp       |  28 ++---
 aten/src/ATen/cuda/jiterator.cu               |   4 +-
 aten/src/ATen/cuda/tunable/Tunable.cpp        |  12 +-
 aten/src/ATen/cudnn/Descriptors.cpp           |  24 ++--
 aten/src/ATen/functorch/DynamicLayer.cpp      |   6 +-
 aten/src/ATen/functorch/TensorWrapper.cpp     |   4 +-
 aten/src/ATen/miopen/Descriptors.cpp          |  12 +-
 aten/src/ATen/mps/MPSProfiler.h               |   2 +-
 aten/src/ATen/mps/MPSProfiler.mm              |   4 +-
 aten/src/ATen/native/ConvUtils.h              |   2 +-
 aten/src/ATen/native/Convolution.cpp          |   2 +-
 aten/src/ATen/native/SpectralOps.cpp          |   4 +-
 aten/src/ATen/native/TensorCompare.cpp        |   2 +-
 aten/src/ATen/native/cuda/Reduce.cu           |   8 +-
 aten/src/ATen/native/cuda/ScaledGroupMM.cu    |   4 +-
 aten/src/ATen/native/cuda/jit_utils.cpp       |  16 +--
 aten/src/ATen/native/cudnn/ConvShared.cpp     |  28 ++---
 aten/src/ATen/native/cudnn/Conv_v7.cpp        |   9 +-
 .../native/metal/MetalTensorImplStorage.mm    |   2 +-
 aten/src/ATen/native/mkldnn/xpu/Conv.cpp      |   2 +-
 .../qnnpack/test/avgpool-microkernel-tester.h |   8 +-
 .../qnnpack/test/maxpool-microkernel-tester.h |   2 +-
 aten/src/ATen/native/utils/ParamUtils.h       |   2 +-
 aten/src/ATen/native/vulkan/api/Adapter.cpp   |   8 +-
 aten/src/ATen/native/vulkan/api/Exception.cpp |   6 +-
 aten/src/ATen/native/vulkan/api/QueryPool.cpp |   4 +-
 aten/src/ATen/native/vulkan/api/Runtime.cpp   |   2 +-
 aten/src/ATen/native/vulkan/api/Utils.h       |   2 +-
 aten/src/ATen/test/basic.cpp                  |   2 +-
 aten/src/ATen/test/scalar_test.cpp            |  10 +-
 aten/src/ATen/test/test_install/main.cpp      |   2 +-
 aten/src/ATen/test/vec_test_all_types.cpp     |  12 +-
 aten/src/ATen/test/vitals.cpp                 |   2 +-
 aten/src/ATen/test/vulkan_api_test.cpp        |  12 +-
 .../ATen/test/vulkan_quantized_api_test.cpp   |  46 +++----
 c10/core/DispatchKeySet.cpp                   |   2 +-
 c10/core/TensorOptions.cpp                    |   2 +-
 c10/cuda/CUDADeviceAssertionHost.cpp          |  16 +--
 c10/test/core/DispatchKeySet_test.cpp         |   2 +-
 c10/test/util/Half_test.cpp                   |   4 +-
 c10/test/util/logging_test.cpp                |   2 +-
 c10/util/ArrayRef.h                           |   4 +-
 c10/util/Backtrace.cpp                        |  12 +-
 c10/util/Exception.cpp                        |   6 +-
 c10/util/Logging.cpp                          |  12 +-
 c10/util/SmallVector.h                        |   4 +-
 c10/util/StringUtil.cpp                       |   2 +-
 c10/util/StringUtil.h                         |   4 +-
 c10/util/signal_handler.cpp                   |   2 +-
 c10/util/sparse_bitset.h                      |   4 +-
 torch/csrc/DataLoader.cpp                     |   2 +-
 torch/csrc/Device.cpp                         |   4 +-
 torch/csrc/Module.cpp                         |   8 +-
 torch/csrc/TypeInfo.cpp                       |   4 +-
 .../torch/detail/TensorDataContainer.h        |   8 +-
 .../api/include/torch/nn/modules/batchnorm.h  |   2 +-
 .../nn/modules/container/parameterdict.h      |   8 +-
 .../nn/modules/container/parameterlist.h      |   8 +-
 .../csrc/api/include/torch/nn/modules/conv.h  |  12 +-
 .../include/torch/nn/modules/instancenorm.h   |   2 +-
 .../api/include/torch/nn/modules/pooling.h    |   8 +-
 torch/csrc/api/src/nn/module.cpp              |   4 +-
 torch/csrc/api/src/nn/modules/activation.cpp  |  34 ++---
 torch/csrc/api/src/nn/modules/distance.cpp    |   4 +-
 torch/csrc/api/src/nn/modules/dropout.cpp     |  10 +-
 torch/csrc/api/src/nn/modules/embedding.cpp   |   4 +-
 torch/csrc/api/src/nn/modules/fold.cpp        |   4 +-
 torch/csrc/api/src/nn/modules/linear.cpp      |   6 +-
 torch/csrc/api/src/nn/modules/loss.cpp        |  10 +-
 .../csrc/api/src/nn/modules/normalization.cpp |   8 +-
 torch/csrc/api/src/nn/modules/padding.cpp     |  16 +--
 .../csrc/api/src/nn/modules/pixelshuffle.cpp  |   4 +-
 torch/csrc/api/src/nn/modules/pooling.cpp     |  14 +--
 torch/csrc/api/src/nn/modules/rnn.cpp         |   6 +-
 torch/csrc/api/src/nn/modules/upsampling.cpp  |   2 +-
 torch/csrc/autograd/saved_variable.cpp        |   8 +-
 torch/csrc/cuda/Module.cpp                    |   2 +-
 .../distributed/c10d/FlightRecorderDetail.hpp |   4 +-
 .../distributed/c10d/ProcessGroupNCCL.cpp     |   2 +-
 .../distributed/c10d/ProcessGroupWrapper.cpp  |   2 +-
 torch/csrc/distributed/c10d/UCCTracing.cpp    |   2 +-
 torch/csrc/distributed/c10d/UCCUtils.cpp      |   6 +-
 torch/csrc/distributed/c10d/Utils.hpp         |   4 +-
 .../c10d/control_plane/WorkerServer.cpp       |   8 +-
 torch/csrc/distributed/c10d/logger.cpp        |   4 +-
 torch/csrc/distributed/c10d/reducer.cpp       |   4 +-
 .../symm_mem/CUDASymmetricMemoryUtils.hpp     |   2 +-
 .../c10d/symm_mem/DMAConnectivity.cpp         |   2 +-
 .../c10d/symm_mem/NCCLSymmetricMemory.cu      |   2 +-
 .../c10d/symm_mem/NVSHMEMSymmetricMemory.cu   |   2 +-
 .../c10d/symm_mem/intra_node_comm.cpp         |   4 +-
 .../c10d/symm_mem/nvshmem_extension.cu        |   2 +-
 torch/csrc/distributed/rpc/rpc_agent.cpp      |   2 +-
 torch/csrc/distributed/rpc/rref_impl.cpp      |   4 +-
 torch/csrc/distributed/rpc/types.cpp          |   2 +-
 .../csrc/dynamo/python_compiled_autograd.cpp  |   6 +-
 torch/csrc/export/upgrader.cpp                |   2 +-
 .../inductor/aoti_eager/kernel_meta_info.cpp  |   4 +-
 .../aoti_package/model_package_loader.cpp     |   2 +-
 torch/csrc/inductor/aoti_runtime/model_base.h |   2 +-
 .../csrc/inductor/aoti_torch/shim_common.cpp  |   6 +-
 torch/csrc/jit/api/module.cpp                 |   2 +-
 torch/csrc/jit/api/module.h                   |   2 +-
 torch/csrc/jit/backends/backend_detail.cpp    |  12 +-
 torch/csrc/jit/codegen/fuser/tensor_desc.h    |   6 +-
 .../jit/frontend/concrete_module_type.cpp     |  24 ++--
 torch/csrc/jit/frontend/error_report.cpp      |   2 +-
 torch/csrc/jit/frontend/ir_emitter.cpp        |  10 +-
 torch/csrc/jit/frontend/parser.cpp            |   2 +-
 torch/csrc/jit/frontend/schema_matching.cpp   |   4 +-
 torch/csrc/jit/frontend/source_range.cpp      |   6 +-
 torch/csrc/jit/frontend/tree.h                |  16 +--
 torch/csrc/jit/ir/alias_analysis.cpp          |  20 +--
 torch/csrc/jit/ir/ir.cpp                      |  46 +++----
 torch/csrc/jit/jit_log.cpp                    |   6 +-
 torch/csrc/jit/mobile/debug_info.cpp          |   2 +-
 torch/csrc/jit/mobile/import_data.cpp         |   2 +-
 torch/csrc/jit/mobile/interpreter.cpp         |   4 +-
 torch/csrc/jit/mobile/model_tracer/tracer.cpp |   4 +-
 torch/csrc/jit/passes/check_strict_fusion.cpp |   6 +-
 torch/csrc/jit/passes/liveness.cpp            |   8 +-
 torch/csrc/jit/passes/onnx.cpp                |   2 +-
 torch/csrc/jit/passes/onnx/constant_map.cpp   |  10 +-
 .../jit/passes/onnx/function_extraction.cpp   |   2 +-
 .../onnx/remove_inplace_ops_for_onnx.cpp      |   2 +-
 .../jit/passes/symbolic_shape_analysis.cpp    |   4 +-
 .../csrc/jit/passes/utils/subgraph_utils.cpp  |   4 +-
 torch/csrc/jit/python/init.cpp                |   2 +-
 torch/csrc/jit/python/python_arg_flatten.h    |   8 +-
 torch/csrc/jit/python/python_ir.cpp           |  12 +-
 torch/csrc/jit/python/python_tracer.cpp       |   6 +-
 torch/csrc/jit/python/script_init.cpp         |  10 +-
 torch/csrc/jit/runtime/argument_spec.cpp      |   2 +-
 torch/csrc/jit/runtime/argument_spec.h        |  12 +-
 torch/csrc/jit/runtime/instruction.cpp        |   4 +-
 torch/csrc/jit/runtime/interpreter.cpp        |   8 +-
 .../csrc/jit/runtime/interpreter/code_impl.h  |   6 +-
 torch/csrc/jit/runtime/register_prim_ops.cpp  |  10 +-
 torch/csrc/jit/runtime/static/impl.cpp        |  10 +-
 torch/csrc/jit/serialization/onnx.cpp         |  42 +++----
 torch/csrc/jit/serialization/pickler.cpp      |   2 +-
 torch/csrc/jit/serialization/python_print.cpp | 119 +++++++++---------
 torch/csrc/jit/tensorexpr/block_codegen.cpp   |  41 +++---
 .../csrc/jit/tensorexpr/bounds_inference.cpp  |   8 +-
 torch/csrc/jit/tensorexpr/bounds_overlap.cpp  |   2 +-
 torch/csrc/jit/tensorexpr/codegen.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/cpp_codegen.cpp     |  48 +++----
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp    |  54 ++++----
 torch/csrc/jit/tensorexpr/ir_printer.cpp      | 104 +++++++--------
 torch/csrc/jit/tensorexpr/loopnest.cpp        |   4 +-
 .../jit/tensorexpr/loopnest_randomization.cpp |   2 +-
 .../jit/tensorexpr/mem_dependency_checker.cpp |  24 ++--
 torch/csrc/jit/tensorexpr/registerizer.cpp    |   8 +-
 torch/csrc/jit/tensorexpr/types.cpp           |   2 +-
 torch/csrc/jit/testing/file_check.cpp         |   8 +-
 torch/csrc/lazy/core/debug_util.cpp           |   6 +-
 torch/csrc/lazy/core/ir.cpp                   |   2 +-
 torch/csrc/lazy/core/ir_dump_util.cpp         |  18 +--
 torch/csrc/lazy/core/ir_metadata.cpp          |   6 +-
 torch/csrc/lazy/core/lazy_graph_executor.cpp  |   2 +-
 torch/csrc/lazy/core/shape_inference.cpp      |   2 +-
 torch/csrc/lazy/core/trie.cpp                 |   2 +-
 torch/csrc/monitor/counters.h                 |   2 +-
 torch/csrc/profiler/kineto_shim.cpp           |  12 +-
 .../standalone/execution_trace_observer.cpp   |   8 +-
 torch/csrc/profiler/stubs/cuda.cpp            |   2 +-
 torch/csrc/profiler/unwind/action.h           |   8 +-
 torch/csrc/profiler/unwind/eh_frame_hdr.h     |   2 +-
 torch/csrc/profiler/unwind/fde.h              |  24 ++--
 torch/csrc/profiler/unwind/unwind.cpp         |   2 +-
 torch/csrc/profiler/util.cpp                  |  10 +-
 torch/csrc/tensor/python_tensor.cpp           |   2 +-
 torch/csrc/utils/python_arg_parser.cpp        |  14 +--
 torch/csrc/utils/python_dispatch.cpp          |   2 +-
 torch/csrc/utils/structseq.cpp                |   2 +-
 torch/csrc/utils/tensor_types.cpp             |   4 +-
 torch/csrc/xpu/Module.cpp                     |   2 +-
 torch/nativert/executor/OpKernel.cpp          |   2 +-
 .../executor/memory/FunctionSchema.cpp        |   4 +-
 torch/nativert/graph/Graph.cpp                |  16 +--
 torch/nativert/graph/GraphSignature.cpp       |  22 ++--
 .../graph/passes/pass_manager/PassManager.cpp |   4 +-
 209 files changed, 920 insertions(+), 927 deletions(-)

diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index 798e3535af3fb..f051e7b1f6531 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }
 
 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
   return out;
 }
 
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index 1fa852686656f..8618a67259c9c 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();
 
 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
+  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
   return stream;
 }
 
@@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }
 
 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << "(";
+  stream << '(';
   for (const auto i : c10::irange(tensor_indices.size())) {
     stream << tensor_indices[i];
     if (i < tensor_indices.size() - 1) stream << ", ";
   }
-  stream << ")";
+  stream << ')';
   return stream;
 }
 
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index bff12aa8de65f..ac6857b95c1d6 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
   out << tensorname.name_ << " (index ";
   out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ")";
+  out << tensorname.origin_ << ')';
   return out;
 }
 
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 8236751679f06..2752ff792e485 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
   if (t.pos == 0) {
     // 0 is distinguished; it usually indicates 'self' or the return
     // tensor
-    out << "'" << t.name << "'";
+    out << '\'' << t.name << '\'';
   } else {
-    out << "argument #" << t.pos << " '" << t.name << "'";
+    out << "argument #" << t.pos << " '" << t.name << '\'';
   }
   return out;
 }
@@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
       oss << "Tensor for " << t2 << " is on CPU, ";
     }
     oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ")";
+        << " to be on GPU (while checking arguments for " << c << ')';
     TORCH_CHECK(false, oss.str());
   }
   TORCH_CHECK(
@@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
         i++;
       }
       oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ")";
+          << " instead (while checking arguments for " << c << ')';
       TORCH_CHECK(false, oss.str());
     }
 }
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index 7239f357fdd64..a6335d9e11304 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
     // https://github.com/intel/ideep/issues/29
     {
       const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
-         << " (Git Hash " << ver->hash << ")";
+      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
+         << " (Git Hash " << ver->hash << ')';
     }
   #else
     ss << "MKLDNN not found";
@@ -81,7 +81,7 @@ std::string get_openmp_version() {
           break;
       }
       if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ")";
+        ss << " (a.k.a. OpenMP " << ver_str << ')';
       }
     }
   #else
@@ -135,38 +135,38 @@ std::string show_config() {
 
 #if defined(__GNUC__)
   {
-    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
+    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
   }
 #endif
 
 #if defined(__cplusplus)
   {
-    ss << "  - C++ Version: " << __cplusplus << "\n";
+    ss << "  - C++ Version: " << __cplusplus << '\n';
   }
 #endif
 
 #if defined(__clang_major__)
   {
-    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
+    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
   }
 #endif
 
 #if defined(_MSC_VER)
   {
-    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
+    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
   }
 #endif
 
 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << "\n";
+  ss << "  - " << get_mkl_version() << '\n';
 #endif
 
 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << "\n";
+  ss << "  - " << get_mkldnn_version() << '\n';
 #endif
 
 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << "\n";
+  ss << "  - " << get_openmp_version() << '\n';
 #endif
 
 #if AT_BUILD_WITH_LAPACK()
@@ -183,7 +183,7 @@ std::string show_config() {
   ss << "  - Cross compiling on MacOSX\n";
 #endif
 
-  ss << "  - "<< used_cpu_capability() << "\n";
+  ss << "  - "<< used_cpu_capability() << '\n';
 
   if (hasCUDA()) {
     ss << detail::getCUDAHooks().showConfig();
@@ -200,10 +200,10 @@ std::string show_config() {
   ss << "  - Build settings: ";
   for (const auto& pair : caffe2::GetBuildOptions()) {
     if (!pair.second.empty()) {
-      ss << pair.first << "=" << pair.second << ", ";
+      ss << pair.first << '=' << pair.second << ", ";
     }
   }
-  ss << "\n";
+  ss << '\n';
 
   // TODO: do HIP
   // TODO: do XLA
diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h
index 2026795fc0a3d..2cde802dac172 100644
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@@ -209,7 +209,7 @@ struct CodeTemplate {
   // to indent correctly in the context.
   void emitIndent(std::ostream& out, size_t indent) const {
     for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << " ";
+      out << ' ';
     }
   }
   void emitStringWithIndents(
diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp
index c78d554732b9e..66aa8cb69e1ed 100644
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
   if (dimname.type() == NameType::WILDCARD) {
     out << "None";
   } else {
-    out << "'" << dimname.symbol().toUnqualString() << "'";
+    out << '\'' << dimname.symbol().toUnqualString() << '\'';
   }
   return out;
 }
diff --git a/aten/src/ATen/core/Range.cpp b/aten/src/ATen/core/Range.cpp
index 06a79a9c7d063..b5f4c7b6f85bc 100644
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@@ -5,7 +5,7 @@
 namespace at {
 
 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << "]";
+  out << "Range[" << range.begin << ", " << range.end << ']';
   return out;
 }
 
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index c5f887f096cd1..090e77e703736 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {
 
 void TensorBase::print() const {
   if (defined()) {
-    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
+    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
   } else {
     std::cerr << "[UndefinedTensor]" << '\n';
   }
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index 1cfc720aca52b..ac1ee45d58345 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -9,8 +9,8 @@ APIVitals VitalsAPI;
 
 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
   for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
-       << m.second.value << "\n";
+    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
+       << m.second.value << '\n';
   }
   return os;
 }
diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h
index bf0ff6ee72d3b..6a3335c328be2 100644
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
 
 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << "(";
+  out << '(';
   bool first = true;
   for (const auto& set : aliasInfo.beforeSets()) {
     if (first) {
       first = false;
     } else {
-      out << "|";
+      out << '|';
     }
     out << set.toUnqualString();
   }
   if (aliasInfo.isWrite()) {
-    out << "!";
+    out << '!';
   }
   if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
     out << " -> ";
@@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
       if (first) {
         first = false;
       } else {
-        out << "|";
+        out << '|';
       }
       out << set.toUnqualString();
     }
   }
-  out << ")";
+  out << ')';
   return out;
 }
 } // namespace c10
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 251da65e0896f..617d6a982ab4e 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }
 
 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << "]";
+  return out << "Blob[" << v.TypeName() << ']';
 }
 
 } // namespace caffe2
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index 800d9ea0ef9f6..a65124e80979e 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
           *why_not << "Method on class '" << repr_str()
                    << "' (1) is not compatible with interface '"
                    << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << self_method->getSchema() << '\n'
+                   << "  (2) " << schema << '\n';
         }
         return false;
       }
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index ea537400ef73d..f6f6bade9c90d 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
   std::string repr_str() const override {
     std::stringstream ss;
     ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
+       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
     return ss.str();
   }
 
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
index 9180d0d19e644..369bd374747ad 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
   std::ostringstream oss;
   for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
     if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << "1";
+      oss << '1';
     } else {
-      oss << "0";
+      oss << '0';
     }
   }
-  oss << " " << nonFallthroughKeys_ << "\n";
+  oss << ' ' << nonFallthroughKeys_ << '\n';
   return oss.str();
 }
 
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index afcaf51f231ae..5facca30a54f3 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -69,8 +69,8 @@ class RegistrationListenerList final {
 
 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
   auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
 }
 } // namespace detail
 
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index 928474ec3336d..e2627354971a0 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {
 
 std::string OperatorEntry::listAllDispatchKeys() const {
   std::ostringstream str;
-  str << "[";
+  str << '[';
 
   bool has_kernels = false;
   for (auto k : allDispatchKeysInFullSet()) {
@@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
     str << k;
     has_kernels = true;
   }
-  str << "]";
+  str << ']';
   return str.str();
 }
 
@@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
   std::ostringstream oss;
-  oss << "name: " << name_ << "\n";
+  oss << "name: " << name_ << '\n';
   if (schema_) {
-    oss << "schema: " << schema_->schema << "\n";
-    oss << "debug: " << schema_->debug << "\n";
+    oss << "schema: " << schema_->schema << '\n';
+    oss << "debug: " << schema_->debug << '\n';
     oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
   } else {
     oss << "schema: (none)\n";
   }
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index 6587af0f9ccc0..ffccbe282ddd2 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -7,7 +7,7 @@
 namespace c10 {
 
 void FunctionSchema::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
 
   out << schema.name();
   if (!schema.overload_name().empty()) {
-    out << "." << schema.overload_name();
+    out << '.' << schema.overload_name();
   }
-  out << "(";
+  out << '(';
 
   bool seen_kwarg_only = false;
   for (const auto i : c10::irange(schema.arguments().size())) {
@@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
   }
 
   if (need_paren) {
-    out << "(";
+    out << '(';
   }
   for (const auto i : c10::irange(returns.size())) {
     if (i > 0) {
@@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
     out << "...";
   }
   if (need_paren) {
-    out << ")";
+    out << ')';
   }
   return out;
 }
@@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
     if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
       if (why_not) {
         why_not
-            << "'" << arguments().at(i).name() << "'"
+            << '\'' << arguments().at(i).name() << '\''
             << " is not forward compatible with the older version of the schema";
       }
       return false;
@@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
              .isForwardCompatibleWith(old.arguments().at(i))) {
       if (why_not) {
         why_not << "Out argument '"
-                << "'" << arguments().at(i).name()
+                << '\'' << arguments().at(i).name()
                 << " is not FC with the older version of the schema";
       }
       return false;
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index c3e1520dc9868..f349567c26478 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
     if (arg.N()) {
         N = std::to_string(*arg.N());
     }
-    out << "[" << N << "]";
+    out << '[' << N << ']';
   } else {
     out << unopt_type->str();
   }
@@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
   }
 
   if (is_opt) {
-    out << "?";
+    out << '?';
   }
 
   if (!arg.name().empty()) {
-    out << " " << arg.name();
+    out << ' ' << arg.name();
   }
 
   if (arg.default_value()) {
-    out << "=";
+    out << '=';
     if ((type->kind() == c10::TypeKind::StringType ||
         unopt_type->kind() == c10::TypeKind::StringType) &&
         arg.default_value().value().isString()) {
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 1ff8dd0410949..6e4ee82ab1137 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }
 
 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << "." << v.name();
+  out << v.qualifiedClassName() << '.' << v.name();
   return out;
 }
 
@@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
       !elementTypeCanBeInferredFromMembers(list_elem_type)) {
     out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
     printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ")";
+    out << ')';
     return out;
   } else {
     return printList(out, the_list.toListRef(), "[", "]", formatter);
@@ -538,7 +538,7 @@ std::ostream& printDict(
     std::ostream& out,
     const Dict& v,
     const IValueFormatter& formatter) {
-  out << "{";
+  out << '{';
 
   bool first = true;
   for (const auto& pair : v) {
@@ -552,7 +552,7 @@ std::ostream& printDict(
     first = false;
   }
 
-  out << "}";
+  out << '}';
   return out;
 }
 }
@@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
   auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
   if (the_dict.toGenericDict().empty() ||
       !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
-    printDict(out, the_dict.toGenericDict(), formatter) << ")";
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
+    printDict(out, the_dict.toGenericDict(), formatter) << ')';
   } else {
     return printDict(out, the_dict.toGenericDict(), formatter);
   }
@@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
   c10::complex<double> d = v.toComplexDouble();
   IValue real(d.real()), imag(std::abs(d.imag()));
   auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << "j";
+  return out << real << sign << imag << 'j';
 }
 
 std::ostream& IValue::repr(
@@ -605,9 +605,9 @@ std::ostream& IValue::repr(
         if (static_cast<double>(i) == d) {
           // -0.0 (signed zero) needs to be parsed as -0.
           if (i == 0 && std::signbit(d)) {
-            return out << "-" << i << ".";
+            return out << '-' << i << '.';
           }
-          return out << i << ".";
+          return out << i << '.';
         }
       }
       auto orig_prec = out.precision();
@@ -643,20 +643,20 @@ std::ostream& IValue::repr(
       device_stream << v.toDevice();
       out << "torch.device(";
       c10::printQuotedString(out, device_stream.str());
-      return out << ")";
+      return out << ')';
     }
     case IValue::Tag::Generator: {
       auto generator = v.toGenerator();
       out << "torch.Generator(device=";
       c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ")";
+      out << ", seed=" << generator.current_seed() << ')';
       return out;
     }
     case IValue::Tag::GenericDict:
       return printMaybeAnnotatedDict(out, v, formatter);
     case IValue::Tag::Enum: {
       auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << "." <<
+      return out << enum_holder->qualifiedClassName() << '.' <<
           enum_holder->name();
     }
     case IValue::Tag::Object: {
@@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       if (c == FP_NORMAL || c == FP_ZERO) {
         int64_t i = static_cast<int64_t>(d);
         if (static_cast<double>(i) == d) {
-          return out << i << ".";
+          return out << i << '.';
         }
       }
       auto orig_prec = out.precision();
@@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printDict(out, v.toGenericDict(), formatter);
     case IValue::Tag::PyObject: {
       auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << ">";
+      return out << "<PyObject at" << py_obj << '>';
     }
     case IValue::Tag::Generator:
       return out << "Generator";
@@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       // TODO we should attempt to call __str__ if the object defines it.
       auto obj = v.toObject();
       // print this out the way python would do it
-      return out << "<" << obj->name() << " object at " << obj.get() << ">";
+      return out << '<' << obj->name() << " object at " << obj.get() << '>';
     }
     case IValue::Tag::Enum: {
       auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
-          enum_holder->name() << ">";
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
+          enum_holder->name() << '>';
     }
 
   }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
 }
 
 #undef TORCH_FORALL_TAGS
 
 void IValue::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 std::shared_ptr<ClassType> ivalue::Object::type() const {
@@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
       std::stringstream err;
       err << "Cannot serialize custom bound C++ class";
       if (auto qualname = type()->name()) {
-        err << " " << qualname->qualifiedName();
+        err << ' ' << qualname->qualifiedName();
       }
       err << ". Please define serialization methods via def_pickle() for "
             "this class.";
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 666d1ade5789c..535831ea11d6e 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string str() const override {
     std::stringstream ss;
-    ss << getElementType()->str() << "?";
+    ss << getElementType()->str() << '?';
     return ss.str();
   }
 
@@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -906,7 +906,7 @@ struct TORCH_API ListType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << "]";
+    ss << "List[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
   std::string str() const override {
     std::stringstream ss;
     ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ")";
+       << ')';
     return ss.str();
   }
 
@@ -1018,7 +1018,7 @@ struct TORCH_API FutureType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ")";
+    ss << "Future(" << getElementType()->str() << ')';
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1041,7 +1041,7 @@ struct TORCH_API FutureType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ")";
+    ss << "Await(" << getElementType()->str() << ')';
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
@@ -1102,7 +1102,7 @@ struct TORCH_API RRefType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ")";
+    ss << "RRef(" << getElementType()->str() << ')';
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1115,7 +1115,7 @@ struct TORCH_API RRefType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
+    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
     return ss.str();
   }
 };
diff --git a/aten/src/ATen/core/operator_name.cpp b/aten/src/ATen/core/operator_name.cpp
index 43a1fd24749a7..e55a84a4d305a 100644
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
   os << opName.name;
   if (!opName.overload_name.empty()) {
-    os << "." << opName.overload_name;
+    os << '.' << opName.overload_name;
   }
   return os;
 }
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index 9d8080cb8f317..d428aceb3d04c 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {
 
 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << "(";
+  out << '(';
   if (!vs.size()) {
     out << "*)";
     return out;
@@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
     if (v.has_value()) {
       out << v.value();
     } else {
-      out << "*";
+      out << '*';
     }
   }
-  out << ")";
+  out << ')';
   return out;
 }
 
@@ -105,7 +105,7 @@ std::ostream& operator<<(
   }
   auto sizes_opt = ss.sizes();
 
-  os << "(";
+  os << '(';
   for (size_t i = 0; i < rank_opt.value(); i++) {
     if (i > 0) {
       os << ", ";
@@ -113,10 +113,10 @@ std::ostream& operator<<(
     if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
       os << sizes_opt.value()[i];
     } else {
-      os << "*";
+      os << '*';
     }
   }
-  os << ")";
+  os << ')';
 
   return os;
 }
@@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }
 
 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << "{";
+  os << '{';
   if (s.stride_index_.has_value()) {
     os << *s.stride_index_;
   } else {
-    os << "*";
+    os << '*';
   }
-  os << ":";
+  os << ':';
   if (s.stride_.has_value()) {
     os << *s.stride_;
   } else {
-    os << "*";
+    os << '*';
   }
   os << '}';
   return os;
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index abba4e14583a3..46dc550b1f37b 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
       bool has_valid_strides_info = ndim > 0 &&
           value->strides().isComplete() && value->strides().size() == ndim;
 
-      out << "(";
+      out << '(';
       size_t i = 0;
       bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
       for (i = 0; i < *ndim; ++i) {
@@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
         } else if (symbolic) {
           out << value->symbolic_sizes().at(i);
         } else {
-          out << "*";
+          out << '*';
         }
       }
       if (has_valid_strides_info &&
@@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           }
           out << value->strides()[i].value();
         }
-        out << "]";
+        out << ']';
       }
       if (type_verbosity() >= TypeVerbosity::Full) {
         if (value->requiresGrad()) {
@@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           out << "device=" << *value->device();
         }
       }
-      out << ")";
+      out << ')';
     } else {
       if (type_verbosity() >= TypeVerbosity::Full) {
         size_t i = 0;
         if (value->requiresGrad()) {
-          out << "("
+          out << '('
               << "requires_grad=" << *value->requiresGrad();
           i++;
         }
@@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
         }
         if (i > 0) {
-          out << ")";
+          out << ')';
         }
       }
     }
@@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << *prim << "[]";
   } else if (t.kind() == TypeKind::OptionalType) {
     auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << "?";
+    out << *prim << '?';
   } else if(t.kind() == TypeKind::FutureType) {
     auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << "]";
+    out << "Future[" << *elem << ']';
   } else if(t.kind() == TypeKind::RRefType) {
     auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << "]";
+    out << "RRef[" << *elem << ']';
   } else if(auto tup = t.cast<TupleType>()) {
     if (tup->schema()) {
       out << "NamedTuple";
     }
-    out << "(";
+    out << '(';
     for(size_t i = 0; i < tup->elements().size(); ++i) {
       if(i > 0)
         out << ", ";
@@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
         out << *(tup->elements()[i]);
       }
     }
-    out << ")";
+    out << ')';
   } else if (t.kind() == TypeKind::FunctionType) {
     out << "Function";
   } else {
@@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
       why_not << "Could not unify type list since element " << i << " of type "
               << elements.at(i)->repr_str()
               << " did not match the types before it ("
-              << ret_type->repr_str() << ")";
+              << ret_type->repr_str() << ')';
       return std::nullopt;
     }
     ret_type = *maybe_unified;
@@ -907,13 +907,13 @@ std::string TupleType::str() const {
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     ss << name()->qualifiedName();
   } else {
-    ss << "(";
+    ss << '(';
     for(size_t i = 0; i < elements().size(); ++i) {
       if(i > 0)
         ss << ", ";
       ss << elements()[i]->str();
     }
-    ss << ")";
+    ss << ')';
   }
   return ss.str();
 }
@@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
           *why_not << "Method on interface '" << lhs.repr_str()
                    << "' (1) is not compatible with interface '"
                    << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << *self_schema << '\n'
+                   << "  (2) " << schema << '\n';
           return false;
         }
         return false;
@@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }
 
 void SymbolicShape::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index dc4cb78872182..8731c2cbc4952 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
     for (const auto i : c10::irange(reference.size())) {
       msg << reference[i]->repr_str();
       if (i > 0) {
-        msg << ",";
+        msg << ',';
       }
-      msg << " ";
+      msg << ' ';
     }
     msg << "} has the single type " << types_[0]->repr_str()
          << ". Use the common supertype instead of creating a Union"
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index 50c3cc31a6c48..a2eb9e5f45104 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
     }
     stream << buf[i];
   }
-  stream << "]";
+  stream << ']';
   return stream;
 }
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index 975b71ce9a867..623971454df8b 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
     }
     stream << buf[i];
   }
-  stream << "]";
+  stream << ']';
   return stream;
 }
 
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index 594045a1b41d2..b2b9be4498e5b 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
     // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
     if(v < 500) {
       // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << "." << (v % 10);
+      oss << (v / 100) << '.' << (v % 10);
     }
     else {
       // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
+      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
     }
 #else
-    oss << (v / 1000) << "." << (v / 10 % 100);
+    oss << (v / 1000) << '.' << (v / 10 % 100);
     if (v % 10 != 0) {
-      oss << "." << (v % 10);
+      oss << '.' << (v % 10);
     }
 #endif
   };
@@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
   oss << "  - HIP Runtime ";
 #endif
   printCudaStyleVersion(runtimeVersion);
-  oss << "\n";
+  oss << '\n';
 
   // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
   if (runtimeVersion != CUDART_VERSION) {
     oss << "  - Built with CUDA Runtime ";
     printCudaStyleVersion(CUDART_VERSION);
-    oss << "\n";
+    oss << '\n';
   }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
 #endif
 
 #if !defined(USE_ROCM)
@@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {
 
 
   auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << "." << (v / 100 % 10);
+    oss << (v / 1000) << '.' << (v / 100 % 10);
     if (v % 100 != 0) {
-      oss << "." << (v % 100);
+      oss << '.' << (v % 100);
     }
   };
 
@@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
   if (cudnnCudartVersion != CUDART_VERSION) {
     oss << "  (built against CUDA ";
     printCudaStyleVersion(cudnnCudartVersion);
-    oss << ")";
+    oss << ')';
   }
-  oss << "\n";
+  oss << '\n';
   if (cudnnVersion != CUDNN_VERSION) {
     oss << "    - Built with CuDNN ";
     printCudnnStyleVersion(CUDNN_VERSION);
-    oss << "\n";
+    oss << '\n';
   }
 #endif
 #else
   // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
 #endif
 
 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
 #endif
 
   return oss.str();
diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu
index 3af5104288d21..d664c828bdad6 100644
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
 
   // The cache key includes all the parameters to generate_code + vec_size + dev_idx
   std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
   ss << f_inputs_type_str << compute_type_str << result_type_str;
   ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
   ss << extra_args_types;
@@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(
 
   // The cache key includes all the parameters to generate_code + dev_idx
   std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
   ss << f_inputs_type_str << compute_type_str << result_type_str;
   ss << contiguous << dynamic_casting;
   ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index 9fb04b40d30f6..eb7e381d27766 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
   static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
   if (!blaslog) {
-    return stream << entry.key_ << "," << entry.time_;
+    return stream << entry.key_ << ',' << entry.time_;
   }
   else {
-    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
   }
 }
 
@@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
     if (isNew) {
       static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
       if (!blaslog) {
-        untuned_file << op_signature << "," << params_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << std::endl;
       }
       else {
-        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
       }
       TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
     }
@@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const
 
   if(!file_exists || file_empty) {
     for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
+      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
       realtime_out_->flush();
     }
     validators_written_ = true;
@@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
     return;
   }
 
-  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
+  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
   realtime_out_->flush(); //ensure immediate write to disk
 
   TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index 8636d267209e9..a2cb0cb0a1025 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
       return "CUDNN_DATA_UINT8x4";
     default:
       std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
   int nbDims = 0;
   int dimA[CUDNN_DIM_MAX];
   int strideA[CUDNN_DIM_MAX];
   cudnnDataType_t dtype{};
   cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   out << "    strideA = ";
   for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   return out;
 }
 
@@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
       return "CUDNN_TENSOR_NHWC";
     default:
       std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
   int nbDims = 0;
   int dimA[CUDNN_DIM_MAX];
   cudnnDataType_t dtype{};
   cudnnTensorFormat_t tformat{};
   cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   return out;
 }
 
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 69af08a7bd7ce..518098a8b4a80 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }
 
 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ":" << layer.key();
+  os << layer.layerId() << ':' << layer.key();
   return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
   os << "DynamicLayerStack[ ";
   for (const auto& layer : dls) {
-    os << layer << " ";
+    os << layer << ' ';
   }
-  os << "]";
+  os << ']';
   return os;
 }
 
diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp
index 65de9268927f0..ba5dcfc923878 100644
--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
     if (batched) {
       ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
       dumpTensor(ss, batched->value());
-      ss << "]";
+      ss << ']';
       return;
     }
     ss << "Tensor" << tensor.sizes();
@@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
     ss << "dead, ";
   }
   dumpTensor(ss, wrapped->value());
-  ss << "]";
+  ss << ']';
 }
 
 void TensorWrapper::refreshMetadata() {
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 86e42ee3b66dc..3fe27c7a0825b 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
       return "miopenBFloat16";
     default:
       std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
   int nbDims = 0;
   int dimA[MIOPEN_DIM_MAX];
   int strideA[MIOPEN_DIM_MAX];
   miopenDataType_t dtype;
   miopenGetTensorDescriptorSize(d.desc(), &nbDims);
   miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << miopenTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   out << "    strideA = ";
   for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << "\n";
+  out << '\n';
   return out;
 }
 
diff --git a/aten/src/ATen/mps/MPSProfiler.h b/aten/src/ATen/mps/MPSProfiler.h
index c1cb9090fc4af..187e86d92e1bf 100644
--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
     std::stringstream kernelStr;
     kernelStr << kernelName;
     for (const Tensor& tensor : tensors) {
-      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
     }
     return kernelStr.str();
   }
diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm
index a91574c56c52d..1d0408b8089c9 100644
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@@ -39,9 +39,9 @@
     // see comments for INCLUDE_BUFFER_ID
     if (includeBufferId && deviceType == at::kMPS) {
       id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
     }
-    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
     return tensorStr.str();
   } else {
     return "undefined";
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 892144ac663a6..2a3388a052685 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
     std::stringstream ss;
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
     TORCH_CHECK(false, ss.str());
   }
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index ca3a4f5f3faba..cb37f6f1030d3 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
       << "  deterministic = " << params.deterministic
       << "  cudnn_enabled = " << params.cudnn_enabled
       << "  allow_tf32 = " << params.allow_tf32
-      << "}";
+      << '}';
   return out;
 }
 
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 79aaac48034ac..975e237c468d6 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
        << ", hop_length=" << hop_length << ", win_length=" << win_length \
        << ", window="; \
     if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
     } else { \
       SS << "None"; \
     } \
@@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
        << ", hop_length=" << hop_length << ", win_length=" << win_length \
        << ", window="; \
     if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
     } else { \
       SS << "None"; \
     } \
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index c086c7db31404..8a0b38eafab36 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }
 
 void _print(std::string_view s) {
-  std::cout << s << "\n";
+  std::cout << s << '\n';
 }
 
 // Sorting-based algorithm for isin(); used when the number of test elements is
diff --git a/aten/src/ATen/native/cuda/Reduce.cu b/aten/src/ATen/native/cuda/Reduce.cu
index 36a1313488245..b32c55a10df6b 100644
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
   if (dim.y == 1 && dim.z == 1) {
     out << dim.x;
   } else {
-    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
+    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
   }
   return out;
 }
@@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "input_mult=[";
   for (int i = 0; i < 3; i++) {
     if (i != 0) {
-      out << ",";
+      out << ',';
     }
     out << config.input_mult[i];
   }
@@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "output_mult=[";
   for (int i = 0; i < 2; i++) {
     if (i != 0) {
-      out << ",";
+      out << ',';
     }
     out << config.output_mult[i];
   }
@@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "block=" << config.block() << ", ";
   out << "grid=" << config.grid() << ", ";
   out << "global_memory_size=" << config.global_memory_size();
-  out << ")";
+  out << ')';
   return out;
 }
 
diff --git a/aten/src/ATen/native/cuda/ScaledGroupMM.cu b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
index 9a06c5907febc..71c9c8dac766d 100644
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
   //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
   //           stride_output_h + group_count);
 
-  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
   //   "
-  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
+  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
   //             << scale_b.data_ptr() << "\n";
   //   for (int i = 0; i < group_count; i++) {
   //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 09c8e74d4b2cf..e65fa4ceb38e9 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1057,14 +1057,14 @@ std::string generate_code(
     // TODO these arrays are potentially of the different types, use function
     // traits to determine the types
     declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
   }
   env.s("declare_load_arrays", declare_load_arrays.str());
 
   std::stringstream declare_store_arrays;
   for (int i = 0; i < nOutputs; i++) {
     declare_store_arrays << result_type << " out" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
   }
   env.s("declare_store_arrays", declare_store_arrays.str());
 
@@ -1217,7 +1217,7 @@ std::string generate_code(
   for (const auto i : c10::irange(nInputs)){
     auto i_string = std::to_string(i);
     vector_inputs << "auto * input" << i_string <<
-        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
         " + block_work_size * idx;\n";
   }
   env.s("vector_inputs", vector_inputs.str());
@@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(
 
     // Constructs file path by appending constructed cubin name to cache path
     std::stringstream ss;
-    ss << *cache_dir << "/";
+    ss << *cache_dir << '/';
     ss << kernel_name;
 #ifdef USE_ROCM
     ss << "_arch" << prop->gcnArchName;
 #else
-    ss << "_arch" << cuda_major << "." << cuda_minor;
+    ss << "_arch" << cuda_major << '.' << cuda_minor;
 #endif
-    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
+    ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
     ss << (compile_to_sass ? "_sass" : "_ptx");
-    ss << "_" << code.length();
-    ss << "_" << hash_code;
+    ss << '_' << code.length();
+    ss << '_' << hash_code;
     file_path = ss.str();
 
     std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index 325b082f314d9..1584d5e9acd38 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -82,15 +82,15 @@ namespace native {
 
 std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
   out << "ConvolutionParams \n"
-      << "    memory_format = " << params.memory_format << "\n"
-      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
-      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
-      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
-      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
-      << "    groups = " << params.groups << "\n"
+      << "    memory_format = " << params.memory_format << '\n'
+      << "    data_type = " << cudnnTypeToString(params.dataType) << '\n'
+      << "    padding = " << ArrayRef<int>{params.padding} << '\n'
+      << "    stride = " << ArrayRef<int>{params.stride} << '\n'
+      << "    dilation = " << ArrayRef<int>{params.dilation} << '\n'
+      << "    groups = " << params.groups << '\n'
       << "    deterministic = " << (params.deterministic ? "true" : "false")
-      << "\n"
-      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+      << '\n'
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';
 
   return out;
 }
@@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
             at::globalContext().float32Precision(
                 at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
             at::Float32Precision::TF32)
-     << "\n";
+     << '\n';
   ss << "torch.backends.cudnn.benchmark = "
-     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+     << pybool(at::globalContext().benchmarkCuDNN()) << '\n';
   ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
-     << "\n";
+     << '\n';
   ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
-     << "\n";
+     << '\n';
   ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
      << ", dtype=" << full_dtype << ", ";
-  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
   ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
      << out_channels << ", ";
   ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
   ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
   ss << "groups=" << params.groups << ")\n";
   ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
-     << "\n";
+     << '\n';
   ss << "out = net(data)\n";
   ss << "out.backward(torch.randn_like(out))\n";
   ss << "torch.cuda.synchronize()\n\n";
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index bc064e3ad3167..d5102910c6471 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -93,11 +93,10 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
       << "input: " << args.idesc // already has a trailing newline
       << "output: " << args.odesc // already has a trailing newline
       << "weight: " << args.wdesc // already has a trailing newline
-      << "Pointer addresses: "
-      << "\n"
-      << "    input: " << args.input.const_data_ptr() << "\n"
-      << "    output: " << args.output.const_data_ptr() << "\n"
-      << "    weight: " << args.weight.const_data_ptr() << "\n";
+      << "Pointer addresses: " << '\n'
+      << "    input: " << args.input.const_data_ptr() << '\n'
+      << "    output: " << args.output.const_data_ptr() << '\n'
+      << "    weight: " << args.weight.const_data_ptr() << '\n';
 
   return out;
 }
diff --git a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
index f614429eefddf..20a942a9e2573 100644
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@@ -115,7 +115,7 @@ void copy_data_to_host(float* host) {
   std::copy(
       strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
   oss << sizes.back();
-  output << oss.str() << "}";
+  output << oss.str() << '}';
   return output;
 }
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
index 1555eed558e29..6827e02cc3f42 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
       << "  transposed = " << params.transposed
       << "  output_padding = " << IntArrayRef{params.output_padding}
       << "  groups = " << params.groups << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic << "}";
+      << "  deterministic = " << params.deterministic << '}';
   return out;
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
index 1a425146ad6c2..ac6370f8df29f 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
@@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
           ASSERT_NEAR(
               float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
         }
       }
@@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
           ASSERT_NEAR(
               float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
         }
       }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
index e1583a2c058ef..fc94f9666d9d0 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
@@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
               << "), kc = " << kc();
         }
       }
diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h
index c9088c03d81c1..8887664df1ce3 100644
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << "=" << list_param;
+       << "dimensions, but got " << param_name << '=' << list_param;
     TORCH_CHECK(false, ss.str());
   } else {
     return list_param.vec();
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp
index 173479a0c2de0..350df39ea3684 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
   std::string device_type = get_device_type_str(properties.deviceType);
   VkPhysicalDeviceLimits limits = properties.limits;
 
-  ss << "{" << std::endl;
+  ss << '{' << std::endl;
   ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
+  ss << "    apiVersion:    " << v_major << '.' << v_minor << std::endl;
   ss << "    driverversion: " << properties.driverVersion << std::endl;
   ss << "    deviceType:    " << device_type << std::endl;
   ss << "    deviceName:    " << properties.deviceName << std::endl;
@@ -371,7 +371,7 @@ std::string Adapter::stringize() const {
 
 #define PRINT_LIMIT_PROP_VEC3(name)                                       \
   ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << "," << limits.name[1] << "," << limits.name[2] << std::endl;
+     << ',' << limits.name[1] << ',' << limits.name[2] << std::endl;
 
   ss << "    Physical Device Limits {" << std::endl;
   PRINT_LIMIT_PROP(maxImageDimension1D);
@@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
     ;
   }
   ss << "  ]" << std::endl;
-  ss << "}";
+  ss << '}';
 
   return ss.str();
 }
diff --git a/aten/src/ATen/native/vulkan/api/Exception.cpp b/aten/src/ATen/native/vulkan/api/Exception.cpp
index 9b8b653e0619e..436b38cbba6c6 100644
--- a/aten/src/ATen/native/vulkan/api/Exception.cpp
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
     VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
     VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
     default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
       break;
   }
   return out;
@@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
 //
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
   return out;
 }
 
@@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
     : msg_(std::move(msg)), source_location_{source_location} {
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
-  oss << "(" << cond << ") is false! ";
+  oss << '(' << cond << ") is false! ";
   oss << msg_;
   what_ = oss.str();
 }
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
index bfa92357daeed..63c163aa44aa9 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -173,8 +173,8 @@ void QueryPool::extract_results() {
 
 static std::string stringize(const VkExtent3D& extents) {
   std::stringstream ss;
-  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
-     << "}";
+  ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
+     << '}';
   return ss.str();
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
index cf8402e40a0b8..a7485b706c54e 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
   (void)flags;
 
   std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
+  stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
   const std::string log = stream.str();
 
   std::cout << log;
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
index 3172c9c461079..8cd6a74c1c467 100644
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -253,7 +253,7 @@ using vec4 = vec<4u>;
 
 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
   return os;
 }
 
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 0937de4552821..33fe4121a040e 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -246,7 +246,7 @@ void TestToCFloat() {
 void TestToString() {
   Tensor b = ones({3, 7}) * .0000001f;
   std::stringstream s;
-  s << b << "\n";
+  s << b << '\n';
   std::string expect = "1e-07 *";
   ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect);
 }
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index 0d7b62b44d214..a22fb0d16adf8 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -33,7 +33,7 @@ struct Foo {
   static void apply(Tensor a, Tensor b) {
     scalar_type s = 1;
     std::stringstream ss;
-    ss << "hello, dispatch: " << a.toString() << s << "\n";
+    ss << "hello, dispatch: " << a.toString() << s << '\n';
     auto data = (scalar_type*)a.data_ptr();
     (void)data;
   }
@@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) {
   Scalar bar = 3.0;
   Half h = bar.toHalf();
   Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
-       << bar.toDouble() << " " << what.isIntegral(false) << "\n";
+  cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' '
+       << bar.toDouble() << ' ' << what.isIntegral(false) << '\n';
   auto gen = at::detail::getDefaultCPUGenerator();
   {
     // See Note [Acquire lock when using random generators]
@@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) {
   }
   if (at::hasCUDA()) {
     auto t2 = zeros({4, 4}, at::kCUDA);
-    cout << &t2 << "\n";
+    cout << &t2 << '\n';
   }
   auto t = ones({4, 4});
 
@@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) {
       std::stringstream ss;
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
       ASSERT_NO_THROW(
-          ss << "hello, dispatch" << x.toString() << s << "\n");
+          ss << "hello, dispatch" << x.toString() << s << '\n');
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
diff --git a/aten/src/ATen/test/test_install/main.cpp b/aten/src/ATen/test/test_install/main.cpp
index e9a03d2303a39..3a57e0c6212bf 100644
--- a/aten/src/ATen/test/test_install/main.cpp
+++ b/aten/src/ATen/test/test_install/main.cpp
@@ -1,5 +1,5 @@
 #include <ATen/ATen.h>
 
 int main() {
-  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n';
 }
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index da0da76109569..c0c05c1484175 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -1828,9 +1828,9 @@ namespace {
       #endif
 
         EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
-            << "Test failed for float to uint16 " << f32s[i] << "\n";
+            << "Test failed for float to uint16 " << f32s[i] << '\n';
         EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16))
-            << "Test failed for uint16 to float " << u16 << "\n";
+            << "Test failed for uint16 to float " << u16 << '\n';
       }
     }
     TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
@@ -1848,10 +1848,10 @@ namespace {
           EXPECT_TRUE(std::isnan(f32));
         } else {
           EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
         }
         EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
       }
     }
     TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
@@ -2015,10 +2015,10 @@ namespace {
           EXPECT_TRUE(std::isnan(f32));
         } else {
           EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
         }
         EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
       }
     }
     TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp
index cc93775bb5383..eaf1cc152bc37 100644
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@@ -19,7 +19,7 @@ TEST(Vitals, Basic) {
     c10::utils::set_env("TORCH_VITAL", "1");
     TORCH_VITAL_DEFINE(Testing);
     TORCH_VITAL(Testing, Attribute0) << 1;
-    TORCH_VITAL(Testing, Attribute1) << "1";
+    TORCH_VITAL(Testing, Attribute1) << '1';
     TORCH_VITAL(Testing, Attribute2) << 1.0f;
     TORCH_VITAL(Testing, Attribute3) << 1.0;
     auto t = at::ones({1, 1});
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 396ea59d2f008..29f01fbd78c51 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
   std::cout << "Max Diff allowed: " << maxDiff << std::endl;
   if (diff.sizes().size() == 2) {
     for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
       for (const auto x : c10::irange(diff.sizes()[1])) {
         float diff_xy = diff[y][x].item<float>();
         if (diff_xy > maxDiff) {
           std::cout << std::setw(5) << x;
         }
         else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
         }
       }
       std::cout << std::endl;
@@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
 
 void print_shape(const std::vector<int64_t>& shape) {
   for (const auto& num : shape) {
-    std::cout << num << " ";
+    std::cout << num << ' ';
   }
 }
 
@@ -3367,7 +3367,7 @@ void test_masked_fill_scalar(
             print_shape(tmp_curr_input_shape);
             std::cout << "], and mask of shape [";
             print_shape(tmp_curr_mask_shape);
-            std::cout << "]" << std::endl;
+            std::cout << ']' << std::endl;
           }
 
           ASSERT_TRUE(check);
@@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
     if (!check) {
       std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
       for (uint32_t place = 0; place < shape.size() - 1; place++) {
-        std::cout << shape[place] << " ";
+        std::cout << shape[place] << ' ';
       }
-      std::cout << shape.back() << "}" << std::endl;
+      std::cout << shape.back() << '}' << std::endl;
       showRtol(out_cpu, out_vulkan.cpu());
     }
     ASSERT_TRUE(check);
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 2829aed94def9..2eff421a64ced 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -95,7 +95,7 @@ void showRtol(
   std::cout << "Max Diff found is: " << diff.max().item<double>() << std::endl;
   if (diff.sizes().size() == 2) {
     for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
       for (const auto x : c10::irange(diff.sizes()[1])) {
         double diff_xy = diff[y][x].item<double>();
         if (diff_xy > maxDiff) {
@@ -109,7 +109,7 @@ void showRtol(
             }
           }
         } else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
         }
       }
       std::cout << std::endl;
@@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4;
 using at::native::vulkan::api::utils::vec4;
 
 std::ostream& operator<<(std::ostream& os, const vec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
   return os;
 }
 
@@ -3379,51 +3379,51 @@ bool _test_quantized_linear(
       showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
     }
     if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
-                << "\n";
+      std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos
+                << '\n';
       std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
-                << "\n";
-      std::cout << "Input tensor row " << ypos << "\n";
+                << '\n';
+      std::cout << "Input tensor row " << ypos << '\n';
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu[ypos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "Weight tensor scale: " << w_scale
-                << " zerop: " << w_zero_point << "\n";
-      std::cout << "Weight tensor col " << xpos << "\n";
+                << " zerop: " << w_zero_point << '\n';
+      std::cout << "Weight tensor col " << xpos << '\n';
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight[xpos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "Input tensor quantized row " << ypos << " with dtype "
-                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "Weight tensor quantized col " << xpos << " with dtype "
-                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "bias tensor\n";
       for (int i = 0; i < bias.sizes()[0]; i++) {
         std::cout << bias[i].item<double>() << ", ";
       }
-      std::cout << "\n";
+      std::cout << '\n';
 
       std::cout << "out_scale: " << out_scale
-                << " out_zero_point: " << out_zero_point << "\n";
+                << " out_zero_point: " << out_zero_point << '\n';
 
       std::cout << "cpu unmatched output: "
-                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_cpu_dequant[ypos][xpos].item<double>() << '\n';
       std::cout << "vk unmatched output: "
-                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << '\n';
     }
   }
   return check;
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index 107530e9e28a2..d1ec51b6a47d6 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -176,7 +176,7 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
     os << k;
     first = false;
   }
-  os << ")";
+  os << ')';
   return os;
 }
 
diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp
index d3282ae7114e5..b1a90cce30edc 100644
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorOptions& options) {
   } else {
     stream << "(nullopt)";
   }
-  stream << ")";
+  stream << ')';
 
   return stream;
 }
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index 9b7c3568a9833..08e657a411614 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -136,7 +136,7 @@ std::string c10_retrieve_device_side_assertion_info() {
     // Something failed, let's talk about that
     oss << failures_found
         << " CUDA device-side assertion failures were found on GPU #"
-        << device_num << "!" << std::endl;
+        << device_num << '!' << std::endl;
     if (assertion_data_for_device.assertion_count >
         C10_CUDA_DSA_ASSERTION_COUNT) {
       oss << "But at least " << assertion_data_for_device.assertion_count
@@ -151,17 +151,17 @@ std::string c10_retrieve_device_side_assertion_info() {
       oss << "Assertion failure " << i << std::endl;
       oss << "  GPU assertion failure message = " << self.assertion_msg
           << std::endl;
-      oss << "  File containing assertion = " << self.filename << ":"
+      oss << "  File containing assertion = " << self.filename << ':'
           << self.line_number << std::endl;
       oss << "  Device function containing assertion = " << self.function_name
           << std::endl;
-      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
-          << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
-      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
-          << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
+      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ','
+          << self.thread_id[1] << ',' << self.thread_id[2] << ']' << std::endl;
+      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ','
+          << self.block_id[1] << ',' << self.block_id[2] << ']' << std::endl;
       if (launch_info.generation_number == self.caller) {
         oss << "  File containing kernel launch = "
-            << launch_info.launch_filename << ":" << launch_info.launch_linenum
+            << launch_info.launch_filename << ':' << launch_info.launch_linenum
             << std::endl;
         oss << "  Function containing kernel launch = "
             << launch_info.launch_function << std::endl;
@@ -175,7 +175,7 @@ std::string c10_retrieve_device_side_assertion_info() {
         if (launch_registry.gather_launch_stacktrace) {
           oss << "Launch stacktracing disabled." << std::endl;
         } else {
-          oss << "\n" << launch_info.launch_stacktrace << std::endl;
+          oss << '\n' << launch_info.launch_stacktrace << std::endl;
         }
       } else {
         oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp
index a93461a041c39..cdbdc150167e0 100644
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@@ -435,7 +435,7 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) {
     if (i > 0) {
       ASSERT_TRUE(res.find("Unknown") == std::string::npos)
           << i << " (before is " << toString(static_cast<DispatchKey>(i - 1))
-          << ")";
+          << ')';
     } else {
       ASSERT_TRUE(res.find("Unknown") == std::string::npos) << i;
     }
diff --git a/c10/test/util/Half_test.cpp b/c10/test/util/Half_test.cpp
index a76814615101b..33c77ead61fc8 100644
--- a/c10/test/util/Half_test.cpp
+++ b/c10/test/util/Half_test.cpp
@@ -96,10 +96,10 @@ TEST(HalfConversionTest, TestPorableConversion) {
   for (auto x : inputs) {
     auto target = c10::detail::fp16_ieee_to_fp32_value(x);
     EXPECT_EQ(halfbits2float(x), target)
-        << "Test failed for uint16 to float " << x << "\n";
+        << "Test failed for uint16 to float " << x << '\n';
     EXPECT_EQ(
         float2halfbits(target), c10::detail::fp16_ieee_from_fp32_value(target))
-        << "Test failed for float to uint16" << target << "\n";
+        << "Test failed for float to uint16" << target << '\n';
   }
 }
 
diff --git a/c10/test/util/logging_test.cpp b/c10/test/util/logging_test.cpp
index b8fc81ddc6bbe..4587130564dfc 100644
--- a/c10/test/util/logging_test.cpp
+++ b/c10/test/util/logging_test.cpp
@@ -98,7 +98,7 @@ struct Noncopyable {
 };
 
 std::ostream& operator<<(std::ostream& out, const Noncopyable& nc) {
-  out << "Noncopyable(" << nc.x << ")";
+  out << "Noncopyable(" << nc.x << ')';
   return out;
 }
 } // namespace
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index bbbb1d7288fdd..55900b6ee43c6 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -204,13 +204,13 @@ ArrayRef(const std::initializer_list<T>&) -> ArrayRef<T>;
 template <typename T>
 std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
   int i = 0;
-  out << "[";
+  out << '[';
   for (const auto& e : list) {
     if (i++ > 0)
       out << ", ";
     out << e;
   }
-  out << "]";
+  out << ']';
   return out;
 }
 
diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index 8838cafb029e4..29dbfe427ae01 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -106,8 +106,8 @@ class GetBacktraceImpl {
           /*length*/ &length,
           /*status*/ &status);
 
-      os << " frame #" << idx++ << "\t"
-         << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
+      os << " frame #" << idx++ << '\t'
+         << ((demangled != NULL && status == 0) ? demangled : symbol) << '['
          << addr << "]\t" << std::endl;
     }
     free(demangled);
@@ -274,7 +274,7 @@ class GetBacktraceImpl {
       } else {
         // In the edge-case where we couldn't parse the frame string, we can
         // just use it directly (it may have a different format).
-        stream << symbols[frame_number] << "\n";
+        stream << symbols[frame_number] << '\n';
       }
     }
 
@@ -413,8 +413,8 @@ class GetBacktraceImpl {
              << back_trace_[i_frame] << std::dec;
       if (with_symbol) {
         stream << std::setfill('0') << std::setw(16) << std::uppercase
-               << std::hex << p_symbol->Address << std::dec << " " << module
-               << "!" << p_symbol->Name;
+               << std::hex << p_symbol->Address << std::dec << ' ' << module
+               << '!' << p_symbol->Name;
       } else {
         stream << " <unknown symbol address> " << module << "!<unknown symbol>";
       }
@@ -424,7 +424,7 @@ class GetBacktraceImpl {
       } else {
         stream << "<unknown file> @ <unknown line number>";
       }
-      stream << "]" << std::endl;
+      stream << ']' << std::endl;
     }
 
     return stream.str();
diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index cccdb28607141..c8470893d9f57 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -45,7 +45,7 @@ std::string Error::compute_what(bool include_backtrace) const {
 
   if (context_.size() == 1) {
     // Fold error and context in one line
-    oss << " (" << context_[0] << ")";
+    oss << " (" << context_[0] << ')';
   } else {
     for (const auto& c : context_) {
       oss << "\n  " << c;
@@ -53,7 +53,7 @@ std::string Error::compute_what(bool include_backtrace) const {
   }
 
   if (include_backtrace && backtrace_) {
-    oss << "\n" << backtrace_->get();
+    oss << '\n' << backtrace_->get();
   }
 
   return oss.str();
@@ -248,7 +248,7 @@ void WarningHandler::process(const Warning& warning) {
   LOG_AT_FILE_LINE(
       WARNING, warning.source_location().file, warning.source_location().line)
       << "Warning: " << warning.msg() << " (function "
-      << warning.source_location().function << ")";
+      << warning.source_location().function << ')';
 }
 
 std::string GetExceptionString(const std::exception& e) {
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index b95eaec9d3ebb..0ae1e78637588 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -474,12 +474,12 @@ MessageLogger::MessageLogger(
   if (GLOBAL_RANK != -1) {
     stream_ << "[rank" << GLOBAL_RANK << "]:";
   }
-  stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
+  stream_ << '[' << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
           << (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
-          << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour
-          << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2)
-          << timeinfo->tm_sec << "." << std::setw(9) << ns << " "
-          << c10::filesystem::path(file).filename() << ":" << line << "] ";
+          << std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour
+          << ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2)
+          << timeinfo->tm_sec << '.' << std::setw(9) << ns << ' '
+          << c10::filesystem::path(file).filename() << ':' << line << "] ";
 }
 
 // Output the contents of the stream to the proper channel on destruction.
@@ -488,7 +488,7 @@ MessageLogger::~MessageLogger() noexcept(false) {
     // Nothing needs to be logged.
     return;
   }
-  stream_ << "\n";
+  stream_ << '\n';
 #ifdef ANDROID
   static const int android_log_levels[] = {
       ANDROID_LOG_FATAL, // LOG_FATAL
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index d02c9380a563d..d47f37cdf7eca 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -1412,13 +1412,13 @@ inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
 template <typename T, unsigned N>
 std::ostream& operator<<(std::ostream& out, const SmallVector<T, N>& list) {
   int i = 0;
-  out << "[";
+  out << '[';
   for (auto e : list) {
     if (i++ > 0)
       out << ", ";
     out << e;
   }
-  out << "]";
+  out << ']';
   return out;
 }
 
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index 063a8fc93ea7a..6fae2f004cc93 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -79,7 +79,7 @@ std::ostream& _str(std::ostream& ss, const std::wstring& wString) {
 } // namespace detail
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
   return out;
 }
 
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index cbc6f4ec336bb..de241bc9f7c45 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -170,7 +170,7 @@ inline bool isPrint(char s) {
 }
 
 inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
-  stmt << "\"";
+  stmt << '"';
   for (auto s : str) {
     switch (s) {
       case '\\':
@@ -224,7 +224,7 @@ inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
         break;
     }
   }
-  stmt << "\"";
+  stmt << '"';
 }
 
 template <typename T>
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index 831c0d0245245..bfb04e1ccbc36 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -223,7 +223,7 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
         // a single thread that wouldn't receive the SIGUSR2
         if (std::cv_status::timeout == writingCond.wait_for(ul, 2s)) {
           if (!signalReceived) {
-            std::cerr << "signal lost waiting for stacktrace " << pid << ":"
+            std::cerr << "signal lost waiting for stacktrace " << pid << ':'
                       << tid << '\n';
             break;
           }
diff --git a/c10/util/sparse_bitset.h b/c10/util/sparse_bitset.h
index c8eb0df47f6ae..e7ad1db06d6f7 100644
--- a/c10/util/sparse_bitset.h
+++ b/c10/util/sparse_bitset.h
@@ -877,7 +877,7 @@ std::ostream& operator<<(
     std::ostream& stream,
     const SparseBitVector<ElementSize>& vec) {
   bool first = true;
-  stream << "{";
+  stream << '{';
   for (auto el : vec) {
     if (first) {
       first = false;
@@ -886,7 +886,7 @@ std::ostream& operator<<(
     }
     stream << el;
   }
-  stream << "}";
+  stream << '}';
   return stream;
 }
 
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index a6ad3f00b2782..31cec72d8a1c3 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -61,7 +61,7 @@ static void setSignalHandler(
       sigaction(signal, &sa, old_sa_ptr) != 0) {
     std::ostringstream oss;
     oss << "An error occurred while setting handler for " << strsignal(signal)
-        << ".";
+        << '.';
     TORCH_CHECK(false, oss.str());
   }
 }
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index f3babe4cd72bb..da7b287369dab 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -29,14 +29,14 @@ PyObject* THPDevice_New(const at::Device& device) {
 
 static PyObject* THPDevice_repr(THPDevice* self) {
   std::ostringstream oss;
-  oss << "device(type=\'" << self->device.type() << "\'";
+  oss << "device(type=\'" << self->device.type() << '\'';
   if (self->device.has_index()) {
     // `self->device.index()` returns uint8_t which is treated as ascii while
     // printing, hence casting it to uint16_t.
     // https://stackoverflow.com/questions/19562103/uint8-t-cant-be-printed-with-cout
     oss << ", index=" << static_cast<uint16_t>(self->device.index());
   }
-  oss << ")";
+  oss << ')';
   return THPUtils_packString(oss.str().c_str());
 }
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 7d3f9ecc4d007..e2d0e17738dc2 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -212,8 +212,8 @@ static PyObject* THPModule_initExtension(
         }
         auto frame_id = s_tb[idx];
         const auto& frame = s_tbs.all_frames.at(frame_id);
-        oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
-            << ":" << frame.lineno << '\n';
+        oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
+            << ':' << frame.lineno << '\n';
       }
       return oss.str();
     });
@@ -2781,8 +2781,8 @@ Call this whenever a new thread is created in order to propagate values from
 
   py_module.def("_dump_local_tls_set", []() {
     auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-    std::cout << "Included: " << toString(local_keyset.included_) << "\n";
-    std::cout << "Excluded: " << toString(local_keyset.excluded_) << "\n";
+    std::cout << "Included: " << toString(local_keyset.included_) << '\n';
+    std::cout << "Excluded: " << toString(local_keyset.excluded_) << '\n';
   });
 
   py_module.def(
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index 6874374eff768..de23b79536033 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -254,7 +254,7 @@ static PyObject* THPFInfo_str(THPFInfo* self) {
       << PyFloat_AsDouble(THPFInfo_smallest_normal(self, nullptr));
   oss << ", tiny=" << PyFloat_AsDouble(THPFInfo_tiny(self, nullptr));
   if (dtypeStr != nullptr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
   }
   return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
 }
@@ -266,7 +266,7 @@ static PyObject* THPIInfo_str(THPIInfo* self) {
   oss << "iinfo(min=" << PyLong_AsDouble(THPIInfo_min(self, nullptr));
   oss << ", max=" << PyLong_AsDouble(THPIInfo_max(self, nullptr));
   if (dtypeStr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
   }
 
   return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
diff --git a/torch/csrc/api/include/torch/detail/TensorDataContainer.h b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
index 9485af1d297d2..152672c7f3f21 100644
--- a/torch/csrc/api/include/torch/detail/TensorDataContainer.h
+++ b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@@ -271,7 +271,7 @@ struct TensorDataContainer {
           "TensorDataContainer_pretty_print_scalar",
           [&] { stream << scalar_.to<scalar_t>(); });
     } else if (is_init_list()) {
-      stream << "{";
+      stream << '{';
       for (const TensorDataContainer* it = init_list_.begin();
            it != init_list_.end();
            it++) {
@@ -279,9 +279,9 @@ struct TensorDataContainer {
         if (std::next(it) != init_list_.end())
           stream << ", ";
       }
-      stream << "}";
+      stream << '}';
     } else if (is_tensor()) {
-      stream << "{";
+      stream << '{';
       for (const auto i : c10::irange(tensor_.sizes()[0])) {
         AT_DISPATCH_ALL_TYPES_AND3(
             at::kBool,
@@ -293,7 +293,7 @@ struct TensorDataContainer {
         if (i != tensor_.sizes()[0] - 1)
           stream << ", ";
       }
-      stream << "}";
+      stream << '}';
     } else {
       TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
     }
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index 8437ffd7afb8e..a0456578da0e7 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -145,7 +145,7 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
     stream << ", "
            << "affine=" << this->options.affine() << ", "
            << "track_running_stats=" << this->options.track_running_stats()
-           << ")";
+           << ')';
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
index 008d790fdece1..72cc777cd5c0e 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -28,13 +28,13 @@ class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
   void pretty_print(std::ostream& stream) const override {
     stream << "torch::nn::ParameterDict(" << '\n';
     for (const auto& pair : parameters_) {
-      stream << "(" << pair.key() << ")"
-             << ": Parameter containing: [" << pair.value().scalar_type()
-             << " of size " << pair.value().sizes() << "]";
+      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
+             << pair.value().scalar_type() << " of size "
+             << pair.value().sizes() << ']';
       ;
       stream << '\n';
     }
-    stream << ")";
+    stream << ')';
   }
 
   /// Insert the parameter along with the key into ParameterDict
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
index 198172ab56489..c42215715406d 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -36,13 +36,13 @@ class ParameterListImpl : public Cloneable<ParameterListImpl> {
   void pretty_print(std::ostream& stream) const override {
     stream << "torch::nn::ParameterList(" << '\n';
     for (const auto& pair : parameters_) {
-      stream << "(" << pair.key() << ")"
-             << ": Parameter containing: [" << pair.value().scalar_type()
-             << " of size " << pair.value().sizes() << "]";
+      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
+             << pair.value().scalar_type() << " of size "
+             << pair.value().sizes() << ']';
       ;
       stream << '\n';
     }
-    stream << ")";
+    stream << ')';
   }
 
   /// push the a given parameter at the end of the list
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 8c5f1f3e39182..56fb6023ed4b5 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -113,8 +113,8 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
 
   /// Pretty prints the `Conv{1,2,3}d` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::Conv" << D << "d"
-           << "(" << options.in_channels() << ", " << options.out_channels()
+    stream << "torch::nn::Conv" << D << 'd' << '(' << options.in_channels()
+           << ", " << options.out_channels()
            << ", kernel_size=" << options.kernel_size()
            << ", stride=" << options.stride();
     std::visit(
@@ -143,7 +143,7 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
       stream << ", padding_mode="
              << enumtype::get_enum_name(options.padding_mode());
     }
-    stream << ")";
+    stream << ')';
   }
 
   /// The options with which this `Module` was constructed.
@@ -278,8 +278,8 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 
   /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::ConvTranspose" << D << "d"
-           << "(" << this->options.in_channels() << ", "
+    stream << "torch::nn::ConvTranspose" << D << 'd' << '('
+           << this->options.in_channels() << ", "
            << this->options.out_channels()
            << ", kernel_size=" << this->options.kernel_size()
            << ", stride=" << this->options.stride();
@@ -303,7 +303,7 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
       stream << ", padding_mode="
              << enumtype::get_enum_name(this->options.padding_mode());
     }
-    stream << ")";
+    stream << ')';
   }
 
  protected:
diff --git a/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
index 228f181715fc7..492aba8e4e234 100644
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -53,7 +53,7 @@ class InstanceNormImpl
            << "momentum=" << this->options.momentum() << ", "
            << "affine=" << this->options.affine() << ", "
            << "track_running_stats=" << this->options.track_running_stats()
-           << ")";
+           << ')';
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h
index 17ed12f4cc037..4f08bf31031e6 100644
--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -232,8 +232,8 @@ class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
   /// Pretty prints the `AdaptiveMaxPool{1,2,3}d` module into the given
   /// `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveMaxPool" << D << "d"
-           << "(output_size=" << options.output_size() << ")";
+    stream << "torch::nn::AdaptiveMaxPool" << D << 'd'
+           << "(output_size=" << options.output_size() << ')';
   }
 
   /// The options with which this `Module` was constructed.
@@ -365,8 +365,8 @@ class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
   /// Pretty prints the `AdaptiveAvgPool{1,2,3}d` module into the given
   /// `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveAvgPool" << D << "d"
-           << "(output_size=" << options.output_size() << ")";
+    stream << "torch::nn::AdaptiveAvgPool" << D << 'd'
+           << "(output_size=" << options.output_size() << ')';
   }
 
   /// The options with which this `Module` was constructed.
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index 563ed4789cb12..5dbc36b7dd5f2 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -355,11 +355,11 @@ void Module::pretty_print_recursive(
     stream << "(\n";
     const std::string next_indentation = indentation + "  ";
     for (const auto& child : children_) {
-      stream << next_indentation << "(" << child.key() << "): ";
+      stream << next_indentation << '(' << child.key() << "): ";
       child.value()->pretty_print_recursive(stream, next_indentation);
       stream << '\n';
     }
-    stream << indentation << ")";
+    stream << indentation << ')';
   }
 }
 
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index 68949f3fb496e..5144ea51ecee7 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -21,7 +21,7 @@ void ELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -39,7 +39,7 @@ void SELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -55,7 +55,7 @@ void HardshrinkImpl::reset() {}
 
 void HardshrinkImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Hardshrink(" << options.lambda()
-         << ")";
+         << ')';
 }
 
 // ============================================================================
@@ -86,7 +86,7 @@ void HardtanhImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -107,7 +107,7 @@ void LeakyReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -129,7 +129,7 @@ SoftmaxImpl::SoftmaxImpl(const SoftmaxOptions& options_) : options(options_) {}
 void SoftmaxImpl::reset() {}
 
 void SoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmax(dim=" << options.dim() << ")";
+  stream << "torch::nn::Softmax(dim=" << options.dim() << ')';
 }
 
 Tensor SoftmaxImpl::forward(const Tensor& input) {
@@ -143,7 +143,7 @@ SoftminImpl::SoftminImpl(const SoftminOptions& options_) : options(options_) {}
 void SoftminImpl::reset() {}
 
 void SoftminImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmin(dim=" << options.dim() << ")";
+  stream << "torch::nn::Softmin(dim=" << options.dim() << ')';
 }
 
 Tensor SoftminImpl::forward(const Tensor& input) {
@@ -158,7 +158,7 @@ LogSoftmaxImpl::LogSoftmaxImpl(const LogSoftmaxOptions& options_)
 void LogSoftmaxImpl::reset() {}
 
 void LogSoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ")";
+  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ')';
 }
 
 Tensor LogSoftmaxImpl::forward(const Tensor& input) {
@@ -197,7 +197,7 @@ void PReLUImpl::reset() {
 
 void PReLUImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PReLU(num_parameters=" << options.num_parameters()
-         << ")";
+         << ')';
 }
 
 // ============================================================================
@@ -215,7 +215,7 @@ void ReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -233,7 +233,7 @@ void ReLU6Impl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -257,7 +257,7 @@ void RReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -275,7 +275,7 @@ void CELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
@@ -289,7 +289,7 @@ Tensor GLUImpl::forward(const Tensor& input) {
 void GLUImpl::reset() {}
 
 void GLUImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::GLU(dim=" << options.dim() << ")";
+  stream << "torch::nn::GLU(dim=" << options.dim() << ')';
 }
 
 // ============================================================================
@@ -355,7 +355,7 @@ void SoftplusImpl::reset() {}
 
 void SoftplusImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Softplus(beta=" << options.beta()
-         << ", threshold=" << options.threshold() << ")";
+         << ", threshold=" << options.threshold() << ')';
 }
 
 // ============================================================================
@@ -370,7 +370,7 @@ Tensor SoftshrinkImpl::forward(const Tensor& input) {
 void SoftshrinkImpl::reset() {}
 
 void SoftshrinkImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softshrink(" << options.lambda() << ")";
+  stream << "torch::nn::Softshrink(" << options.lambda() << ')';
 }
 
 // ============================================================================
@@ -430,7 +430,7 @@ void ThresholdImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ")";
+  stream << ')';
 }
 
 // ============================================================================
diff --git a/torch/csrc/api/src/nn/modules/distance.cpp b/torch/csrc/api/src/nn/modules/distance.cpp
index d8e7fa8ac4003..7b45deadac947 100644
--- a/torch/csrc/api/src/nn/modules/distance.cpp
+++ b/torch/csrc/api/src/nn/modules/distance.cpp
@@ -12,7 +12,7 @@ void CosineSimilarityImpl::reset() {}
 
 void CosineSimilarityImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::CosineSimilarity"
-         << "(dim=" << options.dim() << ", eps=" << options.eps() << ")";
+         << "(dim=" << options.dim() << ", eps=" << options.eps() << ')';
 }
 
 Tensor CosineSimilarityImpl::forward(const Tensor& x1, const Tensor& x2) {
@@ -30,7 +30,7 @@ void PairwiseDistanceImpl::reset() {}
 void PairwiseDistanceImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::PairwiseDistance"
          << "(p=" << options.p() << ", eps=" << options.eps()
-         << ", keepdim=" << options.keepdim() << ")";
+         << ", keepdim=" << options.keepdim() << ')';
 }
 
 Tensor PairwiseDistanceImpl::forward(const Tensor& x1, const Tensor& x2) {
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index 2b7c5aa3a289e..08433bf363128 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -19,7 +19,7 @@ Tensor DropoutImpl::forward(Tensor input) {
 
 void DropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -31,7 +31,7 @@ Tensor Dropout2dImpl::forward(Tensor input) {
 
 void Dropout2dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout2d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -43,7 +43,7 @@ Tensor Dropout3dImpl::forward(Tensor input) {
 
 void Dropout3dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout3d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -55,7 +55,7 @@ Tensor AlphaDropoutImpl::forward(const Tensor& input) {
 
 void AlphaDropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::AlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 // ============================================================================
@@ -67,7 +67,7 @@ Tensor FeatureAlphaDropoutImpl::forward(const Tensor& input) {
 
 void FeatureAlphaDropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::FeatureAlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ")";
+         << ", inplace=" << options.inplace() << ')';
 }
 
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/embedding.cpp b/torch/csrc/api/src/nn/modules/embedding.cpp
index b9fededfd7372..e704e71c97e65 100644
--- a/torch/csrc/api/src/nn/modules/embedding.cpp
+++ b/torch/csrc/api/src/nn/modules/embedding.cpp
@@ -76,7 +76,7 @@ void EmbeddingImpl::pretty_print(std::ostream& stream) const {
   if (options.sparse()) {
     stream << ", sparse=" << std::boolalpha << options.sparse();
   }
-  stream << ")";
+  stream << ')';
 }
 
 torch::Tensor EmbeddingImpl::forward(const Tensor& input) {
@@ -181,6 +181,6 @@ void EmbeddingBagImpl::pretty_print(std::ostream& stream) const {
   if (padding_idx_opt.has_value()) {
     stream << ", padding_idx=" << padding_idx_opt.value();
   }
-  stream << ")";
+  stream << ')';
 }
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/fold.cpp b/torch/csrc/api/src/nn/modules/fold.cpp
index 32c83ca6e1b7f..43b07b84fcf27 100644
--- a/torch/csrc/api/src/nn/modules/fold.cpp
+++ b/torch/csrc/api/src/nn/modules/fold.cpp
@@ -17,7 +17,7 @@ void FoldImpl::pretty_print(std::ostream& stream) const {
          << ", kernel_size=" << options.kernel_size()
          << ", dilation=" << options.dilation()
          << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ")";
+         << ')';
 }
 
 Tensor FoldImpl::forward(const Tensor& input) {
@@ -40,7 +40,7 @@ void UnfoldImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Unfold(kernel_size=" << options.kernel_size()
          << ", dilation=" << options.dilation()
          << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ")";
+         << ')';
 }
 
 Tensor UnfoldImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/api/src/nn/modules/linear.cpp b/torch/csrc/api/src/nn/modules/linear.cpp
index 0b31e3aa03730..6ed92d2998c24 100644
--- a/torch/csrc/api/src/nn/modules/linear.cpp
+++ b/torch/csrc/api/src/nn/modules/linear.cpp
@@ -55,7 +55,7 @@ void LinearImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha
          << "torch::nn::Linear(in_features=" << options.in_features()
          << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ")";
+         << ", bias=" << options.bias() << ')';
 }
 
 Tensor LinearImpl::forward(const Tensor& input) {
@@ -70,7 +70,7 @@ void FlattenImpl::reset() {}
 
 void FlattenImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Flatten(start_dim=" << options.start_dim()
-         << ", end_dim=" << options.end_dim() << ")";
+         << ", end_dim=" << options.end_dim() << ')';
 }
 
 Tensor FlattenImpl::forward(const Tensor& input) {
@@ -161,7 +161,7 @@ void BilinearImpl::pretty_print(std::ostream& stream) const {
          << "torch::nn::Bilinear(in1_features=" << options.in1_features()
          << ", in2_features=" << options.in2_features()
          << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ")";
+         << ", bias=" << options.bias() << ')';
 }
 
 Tensor BilinearImpl::forward(const Tensor& input1, const Tensor& input2) {
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index 7cae60ac99251..6ea9d76af8128 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -74,7 +74,7 @@ HingeEmbeddingLossImpl::HingeEmbeddingLossImpl(
 void HingeEmbeddingLossImpl::reset() {}
 
 void HingeEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ")";
+  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ')';
 }
 
 Tensor HingeEmbeddingLossImpl::forward(
@@ -104,7 +104,7 @@ void MultiMarginLossImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::MultiMarginLoss(p=" << options.p()
          << ", margin=" << options.margin() << ", weight=" << options.weight()
          << ", reduction=" << enumtype::get_enum_name(options.reduction())
-         << ")";
+         << ')';
 }
 
 Tensor MultiMarginLossImpl::forward(const Tensor& input, const Tensor& target) {
@@ -126,7 +126,7 @@ CosineEmbeddingLossImpl::CosineEmbeddingLossImpl(
 void CosineEmbeddingLossImpl::reset() {}
 
 void CosineEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ")";
+  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ')';
 }
 
 Tensor CosineEmbeddingLossImpl::forward(
@@ -169,7 +169,7 @@ void TripletMarginLossImpl::reset() {}
 void TripletMarginLossImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::TripletMarginLoss(margin=" << options.margin()
          << ", p=" << options.p() << ", eps=" << options.eps() << std::boolalpha
-         << ", swap=" << options.swap() << ")";
+         << ", swap=" << options.swap() << ')';
 }
 
 Tensor TripletMarginLossImpl::forward(
@@ -199,7 +199,7 @@ void TripletMarginWithDistanceLossImpl::pretty_print(
     std::ostream& stream) const {
   stream << "torch::nn::TripletMarginWithDistanceLoss(margin="
          << options.margin() << std::boolalpha << ", swap=" << options.swap()
-         << ")";
+         << ')';
 }
 
 Tensor TripletMarginWithDistanceLossImpl::forward(
diff --git a/torch/csrc/api/src/nn/modules/normalization.cpp b/torch/csrc/api/src/nn/modules/normalization.cpp
index 41129c8990923..72957356a3da9 100644
--- a/torch/csrc/api/src/nn/modules/normalization.cpp
+++ b/torch/csrc/api/src/nn/modules/normalization.cpp
@@ -40,7 +40,7 @@ void LayerNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::LayerNorm("
          << torch::IntArrayRef(options.normalized_shape())
          << ", eps=" << options.eps()
-         << ", elementwise_affine=" << options.elementwise_affine() << ")";
+         << ", elementwise_affine=" << options.elementwise_affine() << ')';
 }
 
 torch::Tensor LayerNormImpl::forward(const Tensor& input) {
@@ -64,7 +64,7 @@ void LocalResponseNormImpl::reset() {}
 void LocalResponseNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::LocalResponseNorm(" << options.size()
          << ", alpha=" << options.alpha() << ", beta=" << options.beta()
-         << ", k=" << options.k() << ")";
+         << ", k=" << options.k() << ')';
 }
 
 // ============================================================================
@@ -74,7 +74,7 @@ void CrossMapLRN2dImpl::reset() {}
 void CrossMapLRN2dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::CrossMapLRN2d(" << options.size()
          << ", alpha=" << options.alpha() << ", beta=" << options.beta()
-         << ", k=" << options.k() << ")";
+         << ", k=" << options.k() << ')';
 }
 
 torch::Tensor CrossMapLRN2dImpl::forward(const torch::Tensor& input) {
@@ -115,7 +115,7 @@ torch::Tensor GroupNormImpl::forward(const Tensor& input) {
 void GroupNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::GroupNorm(" << options.num_groups()
          << ", " << options.num_channels() << ", eps=" << options.eps()
-         << ", affine=" << options.affine() << ")";
+         << ", affine=" << options.affine() << ')';
 }
 
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/padding.cpp b/torch/csrc/api/src/nn/modules/padding.cpp
index d992bf696d0ca..2e3212f7c94fe 100644
--- a/torch/csrc/api/src/nn/modules/padding.cpp
+++ b/torch/csrc/api/src/nn/modules/padding.cpp
@@ -21,8 +21,8 @@ Tensor ReflectionPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ReflectionPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ReflectionPad" << D << "d"
-         << "(padding=" << options.padding() << ")";
+  stream << "torch::nn::ReflectionPad" << D << 'd'
+         << "(padding=" << options.padding() << ')';
 }
 
 template class ReflectionPadImpl<1, ReflectionPad1dImpl>;
@@ -46,8 +46,8 @@ Tensor ReplicationPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ReplicationPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ReplicationPad" << D << "d"
-         << "(padding=" << options.padding() << ")";
+  stream << "torch::nn::ReplicationPad" << D << 'd'
+         << "(padding=" << options.padding() << ')';
 }
 
 template class ReplicationPadImpl<1, ReplicationPad1dImpl>;
@@ -70,8 +70,8 @@ Tensor ZeroPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ZeroPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ZeroPad" << D << "d"
-         << "(padding=" << options.padding() << ")";
+  stream << "torch::nn::ZeroPad" << D << 'd' << "(padding=" << options.padding()
+         << ')';
 }
 
 template class ZeroPadImpl<1, ZeroPad1dImpl>;
@@ -96,9 +96,9 @@ Tensor ConstantPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ConstantPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ConstantPad" << D << "d"
+  stream << "torch::nn::ConstantPad" << D << 'd'
          << "(padding=" << options.padding() << ", value=" << options.value()
-         << ")";
+         << ')';
 }
 
 template class ConstantPadImpl<1, ConstantPad1dImpl>;
diff --git a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
index b11a99eea4e47..bae89d1964961 100644
--- a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
+++ b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
@@ -9,7 +9,7 @@ PixelShuffleImpl::PixelShuffleImpl(const PixelShuffleOptions& options_)
 
 void PixelShuffleImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PixelShuffle(upscale_factor="
-         << options.upscale_factor() << ")";
+         << options.upscale_factor() << ')';
 }
 
 void PixelShuffleImpl::reset() {}
@@ -23,7 +23,7 @@ PixelUnshuffleImpl::PixelUnshuffleImpl(const PixelUnshuffleOptions& options_)
 
 void PixelUnshuffleImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PixelUnshuffle(downscale_factor="
-         << options.downscale_factor() << ")";
+         << options.downscale_factor() << ')';
 }
 
 void PixelUnshuffleImpl::reset() {}
diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp
index f42cfe6b20294..3d6aeb6dffb6c 100644
--- a/torch/csrc/api/src/nn/modules/pooling.cpp
+++ b/torch/csrc/api/src/nn/modules/pooling.cpp
@@ -15,10 +15,10 @@ void AvgPoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void AvgPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::AvgPool" << D << "d"
+  stream << "torch::nn::AvgPool" << D << 'd'
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
-         << ")";
+         << ')';
 }
 
 Tensor AvgPool1dImpl::forward(const Tensor& input) {
@@ -68,11 +68,11 @@ void MaxPoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void MaxPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << std::boolalpha << "torch::nn::MaxPool" << D << "d"
+  stream << std::boolalpha << "torch::nn::MaxPool" << D << 'd'
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
          << ", dilation=" << options.dilation()
-         << ", ceil_mode=" << options.ceil_mode() << ")";
+         << ", ceil_mode=" << options.ceil_mode() << ')';
 }
 
 Tensor MaxPool1dImpl::forward(const Tensor& input) {
@@ -219,10 +219,10 @@ void MaxUnpoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void MaxUnpoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << std::boolalpha << "torch::nn::MaxUnpool" << D << "d"
+  stream << std::boolalpha << "torch::nn::MaxUnpool" << D << 'd'
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
-         << ")";
+         << ')';
 }
 
 Tensor MaxUnpool1dImpl::forward(
@@ -401,7 +401,7 @@ void LPPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
          << "norm_type=" << options.norm_type() << ", "
          << "kernel_size=" << options.kernel_size() << ", "
          << "stride=" << options.stride() << ", "
-         << "ceil_mode=" << options.ceil_mode() << ")";
+         << "ceil_mode=" << options.ceil_mode() << ')';
 }
 
 Tensor LPPool1dImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
index be7c5ded2fc52..7ee864bc8ea94 100644
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -374,7 +374,7 @@ void RNNImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (options_base.proj_size() > 0) {
     stream << ", proj_size=" << options_base.proj_size();
   }
-  stream << ")";
+  stream << ')';
 }
 
 template <typename Derived>
@@ -837,7 +837,7 @@ template <typename Derived>
 void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   const std::string name = this->name();
   const std::string name_without_impl = name.substr(0, name.size() - 4);
-  stream << name_without_impl << "(" << options_base.input_size() << ", "
+  stream << name_without_impl << '(' << options_base.input_size() << ", "
          << options_base.hidden_size();
   if (!options_base.bias()) {
     stream << ", bias=" << std::boolalpha << false;
@@ -846,7 +846,7 @@ void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (!nonlinearity_str.empty() && nonlinearity_str != "kTanh") {
     stream << ", nonlinearity=" << nonlinearity_str;
   }
-  stream << ")";
+  stream << ')';
 }
 
 template <typename Derived>
diff --git a/torch/csrc/api/src/nn/modules/upsampling.cpp b/torch/csrc/api/src/nn/modules/upsampling.cpp
index 420ffe5a8813d..e29f1034fa51c 100644
--- a/torch/csrc/api/src/nn/modules/upsampling.cpp
+++ b/torch/csrc/api/src/nn/modules/upsampling.cpp
@@ -18,7 +18,7 @@ void UpsampleImpl::pretty_print(std::ostream& stream) const {
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     stream << "size=" << at::ArrayRef<int64_t>(options.size().value());
   }
-  stream << ", mode=" << enumtype::get_enum_name(options.mode()) << ")";
+  stream << ", mode=" << enumtype::get_enum_name(options.mode()) << ')';
 }
 
 Tensor UpsampleImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 0124a0212bc61..55def20af786f 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -172,15 +172,15 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
       message
           << "one of the variables needed for gradient computation has been "
              "modified by an inplace operation: ["
-          << data_.toString() << " ";
+          << data_.toString() << ' ';
       if (data_.is_nested()) {
-        message << data_._nested_tensor_size() << "]";
+        message << data_._nested_tensor_size() << ']';
       } else {
-        message << data_.sizes() << "]";
+        message << data_.sizes() << ']';
       }
       if (grad_fn) {
         message << ", which is output " << output_nr_ << " of "
-                << grad_fn->name() << ",";
+                << grad_fn->name() << ',';
       }
       message << " is at version " << current_version << "; expected version "
               << saved_version_ << " instead.";
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index b14323a47bf35..a8ae82b1b66ea 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -1114,7 +1114,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
         stream << "_CudaDeviceProperties(name='" << prop.name
                << "', major=" << prop.major << ", minor=" << prop.minor
 #if USE_ROCM
-               << ", gcnArchName='" << prop.gcnArchName << "'"
+               << ", gcnArchName='" << prop.gcnArchName << '\''
 #endif // USE_ROCM
                << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
                << "MB, multi_processor_count=" << prop.multiProcessorCount
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 88205c171941c..28647b8c50f5a 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -24,8 +24,8 @@ std::string FlightRecorder<EventType>::Entry::getTraceback() {
   for (auto idx : c10::irange(s_tb.size())) {
     auto frame_id = s_tb[idx];
     const auto& frame = s_tbs.all_frames.at(frame_id);
-    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
-        << ":" << frame.lineno << '\n';
+    oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
+        << ':' << frame.lineno << '\n';
   }
   /* Resulted format is like:
     #0 all_reduce from pytorch/torch/distributed/distributed_c10d.py:2696
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index e99d9b0cf8558..8ae3bf3b314f3 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2016,7 +2016,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
           << pg_->logPrefix()
           << "ProcessGroupNCCL monitor thread is disabled, but would have terminated the process"
           << "after attempting to dump debug info, due to " << exitReason
-          << ".";
+          << '.';
     }
   }
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index 624a8fc11b615..fa40ff15ec74f 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -174,7 +174,7 @@ struct CollectiveFingerPrint {
           ss << "Detected mismatch between collectives on ranks. Rank "
              << backend->getRank() << " is running collective: " << *this
              << ", but Rank " << rank
-             << " is running collective: " << rank_fingerprint << ".";
+             << " is running collective: " << rank_fingerprint << '.';
           auto diff_result = compute_collective_diff(rank_fingerprint);
           if (std::get<0>(diff_result)) {
             ss << std::get<1>(diff_result);
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index 66d62d662c259..78fac30d2ab9f 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -51,7 +51,7 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
   _outfile.open(trace_filename, std::ofstream::out | std::ofstream::trunc);
   // flush the traced comms
   if (_outfile.is_open()) {
-    _outfile << "[" << c10::Join(",", trace_generator->getCommsTrace())
+    _outfile << '[' << c10::Join(",", trace_generator->getCommsTrace())
              << "\n]";
     _outfile.flush();
     _outfile.close();
diff --git a/torch/csrc/distributed/c10d/UCCUtils.cpp b/torch/csrc/distributed/c10d/UCCUtils.cpp
index 6794c4eaa594f..9e297ad339fa6 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.cpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.cpp
@@ -35,7 +35,7 @@ ucc_status_t oob_allgather(
     *req = coll_info;
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << "[" << ex.what() << "]";
+               << '[' << ex.what() << ']';
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
@@ -61,7 +61,7 @@ ucc_status_t oob_allgather_test(void* req) {
     }
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << "[" << ex.what() << "]";
+               << '[' << ex.what() << ']';
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
@@ -91,7 +91,7 @@ ucc_status_t oob_allgather_free(void* req) {
         info->getKey(kAllGatherFree + std::to_string(info->rank)));
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << "[" << ex.what() << "]";
+               << '[' << ex.what() << ']';
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index fc9d735401c73..25193b54af9fd 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -48,14 +48,14 @@ TORCH_API std::vector<at::Tensor> getTensorShapes(
 // Turns at::IntArrayRef into "(1, 2, 3, 4)".
 inline std::string toString(at::IntArrayRef l) {
   std::stringstream ss;
-  ss << "(";
+  ss << '(';
   for (const auto i : c10::irange(l.size())) {
     if (i > 0) {
       ss << ", ";
     }
     ss << l[i];
   }
-  ss << ")";
+  ss << ')';
   return ss.str();
 }
 
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 2f77bb119a956..8bbe857620790 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -87,17 +87,17 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
       "/handler/",
       [](const httplib::Request& req [[maybe_unused]], httplib::Response& res) {
         std::ostringstream body;
-        body << "[";
+        body << '[';
         bool first = true;
         for (const auto& name : getHandlerNames()) {
           if (!first) {
-            body << ",";
+            body << ',';
           }
           first = false;
 
-          body << "\"" << jsonStrEscape(name) << "\"";
+          body << '"' << jsonStrEscape(name) << '"';
         }
-        body << "]";
+        body << ']';
 
         res.set_content(body.str(), "application/json");
       });
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index 170748a60352b..c9ef7262f8c8b 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -215,10 +215,10 @@ void Logger::set_construction_data_and_log(
         ddp_logging_data_->ints_map["rank"]);
     std::stringstream ddpLoggingDataInfo;
     for (const auto& intItem : ddp_logging_data_->ints_map) {
-      ddpLoggingDataInfo << intItem.first << ": " << intItem.second << "\n";
+      ddpLoggingDataInfo << intItem.first << ": " << intItem.second << '\n';
     }
     for (const auto& strItem : ddp_logging_data_->strs_map) {
-      ddpLoggingDataInfo << strItem.first << ": " << strItem.second << "\n";
+      ddpLoggingDataInfo << strItem.first << ": " << strItem.second << '\n';
     }
     LOG(INFO) << initInfo << ddpLoggingDataInfo.str();
   }
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index 10a2251754cde..a1c9b4a3039d5 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -615,8 +615,8 @@ void Reducer::delay_all_reduce() {
           param_name != param_names_.end(),
           "Expected to find parameter name from unused parameters map in debug mode.");
       // Add the param_name
-      unused_params_stream << "{" << param_name->second << "," << unused_index
-                           << "}";
+      unused_params_stream << '{' << param_name->second << ',' << unused_index
+                           << '}';
     }
 
     // Each rank prints out all the unused parameters detected
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
index efec39e9eb72c..e246620df31e8 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
@@ -61,7 +61,7 @@ class StoreExchange {
     peer_keys.reserve(world_size);
     for (int r = 0; r < world_size; ++r) {
       std::ostringstream oss;
-      oss << store_prefix_ << "/" << seq_id_ << "/" << r;
+      oss << store_prefix_ << '/' << seq_id_ << '/' << r;
       peer_keys.push_back(oss.str());
     }
     ++seq_id_;
diff --git a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
index 0d54c389ddee6..44a19e96deeab 100644
--- a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
@@ -7,7 +7,7 @@ std::string get_detector_key(
     c10::DeviceType device_type,
     const std::string& connection_type) {
   std::ostringstream oss;
-  oss << device_type << "/" << connection_type;
+  oss << device_type << '/' << connection_type;
   return oss.str();
 }
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index 0eda605fad6fb..c099e2d72ecfd 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -207,7 +207,7 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     auto buffer_size_map =
         storeExchange.all_gather(group_info.store, group_info.rank, group_info.world_size, it->second->buffer_size);
 
-    LOG(INFO) << "[rank " << group_info.rank << "]"
+    LOG(INFO) << "[rank " << group_info.rank << ']'
               << "buffer_size_map: " << buffer_size_map;
     // NCCL window registration api requires all ranks to have the same buffer size
     // we have this check to make sure all ranks have the same buffer size.
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index 69e75df453f51..510f5c4dd1b32 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -71,7 +71,7 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
           storeExchange.all_gather(store, rank_, world_size_, global_rank);
       exchanged_n_times++;
       if (rank_ == 0) {
-        LOG(INFO) << "[rank " << rank_ << "]"
+        LOG(INFO) << "[rank " << rank_ << ']'
                   << " rank_to_global_rank: " << group_info.rank_to_global_rank
                   << ", group_name: " << group_name
                   << ", exchanged_n_times: " << exchanged_n_times;
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
index 0d53d100cee7d..f62577e701847 100644
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@@ -121,7 +121,7 @@ static std::vector<T> storeAllGather(
   std::vector<std::string> peerKeys;
   for (size_t r = 0; r < worldSize; ++r) {
     std::ostringstream oss;
-    oss << prefix << "-" << r;
+    oss << prefix << '-' << r;
     peerKeys.push_back(oss.str());
   }
 
@@ -187,7 +187,7 @@ bool IntraNodeComm::rendezvous() {
     if (strcmp(info.hostname, peerDevInfos.front().hostname) != 0) {
       LOG(WARNING) << "Aborting IntraNodeComm::rendezvous because some "
                       "participants are not on the same host ("
-                   << info.hostname << ", " << devInfo.hostname << ")";
+                   << info.hostname << ", " << devInfo.hostname << ')';
       return false;
     }
     rankToDeviceIdx.emplace_back(info.deviceIdx);
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index cb5d40ef41837..a7a87e4bd8627 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -57,7 +57,7 @@ bool is_nvshmem_available() {
     // Open the shared library, RTLD_LAZY defers symbol resolution until needed
     handle = dlopen("libnvshmem_host.so.3", RTLD_LAZY);
     if (!handle) {
-      std::cerr << dlerror() << "\n";
+      std::cerr << dlerror() << '\n';
       is_available = 0;
     } else {
       is_available = 1;
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 9eee15bdc4d88..a41969ebc1293 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -326,7 +326,7 @@ std::unordered_map<std::string, std::string> RpcAgent::getDebugInfo() {
 
 std::ostream& operator<<(std::ostream& os, const WorkerInfo& workerInfo) {
   return os << "WorkerInfo(id=" << workerInfo.id_
-            << ", name=" << workerInfo.name_ << ")";
+            << ", name=" << workerInfo.name_ << ')';
 }
 
 } // namespace torch::distributed::rpc
diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp
index ecf3cbd999104..59087eb3e6a4e 100644
--- a/torch/csrc/distributed/rpc/rref_impl.cpp
+++ b/torch/csrc/distributed/rpc/rref_impl.cpp
@@ -290,12 +290,12 @@ void OwnerRRef::setError(std::exception_ptr eptr) {
 std::ostream& operator<<(std::ostream& os, const RRef& rref) {
   if (rref.isOwner()) {
     return os << "OwnerRRef("
-              << "rref_id=" << rref.rrefId() << ")";
+              << "rref_id=" << rref.rrefId() << ')';
   } else {
     return os << "UserRRef("
               << "rref_id=" << rref.rrefId()
               << ", fork_id=" << static_cast<const UserRRef*>(&rref)->forkId()
-              << ")";
+              << ')';
   }
 }
 
diff --git a/torch/csrc/distributed/rpc/types.cpp b/torch/csrc/distributed/rpc/types.cpp
index 8a3a18e96a264..1a19fa4708273 100644
--- a/torch/csrc/distributed/rpc/types.cpp
+++ b/torch/csrc/distributed/rpc/types.cpp
@@ -83,7 +83,7 @@ GloballyUniqueId GloballyUniqueId::fromIValue(const at::IValue& ivalue) {
 
 std::ostream& operator<<(std::ostream& os, GloballyUniqueId const& globalId) {
   return os << "GloballyUniqueId(created_on=" << globalId.createdOn_
-            << ", local_id=" << globalId.localId_ << ")";
+            << ", local_id=" << globalId.localId_ << ')';
 }
 
 ///////////////////////////  SerializedPyObj   ///////////////////////////
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index 0e70be3e9ffc4..c24f2cffdd762 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -434,10 +434,10 @@ struct VerboseLogger : public PythonLogger {
       }
       oss << it->key_size;
       if (std::next(it) != cached_keys.end()) {
-        oss << ",";
+        oss << ',';
       }
     }
-    oss << "]";
+    oss << ']';
     std::string compile_reason = oss.str();
     log(PythonLogger::DEBUG, compile_reason);
     return compile_reason;
@@ -454,7 +454,7 @@ struct VerboseLogger : public PythonLogger {
     }
     oss << "sizes["
         << std::to_string(new_dyn_sizes_idx[new_dyn_sizes_idx.size() - 1])
-        << "]";
+        << ']';
     std::string recompile_reason = oss.str();
     log(PythonLogger::DEBUG, recompile_reason);
     return recompile_reason;
diff --git a/torch/csrc/export/upgrader.cpp b/torch/csrc/export/upgrader.cpp
index 04da1ab2a2d28..ec275593e6ff4 100644
--- a/torch/csrc/export/upgrader.cpp
+++ b/torch/csrc/export/upgrader.cpp
@@ -78,7 +78,7 @@ void registerUpgrader(
                      << " and keypath: ";
         for (size_t i = 0; i < keypath.size(); ++i) {
           if (i > 0)
-            error_stream << ".";
+            error_stream << '.';
           error_stream << keypath[i];
         }
         TORCH_CHECK(false, error_stream.str());
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
index 1642ee4beca01..25cd32b6b52fe 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
@@ -100,12 +100,12 @@ std::ostream& operator<<(
   stream << "device_: " << tensor_metadata.device_ << '\n';
   stream << "sizes_: ";
   for (const auto& size : tensor_metadata.sizes_) {
-    stream << size << " ";
+    stream << size << ' ';
   }
   stream << '\n';
   stream << "strides_: ";
   for (const auto& stride : tensor_metadata.strides_) {
-    stream << stride << " ";
+    stream << stride << ' ';
   }
 
   stream << "requires_grad_: " << tensor_metadata.requires_grad_ << '\n';
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index 188f92557761d..ba5e1865ec14d 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -593,7 +593,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   } else {
     LOG(WARNING)
         << "You are using an outdated version of the pt2 archive which do not have a prefix in front of each filename. Example: \n"
-        << found_filenames[0] << "\n"
+        << found_filenames[0] << '\n'
         << found_filenames[1];
   }
 
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index 19f1dca1b7e27..bf8f07edb1458 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -468,7 +468,7 @@ class AOTInductorModelBase {
       auto code = cudaEventDestroy(*run_finished_);
       if (code != cudaSuccess) {
         std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
-                  << cudaGetErrorString(code) << "\n";
+                  << cudaGetErrorString(code) << '\n';
       }
     }
 #endif // USE_CUDA
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 2df922109975a..d6db06af5f2cc 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -1261,7 +1261,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   at::Tensor* t = tensor_handle_to_tensor_pointer(self);
 
   // Display message
-  std::cout << "[";
+  std::cout << '[';
   if (msg) {
     std::cout << "  " << msg;
   }
@@ -1270,7 +1270,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   // Print exact tensor values for small size tensors
   const int64_t numel = t->numel();
   if (numel <= AOTI_TORCH_MAX_NUMEL_TO_PRINT) {
-    std::cout << *t << "\n";
+    std::cout << *t << '\n';
   }
 
   // Print summary stats of the tensor
@@ -1316,7 +1316,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
         std::cout
             << "[INFO] Aten built-in function `min_all_cuda/max_all_cuda` not implemented for current dtype: "
             << t->dtype() << ". Printing out the whole value:\n"
-            << *t << "\n";
+            << *t << '\n';
       }
     }
   }
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 53be7504fe2c3..61c32680c7c0b 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -615,7 +615,7 @@ std::string Module::dump_to_str(
             print_method_bodies, print_attr_values, print_param_values));
   }
   ss << "  }" << '\n';
-  ss << "}" << '\n';
+  ss << '}' << '\n';
 
   return ss.str();
 }
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index c9b7793c89b6f..739eaf478f1e2 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -652,7 +652,7 @@ struct NamedPolicy {
       std::ostringstream ss;
       for (const auto i : c10::irange(cursors.size())) {
         if (i > 0) {
-          ss << ".";
+          ss << '.';
         }
         ss << nameFragment(cursors[i]);
       }
diff --git a/torch/csrc/jit/backends/backend_detail.cpp b/torch/csrc/jit/backends/backend_detail.cpp
index de352f50ab503..2edf832e04262 100644
--- a/torch/csrc/jit/backends/backend_detail.cpp
+++ b/torch/csrc/jit/backends/backend_detail.cpp
@@ -305,8 +305,8 @@ Module codegen_backend_module(
         TORCH_INTERNAL_ASSERT(default_value.has_value());
         std::stringstream def_ss, fwd_ss;
         // Annotate type of the arg
-        def_ss << name << ": " << arg.type()->annotation_str(nullptr) << "=";
-        fwd_ss << name << "=" << name;
+        def_ss << name << ": " << arg.type()->annotation_str(nullptr) << '=';
+        fwd_ss << name << '=' << name;
         default_value->repr(
             def_ss, [](std::ostream&, const IValue&) -> bool { return false; });
         def_inputs.emplace_back(def_ss.str());
@@ -337,18 +337,18 @@ Module codegen_backend_module(
 
     if (out_tuple_ty) {
       auto tuple_elements = out_tuple_ty->elements();
-      type_check_ss << tuple_elements[0]->annotation_str() << ")";
+      type_check_ss << tuple_elements[0]->annotation_str() << ')';
       type_checks.emplace_back(type_check_ss.str());
       for (unsigned i = 1, e = tuple_elements.size(); i < e; ++i) {
         type_check_ss.str(std::string());
         type_check_ss.clear();
         out_ss << ", _" << i;
         type_check_ss << "assert isinstance(_" << i << ", "
-                      << tuple_elements[i]->annotation_str() << ")";
+                      << tuple_elements[i]->annotation_str() << ')';
         type_checks.emplace_back(type_check_ss.str());
       }
     } else {
-      type_check_ss << out_ty->annotation_str() << ")";
+      type_check_ss << out_ty->annotation_str() << ')';
       type_checks.emplace_back(type_check_ss.str());
     }
 
@@ -364,7 +364,7 @@ Module codegen_backend_module(
     // If the output type is a single element tuple then add an extra comma
     // to ensure the final output maintains this type.
     if (out_tuple_ty && out_tuple_ty->elements().size() == 1) {
-      out_ss << ",";
+      out_ss << ',';
     }
 
     method_te.s("ret", out_ss.str());
diff --git a/torch/csrc/jit/codegen/fuser/tensor_desc.h b/torch/csrc/jit/codegen/fuser/tensor_desc.h
index 0c5db65d54ad1..55cd4008e1814 100644
--- a/torch/csrc/jit/codegen/fuser/tensor_desc.h
+++ b/torch/csrc/jit/codegen/fuser/tensor_desc.h
@@ -88,10 +88,10 @@ struct TORCH_API TensorDesc {
 };
 
 inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) {
-  out << d.scalar_type << "[";
+  out << d.scalar_type << '[';
   for (const auto b : d.contiguity)
-    out << b << ";";
-  out << "]";
+    out << b << ';';
+  out << ']';
   return out;
 }
 
diff --git a/torch/csrc/jit/frontend/concrete_module_type.cpp b/torch/csrc/jit/frontend/concrete_module_type.cpp
index 91d41607f9df9..1cb5fb225dc92 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.cpp
+++ b/torch/csrc/jit/frontend/concrete_module_type.cpp
@@ -305,39 +305,37 @@ void ConcreteModuleTypeBuilder::addIgnoredAttribute(std::string name) {
 
 void ConcreteModuleType::dump() const {
   std::cout << "ConcreteModuleType for: "
-            << py::getattr(data_.pyClass_, "__name__") << "\n";
+            << py::getattr(data_.pyClass_, "__name__") << '\n';
   std::cout << "Constants: \n";
   for (const auto& pr : data_.constants_) {
-    std::cout << "\t" << pr.first << ": " << pr.second << "\n";
+    std::cout << '\t' << pr.first << ": " << pr.second << '\n';
   }
   std::cout << "\nAttributes: \n";
   for (const auto& pr : data_.attributes_) {
-    std::cout << "\t" << pr.key() << ": " << pr.value().type_->annotation_str()
-              << "\n";
+    std::cout << '\t' << pr.key() << ": " << pr.value().type_->annotation_str()
+              << '\n';
   }
   std::cout << "\nSubmodules: \n";
   for (const auto& info : data_.modules_) {
-    std::cout << "\t" << info.name_ << ": "
-              << info.meta_->getJitType()->annotation_str() << "\n";
+    std::cout << '\t' << info.name_ << ": "
+              << info.meta_->getJitType()->annotation_str() << '\n';
   }
   std::cout << "\nForward Pre-Hooks: \n";
   for (const auto& pre_hook_id : data_.forwardPreHooks_) {
-    std::cout << "\t"
-              << "pre_hook id: " << pre_hook_id << "\n";
+    std::cout << '\t' << "pre_hook id: " << pre_hook_id << '\n';
   }
   std::cout << "\nForward Hooks: \n";
   for (const auto& hook_id : data_.forwardHooks_) {
-    std::cout << "\t"
-              << "hook id: " << hook_id << "\n";
+    std::cout << '\t' << "hook id: " << hook_id << '\n';
   }
   std::cout << "\nOverloads: \n";
   for (const auto& pr : data_.overloads_) {
-    std::cout << "\t" << pr.first << ": " << pr.second << "\n";
+    std::cout << '\t' << pr.first << ": " << pr.second << '\n';
   }
   std::string isPoisoned = data_.isPoisoned_ ? "true" : "false";
-  std::cout << "isPoisoned: " << isPoisoned << "\n";
+  std::cout << "isPoisoned: " << isPoisoned << '\n';
   if (jitType_) {
-    std::cout << "jit type: " << jitType_->annotation_str() << "\n";
+    std::cout << "jit type: " << jitType_->annotation_str() << '\n';
   }
 }
 
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index d5a8408e971c0..47a9343c5387f 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -99,7 +99,7 @@ std::string ErrorReport::current_call_stack() {
 
 const char* ErrorReport::what() const noexcept {
   std::stringstream msg;
-  msg << "\n" << ss.str();
+  msg << '\n' << ss.str();
   msg << ":\n";
   context.highlight(msg);
 
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index e7949b0ac4bee..fba613b5ea8f7 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -421,7 +421,7 @@ struct Environment {
                    "of another type (torch.jit.annotate(List[T, []]) where T "
                    "is the type of elements in the list for Python 2)";
         }
-        error << "\n" << why_not.str();
+        error << '\n' << why_not.str();
         throw ErrorReport(error);
       }
     }
@@ -842,7 +842,7 @@ struct to_ir {
       throw(
           ErrorReport(def.decl().params().range())
           << "Number of type annotations for"
-          << " function parameters (" << schema.arguments().size() << ")"
+          << " function parameters (" << schema.arguments().size() << ')'
           << " does not match the number of parameters on the function ("
           << expected_annotation_size << ")!");
     }
@@ -3452,7 +3452,7 @@ struct to_ir {
           throw(
               ErrorReport(apply.inputs())
               << "expected an expression of type " << type->repr_str()
-              << " but found " << expr->type()->repr_str() << "\n"
+              << " but found " << expr->type()->repr_str() << '\n'
               << why_not.str());
         }
 
@@ -3828,13 +3828,13 @@ struct to_ir {
       if (!is_key_subtype) {
         err << "Generated key type " << key_type->repr_str()
             << " did not match the annotated key type, which was "
-            << annotated_k_type->repr_str() << "\n";
+            << annotated_k_type->repr_str() << '\n';
       }
 
       if (!is_value_subtype) {
         err << "Generated value type " << value_type->repr_str()
             << " did not match the annotated value type, which was "
-            << annotated_v_type->repr_str() << "\n"
+            << annotated_v_type->repr_str() << '\n'
             << ss.str();
       }
 
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index ef49c15bab24c..f56a392cc9327 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -23,7 +23,7 @@ Decl mergeTypesFromTypeComment(
         << type_annotation_decl.params().size()
         << ") did not match the number of "
         << (is_method ? "method" : "function") << " parameters ("
-        << expected_num_annotations << ")";
+        << expected_num_annotations << ')';
   }
   auto old = decl.params();
   auto _new = type_annotation_decl.params();
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index f191c7daf6e26..d866e4f434448 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -364,7 +364,7 @@ static std::optional<MatchedSchema> tryMatchSchema(
   }
 
   auto err = [&]() -> std::ostream& {
-    *failure_messages << "\n" << schema << ":\n";
+    *failure_messages << '\n' << schema << ":\n";
     return *failure_messages;
   };
 
@@ -751,7 +751,7 @@ Value* emitBuiltinCall(
     } else {
       error << "Here are some suggestions: \n";
       for (const auto& sym : close_symbols) {
-        error << "\t" << sym.toQualString() << "\n";
+        error << '\t' << sym.toQualString() << '\n';
       }
       error << "\nThe original call is";
     }
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index 89815d386ac05..b9263ad08978f 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -310,7 +310,7 @@ void SourceRange::print_with_context(
     if (!funcname.empty()) {
       out << ", in " << funcname;
     }
-    out << "\n";
+    out << '\n';
   }
   // print out initial context
   out << str.substr(begin_context, start() - begin_context);
@@ -327,7 +327,7 @@ void SourceRange::print_with_context(
       auto actual_line = str.substr(line_start, (line_end - line_start) + 1);
       out << actual_line;
       if (actual_line.back() != '\n') {
-        out << "\n";
+        out << '\n';
       }
 
       size_t empty_space = 0;
@@ -377,7 +377,7 @@ void SourceRange::print_with_context(
     auto line_substr = str.substr(line_end, end_context - line_end);
     out << line_substr;
     if (!line_substr.empty() && line_substr.back() != '\n') {
-      out << "\n";
+      out << '\n';
     }
   }
 }
diff --git a/torch/csrc/jit/frontend/tree.h b/torch/csrc/jit/frontend/tree.h
index 12e75ec41c69d..a11f196c5ac0c 100644
--- a/torch/csrc/jit/frontend/tree.h
+++ b/torch/csrc/jit/frontend/tree.h
@@ -93,9 +93,9 @@ struct Tree : c10::intrusive_ptr_target {
     if (trees().size() < expected_subtrees ||
         (!allow_more && trees().size() != expected_subtrees)) {
       std::stringstream ss;
-      ss << filename << ":" << lineno << ": expected at least "
+      ss << filename << ':' << lineno << ": expected at least "
          << expected_subtrees << " subtrees, but found only " << trees().size()
-         << "\n";
+         << '\n';
       range().highlight(ss);
       TORCH_CHECK(false, ss.str());
     }
@@ -184,11 +184,11 @@ struct pretty_tree {
         out << t->stringValue();
         break;
       default:
-        out << "(" << kindToString(t->kind());
+        out << '(' << kindToString(t->kind());
         for (const auto& e : t->trees()) {
-          out << " " << get_flat(e);
+          out << ' ' << get_flat(e);
         }
-        out << ")";
+        out << ')';
         break;
     }
     auto it_ = flat_strings.emplace(t, out.str());
@@ -201,12 +201,12 @@ struct pretty_tree {
       return;
     }
     std::string k = kindToString(t->kind());
-    out << "(" << k;
+    out << '(' << k;
     for (const auto& e : t->trees()) {
-      out << "\n" << std::string(indent + 2, ' ');
+      out << '\n' << std::string(indent + 2, ' ');
       print(out, e, indent + 2);
     }
-    out << ")";
+    out << ')';
   }
 };
 
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 16edf669da9be..ac99385401be4 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -419,14 +419,14 @@ std::string AliasDb::getElementName(const Element* e) const {
   } else {
     std::ostringstream ss;
     if (e->values.size() == 1) {
-      ss << "%" << (*e->values.begin())->debugName();
+      ss << '%' << (*e->values.begin())->debugName();
       return ss.str();
     }
-    ss << "(";
+    ss << '(';
     for (const Value* v : e->values) {
-      ss << "%" << v->debugName() << ", ";
+      ss << '%' << v->debugName() << ", ";
     }
-    ss << ")";
+    ss << ')';
     return ss.str();
   }
 }
@@ -454,7 +454,7 @@ std::string AliasDb::toString() const {
         ++ct;
         ss << getElementName(memoryDAG_->fromIndex(pointedTo));
       }
-      ss << "\n";
+      ss << '\n';
     }
     ct = 0;
     if (!element->containedElements.empty()) {
@@ -466,7 +466,7 @@ std::string AliasDb::toString() const {
         }
         ++ct;
       }
-      ss << "\n";
+      ss << '\n';
     }
   }
 
@@ -479,9 +479,9 @@ std::string AliasDb::toString() const {
     for (const auto value : values) {
       ss << getElementName(memoryDAG_->fromIndex(value)) << ", ";
     }
-    ss << "\n";
+    ss << '\n';
   }
-  ss << "\n";
+  ss << '\n';
   return ss.str();
 }
 
@@ -511,7 +511,7 @@ std::string AliasDb::toGraphviz() const {
     } else {
       std::ostringstream ss;
       if (e->values.size() == 1) {
-        ss << "\"\\%" << (*e->values.begin())->debugName() << "\"";
+        ss << "\"\\%" << (*e->values.begin())->debugName() << '"';
         return ss.str();
       }
       ss << "\"(";
@@ -538,7 +538,7 @@ std::string AliasDb::toGraphviz() const {
     if (!element->pointsTo.empty()) {
       for (const auto pointedTo : element->pointsTo) {
         dot << "  " << name(element) << " -> "
-            << name(memoryDAG_->fromIndex(pointedTo)) << "\n";
+            << name(memoryDAG_->fromIndex(pointedTo)) << '\n';
       }
     }
     if (!element->containedElements.empty()) {
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 4368b3c8191d8..08bfe47382952 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -64,7 +64,7 @@ constexpr topo_position_t kMidPoint = 0;
 constexpr topo_position_t kAppendInterval = 1099511627776ULL /* 2^40 */;
 
 void printValueRef(std::ostream& out, const Value* n) {
-  out << "%" << n->debugName();
+  out << '%' << n->debugName();
 }
 
 bool isNumber(std::string_view str) {
@@ -160,7 +160,7 @@ static void printAttribute(std::ostream& out, const at::Tensor& tensor) {
   // 1-elem tensors are usually boxed scalars, so print them like it
   if (tensor.numel() == 1) {
     auto scalar_tensor = tensor.view(std::vector<int64_t>{}).item();
-    out << "{";
+    out << '{';
     if (scalar_tensor.isFloatingPoint()) {
       out << scalar_tensor.toDouble();
     } else if (scalar_tensor.isComplex()) {
@@ -168,7 +168,7 @@ static void printAttribute(std::ostream& out, const at::Tensor& tensor) {
     } else {
       out << scalar_tensor.toLong();
     }
-    out << "}";
+    out << '}';
   } else if (tensor.numel() <= max_tensor_display_size) {
     // TODO: This is awful code.  Also it doesn't work on Windows.
     std::ostringstream tensor_ss;
@@ -191,7 +191,7 @@ static void printAttribute(std::ostream& out, const IValue& ival) {
       ss << "[<Tensors>]";
       return true;
     } else if (input.isObject() && !input.type()->is_module()) {
-      ss << "object(" << &input.toObjectRef() << ")";
+      ss << "object(" << &input.toObjectRef() << ')';
       return true;
     }
     return false;
@@ -202,14 +202,14 @@ static void printAttribute(std::ostream& out, const IValue& ival) {
 static void printTypeList(
     std::ostream& out,
     const std::vector<TypePtr>& items) {
-  out << "[";
+  out << '[';
   int i = 0;
   for (auto& item : items) {
     if (i++ > 0)
       out << ", ";
     out << *item;
   }
-  out << "]";
+  out << ']';
 }
 
 void Node::printAttrValue(std::ostream& out, const Symbol& name) const {
@@ -265,7 +265,7 @@ void Node::printAttrValue(std::ostream& out, const Symbol& name) const {
 
 void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
     const {
-  out << "[";
+  out << '[';
   auto names = attributeNames();
   int i = 0;
   for (auto name : names) {
@@ -279,11 +279,11 @@ void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
     // don't want to print the qualifier since it should always
     // be attribute, but you might be able to track down a weird
     // bug by printing it out.
-    out << name.toUnqualString() << "=";
+    out << name.toUnqualString() << '=';
 
     printAttrValue(out, name);
   }
-  out << "]";
+  out << ']';
 }
 
 SourceRange Node::sourceRange() const {
@@ -313,11 +313,11 @@ std::ostream& Node::print(
   out << " = ";
   if (kind() == prim::PythonOp) {
     auto* pyOp = static_cast<const ::torch::jit::PythonOp*>(this);
-    out << "^" << pyOp->name();
+    out << '^' << pyOp->name();
     printAttributes(out, /*ignore_subgraph=*/false);
     pyOp->writeScalars(out);
   } else if (hasAttribute(attr::Subgraph) && groups) {
-    out << kind().toQualString() << "_" << groups->size();
+    out << kind().toQualString() << '_' << groups->size();
     if (print_attributes && numAttributes() > 1 &&
         kind() != prim::DifferentiableGraph) {
       printAttributes(out, /*ignore_subgraph=*/true);
@@ -330,7 +330,7 @@ std::ostream& Node::print(
       printAttributes(out);
     }
   }
-  out << "(" << inputs() << ")";
+  out << '(' << inputs() << ')';
 
   if (print_scopes) {
     std::string scName = scopeName();
@@ -350,7 +350,7 @@ std::ostream& Node::print(
     }
     if (auto file_line_col = r.file_line_col()) {
       auto [filename, line, col] = *file_line_col;
-      out << " # " << filename << ":" << line << ":" << col;
+      out << " # " << filename << ':' << line << ':' << col;
     }
   }
 
@@ -358,11 +358,11 @@ std::ostream& Node::print(
     return out;
   }
 
-  out << "\n";
+  out << '\n';
 
   for (const auto i : c10::irange(blocks().size())) {
     auto b = blocks()[i];
-    indent(out, level + 1) << "block" << i << "("
+    indent(out, level + 1) << "block" << i << '('
                            << const_value_list_with_types(b->inputs())
                            << "):\n";
     for (auto nested : b->nodes()) {
@@ -389,7 +389,7 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
   out << "  return (" << outputs() << ")\n";
   size_t i = 0;
   for (auto fg : groups) {
-    out << "with " << fg->kind().toQualString() << "_" << i++ << " = "
+    out << "with " << fg->kind().toQualString() << '_' << i++ << " = "
         << *fg->g(attr::Subgraph);
   }
   out.flush();
@@ -397,7 +397,7 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
   /*
   // Uncomment this to debug all_nodes issues
   {
-    out << "\n";
+    out << '\n';
     out << "all_nodes:\n";
     for (auto& n : all_nodes) {
       printNode(out, const_cast<Node*>(n), nullptr);
@@ -654,7 +654,7 @@ void Graph::lint() const {
 }
 
 void Graph::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 void Graph::push_scope(const std::string& scope_name) {
@@ -888,7 +888,7 @@ Value* Value::setDebugName(const std::string& name) {
       static std::locale c_locale("C");
       ss.imbue(c_locale);
 #endif
-      ss << name_base << "." << suffix++;
+      ss << name_base << '.' << suffix++;
       replacement_name = ss.str();
     } while (names.count(replacement_name) > 0);
 
@@ -1069,7 +1069,7 @@ bool Node::mustBeNone() const {
 }
 
 void Node::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }
 
 const FunctionSchema& Node::schema() const {
@@ -1106,7 +1106,7 @@ const Operator& Node::getOperator() const {
 
   auto er = ErrorReport(sourceRange());
   er << "Schema not found for node. File a bug report.\n";
-  er << "Node: " << *this << "\n";
+  er << "Node: " << *this << '\n';
   er << "Input types:";
   for (const auto i : c10::irange(inputs().size())) {
     if (i > 0)
@@ -1117,13 +1117,13 @@ const Operator& Node::getOperator() const {
   if (!candidates.empty()) {
     er << "\ncandidates were:\n";
     for (auto& candidate : candidates) {
-      er << "  " << candidate->schema() << "\n";
+      er << "  " << candidate->schema() << '\n';
     }
   } else {
     er << "\nno candidates found\n";
   }
   er << "within the graph:\n";
-  er << *owningGraph() << "\n";
+  er << *owningGraph() << '\n';
   throw er;
 }
 
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index 8adf4c8aab10c..f2b237418627e 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -154,9 +154,9 @@ std::string jit_log_prefix(
     int l,
     const std::string& in_str) {
   std::stringstream prefix_ss;
-  prefix_ss << "[";
-  prefix_ss << level << " ";
-  prefix_ss << c10::filesystem::path(fn).filename() << ":";
+  prefix_ss << '[';
+  prefix_ss << level << ' ';
+  prefix_ss << c10::filesystem::path(fn).filename() << ':';
   prefix_ss << std::setfill('0') << std::setw(3) << l;
   prefix_ss << "] ";
 
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index 0a410a42fef04..be61d1d2ec57b 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -103,7 +103,7 @@ std::pair<std::string, std::string> getStackTraceWithModuleHierarchy(
       std::get<kDebugInfoTupleNodeNameIndex>(last_entry);
   module_info.append(".").append(node_name);
   std::ostringstream ss;
-  ss << "Module hierarchy:" << module_info << "\n";
+  ss << "Module hierarchy:" << module_info << '\n';
   format_stack_trace(ss, stack_entries);
   return {ss.str(), std::move(module_info)};
 }
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index 1bd34e4a823ae..7071a08daf6f4 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -138,7 +138,7 @@ c10::IValue IValueUnpickler::readArchive(
 
   auto read_record = [&](const std::string& name) {
     std::stringstream ss;
-    ss << archive_name << "/" << name;
+    ss << archive_name << '/' << name;
     return std::get<0>(reader_->getRecord(ss.str()));
   };
 
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index b5e67cd83cbb2..41fc8d49efb16 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -95,11 +95,11 @@ bool InterpreterState::run(Stack& stack) {
         debug_handle = *handle;
       }
 
-      // std::cout << "RUNNING " << pc << " " << code.instructions_[pc];
+      // std::cout << "RUNNING " << pc << ' ' << code.instructions_[pc];
       // if (inst.op == OP) {
       //   std::cout << ", " << code.op_names_[inst.X].name;
       //   if (!code.op_names_[inst.X].overload_name.empty()) {
-      //     std::cout << "." << code.op_names_[inst.X].overload_name;
+      //     std::cout << '.' << code.op_names_[inst.X].overload_name;
       //   }
       // }
       // std::cout << std::endl;
diff --git a/torch/csrc/jit/mobile/model_tracer/tracer.cpp b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
index b821e7dfcdcd7..c6a94dc8a1fb8 100644
--- a/torch/csrc/jit/mobile/model_tracer/tracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
@@ -60,7 +60,7 @@ static void printOpYAML(
     bool is_used_for_training,
     bool is_root_operator,
     bool include_all_overloads) {
-  out << std::string(indent, ' ') << op_name << ":" << '\n';
+  out << std::string(indent, ' ') << op_name << ':' << '\n';
   out << std::string(indent + 2, ' ')
       << "is_used_for_training: " << (is_used_for_training ? "true" : "false")
       << '\n';
@@ -88,7 +88,7 @@ static void printDTypeYAML(
     const std::string& kernel_tag_name,
     const std::set<std::string>& dtypes) {
   std::string indent_str = std::string(indent, ' ');
-  out << indent_str << kernel_tag_name << ":" << '\n';
+  out << indent_str << kernel_tag_name << ':' << '\n';
   for (auto& dtype : dtypes) {
     out << indent_str << "- " << dtype << '\n';
   }
diff --git a/torch/csrc/jit/passes/check_strict_fusion.cpp b/torch/csrc/jit/passes/check_strict_fusion.cpp
index 41f60fa359132..731382c316398 100644
--- a/torch/csrc/jit/passes/check_strict_fusion.cpp
+++ b/torch/csrc/jit/passes/check_strict_fusion.cpp
@@ -73,7 +73,7 @@ static void checkForUnfusedOps(Node* enter_node) {
     std::stringstream ss;
     ss << "Found multiple fusions: \n";
     for (Node* n : guarding_ifs) {
-      ss << *n << "\n";
+      ss << *n << '\n';
     }
     throw(ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str());
   }
@@ -100,13 +100,13 @@ static void checkForUnfusedOps(Node* enter_node) {
     std::stringstream ss;
     ss << "Found unfused operators: \n";
     for (Node* unfused : unfused_nodes_not_used_in_guard) {
-      ss << "\t";
+      ss << '\t';
       if (unfused->maybeSchema()) {
         ss << unfused->schema();
       } else {
         unfused->kind().toDisplayString();
       }
-      ss << "\n";
+      ss << '\n';
     }
     throw(ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str());
   }
diff --git a/torch/csrc/jit/passes/liveness.cpp b/torch/csrc/jit/passes/liveness.cpp
index c4a80872d61b4..138c6fc78f752 100644
--- a/torch/csrc/jit/passes/liveness.cpp
+++ b/torch/csrc/jit/passes/liveness.cpp
@@ -72,7 +72,7 @@ struct LivenessAnalyzer {
         std::cout << e.first->outputs()[0]->debugName();
       }
 
-      std::cout << " " << e.first->kind().toQualString();
+      std::cout << ' ' << e.first->kind().toQualString();
       std::cout << " = ";
       dump(e.second);
       std::cout << '\n';
@@ -83,16 +83,16 @@ struct LivenessAnalyzer {
 
   void dump(const std::vector<Value*>& set) {
     bool first = true;
-    std::cout << "[";
+    std::cout << '[';
     for (auto el : set) {
       if (first) {
         first = false;
       } else {
         std::cout << ", ";
       }
-      std::cout << el->debugName() << "(" << el->unique() << ")";
+      std::cout << el->debugName() << '(' << el->unique() << ')';
     }
-    std::cout << "]";
+    std::cout << ']';
   }
 
  private:
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index cddae77768228..d3231222cb935 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -292,7 +292,7 @@ void NodeToONNX(
       std::ostringstream ss;
       ss << "symbolic for " << op_name
          << " produced an incorrect number of outputs (expected ";
-      ss << num_old_outputs << ", but got " << outputs.size() << ")";
+      ss << num_old_outputs << ", but got " << outputs.size() << ')';
       throw std::runtime_error(ss.str());
     }
     // For const node, it does not need params_dict info, so set it to {}.
diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp
index e4ec14a5a0175..902dc5f8924cd 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_map.cpp
@@ -301,7 +301,7 @@ void ConstantValueMap::PrintMaps() {
         }
       }
     }
-    ss << " (rank = " << x.second << ")";
+    ss << " (rank = " << x.second << ')';
     std::cout << "node " << x.first << ": " << ss.str() << '\n';
   }
   std::cout << '\n';
@@ -346,9 +346,9 @@ void ConstantValueMap::PrintMaps() {
     std::cout << "(node " << x.first << ": ";
     for (const auto& dim : x.second.dim()) {
       if (dim.has_dim_param()) {
-        std::cout << dim.dim_param() << " ";
+        std::cout << dim.dim_param() << ' ';
       } else {
-        std::cout << dim.dim_value() << " ";
+        std::cout << dim.dim_value() << ' ';
       }
     }
     std::cout << "), ";
@@ -361,7 +361,7 @@ void ConstantValueMap::PrintMaps() {
   std::cout << "SymbolDim Map:" << '\n';
   count = 0;
   for (const auto& x : ConstantValueMap::getInstance().symbolDimMap) {
-    std::cout << "(" << x.first << ": " << x.second << "), ";
+    std::cout << '(' << x.first << ": " << x.second << "), ";
     count++;
     if (count % 10 == 0) {
       std::cout << '\n';
@@ -370,7 +370,7 @@ void ConstantValueMap::PrintMaps() {
   std::cout << "DimSymbol Map:" << '\n';
   count = 0;
   for (const auto& x : ConstantValueMap::getInstance().dimSymbolMap) {
-    std::cout << "(" << x.first << ": " << x.second << "), ";
+    std::cout << '(' << x.first << ": " << x.second << "), ";
     count++;
     if (count % 10 == 0) {
       std::cout << '\n';
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index 7901b44bb85f5..fab3110954fde 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -250,7 +250,7 @@ void FunctionExtractor::DebugPrintScopeContexts(
     GRAPH_UPDATE("Children scopes: ", [&]() {
       std::stringstream ss;
       for (const auto& child_scope : it.second->children_) {
-        ss << child_scope->name().toDisplayString() << " ";
+        ss << child_scope->name().toDisplayString() << ' ';
       }
       return ss.str();
     }());
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index a188eb0abd6b8..48f13499a5fc0 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -440,7 +440,7 @@ std::string InplaceConverter::ValueTracker::toString() const {
     ss << "Value[" << idx << "]: " << it.first->debugName() << '\n';
     ss << "  Mapping to ";
     for (auto v : it.second) {
-      ss << v->debugName() << " ";
+      ss << v->debugName() << ' ';
     }
     ss << '\n';
     idx++;
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index 153408b350bf2..999f8247b7c84 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -152,11 +152,11 @@ static std::ostream& operator<<(std::ostream& os, const ShapeArguments& sa) {
     return os;
   }
 
-  os << "(";
+  os << '(';
   for (const auto i : c10::irange(sa.len())) {
     os << sa.at(i);
   }
-  os << ")";
+  os << ')';
 
   return os;
 }
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index f9fd65f9ce541..f54adbd7223a2 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -612,7 +612,7 @@ static std::string truncateStrWithHash(const std::string& s, size_t maxlen) {
       (maxlen > hash_str.size() + 1) ? (maxlen - hash_str.size() - 1) : maxlen;
   std::stringstream truncated;
   truncated << s.substr(0, trunc_len);
-  truncated << "_" << hash_str;
+  truncated << '_' << hash_str;
   return truncated.str();
 }
 
@@ -626,7 +626,7 @@ std::string generateNameForGraph(
     if (!node->kind().is_aten()) {
       continue;
     }
-    graph_name << "_" << node->kind().toUnqualString();
+    graph_name << '_' << node->kind().toUnqualString();
   }
   return truncateStrWithHash(graph_name.str(), maxlen);
 }
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index beb6f89519804..8dc4cb7ac9349 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1798,7 +1798,7 @@ void initJITBindings(PyObject* module) {
                     << "' with schema(s):\n";
 
           for (const auto& op : sortedOps) {
-            docstring << "  " << op->schema() << "\n";
+            docstring << "  " << op->schema() << '\n';
           }
 
           py::list overload_names;
diff --git a/torch/csrc/jit/python/python_arg_flatten.h b/torch/csrc/jit/python/python_arg_flatten.h
index 232f5b6ea0812..472b257736491 100644
--- a/torch/csrc/jit/python/python_arg_flatten.h
+++ b/torch/csrc/jit/python/python_arg_flatten.h
@@ -79,17 +79,17 @@ static inline std::ostream& operator<<(
       out << ", ";
     out << meta.sizes[i];
   }
-  out << "}";
+  out << '}';
   return out;
 }
 
 static inline std::ostream& operator<<(
     std::ostream& out,
     const IODescriptor& desc) {
-  out << desc.structure << "\n";
-  out << "  with grad_enabled=" << desc.grad_enabled << "\n";
+  out << desc.structure << '\n';
+  out << "  with grad_enabled=" << desc.grad_enabled << '\n';
   for (const auto i : c10::irange(desc.metadata.size())) {
-    out << "  with v" << i << " having type " << desc.metadata[i] << "\n";
+    out << "  with v" << i << " having type " << desc.metadata[i] << '\n';
   }
   return out;
 }
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 88794ecbf3d73..6e5dcde957ddb 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -61,7 +61,7 @@ static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
     // tuple.__str__; this doesn't work because Python doesn't allow
     // monkeypatching methods of built-in types.
     auto pytuple = pyobj.cast<py::tuple>();
-    out << "(";
+    out << '(';
     size_t i = 0;
     for (const auto& o : pytuple) {
       if (i > 0) {
@@ -72,9 +72,9 @@ static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
       i++;
     }
     if (i == 1) {
-      out << ",";
+      out << ',';
     }
-    out << ")";
+    out << ')';
     return out;
   } else {
     return out << THPUtils_unpackString(py::str(pyobj).ptr());
@@ -154,14 +154,14 @@ std::optional<THPObjectPtr> ConcretePythonOp::autogradFunction() const {
 }
 
 void ConcretePythonOp::writeScalars(std::ostream& out) const {
-  out << "(";
+  out << '(';
   int i = 0;
   for (auto& scalar : scalar_args) {
     if (i++ > 0)
       out << ", ";
     printPyObject(out, scalar);
   }
-  out << ")";
+  out << ')';
 }
 
 void ConcretePythonOp::lint_python() const {
@@ -506,7 +506,7 @@ void initPythonIRBindings(PyObject* module_) {
           "__repr__",
           [](Value& n) {
             std::stringstream ss;
-            ss << n.debugName() << " defined in (" << *n.node() << ")";
+            ss << n.debugName() << " defined in (" << *n.node() << ')';
             return ss.str();
           })
       .VS(type)
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 81da1605fcbe2..9210311997384 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -55,8 +55,8 @@ SourceRange getPythonInterpreterSourceRange() {
       if (src && src->filename()) {
         auto line =
             src->starting_line_no() + src->lineno_for_offset(range.start());
-        stack_trace << *(src->filename()) << "(" << line
-                    << "): " << entry.filename << "\n";
+        stack_trace << *(src->filename()) << '(' << line
+                    << "): " << entry.filename << '\n';
         if (!source_filename) {
           source_filename = *(src->filename());
           source_line = line;
@@ -218,7 +218,7 @@ void initPythonTracerBindings(PyObject* module) {
           "__repr__",
           [](const TracingState& s) {
             std::ostringstream ss;
-            ss << "<TracingState " << (const void*)&s << ">";
+            ss << "<TracingState " << (const void*)&s << '>';
             return ss.str();
           })
       .def(
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index b9fbf4d1ec30f..ca75e6b986404 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -497,7 +497,7 @@ static bool ivalue_tags_match(const Module& lhs, const Module& rhs) {
     if (item.a.isPtrType()) {
       // uncomment to debug type matching errors
       // std::cout << "MATCHING " << /*item.a <<*/ "(" << *item.a.type() << ") "
-      //          << item.a.internalToPointer() << " " << /*item.b <<*/ " ("
+      //          << item.a.internalToPointer() << ' ' << /*item.b <<*/ " ("
       //          << *item.b.type() << ") " << item.b.internalToPointer() <<
       //          "\n";
 
@@ -902,7 +902,7 @@ void initJitScriptBindings(PyObject* module) {
                     std::stringstream err;
                     err << "Tried to deepcopy object ";
                     if (auto qualname = class_type->name()) {
-                      err << qualname->qualifiedName() << " ";
+                      err << qualname->qualifiedName() << ' ';
                     }
                     err << "which does not have a __setstate__ method defined!";
                     throw std::runtime_error(err.str());
@@ -912,7 +912,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to deepcopy object ";
                 if (auto qualname = self.type()->name()) {
-                  err << qualname->qualifiedName() << " ";
+                  err << qualname->qualifiedName() << ' ';
                 }
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
@@ -929,7 +929,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to serialize object ";
                 if (auto qualname = self.type()->name()) {
-                  err << qualname->qualifiedName() << " ";
+                  err << qualname->qualifiedName() << ' ';
                 }
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
@@ -966,7 +966,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to deserialize object ";
                 if (auto qualname = class_type->name()) {
-                  err << qualname->qualifiedName() << " ";
+                  err << qualname->qualifiedName() << ' ';
                 }
                 err << "which does not have a __setstate__ method defined!";
                 throw std::runtime_error(err.str());
diff --git a/torch/csrc/jit/runtime/argument_spec.cpp b/torch/csrc/jit/runtime/argument_spec.cpp
index 0a50a64e5f1bf..667b94556f2b9 100644
--- a/torch/csrc/jit/runtime/argument_spec.cpp
+++ b/torch/csrc/jit/runtime/argument_spec.cpp
@@ -127,7 +127,7 @@ void ArgumentSpecCreator::dump() const {
         break;
     }
   }
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 ArgumentSpec ArgumentSpecCreator::create(bool with_grad, const Stack& input)
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index 1b4cf86a1963c..a7758f1674fab 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -402,12 +402,12 @@ inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) {
   }
   out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
       << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim()
-      << ")";
+      << ')';
   return out;
 }
 
 inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
-  out << "{";
+  out << '{';
   for (const auto i : c10::irange(spec.numTensors())) {
     if (i > 0)
       out << ", ";
@@ -419,7 +419,7 @@ inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
       out << ", ";
     out << spec.isPresent(i);
   }
-  out << "}";
+  out << '}';
   return out;
 }
 
@@ -431,20 +431,20 @@ inline std::ostream& operator<<(
   }
   out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
       << ", requires_grad=" << info.requires_grad()
-      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")";
+      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ')';
   return out;
 }
 
 inline std::ostream& operator<<(
     std::ostream& out,
     const CompleteArgumentSpec& spec) {
-  out << "{";
+  out << '{';
   for (const auto i : c10::irange(spec.size())) {
     if (i > 0)
       out << ", ";
     out << spec.at(i);
   }
-  out << "}";
+  out << '}';
   return out;
 }
 
diff --git a/torch/csrc/jit/runtime/instruction.cpp b/torch/csrc/jit/runtime/instruction.cpp
index b591bf7318b40..7388b8eac6700 100644
--- a/torch/csrc/jit/runtime/instruction.cpp
+++ b/torch/csrc/jit/runtime/instruction.cpp
@@ -47,10 +47,10 @@ std::ostream& operator<<(std::ostream& out, Instruction inst) {
   auto nargs = std::strlen(OpInfo(inst.op));
   out << inst.op;
   if (nargs > 0) {
-    out << " " << inst.X;
+    out << ' ' << inst.X;
   }
   if (nargs > 1) {
-    out << " " << inst.N;
+    out << ' ' << inst.N;
   }
   return out;
 }
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 9d4d681f8b32f..95b74376d2eb2 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -213,7 +213,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     out << "Stack:\n";
     for (const auto& val : stack) {
       out << val;
-      out << "\n";
+      out << '\n';
     }
   }
 
@@ -929,7 +929,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         python_class_name ? *python_class_name : "RuntimeError";
     ss << "The following operation failed in the TorchScript interpreter.\n";
     formatStackTrace(ss);
-    ss << class_name << ": " << msg << "\n";
+    ss << class_name << ": " << msg << '\n';
     if (future_) {
       future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
     } else if (is_jit_exception) {
@@ -942,7 +942,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           not_implemented_error->caller());
     } else {
       if (get_cpp_stacktraces_enabled()) {
-        ss << e.what() << "\n";
+        ss << e.what() << '\n';
       }
       throw std::runtime_error(ss.str());
     }
@@ -1143,7 +1143,7 @@ std::vector<std::string> currentModuleHierarchy() {
 }
 
 std::ostream& operator<<(std::ostream& out, const Code& code) {
-  out << *code.pImpl->graph_ << "\n";
+  out << *code.pImpl->graph_ << '\n';
   code.pImpl->dump(out);
   return out;
 }
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 02e64d1961513..3eddaec0dece4 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -866,17 +866,17 @@ struct CodeImpl {
   }
 
   void dump(std::ostream& out, size_t i) const {
-    out << i << " " << instructions_[i];
+    out << i << ' ' << instructions_[i];
     if (instructions_[i].op == OP || instructions_[i].op == CALL ||
         instructions_[i].op == OPN) {
       out << " # " << *instructions_source_[i];
     } else {
-      out << "\n";
+      out << '\n';
     }
   }
 
   void dump(std::ostream& out) const {
-    out << *graph_ << "\n";
+    out << *graph_ << '\n';
     for (const auto i : c10::irange(instructions_.size())) {
       dump(out, i);
     }
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 310fe35ffaacb..1f168d24e8adf 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -145,7 +145,7 @@ bool isSortableListOfObjectsOrTuples(
   why_not << "Only list of Tensors, ints, floats, bools, strs, "
           << "a User Defined Class that defines the __lt__ compare method "
           << "or Tuples of aforementioned types can be sorted, got list of "
-          << type->repr_str() << "\n";
+          << type->repr_str() << '\n';
   return false;
 }
 
@@ -820,7 +820,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           bool first = true;
           for (const IValue& i : last(stack, num_inputs)) {
             if (!first)
-              ss << " ";
+              ss << ' ';
             first = false;
             ss << i;
           }
@@ -2971,10 +2971,10 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
         auto i = pop(stack).toInt();                     \
         std::stringstream ss;                            \
         if (i < 0) {                                     \
-          ss << "-";                                     \
+          ss << '-';                                     \
           i = -i;                                        \
         }                                                \
-        ss << "0" << prefix << char_op << i;             \
+        ss << '0' << prefix << char_op << i;             \
         push(stack, ss.str());                           \
       },                                                 \
       aliasAnalysisFromSchema())
@@ -2991,7 +2991,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
             push(stack, "0b0");
           } else {
             if (i < 0) {
-              ss << "-";
+              ss << '-';
               i = -i;
             }
             std::string str = std::bitset<8 * sizeof(i)>(i).to_string();
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 0a6e0b3564add..8ad348bb162c1 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -145,9 +145,9 @@ std::string dumpValueSet(
   std::ostringstream oss;
   oss << set_name << ": {";
   for (const auto* val : value_set) {
-    oss << "%" << val->debugName() << ", ";
+    oss << '%' << val->debugName() << ", ";
   }
-  oss << "}";
+  oss << '}';
   return oss.str();
 }
 
@@ -1521,7 +1521,7 @@ void BlockRunner::benchmark(
     } else if (results.native_nodes.count(kind)) {
       std::cout << ", native)" << '\n';
     } else {
-      std::cout << ")" << '\n';
+      std::cout << ')' << '\n';
     }
 
     if (generate_ai_pep_output) {
@@ -1566,13 +1566,13 @@ void BlockRunner::benchmark(
   auto unsupported_nodes_count = results.total_nodes_count -
       results.out_nodes_count - results.native_nodes.size();
   std::cout << "Total number of 'out' variant nodes/total number of nodes: "
-            << results.out_nodes_count << "/" << results.total_nodes_count
+            << results.out_nodes_count << '/' << results.total_nodes_count
             << " ("
             << 100.0 * static_cast<float>(results.out_nodes_count) /
           static_cast<float>(results.total_nodes_count)
             << "%)" << '\n';
   std::cout << "Total number of nodes not covered by SR/total number of nodes: "
-            << unsupported_nodes_count << "/" << results.total_nodes_count
+            << unsupported_nodes_count << '/' << results.total_nodes_count
             << " ("
             << 100.0 * static_cast<float>(unsupported_nodes_count) /
           static_cast<float>(results.total_nodes_count)
diff --git a/torch/csrc/jit/serialization/onnx.cpp b/torch/csrc/jit/serialization/onnx.cpp
index 82ce2e4e360c1..499ed582a661d 100644
--- a/torch/csrc/jit/serialization/onnx.cpp
+++ b/torch/csrc/jit/serialization/onnx.cpp
@@ -27,7 +27,7 @@ void dump(const onnx::TensorProto& tensor, std::ostream& stream) {
   for (const auto i : c10::irange(tensor.dims_size())) {
     stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " ");
   }
-  stream << "]";
+  stream << ']';
 }
 
 void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
@@ -36,7 +36,7 @@ void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
     if (dim.has_dim_value()) {
       stream << dim.dim_value();
     } else {
-      stream << "?";
+      stream << '?';
     }
     stream << (i == shape.dim_size() - 1 ? "" : " ");
   }
@@ -67,7 +67,7 @@ void dump(const onnx::TypeProto_Optional& optional_type, std::ostream& stream) {
   } else {
     stream << "None";
   }
-  stream << ">";
+  stream << '>';
 }
 
 void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
@@ -77,7 +77,7 @@ void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
   } else {
     stream << "None";
   }
-  stream << ">";
+  stream << '>';
 }
 
 void dump(const onnx::TypeProto& type, std::ostream& stream) {
@@ -95,7 +95,7 @@ void dump(const onnx::TypeProto& type, std::ostream& stream) {
 void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) {
   stream << "{name: \"" << value_info.name() << "\", type:";
   dump(value_info.type(), stream);
-  stream << "}";
+  stream << '}';
 }
 
 void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent);
@@ -123,36 +123,36 @@ void dump(
     for (const auto i : c10::irange(attr.floats_size())) {
       stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " ");
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.ints_size()) {
     stream << "ints, values: [";
     for (const auto i : c10::irange(attr.ints_size())) {
       stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " ");
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.strings_size()) {
     stream << "strings, values: [";
     for (const auto i : c10::irange(attr.strings_size())) {
       stream << "'" << attr.strings(i) << "'"
              << (i == attr.strings_size() - 1 ? "" : " ");
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.tensors_size()) {
     stream << "tensors, values: [";
     for (auto& t : attr.tensors()) {
       dump(t, stream);
     }
-    stream << "]";
+    stream << ']';
   } else if (attr.graphs_size()) {
     stream << "graphs, values: [";
     for (auto& g : attr.graphs()) {
       dump(g, stream, indent + 1);
     }
-    stream << "]";
+    stream << ']';
   } else {
     stream << "UNKNOWN";
   }
-  stream << "}";
+  stream << '}';
 }
 
 void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
@@ -174,31 +174,31 @@ void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
 
 void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) {
   stream << idt(indent) << "GraphProto {" << nlidt(indent + 1) << "name: \""
-         << graph.name() << "\"" << nlidt(indent + 1) << "inputs: [";
+         << graph.name() << '"' << nlidt(indent + 1) << "inputs: [";
   for (const auto i : c10::irange(graph.input_size())) {
     dump(graph.input(i), stream);
     stream << (i == graph.input_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "outputs: [";
+  stream << ']' << nlidt(indent + 1) << "outputs: [";
   for (const auto i : c10::irange(graph.output_size())) {
     dump(graph.output(i), stream);
     stream << (i == graph.output_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "value_infos: [";
+  stream << ']' << nlidt(indent + 1) << "value_infos: [";
   for (const auto i : c10::irange(graph.value_info_size())) {
     dump(graph.value_info(i), stream);
     stream << (i == graph.value_info_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "initializers: [";
+  stream << ']' << nlidt(indent + 1) << "initializers: [";
   for (const auto i : c10::irange(graph.initializer_size())) {
     dump(graph.initializer(i), stream);
     stream << (i == graph.initializer_size() - 1 ? "" : ",");
   }
-  stream << "]" << nlidt(indent + 1) << "nodes: [" << nlidt(indent + 2);
+  stream << ']' << nlidt(indent + 1) << "nodes: [" << nlidt(indent + 2);
   for (const auto i : c10::irange(graph.node_size())) {
     dump(graph.node(i), stream, indent + 2);
     if (i != graph.node_size() - 1) {
-      stream << "," << nlidt(indent + 2);
+      stream << ',' << nlidt(indent + 2);
     }
   }
   stream << nlidt(indent + 1) << "]\n" << idt(indent) << "}\n";
@@ -208,14 +208,14 @@ void dump(
     const onnx::OperatorSetIdProto& operator_set_id,
     std::ostream& stream) {
   stream << "OperatorSetIdProto { domain: " << operator_set_id.domain()
-         << ", version: " << operator_set_id.version() << "}";
+         << ", version: " << operator_set_id.version() << '}';
 }
 
 void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) {
   stream << idt(indent) << "ModelProto {" << nlidt(indent + 1)
-         << "producer_name: \"" << model.producer_name() << "\""
-         << nlidt(indent + 1) << "domain: \"" << model.domain() << "\""
-         << nlidt(indent + 1) << "doc_string: \"" << model.doc_string() << "\"";
+         << "producer_name: \"" << model.producer_name() << '"'
+         << nlidt(indent + 1) << "domain: \"" << model.domain() << '"'
+         << nlidt(indent + 1) << "doc_string: \"" << model.doc_string() << '"';
   if (model.has_graph()) {
     stream << nlidt(indent + 1) << "graph:\n";
     dump(model.graph(), stream, indent + 2);
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 0622dbb5cd98e..1d5a2e77931c0 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     err << "Cannot serialize custom bound C++ class";
     if (memoized_class_types_ && !memoized_class_types_->empty()) {
       if (auto qualname = memoized_class_types_->back()->name()) {
-        err << " " << qualname->qualifiedName();
+        err << ' ' << qualname->qualifiedName();
       }
     }
     err << ". Please define serialization methods via def_pickle() for "
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index 70e188816fb4c..bf7e5250487d1 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -381,7 +381,7 @@ struct PythonPrintImpl {
   static std::string makeValidIdentifier(const std::string& candidate) {
     std::stringstream ss;
     if (candidate.empty() || isdigit(candidate[0]))
-      ss << "_";
+      ss << '_';
     for (char c : candidate) {
       if (isupper(c) || islower(c) || isdigit(c) || c == '_')
         ss << c;
@@ -487,11 +487,11 @@ struct PythonPrintImpl {
     if (isValidIdentifier(val_name)) {
       stmt << val_name;
     } else {
-      stmt << "(" << val_name << ")";
+      stmt << '(' << val_name << ')';
     }
-    stmt << "[";
+    stmt << '[';
     stmt << useOf(inputs[1]);
-    stmt << "]";
+    stmt << ']';
   }
 
   void printDict(
@@ -534,7 +534,7 @@ struct PythonPrintImpl {
     body_ << " = ";
     // or if value is being assigned to something of a union type
     printValueList(body_, rhs);
-    body_ << "\n";
+    body_ << '\n';
   }
 
   bool requiresAnnotation(Value* lhs, Value* rhs) {
@@ -555,7 +555,7 @@ struct PythonPrintImpl {
       if (requiresAnnotation(lhs[i], rhs[i])) {
         body_ << ": " << lhs[i]->type()->annotation_str(type_printer_);
       }
-      body_ << " = " << useOf(rhs[i]) << "\n";
+      body_ << " = " << useOf(rhs[i]) << '\n';
     }
   }
 
@@ -705,7 +705,7 @@ struct PythonPrintImpl {
       printValueList(body_, node->outputs());
       body_ << " = ";
     }
-    body_ << expr << "\n";
+    body_ << expr << '\n';
   }
 
   // Recursively check contained types for any class dependencies
@@ -794,7 +794,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "return ";
           printValueList(body_, node->inputs());
-          body_ << "\n";
+          body_ << '\n';
         }
         break;
       case prim::Loop:
@@ -814,7 +814,7 @@ struct PythonPrintImpl {
         if (!node->outputs().empty()) {
           printValueList(body_, node->outputs(), "", ", = ");
         }
-        body_ << useOf(node->input()) << "\n";
+        body_ << useOf(node->input()) << '\n';
         break;
       case prim::SetAttr: {
         const auto obj = node->inputs().at(0);
@@ -822,8 +822,8 @@ struct PythonPrintImpl {
         const auto type = obj->type()->expect<ClassType>();
         const auto& attrname = node->s(attr::name);
         indent();
-        body_ << useOf(obj) << "." << attrname << " = " << useOf(newVal)
-              << "\n";
+        body_ << useOf(obj) << '.' << attrname << " = " << useOf(newVal)
+              << '\n';
       } break;
       case prim::fork: {
         // the subgraph gets emitted as another function
@@ -836,7 +836,7 @@ struct PythonPrintImpl {
         }
         printBody(graph->block());
         std::stringstream ss;
-        ss << "fork(" << name << ")";
+        ss << "fork(" << name << ')';
         printOutputDefinition(node, ss.str());
       } break;
       case prim::awaitable: {
@@ -850,7 +850,7 @@ struct PythonPrintImpl {
         }
         printBody(graph->block());
         std::stringstream ss;
-        ss << "awaitable(" << name << ")";
+        ss << "awaitable(" << name << ')';
         printOutputDefinition(node, ss.str());
       } break;
       case prim::Enter: {
@@ -884,7 +884,7 @@ struct PythonPrintImpl {
         auto name = useOf(node->output())->str();
         std::shared_ptr<Graph> graph = node->g(attr::Subgraph);
         indent();
-        body_ << "def " << name << "(";
+        body_ << "def " << name << '(';
         assignValuesToTheirUniqueNames(graph->inputs());
         for (size_t i = 0; i < graph->inputs().size(); ++i) {
           Value* v = graph->inputs().at(i);
@@ -903,7 +903,7 @@ struct PythonPrintImpl {
         assignValuesToTheirUniqueNames(out);
         indent();
         body_ << useOf(out) << " : " << out->type()->annotation_str() << " = "
-              << useOf(container) << "[" << useOf(key) << "]\n";
+              << useOf(container) << '[' << useOf(key) << "]\n";
       } break;
       default:
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
@@ -992,7 +992,7 @@ struct PythonPrintImpl {
       // doing it here ensures we do not have fix up archives later
       stmt << "torch." << kind.toUnqualString();
     } else {
-      stmt << "ops." << kind.ns().toUnqualString() << "."
+      stmt << "ops." << kind.ns().toUnqualString() << '.'
            << kind.toUnqualString();
     }
   }
@@ -1011,14 +1011,14 @@ struct PythonPrintImpl {
               << "If this is a nn.ModuleList, add it to __constants__");
         }
         std::stringstream scalars_stream;
-        stmt << "^" << value->name();
+        stmt << '^' << value->name();
         value->writeScalars(scalars_stream);
         stmt << scalars_stream.str();
         printValueList(stmt, node->inputs(), "(", ")");
       } break;
       case prim::Uninitialized: {
         stmt << "uninitialized("
-             << node->output()->type()->annotation_str(type_printer_) << ")";
+             << node->output()->type()->annotation_str(type_printer_) << ')';
       } break;
       case prim::Constant: {
         if (node->outputs().size() == 1 &&
@@ -1038,7 +1038,7 @@ struct PythonPrintImpl {
       case aten::IntImplicit: {
         stmt << "annotate("
              << node->output()->type()->annotation_str(type_printer_) << ", "
-             << useOf(node->input()) << ")";
+             << useOf(node->input()) << ')';
       } break;
       case aten::Int: {
         printValueList(stmt, node->inputs(), "int(", ")");
@@ -1070,12 +1070,12 @@ struct PythonPrintImpl {
             stmt, node->inputs(), "(", node->inputs().size() == 1 ? ",)" : ")");
       } break;
       case prim::TupleIndex: {
-        stmt << "(" << useOf(node->inputs().at(0)) << ")["
-             << useOf(node->inputs().at(1)) << "]";
+        stmt << '(' << useOf(node->inputs().at(0)) << ")["
+             << useOf(node->inputs().at(1)) << ']';
       } break;
       case prim::TupleSlice: {
-        stmt << "(" << useOf(node->input()) << ")[" << node->i(attr::beg) << ":"
-             << node->i(attr::end) << "]";
+        stmt << '(' << useOf(node->input()) << ")[" << node->i(attr::beg) << ':'
+             << node->i(attr::end) << ']';
       } break;
       case prim::ListConstruct: {
         ListTypePtr list_type = node->output()->type()->expect<ListType>();
@@ -1093,7 +1093,7 @@ struct PythonPrintImpl {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_) << ", ";
           printValueList(stmt, node->inputs(), "[", "]");
-          stmt << ")";
+          stmt << ')';
           // Otherwise just print a list
         } else {
           printValueList(stmt, node->inputs(), "[", "]");
@@ -1112,7 +1112,7 @@ struct PythonPrintImpl {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_) << ", ";
           printDict(stmt, node->inputs());
-          stmt << ")";
+          stmt << ')';
           // Otherwise just print a dict
         } else {
           printDict(stmt, node->inputs());
@@ -1121,37 +1121,36 @@ struct PythonPrintImpl {
       case prim::CreateObject: {
         const auto classType = node->output()->type()->expect<ClassType>();
         stmt << classType->annotation_str(type_printer_) << ".__new__("
-             << classType->annotation_str(type_printer_) << ")";
+             << classType->annotation_str(type_printer_) << ')';
       } break;
       case prim::GetAttr: {
         const auto obj = node->inputs().at(0);
         const auto classType = obj->type()->expect<ClassType>();
         const auto& field = node->s(attr::name);
         if (isValidIdentifier(field)) {
-          stmt << useOf(obj) << "." << field;
+          stmt << useOf(obj) << '.' << field;
         } else {
           stmt << "getattr(" << useOf(obj) << ", ";
           std::stringstream field_stream;
           c10::printQuotedString(field_stream, field);
-          stmt << field_stream.str() << ")";
+          stmt << field_stream.str() << ')';
         }
       } break;
       case prim::CallFunction: {
-        stmt << useOf(node->inputs().at(0)) << "(";
+        stmt << useOf(node->inputs().at(0)) << '(';
         for (size_t i = 1; i < node->inputs().size(); i++) {
           stmt << useOf(node->inputs()[i]) << ", ";
         }
-        stmt << ")";
+        stmt << ')';
       } break;
       case prim::CallMethod: {
         const auto& self = node->inputs().at(0);
         const auto& methodName = node->s(attr::name);
-        stmt << "(" << useOf(self) << ")"
-             << "." << methodName << "(";
+        stmt << '(' << useOf(self) << ')' << '.' << methodName << '(';
         for (size_t i = 1; i < node->inputs().size(); i++) {
           stmt << useOf(node->inputs()[i]) << ", ";
         }
-        stmt << ")";
+        stmt << ')';
 
         if (auto selfClass = self->type()->cast<ClassType>()) {
           deps_table_.add(selfClass);
@@ -1169,7 +1168,7 @@ struct PythonPrintImpl {
       } break;
       case aten::_unwrap_optional: {
         printOpName(stmt, node->kind());
-        stmt << "(";
+        stmt << '(';
         // we cannot recover the type of unwrap_optional(None),
         // using normal schema matching, so we route around this by rewriting
         // the call to unwrap_optional(annotated(Optional[T], None))
@@ -1177,11 +1176,11 @@ struct PythonPrintImpl {
             node->input()->mustBeNone()) {
           auto input_type = OptionalType::create(node->output()->type());
           stmt << "annotate(" << input_type->annotation_str(type_printer_)
-               << ", " << useOf(node->input()) << ")";
+               << ", " << useOf(node->input()) << ')';
         } else {
           stmt << useOf(node->input());
         }
-        stmt << ")";
+        stmt << ')';
       } break;
       // unchecked_unwrap_optional is no longer generated by the compiler,
       // but may end up here if it was first loaded from a old model and
@@ -1191,7 +1190,7 @@ struct PythonPrintImpl {
       case prim::unchecked_cast: {
         stmt << "unchecked_cast("
              << node->output()->type()->annotation_str(type_printer_) << ", "
-             << useOf(node->input()) << ")";
+             << useOf(node->input()) << ')';
       } break;
       case prim::isinstance: {
         stmt << "isinstance(" << useOf(node->input()) << ", ";
@@ -1200,7 +1199,7 @@ struct PythonPrintImpl {
           stmt << types.at(0)->annotation_str(type_printer_);
         } else {
           // check multiple things, e.g. (str, list, int)
-          stmt << "(";
+          stmt << '(';
           bool first = true;
           for (const TypePtr& typ : types) {
             if (!first) {
@@ -1209,30 +1208,29 @@ struct PythonPrintImpl {
             stmt << typ->annotation_str(type_printer_);
             first = false;
           }
-          stmt << ")";
+          stmt << ')';
         }
-        stmt << ")";
+        stmt << ')';
       } break;
       case prim::tolist: {
         stmt << "annotate("
              << node->output()->type()->annotation_str(type_printer_) << ", ";
-        stmt << useOf(node->input(0)) << ".tolist()"
-             << ")";
+        stmt << useOf(node->input(0)) << ".tolist()" << ')';
       } break;
       case prim::EnumValue:
         // Note: This CAN NOT be printed as raw operator ops.prim.EnumValue
         // because its return type depends on type of enum and must be further
         // resolved, but ops.prim.EnumValue construction does not provide such
         // functionality.
-        stmt << "(" << useOf(node->input()) << ").value";
+        stmt << '(' << useOf(node->input()) << ").value";
         break;
       case prim::EnumName:
-        stmt << "(" << useOf(node->input()) << ").name";
+        stmt << '(' << useOf(node->input()) << ").name";
         break;
       default: {
         printOpName(stmt, node->kind());
         const FunctionSchema& schema = node->schema();
-        stmt << "(";
+        stmt << '(';
         // calculate how many args are specified.
         // see (https://github.com/pytorch/pytorch/pull/56079) for more
         // details.
@@ -1257,7 +1255,7 @@ struct PythonPrintImpl {
             if (i < num_schema_args) {
               auto arg = schema.arguments().at(i);
               if (arg.kwarg_only()) {
-                stmt << arg.name() << "=";
+                stmt << arg.name() << '=';
               }
             } else {
               // vararg functions like format can have extra arguments
@@ -1274,11 +1272,11 @@ struct PythonPrintImpl {
             // figure out the corresponding input at this index
             auto input_idx = node->inputs().size() - (num_schema_args - i);
             if (input_idx < node->inputs().size()) {
-              stmt << arg.name() << "=" << *useOf(node->inputs().at(input_idx));
+              stmt << arg.name() << '=' << *useOf(node->inputs().at(input_idx));
             }
           }
         }
-        stmt << ")";
+        stmt << ')';
       } break;
     }
   }
@@ -1313,7 +1311,7 @@ struct PythonPrintImpl {
       const Argument& arg,
       TaggedStringStream& stmt,
       const IValue& value) {
-    stmt << "=";
+    stmt << '=';
     // handle broadcasting lists
     if (arg.type()->kind() == ListType::Kind &&
         (value.isInt() || value.isDouble() || value.isBool())) {
@@ -1363,7 +1361,7 @@ struct PythonPrintImpl {
     WithSourceRange guard(&source_range_stack_, graph.param_node());
 
     indent();
-    body_ << "def " << func.name() << "(";
+    body_ << "def " << func.name() << '(';
     auto param_it = graph.inputs().begin();
     for (const Argument& arg : schema.arguments()) {
       registerClassDependencies(arg.type());
@@ -1448,14 +1446,14 @@ struct PythonPrintImpl {
         indent();
         body_ << "__parameters__ = [";
         for (const auto& param : params) {
-          body_ << "\"" << param << "\", ";
+          body_ << '"' << param << "\", ";
         }
         body_ << "]\n";
 
         indent();
         body_ << "__buffers__ = [";
         for (const auto& buffer : buffers) {
-          body_ << "\"" << buffer << "\", ";
+          body_ << '"' << buffer << "\", ";
         }
         body_ << "]\n";
         auto forwardPreHooks = classType->getForwardPreHooks();
@@ -1463,7 +1461,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "__forward_pre_hooks__ = [";
           for (const auto& pre_hook : forwardPreHooks) {
-            body_ << "\"" << pre_hook->name() << "\", ";
+            body_ << '"' << pre_hook->name() << "\", ";
           }
           body_ << "]\n";
         }
@@ -1473,7 +1471,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "__forward_hooks__ = [";
           for (const auto& hook : forwardHooks) {
-            body_ << "\"" << hook->name() << "\", ";
+            body_ << '"' << hook->name() << "\", ";
           }
           body_ << "]\n";
         }
@@ -1496,13 +1494,12 @@ struct PythonPrintImpl {
           }
           // Print out a direct manipulation of the annotations dict, like:
           //   __annotations__["0"] = SomeType
-          body_ << "__annotations__["
-                << "\"" << name
-                << "\"] = " << type->annotation_str(type_printer_) << "\n";
+          body_ << "__annotations__[" << '"' << name
+                << "\"] = " << type->annotation_str(type_printer_) << '\n';
         } else {
           // Otherwise: just emit a python 3 attribute annotation, like:
           //   foo : SomeType
-          body_ << name << " : " << type->annotation_str(type_printer_) << "\n";
+          body_ << name << " : " << type->annotation_str(type_printer_) << '\n';
         }
       }
 
@@ -1516,7 +1513,7 @@ struct PythonPrintImpl {
               << "Final[" << v.type()->annotation_str(type_printer_) << "] = ";
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
         printConstant(*ss, v);
-        body_ << ss->str() << "\n";
+        body_ << ss->str() << '\n';
       }
 
       // TODO fields
@@ -1554,7 +1551,7 @@ struct PythonPrintImpl {
           TORCH_INTERNAL_ASSERT(attr.type());
           indent();
           body_ << attr.name() << " : "
-                << attr.type()->annotation_str(type_printer_) << "\n";
+                << attr.type()->annotation_str(type_printer_) << '\n';
         }
       }
     } else if (auto interfaceType = type->cast<InterfaceType>()) {
@@ -1600,7 +1597,7 @@ struct PythonPrintImpl {
         for (const auto& name_value : enumType->enumNamesValues()) {
           indent();
           body_ << name_value.first << " = " << value_wrapper
-                << name_value.second << value_wrapper << "\n";
+                << name_value.second << value_wrapper << '\n';
         }
       }
     } else {
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp
index 24228cdea32dd..6ec55f998cce0 100644
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -132,7 +132,7 @@ void BlockPrinter::visit(const ForPtr& v) {
     os() << '\n';
     emitIndent();
     PrintReshapeInfo(buf_writes, true); // print reverse reshape
-    os() << "}";
+    os() << '}';
     os() << '\n';
   } else if (loop_options.is_gpu_thread_index()) {
     PrintDMAs(buf_reads);
@@ -154,12 +154,12 @@ void BlockPrinter::PrintTensorInfo(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     auto num_dims = block_analysis_->getMultiDimBuf(buf)->dims().size();
     os() << block_analysis_->getInputName(buf) << " = ";
-    os() << "{";
+    os() << '{';
     for (unsigned long d = 0; d < num_dims; d++) {
-      os() << "{" << dim_names[d] << "};";
+      os() << '{' << dim_names[d] << "};";
     }
     os() << " elem : " << blockDtypeCppString(buf->dtype());
-    os() << "}";
+    os() << '}';
   }
 
   for (auto& buf : bufs) {
@@ -168,15 +168,14 @@ void BlockPrinter::PrintTensorInfo(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     auto num_dims = block_analysis_->getMultiDimBuf(buf)->dims().size();
     os() << block_analysis_->getFlatInputName(buf) << " = ";
-    os() << "{";
-    os() << "{" << flat_dim_names[num_dims - 1] << "};";
+    os() << '{';
+    os() << '{' << flat_dim_names[num_dims - 1] << "};";
     os() << " elem : " << blockDtypeCppString(buf->dtype());
-    os() << "}"
-         << " // flattened tensor";
+    os() << '}' << " // flattened tensor";
   }
   os() << '\n';
   emitIndent();
-  os() << "}" << '\n' << '\n';
+  os() << '}' << '\n' << '\n';
 }
 
 void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
@@ -213,7 +212,7 @@ void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
   emitIndent();
   os() << "var bs_DPE = " << blck_sz << '\n';
   emitIndent();
-  os() << "}" << '\n' << '\n';
+  os() << '}' << '\n' << '\n';
 }
 
 void BlockPrinter::PrintBufferInfo(const std::unordered_set<BufPtr>& bufs) {
@@ -230,7 +229,7 @@ void BlockPrinter::PrintBufferInfo(const std::unordered_set<BufPtr>& bufs) {
   }
   os() << '\n';
   emitIndent();
-  os() << "}" << '\n' << '\n';
+  os() << '}' << '\n' << '\n';
 }
 
 void BlockPrinter::PrintDistribution(const std::unordered_set<BufPtr>& bufs) {
@@ -253,14 +252,14 @@ void BlockPrinter::PrintLoop(
   auto trip = 0;
   for (auto& buf : bufs) {
     if (trip > 0) {
-      os() << ",";
+      os() << ',';
     }
     os() << "{dim : ";
     os() << block_analysis_->getFlatInputName(buf) << ".dim.0, ";
     os() << (block_idx ? "block: bs_N}" : "block: bs_DPE}");
     ++trip;
   }
-  os() << ")";
+  os() << ')';
 }
 
 void BlockPrinter::PrintReshapeInfo(
@@ -274,7 +273,7 @@ void BlockPrinter::PrintReshapeInfo(
          << ", "
          << (reverse ? block_analysis_->getInputName(buf)
                      : block_analysis_->getFlatInputName(buf))
-         << ")" << '\n';
+         << ')' << '\n';
   }
 }
 
@@ -283,7 +282,7 @@ void BlockPrinter::PrintDMAs(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     os() << "dma_in(";
     os() << block_analysis_->getFlatInputName(read);
-    os() << ")" << '\n';
+    os() << ')' << '\n';
   }
 }
 void BlockPrinter::PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs) {
@@ -291,7 +290,7 @@ void BlockPrinter::PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     os() << "adjust_buffer(";
     os() << block_analysis_->getFlatInputName(read);
-    os() << ")" << '\n';
+    os() << ')' << '\n';
   }
 }
 
@@ -305,14 +304,14 @@ void BlockPrinter::visit(const StorePtr& v) {
 }
 
 void BlockPrinter::visit(const BlockPtr& v) {
-  os() << "{" << '\n';
+  os() << '{' << '\n';
   indent_++;
   for (const StmtPtr& s : v->stmts()) {
     s->accept(this);
   }
   indent_--;
   emitIndent();
-  os() << "}";
+  os() << '}';
 }
 
 std::string BlockCodeGen::GetUniqueFuncName(const std::string& func_prefix) {
@@ -341,14 +340,14 @@ void BlockCodeGen::Initialize() {
   };
 
   std::string func_name = GetUniqueFuncName("func");
-  os() << "kernel " << func_name << "(";
+  os() << "kernel " << func_name << '(';
   for (auto const& arg : buf_writes) {
     os() << block_analysis_->getInputName(arg);
   }
   for (auto const& arg : buf_reads) {
-    os() << ";" << block_analysis_->getInputName(arg);
+    os() << ';' << block_analysis_->getInputName(arg);
   }
-  os() << ")";
+  os() << ')';
 
   stmt_v->accept(printer_.get());
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index bbc9d845fa4f7..034f51f46b8f7 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -128,10 +128,10 @@ void printBoundsInfo(const BoundsInfo& v) {
       if (!first) {
         std::cerr << ", ";
       }
-      std::cerr << ((b.kind == kLoad) ? "LOAD" : "STORE") << "(";
+      std::cerr << ((b.kind == kLoad) ? "LOAD" : "STORE") << '(';
       int i = 0;
       if (b.start.empty()) {
-        std::cerr << "0";
+        std::cerr << '0';
       }
       for (auto& s : b.start) {
         if (i != 0) {
@@ -143,7 +143,7 @@ void printBoundsInfo(const BoundsInfo& v) {
       std::cerr << "; ";
       i = 0;
       if (b.stop.empty()) {
-        std::cerr << "0";
+        std::cerr << '0';
       }
       for (auto& s : b.stop) {
         if (i != 0) {
@@ -152,7 +152,7 @@ void printBoundsInfo(const BoundsInfo& v) {
         std::cerr << *s;
         i++;
       }
-      std::cerr << ")";
+      std::cerr << ')';
       first = false;
     }
     std::cerr << "]\n";
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
index 0c352e3b19f3b..0c785504efe85 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
@@ -35,7 +35,7 @@ static bool mustBeZero(const ExprPtr& e) {
 }
 
 void Bound::print() const {
-  std::cout << "(" << *start << ", " << *end << ")";
+  std::cout << '(' << *start << ", " << *end << ')';
 }
 
 bool Bound::equals(const Bound& other) const {
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index 41e54869850c8..b19a8b8964ad5 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -41,7 +41,7 @@ RegisterCodeGenList::StmtFactoryMethod RegisterCodeGenList::
       oss << entry.first;
       index++;
     }
-    oss << "]";
+    oss << ']';
     throw std::runtime_error(oss.str());
   }
   return iter->second;
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
index fa42d48c75e93..6b03b939ace99 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -89,28 +89,28 @@ static inline std::enable_if_t<std::is_floating_point_v<T>, void> visit_mod(
     std::ostream& os,
     const ExprPtr& lhs,
     const ExprPtr& rhs) {
-  os << "std::fmod(" << *lhs << ", " << *rhs << ")";
+  os << "std::fmod(" << *lhs << ", " << *rhs << ')';
 }
 
 template <typename T>
 static inline std::
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "std::max(" << *lhs << ", " << *rhs << ")";
+  os << "std::max(" << *lhs << ", " << *rhs << ')';
 }
 
 template <typename T>
 static inline std::
     enable_if_t<!std::is_floating_point_v<T> && !std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "(" << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
+  os << '(' << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
 }
 
 template <typename T>
 static inline std::
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_min(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "std::min(" << *lhs << ", " << *rhs << ")";
+  os << "std::min(" << *lhs << ", " << *rhs << ')';
 }
 
 template <typename T>
@@ -176,14 +176,14 @@ void CppPrinter::visit(const MinPtr& v) {
 }
 
 void CppPrinter::visit(const CompareSelectPtr& v) {
-  os() << "((" << *v->lhs() << " "
-       << IRPrinter::to_string(v->compare_select_op()) << " " << *v->rhs()
-       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ")";
+  os() << "((" << *v->lhs() << ' '
+       << IRPrinter::to_string(v->compare_select_op()) << ' ' << *v->rhs()
+       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ')';
 }
 
 void CppPrinter::visit(const IfThenElsePtr& v) {
   os() << "((" << *v->condition() << ") ? " << *v->true_value() << " : "
-       << *v->false_value() << ")";
+       << *v->false_value() << ')';
 }
 
 void CppPrinter::visit(const AllocatePtr& v) {
@@ -211,7 +211,7 @@ void CppPrinter::visit(const FreePtr& v) {
 void CppPrinter::visit(const LoadPtr& v) {
   auto flat_idx =
       flatten_index(v->buf()->dims(), v->indices(), v->buf()->strides());
-  os() << *v->base_handle() << "[" << *flat_idx << "]";
+  os() << *v->base_handle() << '[' << *flat_idx << ']';
 }
 
 void CppPrinter::visit(const StorePtr& v) {
@@ -221,19 +221,19 @@ void CppPrinter::visit(const StorePtr& v) {
   for (int lane = 0; lane < lanes; lane++) {
     lane_ = lane;
     emitIndent();
-    os() << *v->base_handle() << "[" << *flat_idx << "] = " << *v->value()
-         << ";" << '\n';
+    os() << *v->base_handle() << '[' << *flat_idx << "] = " << *v->value()
+         << ';' << '\n';
   }
 }
 
 void CppPrinter::visit(const CastPtr& v) {
   os() << "static_cast<" << v->dtype().ToCppString() << ">(" << *v->src_value()
-       << ")";
+       << ')';
 }
 
 void CppPrinter::visit(const BitCastPtr& v) {
   os() << "std::bitcast<" << v->src_value()->dtype().ToCppString() << ", "
-       << v->dtype().ToCppString() << ">(" << *v->src_value() << ")";
+       << v->dtype().ToCppString() << ">(" << *v->src_value() << ')';
 }
 
 void CppPrinter::visit(const IntrinsicsPtr& v) {
@@ -241,14 +241,14 @@ void CppPrinter::visit(const IntrinsicsPtr& v) {
     throw std::runtime_error("kRand and kSigmoid are not supported");
   }
 
-  os() << "std::" << v->func_name() << "(";
+  os() << "std::" << v->func_name() << '(';
   for (size_t i = 0; i < v->nparams(); i++) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void CppPrinter::visit(const ExternalCallPtr& v) {
@@ -272,7 +272,7 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   };
 
   emitIndent();
-  os() << "{" << '\n';
+  os() << '{' << '\n';
   indent_++;
 
   emitIndent();
@@ -315,9 +315,9 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   os() << "};" << '\n';
 
   emitIndent();
-  os() << v->func_name() << "(" << '\n';
+  os() << v->func_name() << '(' << '\n';
   emitIndent();
-  os() << "    " << bufs.size() << "," << '\n';
+  os() << "    " << bufs.size() << ',' << '\n';
   emitIndent();
   os() << "    buf_ptrs," << '\n';
   emitIndent();
@@ -327,20 +327,20 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   emitIndent();
   os() << "    buf_dtypes," << '\n';
   emitIndent();
-  os() << "    " << v->args().size() << "," << '\n';
+  os() << "    " << v->args().size() << ',' << '\n';
   emitIndent();
   os() << "    extra_args);" << '\n';
 
   indent_--;
   emitIndent();
-  os() << "}" << '\n';
+  os() << '}' << '\n';
 }
 
 void CppPrinter::visit(const LetPtr& v) {
   if (v->var()->dtype().lanes() == 1) {
     emitIndent();
-    os() << v->var()->dtype().ToCppString() << " " << *v->var() << " = "
-         << *v->value() << ";" << '\n';
+    os() << v->var()->dtype().ToCppString() << ' ' << *v->var() << " = "
+         << *v->value() << ';' << '\n';
   } else {
     vector_vars_[v->var()] = v->value();
   }
@@ -370,7 +370,7 @@ void CppCodeGen::init() {
   apply_visitor(var_name_rewriter_.get());
 
   printer_->printPrologue();
-  os() << "void " << kernel_func_name() << "(";
+  os() << "void " << kernel_func_name() << '(';
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
@@ -381,7 +381,7 @@ void CppCodeGen::init() {
     Dtype dtype = buffer_arg.dtype();
     os() << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ") << *var;
   }
-  os() << ")";
+  os() << ')';
   stmt()->accept(printer_.get());
   os() << '\n';
 }
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 6131b55883dfb..264e01d65db94 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -195,8 +195,8 @@ void CudaPrinter::print_flat_alloc(const AllocatePtr& alloc) {
       throw std::runtime_error("Only integer dimensions are supported for now");
     }
   }
-  os() << dtypeToCppString(alloc->dtype()) << " " << (*alloc->buffer_var())
-       << "[" << flat_size << "];" << '\n';
+  os() << dtypeToCppString(alloc->dtype()) << ' ' << (*alloc->buffer_var())
+       << '[' << flat_size << "];" << '\n';
 }
 
 void CudaPrinter::visit(const AllocatePtr& v) {
@@ -234,9 +234,9 @@ void CudaPrinter::visit(const CastPtr& v) {
       : v->src_value()->dtype().scalar_type() == ScalarType::BFloat16
       ? "__bfloat162float"
       : ("(" + dtypeToCppString(v->dtype()) + ")");
-  os() << castFn << "(";
+  os() << castFn << '(';
   v->src_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const IntrinsicsPtr& v) {
@@ -265,14 +265,14 @@ void CudaPrinter::visit(const IntrinsicsPtr& v) {
     func_name = "isnan";
   }
 
-  os() << func_name << "(";
+  os() << func_name << '(';
   for (const auto i : c10::irange(v->nparams())) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const ExternalCallPtr& v) {
@@ -293,15 +293,15 @@ void CudaPrinter::visit(const LoadPtr& v) {
       v->dtype().scalar_type() == ScalarType::Half ||
       v->dtype().scalar_type() == ScalarType::BFloat16) {
     // There's no __ldg overload for bool or half.
-    os() << *v->base_handle() << "[" << *v->flat_index() << "]";
+    os() << *v->base_handle() << '[' << *v->flat_index() << ']';
     return;
   }
   if (cuda_analysis_->is_buf_store_target(v->buf())) {
     // Cuda __ldg can only be applied on read-only buffers.
-    os() << *v->base_handle() << "[" << *v->flat_index() << "]";
+    os() << *v->base_handle() << '[' << *v->flat_index() << ']';
     return;
   }
-  os() << "__ldg(" << *v->base_handle() << " + " << *v->flat_index() << ")";
+  os() << "__ldg(" << *v->base_handle() << " + " << *v->flat_index() << ')';
 }
 
 // TODO: maybe this should be a more shared location?
@@ -412,9 +412,9 @@ void CudaPrinter::visit(const StorePtr& v) {
   if (v->indices().empty()) {
     os() << *v->base_handle() << " = ";
   } else {
-    os() << *v->base_handle() << "[" << *v->flat_index() << "] = ";
+    os() << *v->base_handle() << '[' << *v->flat_index() << "] = ";
   }
-  os() << *v->value() << ";";
+  os() << *v->value() << ';';
   os() << '\n';
 }
 
@@ -422,10 +422,10 @@ void CudaPrinter::visit(const AtomicAddPtr& v) {
   emitIndent();
   if (cuda_analysis_->thread_local_bufs().count(v->base_handle()) > 0) {
     // atomicAdd only works on global and shared memory
-    os() << *v->base_handle() << "[" << *v->flat_index()
-         << "] += " << *v->value() << ";";
+    os() << *v->base_handle() << '[' << *v->flat_index()
+         << "] += " << *v->value() << ';';
   } else {
-    os() << "atomicAdd(&" << *v->base_handle() << "[" << *v->flat_index() << "]"
+    os() << "atomicAdd(&" << *v->base_handle() << '[' << *v->flat_index() << ']'
          << ", " << *v->value() << ");";
   }
   os() << '\n';
@@ -438,9 +438,9 @@ void CudaPrinter::visit(const MaxPtr& v) {
     os() << "maximum(";
   }
   v->lhs()->accept(this);
-  os() << ",";
+  os() << ',';
   v->rhs()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const MinPtr& v) {
@@ -450,9 +450,9 @@ void CudaPrinter::visit(const MinPtr& v) {
     os() << "minimum(";
   }
   v->lhs()->accept(this);
-  os() << ",";
+  os() << ',';
   v->rhs()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const IfThenElsePtr& v) {
@@ -462,11 +462,11 @@ void CudaPrinter::visit(const IfThenElsePtr& v) {
   v->true_value()->accept(this);
   os() << " : ";
   v->false_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void CudaPrinter::visit(const BlockPtr& v) {
-  os() << "{" << '\n';
+  os() << '{' << '\n';
   indent_++;
 
   for (const StmtPtr& s : v->stmts()) {
@@ -475,15 +475,15 @@ void CudaPrinter::visit(const BlockPtr& v) {
 
   indent_--;
   emitIndent();
-  os() << "}";
+  os() << '}';
 }
 
 void CudaPrinter::visit(const LetPtr& v) {
   emitIndent();
   os() << dtypeToCppString(v->var()->dtype());
-  os() << " " << *v->var() << " = ";
+  os() << ' ' << *v->var() << " = ";
   v->value()->accept(this);
-  os() << ";" << '\n';
+  os() << ';' << '\n';
 }
 
 class PrioritizeLoad : public IRMutator {
@@ -911,7 +911,7 @@ void CudaCodeGen::Initialize() {
   // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
   os() << "__attribute__((amdgpu_flat_work_group_size(1, 1024)))" << std::endl;
 #endif
-  os() << "void " << func_name << "(";
+  os() << "void " << func_name << '(';
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
@@ -932,7 +932,7 @@ void CudaCodeGen::Initialize() {
     rand_seed = alloc<Var>("rand_seed", kInt);
     rand_offset = alloc<Var>("rand_offset", kInt);
     std::string uint64_str = "unsigned long long";
-    os() << ", " << uint64_str << " " << *rand_seed << ", " << uint64_str << " "
+    os() << ", " << uint64_str << ' ' << *rand_seed << ", " << uint64_str << ' '
          << *rand_offset;
   }
   os() << ") {";
@@ -942,7 +942,7 @@ void CudaCodeGen::Initialize() {
     VarPtr idx = alloc<Var>("idx", kInt);
     os() << "int " << *idx << " = blockIdx.x*blockDim.x + threadIdx.x;" << '\n';
     VarPtr rand_func = printer_->rand_func();
-    os() << "Philox " << *rand_func << "(" << *rand_seed << ", " << *idx << ", "
+    os() << "Philox " << *rand_func << '(' << *rand_seed << ", " << *idx << ", "
          << *rand_offset << ");" << '\n';
     os() << '\n';
   }
@@ -969,7 +969,7 @@ void CudaCodeGen::Initialize() {
 
   stmt_v->accept(printer_.get());
   os() << '\n';
-  os() << "}";
+  os() << '}';
 
   // Check that all block extents had been set.
   const std::vector<ExprPtr>& gpu_block_extents =
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 9b2ecd0e11515..31b7866a73d21 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -71,21 +71,21 @@ static void visitBinaryOp(
   int rhs_prec = getPrecedence(v->rhs()->expr_type());
 
   if (lhs_prec >= self_prec) {
-    os << "(";
+    os << '(';
   }
   v->lhs()->accept(printer);
   if (lhs_prec >= self_prec) {
-    os << ")";
+    os << ')';
   }
 
-  os << " " << op_str << " ";
+  os << ' ' << op_str << ' ';
 
   if (rhs_prec >= self_prec) {
-    os << "(";
+    os << '(';
   }
   v->rhs()->accept(printer);
   if (rhs_prec >= self_prec) {
-    os << ")";
+    os << ')';
   }
 }
 
@@ -129,7 +129,7 @@ void IRPrinter::visit(const ModPtr& v) {
   if (v->dtype().is_integral()) {
     visitBinaryOp(v, "%", this);
   } else if (v->dtype().is_floating_point()) {
-    os() << "mod(" << *v->lhs() << ", " << *v->rhs() << ")";
+    os() << "mod(" << *v->lhs() << ", " << *v->rhs() << ')';
   } else {
     throw std::runtime_error("invalid dtype: " + std::to_string(v->dtype()));
   }
@@ -140,7 +140,7 @@ void IRPrinter::visit(const MaxPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ", " << (unsigned int)v->propagate_nans() << ")";
+  os() << ", " << (unsigned int)v->propagate_nans() << ')';
 }
 
 void IRPrinter::visit(const MinPtr& v) {
@@ -148,7 +148,7 @@ void IRPrinter::visit(const MinPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ", " << (unsigned int)v->propagate_nans() << ")";
+  os() << ", " << (unsigned int)v->propagate_nans() << ')';
 }
 
 void IRPrinter::visit(const CompareSelectPtr& v) {
@@ -158,32 +158,32 @@ void IRPrinter::visit(const CompareSelectPtr& v) {
   int rhs_prec = getPrecedence(v->rhs()->expr_type());
 
   if (lhs_prec >= self_prec) {
-    os() << "(";
+    os() << '(';
   }
   v->lhs()->accept(this);
   if (lhs_prec >= self_prec) {
-    os() << ")";
+    os() << ')';
   }
 
   os() << to_string(cmp_op);
 
   if (rhs_prec >= self_prec) {
-    os() << "(";
+    os() << '(';
   }
   v->rhs()->accept(this);
   if (rhs_prec >= self_prec) {
-    os() << ")";
+    os() << ')';
   }
   os() << " ? ";
 
   auto withParens = [&](const ExprPtr& e) {
     auto prec = getPrecedence(e->expr_type());
     if (prec >= self_prec) {
-      os() << "(";
+      os() << '(';
     }
     e->accept(this);
     if (prec >= self_prec) {
-      os() << ")";
+      os() << ')';
     }
   };
   withParens(v->ret_val1());
@@ -237,16 +237,16 @@ AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
 
 void IRPrinter::visit(const CastPtr& v) {
   auto dtype = v->dtype();
-  os() << dtypeToCppString(dtype) << "(";
+  os() << dtypeToCppString(dtype) << '(';
   v->src_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const BitCastPtr& v) {
   auto dtype = v->dtype();
   os() << "BitCast<" << dtype.ToCppString() << ">(";
   v->src_value()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const VarPtr& v) {
@@ -273,7 +273,7 @@ void IRPrinter::visit(const BufPtr& v) {
     }
     s->accept(this);
   }
-  os() << "]";
+  os() << ']';
   os() << ", strides=[";
   i = 0;
   for (const ExprPtr& s : v->strides()) {
@@ -282,14 +282,14 @@ void IRPrinter::visit(const BufPtr& v) {
     }
     s->accept(this);
   }
-  os() << "]";
+  os() << ']';
 
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const RampPtr& v) {
   os() << "Ramp(" << *v->base() << ", " << *v->stride() << ", " << v->lanes()
-       << ")";
+       << ')';
 }
 
 void IRPrinter::visit(const LoadPtr& v) {
@@ -297,7 +297,7 @@ void IRPrinter::visit(const LoadPtr& v) {
   if (v->indices().empty()) {
     os() << *v->base_handle();
   } else {
-    os() << *v->base_handle() << "[";
+    os() << *v->base_handle() << '[';
     size_t i = 0;
     for (const ExprPtr& ind : v->indices()) {
       if (i++) {
@@ -306,40 +306,40 @@ void IRPrinter::visit(const LoadPtr& v) {
       ind->accept(this);
     }
     if (v->indices().empty()) {
-      os() << "0";
+      os() << '0';
     }
-    os() << "]";
+    os() << ']';
   }
 }
 
 void IRPrinter::visit(const BroadcastPtr& v) {
-  os() << "Broadcast(" << *v->value() << ", " << v->lanes() << ")";
+  os() << "Broadcast(" << *v->value() << ", " << v->lanes() << ')';
 }
 
 void IRPrinter::visit(const IfThenElsePtr& v) {
   os() << "IfThenElse(" << *v->condition() << ", " << *v->true_value() << ", "
-       << *v->false_value() << ")";
+       << *v->false_value() << ')';
 }
 
 void IRPrinter::visit(const IntrinsicsPtr& v) {
-  os() << v->func_name() << "(";
+  os() << v->func_name() << '(';
   for (const auto i : c10::irange(v->nparams())) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const TermPtr& v) {
   os() << "Term(";
   v->scalar()->accept(this);
   for (const auto& t : v->variables()) {
-    os() << ",";
+    os() << ',';
     t->accept(this);
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const PolynomialPtr& v) {
@@ -357,7 +357,7 @@ void IRPrinter::visit(const PolynomialPtr& v) {
     os() << " + ";
   }
   v->scalar()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const RoundOffPtr& v) {
@@ -365,7 +365,7 @@ void IRPrinter::visit(const RoundOffPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const MaxTermPtr& v) {
@@ -380,7 +380,7 @@ void IRPrinter::visit(const MaxTermPtr& v) {
       os() << ", ";
     }
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const MinTermPtr& v) {
@@ -395,7 +395,7 @@ void IRPrinter::visit(const MinTermPtr& v) {
       os() << ", ";
     }
   }
-  os() << ")";
+  os() << ')';
 }
 
 void IRPrinter::visit(const ReduceOpPtr& v) {
@@ -423,11 +423,11 @@ void IRPrinter::visit(const ReduceOpPtr& v) {
 void IRPrinter::visit(const StorePtr& v) {
   // TODO: handle the mask
   if (v->indices().empty()) {
-    os() << *v->base_handle() << " = " << *v->value() << ";";
+    os() << *v->base_handle() << " = " << *v->value() << ';';
     return;
   }
 
-  os() << *v->base_handle() << "[";
+  os() << *v->base_handle() << '[';
   size_t i = 0;
   for (const ExprPtr& ind : v->indices()) {
     if (i++) {
@@ -436,15 +436,15 @@ void IRPrinter::visit(const StorePtr& v) {
     ind->accept(this);
   }
   if (v->indices().empty()) {
-    os() << "0";
+    os() << '0';
   }
-  os() << "] = " << *v->value() << ";";
+  os() << "] = " << *v->value() << ';';
 }
 
 void IRPrinter::visit(const ForPtr& v) {
   VarPtr var = v->var();
   VarHandle vv(var);
-  os() << "for (" << dtypeToCppString(var->dtype()) << " " << vv << " = "
+  os() << "for (" << dtypeToCppString(var->dtype()) << ' ' << vv << " = "
        << ExprHandle(v->start()) << "; " << vv << " < " << ExprHandle(v->stop())
        << "; " << vv << "++) ";
   std::string loop_options_str = v->loop_options().ToString();
@@ -464,11 +464,11 @@ void IRPrinter::visit(const BlockPtr& v) {
 
   for (const StmtPtr& s : *v) {
     emitIndent();
-    os() << *s << "\n";
+    os() << *s << '\n';
   }
   indent_--;
   emitIndent();
-  os() << "}";
+  os() << '}';
 }
 
 void IRPrinter::visit(const AllocatePtr& v) {
@@ -482,7 +482,7 @@ void IRPrinter::visit(const AllocatePtr& v) {
     }
     os() << *dims[i];
   }
-  os() << "]";
+  os() << ']';
 }
 
 void IRPrinter::visit(const FreePtr& v) {
@@ -503,13 +503,13 @@ void IRPrinter::visit(const FreeExtPtr& v) {
 }
 
 void IRPrinter::visit(const PlacementAllocatePtr& v) {
-  os() << "Alias(" << *v->buf()->base_handle() << ","
+  os() << "Alias(" << *v->buf()->base_handle() << ','
        << *v->buf_to_reuse()->base_handle() << ");";
 }
 
 void IRPrinter::visit(const LetPtr& v) {
-  os() << dtypeToCppString(v->var()->dtype()) << " " << *v->var();
-  os() << " = " << *v->value() << ";";
+  os() << dtypeToCppString(v->var()->dtype()) << ' ' << *v->var();
+  os() << " = " << *v->value() << ';';
 }
 
 void IRPrinter::visit(const CondPtr& v) {
@@ -530,7 +530,7 @@ void IRPrinter::visit(const CondPtr& v) {
 }
 
 void IRPrinter::visit(const AtomicAddPtr& v) {
-  os() << "atomicAdd(&" << *v->base_handle() << "[";
+  os() << "atomicAdd(&" << *v->base_handle() << '[';
   size_t i = 0;
   for (const ExprPtr& ind : v->indices()) {
     if (i++) {
@@ -539,7 +539,7 @@ void IRPrinter::visit(const AtomicAddPtr& v) {
     ind->accept(this);
   }
   if (v->indices().empty()) {
-    os() << "0";
+    os() << '0';
   }
   os() << "], " << *v->value() << ");";
 }
@@ -549,7 +549,7 @@ void IRPrinter::visit(const SyncThreadsPtr& v) {
 }
 
 void IRPrinter::visit(const ExternalCallPtr& v) {
-  os() << *v->buf() << " = " << v->func_name() << "(";
+  os() << *v->buf() << " = " << v->func_name() << '(';
 
   os() << "buf_args={";
   int i = 0;
@@ -580,7 +580,7 @@ void IRPrinter::visit(const ExternalCallWithAllocPtr& v) {
     os() << *buf_out_arg;
   }
 
-  os() << " := " << v->func_name() << "(";
+  os() << " := " << v->func_name() << '(';
 
   os() << "buf_args={";
   i = 0;
@@ -657,7 +657,7 @@ void print(const ExprPtr& expr) {
   } else {
     std::cout << "(null expr)";
   }
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 void print(const StmtPtr& stmt) {
@@ -691,14 +691,14 @@ std::string to_string(const StmtPtr& stmt) {
 std::string to_string(const Tensor& t) {
   std::ostringstream oss;
   // TODO: move this to Buf printer
-  oss << "Tensor " << t.buf()->name_hint() << "[";
+  oss << "Tensor " << t.buf()->name_hint() << '[';
   for (const auto i : c10::irange(t.buf()->ndim())) {
     if (i != 0) {
       oss << ", ";
     }
     oss << *t.buf()->dim(i);
   }
-  oss << "]:\n" << *t.stmt() << "\n";
+  oss << "]:\n" << *t.stmt() << '\n';
   return oss.str();
 }
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index 7f0888666d3af..cca7efcd0adaf 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -131,9 +131,9 @@ std::string sanitizeName(const std::string& input_name) {
     } else {
       if (i == 0) {
         // Don't start names with underscore
-        sanitized_name << "v";
+        sanitized_name << 'v';
       }
-      sanitized_name << "_";
+      sanitized_name << '_';
     }
   }
   return sanitized_name.str();
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 46a09314fb7bf..3dda98ff0faf6 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -733,7 +733,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
     }
   } catch (...) {
     std::cout << "EXCEPTION THROWN!\n";
-    std::cout << "SEED: " << seed << "\n";
+    std::cout << "SEED: " << seed << '\n';
     throw std::runtime_error("Random test failed");
   }
   message = "End of transformations;\n";
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
index 73a1c6a4a2d5a..bbd43f0fa8a8c 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
@@ -151,7 +151,7 @@ bool AccessInfo::isWrite() const {
 }
 
 void AccessInfo::print() const {
-  std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << "[";
+  std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << '[';
   if (!bounds_.empty()) {
     for (size_t i = 0; i < bounds_.size() - 1; ++i) {
       bounds_[i].print();
@@ -161,30 +161,30 @@ void AccessInfo::print() const {
     size_t i = bounds_.size() - 1;
     bounds_[i].print();
   }
-  std::cout << "]";
+  std::cout << ']';
 
   if (!dependencies_.empty()) {
     std::cout << " - depends on: ";
     for (auto& pair : dependencies_) {
-      std::cout << pair.second->id() << " ";
+      std::cout << pair.second->id() << ' ';
     }
   }
 
   if (!dependents_.empty()) {
     std::cout << " - dependents: ";
     for (auto& pair : dependents_) {
-      std::cout << pair.second.lock()->id() << " ";
+      std::cout << pair.second.lock()->id() << ' ';
     }
   }
 
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 void AccessInfo::dumpDOT(std::ostream& os) const {
   if (type_ == AccessType::Input || type_ == AccessType::Output ||
       type_ == AccessType::Alloc) {
-    os << "n" << id_ << " [\n";
-    os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << "[";
+    os << 'n' << id_ << " [\n";
+    os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << '[';
     if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
         os << *IRSimplifier::simplify(
@@ -203,17 +203,17 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
       os << "\tshape = \"house\"\n";
     }
   } else {
-    os << "n" << id_ << " [\n";
+    os << 'n' << id_ << " [\n";
     os << "label = \"" << AccessToString(type_) << " (#" << id_ << ")\\n";
     os << "buf : " << *var_ << "\\n";
     os << "bounds : [";
     if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
-        os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
+        os << '(' << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
       }
 
       size_t i = bounds_.size() - 1;
-      os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << ")]";
+      os << '(' << *bounds_[i].start << ", " << *bounds_[i].end << ")]";
     }
     os << "\"\n";
     os << "\tshape = \"box\"\n";
@@ -228,8 +228,8 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
   }
   os << "]\n";
   for (auto& pair : dependencies_) {
-    os << "n" << pair.second->id() << " -> "
-       << "n" << id_ << " [color=\"" << edgeColour << "\"]\n";
+    os << 'n' << pair.second->id() << " -> " << 'n' << id_ << " [color=\""
+       << edgeColour << "\"]\n";
   }
 }
 
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index 37f79d529238d..9ad44e31a3873 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -131,17 +131,17 @@ std::shared_ptr<AccessInfo> AccessInfo::cloneWithHiddenInfo(
 }
 
 void AccessInfo::print() const {
-  std::cout << "Access: " << *buf_ << "{";
+  std::cout << "Access: " << *buf_ << '{';
   for (const auto& i : indices_) {
-    std::cout << *i << " ";
+    std::cout << *i << ' ';
   }
   std::cout << "} stores: " << stores_.size() << " (" << *store_cost_ << ") -";
-  std::cout << " loads: " << loads_.size() << " (" << *load_cost_ << ")";
+  std::cout << " loads: " << loads_.size() << " (" << *load_cost_ << ')';
   if (conditionId_) {
     std::cout << " cond: " << conditionId_;
   }
 
-  std::cout << "\n";
+  std::cout << '\n';
 }
 
 // Scope
diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp
index 0ee8fd4a956bb..f3a62fa374056 100644
--- a/torch/csrc/jit/tensorexpr/types.cpp
+++ b/torch/csrc/jit/tensorexpr/types.cpp
@@ -57,7 +57,7 @@ Dtype ToDtype(ScalarType type) {
 TORCH_API std::ostream& operator<<(std::ostream& stream, const Dtype& dtype) {
   stream << dtype.scalar_type_;
   if (dtype.lanes() > 1) {
-    stream << "x" << dtype.lanes();
+    stream << 'x' << dtype.lanes();
     ;
   }
   return stream;
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index aeac1233e4d23..fb1280400a89d 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -116,7 +116,7 @@ size_t assertFind(
     const std::string& sub,
     const Check& check) {
   return assertFind(search_range, sub, [&](std::ostream& out) {
-    out << "From " << check << "\n";
+    out << "From " << check << '\n';
   });
 }
 
@@ -156,7 +156,7 @@ size_t assertFindRegex(
     const std::string& sub,
     const Check& check) {
   return assertFindRegex(search_range, sub, [&](std::ostream& out) {
-    out << "From " << check << "\n";
+    out << "From " << check << '\n';
   });
 }
 
@@ -182,7 +182,7 @@ void assertNotFind(
     c10::printQuotedString(ss, sub);
     ss << " but found it\n";
     found_range.highlight(ss);
-    ss << "From " << check << "\n";
+    ss << "From " << check << '\n';
     throw std::runtime_error(ss.str());
   }
 }
@@ -543,7 +543,7 @@ FileCheck::FileCheck() : fcImpl(new FileCheckImpl()) {}
 std::ostream& operator<<(std::ostream& out, const FileCheckImpl& fc) {
   out << "FileCheck checks:\n";
   for (const Check& c : fc.checks) {
-    out << "\t" << c << "\n";
+    out << '\t' << c << '\n';
   }
   return out;
 }
diff --git a/torch/csrc/lazy/core/debug_util.cpp b/torch/csrc/lazy/core/debug_util.cpp
index 3cc35c6d0cf05..2eb448e75f61d 100644
--- a/torch/csrc/lazy/core/debug_util.cpp
+++ b/torch/csrc/lazy/core/debug_util.cpp
@@ -77,7 +77,7 @@ std::string GetFirstUserFrameInPython() {
     auto& loc = frames[i - 1];
     if (loc.file.find("site-packages") == std::string::npos) {
       std::stringstream ss;
-      ss << loc.file << " " << loc.function << " " << loc.line;
+      ss << loc.file << ' ' << loc.function << ' ' << loc.line;
       return ss.str();
     }
   }
@@ -120,7 +120,7 @@ std::string DebugUtil::GetTensorsGraphInfo(
   std::vector<SourceLocation> frames = GetPythonFramesFunction()();
   ss << "Python Stacktrace:\n";
   for (auto& location : frames) {
-    ss << "  " << location.function << " (" << location.file << ":"
+    ss << "  " << location.function << " (" << location.file << ':'
        << location.line << ")\n";
   }
   ss << "\nHashes: (";
@@ -160,7 +160,7 @@ void DebugUtil::SaveTensorsGraphInfo(
     std::string info = GetTensorsGraphInfo(tensors, indices, format);
     std::lock_guard<std::mutex> guard(lock);
     std::ofstream graph_file(save_file, std::ios_base::app);
-    graph_file << "[" << name << "]\n" << info << "\n";
+    graph_file << '[' << name << "]\n" << info << '\n';
   }
 }
 
diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp
index 709b5b028b242..3cd25d2f5e85e 100644
--- a/torch/csrc/lazy/core/ir.cpp
+++ b/torch/csrc/lazy/core/ir.cpp
@@ -143,7 +143,7 @@ const Output& Node::nullable_operand(size_t i) const {
 
 std::string Node::ToString() const {
   std::stringstream ss;
-  ss << shapes() << " " << op();
+  ss << shapes() << ' ' << op();
   if (num_outputs() > 1) {
     ss << ", num_outputs=" << num_outputs();
   }
diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp
index 3f33c4fce2246..b7f959682452c 100644
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@@ -137,7 +137,7 @@ std::string GenerateDotNodeLabel(
   std::stringstream ss;
   ss << node->op() << "\\n" << node->shape();
   for (auto& tag : GetNodeTags(node)) {
-    ss << "\\n" << tag.name << "=";
+    ss << "\\n" << tag.name << '=';
     if (tag.value.size() < kMaxValueSize) {
       ss << tag.value;
     } else {
@@ -155,27 +155,27 @@ std::string GenerateDotNodeSpec(
     const Node* node,
     const std::unordered_map<const Node*, size_t>& roots_ids) {
   std::stringstream ss;
-  ss << "label=\"" << GenerateDotNodeLabel(node, roots_ids) << "\"";
+  ss << "label=\"" << GenerateDotNodeLabel(node, roots_ids) << '"';
   return ss.str();
 }
 
 std::string GenerateTextNodeSpec(const Node* node, const NodeIdMap& id_map) {
   std::stringstream ss;
-  ss << node->shapes() << " " << node->op() << "(";
+  ss << node->shapes() << ' ' << node->op() << '(';
   size_t count = 0;
   for (auto& output : node->operands()) {
     if (count > 0) {
       ss << ", ";
     }
-    ss << "%" << id_map.at(output.node);
+    ss << '%' << id_map.at(output.node);
     if (output.node->num_outputs() > 1) {
-      ss << "." << output.index;
+      ss << '.' << output.index;
     }
     ++count;
   }
-  ss << ")";
+  ss << ')';
   for (auto& tag : GetNodeTags(node)) {
-    ss << ", " << tag.name << "=" << tag.value;
+    ss << ", " << tag.name << '=' << tag.value;
   }
   return ss.str();
 }
@@ -214,7 +214,7 @@ std::string DumpUtil::PostOrderToDot(
         if (output.node->num_outputs() > 1) {
           ss << " [label=\"o=" << output.index << "\"]";
         }
-        ss << "\n";
+        ss << '\n';
       }
     }
   }
@@ -242,7 +242,7 @@ std::string DumpUtil::PostOrderToText(
       ss << ", ROOT=" << *opt_root_id;
     }
     ss << ", NodeType=" << typeid(*node).name();
-    ss << "\n";
+    ss << '\n';
   }
   ss << "}\n";
   return ss.str();
diff --git a/torch/csrc/lazy/core/ir_metadata.cpp b/torch/csrc/lazy/core/ir_metadata.cpp
index 50aedaca0293b..5da2860ed6cea 100644
--- a/torch/csrc/lazy/core/ir_metadata.cpp
+++ b/torch/csrc/lazy/core/ir_metadata.cpp
@@ -16,8 +16,8 @@ void EmitShortFrameInfo(
     } else {
       ++pos;
     }
-    stream << ", location=" << frame.function << "@" << frame.file.substr(pos)
-           << ":" << frame.line;
+    stream << ", location=" << frame.function << '@' << frame.file.substr(pos)
+           << ':' << frame.line;
   }
 }
 
@@ -26,7 +26,7 @@ std::ostream& operator<<(
     const std::vector<SourceLocation>& frames) {
   stream << "Frames:\n";
   for (auto& location : frames) {
-    stream << "  " << location.function << " (" << location.file << ":"
+    stream << "  " << location.function << " (" << location.file << ':'
            << location.line << ")\n";
   }
   return stream;
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index c440357f9e16e..413601f70afd4 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -404,7 +404,7 @@ void LazyGraphExecutor::SyncLiveTensorsGraph(
     bool wait) {
   auto tensors = GetLiveTensors(device);
   VLOG(4) << tensors.size() << " live tensors: devices=("
-          << c10::Join(", ", devices) << ")";
+          << c10::Join(", ", devices) << ')';
   SyncTensorsGraph(&tensors, devices, wait, /*sync_ltc_data=*/true);
 }
 
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index e7ab494d18e32..ada3a2fed1693 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -85,7 +85,7 @@ static std::vector<int64_t> expand_param_if_needed(
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << "=" << list_param;
+       << "dimensions, but got " << param_name << '=' << list_param;
     TORCH_CHECK(false, ss.str());
   } else {
     return list_param.vec();
diff --git a/torch/csrc/lazy/core/trie.cpp b/torch/csrc/lazy/core/trie.cpp
index a4a5d6f0c8b86..e0e657aae137e 100644
--- a/torch/csrc/lazy/core/trie.cpp
+++ b/torch/csrc/lazy/core/trie.cpp
@@ -19,7 +19,7 @@ void TraverseTrie(TrieNode* node, std::stringstream& ss) {
        << ", " << node->hit_counter << " hits\"]\n";
   }
   for (auto& successor : node->successors) {
-    ss << node->unique_id << " -> " << successor->unique_id << "\n";
+    ss << node->unique_id << " -> " << successor->unique_id << '\n';
     TraverseTrie(successor.get(), ss);
   }
 }
diff --git a/torch/csrc/monitor/counters.h b/torch/csrc/monitor/counters.h
index 65a0f516a58d3..046c63a78eddb 100644
--- a/torch/csrc/monitor/counters.h
+++ b/torch/csrc/monitor/counters.h
@@ -226,7 +226,7 @@ class Stat {
     for (auto& kv : stats) {
       std::stringstream key;
       key << name_;
-      key << ".";
+      key << '.';
       key << aggregationName(kv.first);
       e.data[key.str()] = kv.second;
     }
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index ec9994e15ec9c..524b84070cbf6 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -201,13 +201,13 @@ class ExperimentalConfigWrapper {
     for (size_t i = 0; i < num_metrics; i++) {
       configss << config_.profiler_metrics[i];
       if (num_metrics > 1 && i < (num_metrics - 1)) {
-        configss << ",";
+        configss << ',';
       }
     }
     configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
              << (config_.profiler_measure_per_kernel ? "true" : "false")
-             << "\n";
-    configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << "\n";
+             << '\n';
+    configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << '\n';
     LOG(INFO) << "Generated config = " << configss.str();
 
     libkineto::api().activityProfiler().prepareTrace(
@@ -236,8 +236,8 @@ static const std::string setTraceID(const std::string& trace_id) {
     return "";
   }
   std::stringstream configss;
-  configss << "REQUEST_TRACE_ID=" << trace_id << "\n";
-  configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << "\n";
+  configss << "REQUEST_TRACE_ID=" << trace_id << '\n';
+  configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << '\n';
   return configss.str();
 }
 
@@ -249,7 +249,7 @@ static const std::string appendCustomConfig(
   }
   std::stringstream configss;
   configss << config;
-  configss << "CUSTOM_CONFIG=" << custom_profiler_config << "\n";
+  configss << "CUSTOM_CONFIG=" << custom_profiler_config << '\n';
   return configss.str();
 }
 #endif
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index 5edc59c893d7a..29b2b94af4472 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -279,7 +279,7 @@ static std::ofstream openOutputFile(const std::string& name) {
   std::ofstream stream;
   stream.open(name, std::ofstream::out | std::ofstream::trunc);
   if (!stream) {
-    LOG(ERROR) << "Failed to open '" << name << "'";
+    LOG(ERROR) << "Failed to open '" << name << '\'';
   } else {
     VLOG(1) << "PyTorch Execution Trace: writing to " << name;
   }
@@ -754,7 +754,7 @@ static void recordOperatorStart(
                 RecordScope::USER_SCOPE),
             tid,
             0); // fw_tid
-        ob.out << ",";
+        ob.out << ',';
       }
     }
 
@@ -928,7 +928,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
             fc.kernelFile,
             fc.get_string_for_tensor_range(),
             additiona_attrs);
-        ob->out << ",";
+        ob->out << ',';
       }
     } catch (const std::exception& e) {
       LOG(WARNING) << "Exception in execution trace observer: [" << fc.name
@@ -977,7 +977,7 @@ bool addExecutionTraceObserver(const std::string& output_file_path) {
       // 5 is the length of ".json"
       ob.resourceDir.replace(ext_pos, 5, "_resources/");
       VLOG(1) << "Execution trace resource directory: " << ob.resourceDir
-              << "\n";
+              << '\n';
     } else {
       LOG(WARNING)
           << "Execution trace output file does not end with \".json\".";
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 2b634b0303c26..45c288b976ae2 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -21,7 +21,7 @@ namespace {
 static void cudaCheck(cudaError_t result, const char* file, int line) {
   if (result != cudaSuccess) {
     std::stringstream ss;
-    ss << file << ":" << line << ": ";
+    ss << file << ':' << line << ": ";
     if (result == cudaErrorInitializationError) {
       // It is common for users to use DataLoader with multiple workers
       // and the autograd profiler. Throw a nice error message here.
diff --git a/torch/csrc/profiler/unwind/action.h b/torch/csrc/profiler/unwind/action.h
index 1a8373d9dfe14..5a982cfd046a0 100644
--- a/torch/csrc/profiler/unwind/action.h
+++ b/torch/csrc/profiler/unwind/action.h
@@ -40,16 +40,16 @@ struct Action {
   friend std::ostream& operator<<(std::ostream& out, const Action& self) {
     switch (self.kind) {
       case A_UNDEFINED:
-        out << "u";
+        out << 'u';
         break;
       case A_REG_PLUS_DATA:
-        out << "r" << (int)self.reg << " + " << self.data;
+        out << 'r' << (int)self.reg << " + " << self.data;
         break;
       case A_REG_PLUS_DATA_DEREF:
-        out << "*(r" << (int)self.reg << " + " << self.data << ")";
+        out << "*(r" << (int)self.reg << " + " << self.data << ')';
         break;
       case A_LOAD_CFA_OFFSET:
-        out << "*(cfa + " << self.data << ")";
+        out << "*(cfa + " << self.data << ')';
         break;
     }
     return out;
diff --git a/torch/csrc/profiler/unwind/eh_frame_hdr.h b/torch/csrc/profiler/unwind/eh_frame_hdr.h
index 740f4beb2c85c..5884685433b0d 100644
--- a/torch/csrc/profiler/unwind/eh_frame_hdr.h
+++ b/torch/csrc/profiler/unwind/eh_frame_hdr.h
@@ -81,7 +81,7 @@ struct EHFrameHdr {
   friend std::ostream& operator<<(std::ostream& out, const EHFrameHdr& self) {
     out << "EHFrameHeader(version=" << self.version_
         << ",table_size=" << self.table_size_
-        << ",fde_count=" << self.fde_count_ << ")";
+        << ",fde_count=" << self.fde_count_ << ')';
     return out;
   }
 
diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h
index 083578ec391e5..ffb06b5ab1f46 100644
--- a/torch/csrc/profiler/unwind/fde.h
+++ b/torch/csrc/profiler/unwind/fde.h
@@ -17,7 +17,7 @@ struct TableState {
     out << "cfa = " << self.cfa << "; ";
     for (auto r : c10::irange(self.registers.size())) {
       if (self.registers.at(r).kind != A_UNDEFINED) {
-        out << "r" << r << " = " << self.registers.at(r) << "; ";
+        out << 'r' << r << " = " << self.registers.at(r) << "; ";
       }
     }
     return out;
@@ -110,21 +110,21 @@ struct FDE {
     auto previous_pc = current_pc_;
     current_pc_ += amount;
     if (LOG) {
-      (*out_) << (void*)(previous_pc - load_bias_) << "-"
-              << (void*)(current_pc_ - load_bias_) << ": " << state() << "\n";
+      (*out_) << (void*)(previous_pc - load_bias_) << '-'
+              << (void*)(current_pc_ - load_bias_) << ": " << state() << '\n';
     }
   }
 
   void advance_loc(int64_t amount) {
     if (LOG) {
-      (*out_) << "advance_loc " << amount << "\n";
+      (*out_) << "advance_loc " << amount << '\n';
     }
     advance_raw(amount * code_alignment_factor_);
   }
 
   void offset(int64_t reg, int64_t offset) {
     if (LOG) {
-      (*out_) << "offset " << reg << " " << offset << "\n";
+      (*out_) << "offset " << reg << ' ' << offset << '\n';
     }
     if (reg > (int64_t)state().registers.size()) {
       if (LOG) {
@@ -138,7 +138,7 @@ struct FDE {
 
   void restore(int64_t reg) {
     if (LOG) {
-      (*out_) << "restore " << reg << "\n";
+      (*out_) << "restore " << reg << '\n';
     }
     if (reg > (int64_t)state().registers.size()) {
       if (LOG) {
@@ -151,7 +151,7 @@ struct FDE {
 
   void def_cfa(int64_t reg, int64_t off) {
     if (LOG) {
-      (*out_) << "def_cfa " << reg << " " << off << "\n";
+      (*out_) << "def_cfa " << reg << ' ' << off << '\n';
     }
     last_reg_ = reg;
     last_offset_ = off;
@@ -179,13 +179,13 @@ struct FDE {
 
   void undefined(int64_t reg) {
     if (LOG) {
-      (*out_) << "undefined " << reg << "\n";
+      (*out_) << "undefined " << reg << '\n';
     }
     state().registers.at(reg) = Action::undefined();
   }
   void register_(int64_t reg, int64_t rhs_reg) {
     if (LOG) {
-      (*out_) << "register " << reg << " " << rhs_reg << "\n";
+      (*out_) << "register " << reg << ' ' << rhs_reg << '\n';
     }
     state().registers.at(reg) =
         Action::regPlusData(static_cast<int32_t>(reg), 0);
@@ -214,7 +214,7 @@ struct FDE {
     if (LOG) {
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       (*out_) << "readUpTo " << (void*)addr << " for " << library_name_
-              << " at " << (void*)load_bias_ << "\n";
+              << " at " << (void*)load_bias_ << '\n';
     }
     state_stack_.emplace_back();
     current_pc_ = low_pc_;
@@ -245,8 +245,8 @@ struct FDE {
   }
 
   void dumpAddr2Line() {
-    std::cout << "addr2line -f -e " << library_name_ << " "
-              << (void*)(low_pc_ - load_bias_) << "\n";
+    std::cout << "addr2line -f -e " << library_name_ << ' '
+              << (void*)(low_pc_ - load_bias_) << '\n';
   }
 
   void readInstruction(Lexer& L) {
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index 2b30df4e2a60e..db7e8a60e4a19 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -354,7 +354,7 @@ struct Symbolizer {
     entry.queried.push_back(addr);
     auto libaddress = maybe_library->second - 1;
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
-    entry.comm->out() << (void*)libaddress << "\n";
+    entry.comm->out() << (void*)libaddress << '\n';
     // we need to make sure we don't write more than 64k bytes to
     // a pipe before reading the results. Otherwise the buffer may
     // get filled and block before we read the results.
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index d266958e2cb63..b547bc528da55 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -145,7 +145,7 @@ std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
   cs_str.reserve(cs.size());
   for (const auto& entry : cs) {
     std::stringstream loc;
-    loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
+    loc << entry.filename << '(' << entry.line << "): " << entry.funcname;
     cs_str.push_back(loc.str());
   }
   return cs_str;
@@ -310,11 +310,11 @@ std::string ivalueToStr(const c10::IValue& val, bool isString) {
   } else {
     ss.str("");
     if (isString) {
-      ss << "\"";
+      ss << '"';
     }
     ss << val;
     if (isString) {
-      ss << "\"";
+      ss << '"';
     }
     std::string mystr = ss.str();
 
@@ -934,7 +934,7 @@ int getTensorStartHint(const at::Tensor& t) {
 bool checkFunctionOutputsForLogging(const at::RecordFunction& fn) {
   const auto& outputs = fn.outputs();
   auto num_outputs = fn.num_outputs();
-  VLOG(2) << "outputs: " << num_outputs << " " << outputs.size() << '\n';
+  VLOG(2) << "outputs: " << num_outputs << ' ' << outputs.size() << '\n';
   // We have two cases: for unboxed kernel, we have num_outputs ==
   // outputs.size() for boxed kernel using stack, there could be more elements
   // on the stack from previous ops.
@@ -948,7 +948,7 @@ bool checkFunctionOutputsForLogging(const at::RecordFunction& fn) {
 bool checkFunctionInputsForLogging(const at::RecordFunction& fn) {
   auto num_inputs = fn.num_inputs();
   const auto inputs = fn.inputs();
-  VLOG(2) << "inputs: " << num_inputs << " " << inputs.size() << '\n';
+  VLOG(2) << "inputs: " << num_inputs << ' ' << inputs.size() << '\n';
   // We have two cases: for unboxed kernel, we have num_inputs ==
   // inputs.size() for boxed kernel using stack, there could be more elements
   // on the stack from previous ops.
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index ad418955e0559..d4c810d95c608 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -218,7 +218,7 @@ static void py_initialize_tensor_type(
 
 static std::string get_name(Backend backend, ScalarType scalarType) {
   std::ostringstream ss;
-  ss << torch::utils::backend_to_string(backend) << "." << toString(scalarType)
+  ss << torch::utils::backend_to_string(backend) << '.' << toString(scalarType)
      << "Tensor";
   return ss.str();
 }
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index 79994eeb8621e..e89f7887320a0 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -663,20 +663,20 @@ auto handle_torch_function_no_python_arg_parser(
     std::stringstream ss;
     ss << "Multiple dispatch failed for '";
     if (module_name && func_name) {
-      ss << module_name << "." << func_name;
+      ss << module_name << '.' << func_name;
     } else {
       py::handle fn = torch_api_function;
-      ss << py::str(fn.attr("__module__")) << "."
+      ss << py::str(fn.attr("__module__")) << '.'
          << py::str(fn.attr("__name__"));
     }
     ss << "'; all " << torch_function_name_str
        << " handlers returned NotImplemented:\n\n";
     if (mode_obj) {
-      ss << "  - mode object " << py::repr(mode_obj) << "\n";
+      ss << "  - mode object " << py::repr(mode_obj) << '\n';
     }
     for (auto& arg : overloaded_args) {
       ss << "  - tensor subclass " << py::repr(get_type_of_overloaded_arg(arg))
-         << "\n";
+         << '\n';
     }
     ss << "\nFor more information, try re-running with TORCH_LOGS=not_implemented";
     const std::string& tmp = ss.str();
@@ -1542,7 +1542,7 @@ std::string FunctionSignature::toString() const {
   // optionals, etc.
   std::ostringstream ss;
   bool keyword_already = false;
-  ss << "(";
+  ss << '(';
   int i = 0;
   for (auto& param : params) {
     if (i != 0) {
@@ -1552,13 +1552,13 @@ std::string FunctionSignature::toString() const {
       ss << "*, ";
       keyword_already = true;
     }
-    ss << param.type_name() << " " << param.name;
+    ss << param.type_name() << ' ' << param.name;
     if (param.optional) {
       ss << " = " << param.default_value;
     }
     i++;
   }
-  ss << ")";
+  ss << ')';
   return ss.str();
 }
 
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index f97b6ac0ba9b1..3380bb0a13e57 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -692,7 +692,7 @@ void initDispatchBindings(PyObject* module) {
       std::stringstream ss;
       ss << op.name;
       if (!op.overload_name.empty()) {
-        ss << "." << op.overload_name;
+        ss << '.' << op.overload_name;
       }
       names.emplace_back(std::move(ss).str());
     }
diff --git a/torch/csrc/utils/structseq.cpp b/torch/csrc/utils/structseq.cpp
index 29d20d5a9bfe2..2e804aa44bad9 100644
--- a/torch/csrc/utils/structseq.cpp
+++ b/torch/csrc/utils/structseq.cpp
@@ -66,7 +66,7 @@ PyObject* returned_structseq_repr(PyStructSequence* obj) {
       ss << ",\n";
     }
   }
-  ss << ")";
+  ss << ')';
 
   return PyUnicode_FromString(ss.str().c_str());
 }
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index d696a0cdf4ddd..c46baea82a442 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -66,14 +66,14 @@ const char* backend_to_string(const at::Backend& backend) {
 
 std::string options_to_string(const at::TensorOptions& options) {
   std::ostringstream ss;
-  ss << backend_to_string(options.backend()) << "."
+  ss << backend_to_string(options.backend()) << '.'
      << toString(at::typeMetaToScalarType(options.dtype())) << "Tensor";
   return ss.str();
 }
 
 std::string type_to_string(const at::DeprecatedTypeProperties& type) {
   std::ostringstream ss;
-  ss << backend_to_string(type.backend()) << "." << toString(type.scalarType())
+  ss << backend_to_string(type.backend()) << '.' << toString(type.scalarType())
      << "Tensor";
   return ss.str();
 }
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index b3d1dd929a216..ba5998ba3d3ce 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -367,7 +367,7 @@ static void registerXpuDeviceProperties(PyObject* module) {
                    << ", sub_group_sizes=[" << prop.sub_group_sizes
                    << "], has_fp16=" << prop.has_fp16
                    << ", has_fp64=" << prop.has_fp64
-                   << ", has_atomic64=" << prop.has_atomic64 << ")";
+                   << ", has_atomic64=" << prop.has_atomic64 << ')';
             return stream.str();
           });
 }
diff --git a/torch/nativert/executor/OpKernel.cpp b/torch/nativert/executor/OpKernel.cpp
index ee4a8503d5ce2..fa628733804a4 100644
--- a/torch/nativert/executor/OpKernel.cpp
+++ b/torch/nativert/executor/OpKernel.cpp
@@ -65,7 +65,7 @@ std::string readableArgs(
     } else {
       ss << arg;
     }
-    ss << "\n";
+    ss << '\n';
   }
   return ss.str();
 }
diff --git a/torch/nativert/executor/memory/FunctionSchema.cpp b/torch/nativert/executor/memory/FunctionSchema.cpp
index 264ed702cbc0d..80347dad2965a 100644
--- a/torch/nativert/executor/memory/FunctionSchema.cpp
+++ b/torch/nativert/executor/memory/FunctionSchema.cpp
@@ -11,8 +11,8 @@ bool FunctionSchema::alias(size_t input_idx, size_t output_idx) const {
     }
   }
 
-  VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << " "
-          << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << " "
+  VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << ' '
+          << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << ' '
           << (c10_fn_schema_.is_vararg() ? "vararg" : "non-vararg");
 
   if (!aliasing_spec_.empty()) {
diff --git a/torch/nativert/graph/Graph.cpp b/torch/nativert/graph/Graph.cpp
index 260af58a2a492..47d082f44332f 100644
--- a/torch/nativert/graph/Graph.cpp
+++ b/torch/nativert/graph/Graph.cpp
@@ -1031,7 +1031,7 @@ std::ostream& operator<<(std::ostream& out, const Constant& constant) {
         } else if constexpr (is_same_v<T, c10::Layout>) {
           out << kLayoutPrefix << arg;
         } else if constexpr (is_same_v<T, c10::Device>) {
-          out << kDevicePrefix << "{" << arg << "}";
+          out << kDevicePrefix << '{' << arg << '}';
         } else if constexpr (is_same_v<T, vector<string>>) {
           out << fmt::format("[{}]", fmt::join(arg, ","));
         } else if constexpr (is_same_v<T, unique_ptr<Graph>>) {
@@ -1054,16 +1054,16 @@ void printValue(std::ostream& out, const Value* v) {
 }
 
 void printNamedArgument(std::ostream& out, const NamedArgument& nv) {
-  out << nv.name << "=" << *nv.value;
+  out << nv.name << '=' << *nv.value;
 }
 
 void printAttribute(std::ostream& out, const Attribute& nv) {
-  out << nv.name << "=" << nv.value;
+  out << nv.name << '=' << nv.value;
 }
 } // namespace
 
 std::ostream& operator<<(std::ostream& out, const Value& v) {
-  out << "%" << v.name();
+  out << '%' << v.name();
   // If a list, distinguish it by adding a []
   // Looks like %my_list[]
   if (v.type() == Type::Kind::TensorList) {
@@ -1085,14 +1085,14 @@ std::ostream& operator<<(std::ostream& out, const Node& node) {
     printList(out, false, node.inputs(), [](std::ostream& out, const auto& nv) {
       out << *nv.value;
     });
-    out << ")";
+    out << ')';
     return out;
   }
 
   printList(out, false, node.outputs_, printValue);
 
   out << " = ";
-  out << node.target_ << "(";
+  out << node.target_ << '(';
   printList(out, false, node.inputs_, printNamedArgument);
   if (!node.inputs_.empty() && !node.attributes_.empty()) {
     // Emit a connective ',' between inputs and attributes.
@@ -1100,13 +1100,13 @@ std::ostream& operator<<(std::ostream& out, const Node& node) {
   }
 
   printList(out, false, node.attributes_, printAttribute);
-  out << ")";
+  out << ')';
   return out;
 }
 
 std::ostream& operator<<(std::ostream& out, const Graph& graph) {
   for (const auto& node : graph.nodes_) {
-    out << node << "\n";
+    out << node << '\n';
   }
   return out;
 }
diff --git a/torch/nativert/graph/GraphSignature.cpp b/torch/nativert/graph/GraphSignature.cpp
index cd07af807198f..569fff36a945c 100644
--- a/torch/nativert/graph/GraphSignature.cpp
+++ b/torch/nativert/graph/GraphSignature.cpp
@@ -313,7 +313,7 @@ GraphSignature::GraphSignature(const torch::_export::GraphSignature& storage) {
   }
 
   if (FLAGS_caffe2_log_level > 2) {
-    std::cout << *this << "\n";
+    std::cout << *this << '\n';
   }
 }
 
@@ -401,14 +401,14 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
   if (!sig.inputsToParameters().empty()) {
     out << "inputsToParameters: {\n";
     for (const auto& [inputName, paramName] : sig.inputsToParameters()) {
-      out << "\t" << inputName << " : " << paramName << "\n";
+      out << '\t' << inputName << " : " << paramName << '\n';
     }
     out << "}\n";
   }
   if (!sig.inputsToBuffers().empty()) {
     out << "inputsToBuffers: {\n";
     for (const auto& [inputName, bufferName] : sig.inputsToBuffers()) {
-      out << "\t" << inputName << " : " << bufferName << "\n";
+      out << '\t' << inputName << " : " << bufferName << '\n';
     }
     out << "}\n";
   }
@@ -416,28 +416,28 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     out << "inputsToTensorConstants: {\n";
     for (const auto& [inputName, tensorConstantName] :
          sig.inputsToTensorConstants()) {
-      out << "\t" << inputName << " : " << tensorConstantName << "\n";
+      out << '\t' << inputName << " : " << tensorConstantName << '\n';
     }
     out << "}\n";
   }
   if (!sig.inputsToCustomObjs().empty()) {
     out << "inputsToCustomObjs: {\n";
     for (const auto& [inputName, customObjName] : sig.inputsToCustomObjs()) {
-      out << "\t" << inputName << " : " << customObjName << "\n";
+      out << '\t' << inputName << " : " << customObjName << '\n';
     }
     out << "}\n";
   }
   if (!sig.userOutputs().empty()) {
     out << "userOutputs: {\n";
     for (const auto& outputName : sig.userOutputs()) {
-      out << "\t" << outputName.value_or("Constant") << "\n";
+      out << '\t' << outputName.value_or("Constant") << '\n';
     }
     out << "}\n";
   }
   if (!sig.buffersToMutate().empty()) {
     out << "buffersToMutate: {\n";
     for (const auto& [outputName, mutatedBufferName] : sig.buffersToMutate()) {
-      out << "\t" << outputName << " : " << mutatedBufferName << "\n";
+      out << '\t' << outputName << " : " << mutatedBufferName << '\n';
     }
     out << "}\n";
   }
@@ -445,7 +445,7 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     out << "userInputsToMutate: {\n";
     for (const auto& [outputName, mutatedUserInputName] :
          sig.userInputsToMutate()) {
-      out << "\t" << outputName << " : " << mutatedUserInputName << "\n";
+      out << '\t' << outputName << " : " << mutatedUserInputName << '\n';
     }
     out << "}\n";
   }
@@ -453,7 +453,7 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     if (!sig.gradientsToParameters().empty()) {
       out << "gradientsToParameters: {\n";
       for (const auto& [outputName, paramName] : sig.gradientsToParameters()) {
-        out << "\t" << outputName << " : " << paramName << "\n";
+        out << '\t' << outputName << " : " << paramName << '\n';
       }
       out << "}\n";
     }
@@ -461,11 +461,11 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
       out << "gradientsToUserInputs: {\n";
       for (const auto& [outputName, userInputName] :
            sig.gradientsToUserInputs()) {
-        out << "\t" << outputName << " : " << userInputName << "\n";
+        out << '\t' << outputName << " : " << userInputName << '\n';
       }
       out << "}\n";
     }
-    out << "lossOutput: " << sig.lossOutput() << "\n";
+    out << "lossOutput: " << sig.lossOutput() << '\n';
   }
   return out;
 }
diff --git a/torch/nativert/graph/passes/pass_manager/PassManager.cpp b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
index e023f223ed6f1..4dbb0012877d8 100644
--- a/torch/nativert/graph/passes/pass_manager/PassManager.cpp
+++ b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
@@ -35,7 +35,7 @@ bool GraphPassManager::run_pass(Graph* graph, const GraphPassIdentifier& name) {
 
 bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
   if (opts_.logGraphBetweenPasses()) {
-    LOG(INFO) << "Before pass: " << pass.name() << "\n"
+    LOG(INFO) << "Before pass: " << pass.name() << '\n'
               << graph->toString() << "-------------------------";
   }
   return false;
@@ -43,7 +43,7 @@ bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
 
 bool GraphPassManager::pass_post_run_hook(Graph* graph, const GraphPass& pass) {
   if (opts_.logGraphBetweenPasses()) {
-    LOG(INFO) << "After pass: " << pass.name() << "\n"
+    LOG(INFO) << "After pass: " << pass.name() << '\n'
               << graph->toString() << "-------------------------";
   }
   return false;

From 5d99a795f54d6bf14e39ae12df58d760d4fd8984 Mon Sep 17 00:00:00 2001
From: Erxin Shang <erxin.shang@intel.com>
Date: Sun, 16 Nov 2025 14:15:28 +0000
Subject: [PATCH 02/47] [xpu][test] Migrated two test files to XPU (#166684)

# Description
Fixes #114850, we will port test utils and schema check to Intel GPU
We could enable Intel GPU with following methods and try the best to keep the original code styles:

# Changes
1. Get device type with from accelerator and get_devtype helper method
2. Replace the requires cuda statement to device_type.
3. Add HAS_XPU and HAS GPU check to replace some of the HAS_XPU etc.

# Notify

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166684
Approved by: https://github.com/ezyang, https://github.com/guangyey

Co-authored-by: Yu, Guangye <106960996+guangyey@users.noreply.github.com>
---
 test/test_utils.py        | 94 +++++++++++++++++++++------------------
 torch/utils/checkpoint.py |  5 ++-
 2 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index f6bdc156c122e..ab2f133ca3f7e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -53,8 +53,10 @@
 # sharding on sandcastle. This line silences flake warnings
 load_tests = load_tests  # noqa: PLW0127
 
-HAS_CUDA = torch.cuda.is_available()
-
+device_type = (
+    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+)
+TEST_GPU = torch.xpu.is_available() or torch.cuda.is_available()
 
 from torch.testing._internal.common_utils import run_tests, TestCase
 
@@ -302,24 +304,24 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
-    def test_checkpoint_rng_cuda(self):
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
+    def test_checkpoint_rng_gpu(self):
         for _ in range(5):
-            inp = torch.randn(20000, device="cuda").requires_grad_()
+            inp = torch.randn(20000, device=device_type).requires_grad_()
             phase1 = torch.nn.Dropout()
             phase2 = torch.nn.Dropout()
 
             def run_fn(input):
                 return phase2(input)
 
-            state = torch.cuda.get_rng_state()
+            state = torch.get_device_module(device_type).get_rng_state()
 
             out = phase1(inp)
             out = checkpoint(run_fn, out, use_reentrant=True)
             out.sum().backward()
             grad_with_checkpointing = inp.grad
 
-            torch.cuda.set_rng_state(state)
+            torch.get_device_module(device_type).set_rng_state(state)
 
             inp.grad = None
 
@@ -330,9 +332,9 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, "No CUDA")
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
     def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self):
-        inp = torch.randn(2, device="cuda").requires_grad_()
+        inp = torch.randn(2, device=device_type).requires_grad_()
         layer = torch.nn.Dropout()
 
         def run_fn(input):
@@ -435,10 +437,10 @@ def run_fn2(tensor1, tensor2):
             out = checkpoint(run_fn2, input_var, input_var2, use_reentrant=True)
             out.sum().backward()
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+    @unittest.skipIf(not TEST_GPU, "No accelerator")
     def test_checkpointing_without_reentrant_early_free(self):
         # I don't know how to check if the temporary saved variable buffer
-        # get de-allocated directly. So using cuda memory usage as a proxy
+        # get de-allocated directly. So using GPU memory usage as a proxy
 
         def _do_test(fn, should_free):
             stats: list[int] = []
@@ -449,8 +451,8 @@ def track(x, idx):
                 # emptied at each step)
                 def hook(_unused):
                     self.assertEqual(len(stats), idx)
-                    torch.cuda.synchronize()
-                    stats.append(torch.cuda.memory_allocated())
+                    torch.accelerator.synchronize()
+                    stats.append(torch.accelerator.memory_allocated())
                     if idx > 0:
                         if should_free:
                             self.assertLess(stats[idx], stats[idx - 1])
@@ -475,7 +477,7 @@ def test_fn(x):
 
             return stats
 
-        x = torch.zeros(10, device="cuda", requires_grad=True)
+        x = torch.zeros(10, device=device_type, requires_grad=True)
         x.grad = torch.zeros_like(x)
 
         # In a regular backward, buffers get eagerly freed
@@ -505,8 +507,8 @@ def test_fn(x):
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_device_states_recursive(self):
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:1")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:1")],
         }
         device_ids, device_states = get_device_states(inp)
         self.assertEqual(2, len(device_ids))
@@ -522,42 +524,42 @@ def test_infer_device_state_recursive_meta(self):
         self.assertEqual("meta", device_type)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
-    def test_infer_device_state_recursive_multi_cuda(self):
-        # Check that no warning is issued for either cuda:0, cuda:1 or
-        # cuda:0, cuda:0 cases since they are both the same device type
+    def test_infer_device_state_recursive_multi_gpu(self):
+        # Check that no warning is issued for either gpu:0, gpu:1 or
+        # gpu:0, gpu:0 cases since they are both the same device type
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:1")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:1")],
         }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
-            "bar": [torch.rand(10, device="cuda:0")],
+            "foo": torch.rand(10, device=f"{device_type}:0"),
+            "bar": [torch.rand(10, device=f"{device_type}:0")],
         }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
-        # Check that a warning is issued for cuda:0, meta and that it includes
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
+        # Check that a warning is issued for gpu:0, meta and that it includes
         # device type information
         inp = {
-            "foo": torch.rand(10, device="cuda:0"),
+            "foo": torch.rand(10, device=f"{device_type}:0"),
             "bar": [torch.rand(10, device="meta")],
         }
         with warnings.catch_warnings(record=True) as w:
-            device_type = _infer_device_type(inp)
-            self.assertEqual("cuda", device_type)
+            _device_type = _infer_device_type(inp)
+            self.assertEqual(device_type, _device_type)
         self.assertEqual(len(w), 1)
         warning_msg = str(w[-1].message)
         self.assertTrue(
             "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices"
             in warning_msg
         )
-        self.assertTrue("Device types: ['cuda', 'meta']" in warning_msg)
-        self.assertTrue("first device type: cuda" in warning_msg)
+        self.assertTrue(f"Device types: ['{device_type}', 'meta']" in warning_msg)
+        self.assertTrue(f"first device type: {device_type}" in warning_msg)
 
 
 class TestDataLoaderUtils(TestCase):
@@ -604,7 +606,7 @@ def test_single_drop(self):
         self.assertEqual(len(list(dataiter)), 1)
 
     @unittest.skip(
-        "FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN"
+        "FIXME: Intermittent GPU out-of-memory error on Windows and time-out under ASAN"
     )
     def test_multi_keep(self):
         dataloader: DataLoader = DataLoader(
@@ -861,27 +863,33 @@ def test_get_default_device(self):
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_default_device_more(self):
         try:
-            torch.set_default_device("cuda")
+            torch.set_default_device(device_type)
             self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
             torch.set_default_device(None)
 
-            torch.set_default_device("cuda")
-            torch.cuda.set_device("cuda:1")
+            torch.set_default_device(device_type)
+            torch.get_device_module(device_type).set_device(f"{device_type}:1")
+            self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
+            torch.accelerator.set_device_index(1)
             self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
             torch.set_default_device(None)
 
-            torch.set_default_device("cuda:1")
+            torch.set_default_device(f"{device_type}:1")
             self.assertEqual(torch.get_default_device(), torch.tensor([]).device)
             torch.set_default_device(None)
 
-            torch.set_default_device("cuda:1")
-            with torch.device("cuda:0"):
-                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+            torch.set_default_device(f"{device_type}:1")
+            with torch.device(f"{device_type}:0"):
+                self.assertEqual(
+                    torch.get_default_device(), torch.device(f"{device_type}", 0)
+                )
 
             torch.set_default_device("cpu")
             self.assertEqual(torch.get_default_device(), torch.device("cpu"))
-            with torch.device("cuda:0"):
-                self.assertEqual(torch.get_default_device(), torch.device("cuda", 0))
+            with torch.device(f"{device_type}:0"):
+                self.assertEqual(
+                    torch.get_default_device(), torch.device(f"{device_type}", 0)
+                )
 
             self.assertEqual(torch.get_default_device(), torch.device("cpu"))
         finally:
diff --git a/torch/utils/checkpoint.py b/torch/utils/checkpoint.py
index b74e4d01da060..e1e38e0c36959 100644
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@@ -106,7 +106,7 @@ class DefaultDeviceType:
     to save and restore for recomputation.
     """
 
-    _default_device_type = "cuda"
+    _default_device_type: Optional[str] = None
 
     @staticmethod
     def set_device_type(device: str = "cuda") -> None:
@@ -126,6 +126,9 @@ def get_device_type() -> str:
         Returns:
             str: The current default device type.
         """
+        if not DefaultDeviceType._default_device_type:
+            DefaultDeviceType._default_device_type = acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
+
         return DefaultDeviceType._default_device_type
 
 
From e2e10753d72d4257630d632627454db21e370ff5 Mon Sep 17 00:00:00 2001
From: Minjang Kim <minjang@meta.com>
Date: Sun, 16 Nov 2025 17:51:23 +0000
Subject: [PATCH 03/47] Allow same triton kernels in export (#167862)

Summary: This diff would be a follow-up diff for D85883723.

Test Plan:
See D86719598. We are now able to publish the model.

Unit test:
```
buck run fbcode//mode/opt -c remoteexecution.local=enabled fbcode//sigmoid/inference/test:test_passes -m ovr_config//triton:experimental -- -r test_triton_hop_cpu
```

Differential Revision: D87091238

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167862
Approved by: https://github.com/XueningXu
---
 caffe2/serialize/inline_container.cc | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/caffe2/serialize/inline_container.cc b/caffe2/serialize/inline_container.cc
index d6c2bfd39c43a..0193a6bc180f1 100644
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@@ -773,8 +773,20 @@ void PyTorchStreamWriter::writeRecord(
     bool compress) {
   AT_ASSERT(!finalized_);
   AT_ASSERT(!archive_name_plus_slash_.empty());
-  TORCH_INTERNAL_ASSERT(
-      files_written_.count(name) == 0, "Tried to serialize file twice: ", name);
+  if (files_written_.count(name) > 0) {
+    // Allow multiple writes for triton binaries
+    bool is_triton_extension =
+        c10::ends_with(name, ".so") ||
+        c10::ends_with(name, ".cubin") ||
+        c10::ends_with(name, ".hsaco");
+
+    if (is_triton_extension) {
+      LOG(WARNING) << "File '" << name << "' is being serialized multiple times";
+      return;
+    }
+
+    TORCH_INTERNAL_ASSERT(false, "Tried to serialize file twice: ", name);
+  }
   if (name == kSerializationIdRecordName && serialization_id_.empty()) {
     // In case of copying records from another file, skip writing a different
     // serialization_id than the one computed in this writer.

From 363385ad3e13e6bd8e0288c3dc4ea5ec69e6e7f5 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@meta.com>
Date: Sat, 15 Nov 2025 22:48:24 -0500
Subject: [PATCH 04/47] s/Stragety/Strategy/ (#167916)

Signed-off-by: Edward Yang <ezyang@meta.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167916
Approved by: https://github.com/Skylion007
---
 torch/distributed/tensor/_op_schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/distributed/tensor/_op_schema.py b/torch/distributed/tensor/_op_schema.py
index 6fc3cc1d4e670..95e9509cdbcd6 100644
--- a/torch/distributed/tensor/_op_schema.py
+++ b/torch/distributed/tensor/_op_schema.py
@@ -205,7 +205,7 @@ def __init__(self, strategies: list[OpSpec]) -> None:
     def __str__(self) -> str:
         strategy_list_str = ", ".join([str(strategy) for strategy in self.strategies])
         mesh_shape = self.mesh_shape
-        return f"OpStragety[{strategy_list_str}] @ mesh: {mesh_shape}"
+        return f"OpStrategy[{strategy_list_str}] @ mesh: {mesh_shape}"
 
     def max_num_shards(self) -> int:
         """

From 43223547705be04e5e4478463cc6c70f5d4f118d Mon Sep 17 00:00:00 2001
From: "Sun, Jiayi" <jiayi.sun@intel.com>
Date: Fri, 14 Nov 2025 09:58:56 +0000
Subject: [PATCH 05/47] [Inductor] optimize scalar welford_reduce (#162709)

**Summary:**
Optimize scalar welford_reduce implementation, combining Welford algorithm with cascade sum to improve numerical stability. Specifically:

1. Use Welford algorithm to compute mean and variance.
2. Use cascade summation when computing sum over input for both mean and variance.

**Example:**
Take https://github.com/pytorch/pytorch/issues/141541 as an example:
```
import torch
import torch.nn as nn
torch.manual_seed(0)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.gn = nn.GroupNorm(num_groups=32, num_channels=32)

    def forward(self, x):
        return self.gn(x)

model = Model().eval()
x = torch.randn(1, 32, 128, 128, 128)

with torch.no_grad():
    output = model(x)
    with torch._inductor.config.patch({"cpp.simdlen": 0}):
        c_model = torch.compile(model)
        c_output = c_model(x)

print(torch.max(torch.abs(output - c_output)))
print(torch.allclose(output, c_output, 1.3e-6, 1e-5))
```
**logs**

- before
```
tensor(0.0005)
False
```
- After
```
tensor(1.4305e-06)
True
```

**Generated code:**
- before
```
cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float*', 'float*', 'const float*', 'const float*', 'const float*', 'float*'], '''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(float* in_out_ptr0,
                       float* in_out_ptr1,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr2)
{
    auto out_ptr1 = in_out_ptr0;
    auto out_ptr0 = in_out_ptr1;
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L))
        {
            {
                Welford<float> tmp_acc0 = Welford<float>();
                Welford<float> tmp_acc0_arr[4];
                for (int i = 0; i < 4; i++)
                {
                    tmp_acc0_arr[i] = Welford<float>();
                }
                #pragma omp parallel num_threads(4)
                {
                    int tid = omp_get_thread_num();
                    Welford<float> tmp_acc0_local = Welford<float>();
                    #pragma omp for
                    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L))
                    {
                        {
                            {
                                auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)];
                                tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0);
                            }
                        }
                    }
                    tmp_acc0_arr[tid] = tmp_acc0_local;
                }
                for (int tid = 0; tid < 4; tid++)
                {
                    tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]);
                }
                in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean;
                in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2;
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L))
        {
            {
                {
                    auto tmp0 = out_ptr1[static_cast<int64_t>(x0)];
                    auto tmp6 = in_ptr1[static_cast<int64_t>(x0)];
                    auto tmp8 = out_ptr0[static_cast<int64_t>(x0)];
                    auto tmp11 = in_ptr2[static_cast<int64_t>(x0)];
                    auto tmp1 = static_cast<float>(2097152.0);
                    auto tmp2 = tmp0 / tmp1;
                    auto tmp3 = static_cast<float>(1e-05);
                    auto tmp4 = float(tmp2 + tmp3);
                    auto tmp5 = 1 / std::sqrt(tmp4);
                    auto tmp7 = float(tmp5 * tmp6);
                    auto tmp9 = decltype(tmp8)(-tmp8);
                    auto tmp10 = float(tmp9 * tmp7);
                    auto tmp12 = float(tmp10 + tmp11);
                    in_out_ptr0[static_cast<int64_t>(x0)] = tmp7;
                    in_out_ptr1[static_cast<int64_t>(x0)] = tmp12;
                }
            }
        }
    }
    #pragma omp parallel num_threads(4)
    {
        int tid = omp_get_thread_num();
        {
            #pragma omp for
            for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L))
                {
                    {
                        {
                            auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)];
                            auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)];
                            auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)];
                            auto tmp2 = float(tmp0 * tmp1);
                            auto tmp4 = float(tmp2 + tmp3);
                            out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4;
                        }
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, arg1_1, arg2_1 = args
        args.clear()
        assert_size_stride(arg0_1, (32, ), (1, ))
        assert_size_stride(arg1_1, (32, ), (1, ))
        assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1))
        buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32)
        buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32)
        buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1  # reuse
        buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0  # reuse
        buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32)
        # [Provenance debug handles] cpp_fused_native_group_norm_0:1
        cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5)
        del arg0_1
        del arg1_1
        del arg2_1
        return (buf5, )
```

- After
```
cpp_fused_native_group_norm_0 = async_compile.cpp_pybinding(['float*', 'float*', 'const float*', 'const float*', 'const float*', 'float*'], '''
#include <torch/csrc/inductor/cpp_prefix.h>
extern "C"  void  kernel(float* in_out_ptr0,
                       float* in_out_ptr1,
                       const float* in_ptr0,
                       const float* in_ptr1,
                       const float* in_ptr2,
                       float* out_ptr2)
{
    auto out_ptr1 = in_out_ptr0;
    auto out_ptr0 = in_out_ptr1;
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L))
        {
            {
                Welford<float> tmp_acc0 = Welford<float>();
                Welford<float> tmp_acc0_arr[4];
                for (int i = 0; i < 4; i++)
                {
                    tmp_acc0_arr[i] = Welford<float>();
                }
                #pragma omp parallel num_threads(4)
                {
                    int tid = omp_get_thread_num();
                    WelfordHelper<float, float, 4096> scalar_welford_helper0(static_cast<int64_t>(524288L));
                    Welford<float> tmp_acc0_local = Welford<float>();
                    #pragma omp for
                    for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L))
                    {
                        {
                            {
                                auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)];
                                tmp_acc0_local = welford_combine(tmp_acc0_local, tmp0, &scalar_welford_helper0);
                            }
                        }
                    }
                    tmp_acc0_local = welford_combine(tmp_acc0_local, &scalar_welford_helper0);
                    tmp_acc0_arr[tid] = tmp_acc0_local;
                }
                for (int tid = 0; tid < 4; tid++)
                {
                    tmp_acc0 = welford_combine(tmp_acc0, tmp_acc0_arr[tid]);
                }
                in_out_ptr1[static_cast<int64_t>(x0)] = tmp_acc0.mean;
                in_out_ptr0[static_cast<int64_t>(x0)] = tmp_acc0.m2;
            }
        }
    }
    {
        #pragma GCC ivdep
        for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L))
        {
            {
                {
                    auto tmp0 = out_ptr1[static_cast<int64_t>(x0)];
                    auto tmp6 = in_ptr1[static_cast<int64_t>(x0)];
                    auto tmp8 = out_ptr0[static_cast<int64_t>(x0)];
                    auto tmp11 = in_ptr2[static_cast<int64_t>(x0)];
                    auto tmp1 = static_cast<float>(2097152.0);
                    auto tmp2 = tmp0 / tmp1;
                    auto tmp3 = static_cast<float>(1e-05);
                    auto tmp4 = float(tmp2 + tmp3);
                    auto tmp5 = 1 / std::sqrt(tmp4);
                    auto tmp7 = float(tmp5 * tmp6);
                    auto tmp9 = decltype(tmp8)(-tmp8);
                    auto tmp10 = float(tmp9 * tmp7);
                    auto tmp12 = float(tmp10 + tmp11);
                    in_out_ptr0[static_cast<int64_t>(x0)] = tmp7;
                    in_out_ptr1[static_cast<int64_t>(x0)] = tmp12;
                }
            }
        }
    }
    #pragma omp parallel num_threads(4)
    {
        int tid = omp_get_thread_num();
        {
            #pragma omp for
            for(int64_t x0=static_cast<int64_t>(0L); x0<static_cast<int64_t>(32L); x0+=static_cast<int64_t>(1L))
            {
                #pragma GCC ivdep
                for(int64_t x1=static_cast<int64_t>(0L); x1<static_cast<int64_t>(2097152L); x1+=static_cast<int64_t>(1L))
                {
                    {
                        {
                            auto tmp0 = in_ptr0[static_cast<int64_t>(x1 + 2097152L*x0)];
                            auto tmp1 = in_out_ptr0[static_cast<int64_t>(x0)];
                            auto tmp3 = in_out_ptr1[static_cast<int64_t>(x0)];
                            auto tmp2 = float(tmp0 * tmp1);
                            auto tmp4 = float(tmp2 + tmp3);
                            out_ptr2[static_cast<int64_t>(x1 + 2097152L*x0)] = tmp4;
                        }
                    }
                }
            }
        }
    }
}
''')

async_compile.wait(globals())
del async_compile

class Runner:
    def __init__(self, partitions):
        self.partitions = partitions

    def recursively_apply_fns(self, fns):
        new_callables = []
        for fn, c in zip(fns, self.partitions):
            new_callables.append(fn(c))
        self.partitions = new_callables

    def call(self, args):
        arg0_1, arg1_1, arg2_1 = args
        args.clear()
        assert_size_stride(arg0_1, (32, ), (1, ))
        assert_size_stride(arg1_1, (32, ), (1, ))
        assert_size_stride(arg2_1, (1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1))
        buf0 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32)
        buf1 = empty_strided_cpu((1, 32, 1, 1), (32, 1, 32, 32), torch.float32)
        buf3 = reinterpret_tensor(buf1, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf1  # reuse
        buf4 = reinterpret_tensor(buf0, (1, 32, 1, 1), (32, 1, 1, 1), 0); del buf0  # reuse
        buf5 = empty_strided_cpu((1, 32, 128, 128, 128), (67108864, 2097152, 16384, 128, 1), torch.float32)
        # [Provenance debug handles] cpp_fused_native_group_norm_0:1
        cpp_fused_native_group_norm_0(buf3, buf4, arg2_1, arg0_1, arg1_1, buf5)
        del arg0_1
        del arg1_1
        del arg2_1
        return (buf5, )
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162709
Approved by: https://github.com/CaoE, https://github.com/jansel
---
 test/inductor/test_cpu_repro.py  | 21 +++++-----
 torch/_inductor/codegen/cpp.py   | 44 +++++++++++---------
 torch/csrc/inductor/cpp_prefix.h | 70 ++++++++++++++++++--------------
 3 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
index d01d57f06a762..ba9dc93c651cf 100644
--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@@ -4449,16 +4449,17 @@ def __init__(self):
             def forward(self, x):
                 return self.gn(x)
 
-        for dynamic in [True, False]:
-            torch._dynamo.reset()
-            metrics.reset()
-            mod = M().eval()
-            x = torch.randn(1, 32, 128, 128, 128)
-            with torch.no_grad():
-                expected = mod(x)
-                compiled_m = torch.compile(mod, dynamic=dynamic)
-                actual = compiled_m(x)
-                self.assertEqual(expected, actual)
+        for simdlen, dynamic in itertools.product([None, 0], [True, False]):
+            with config.patch({"cpp.simdlen": simdlen}):
+                torch._dynamo.reset()
+                metrics.reset()
+                mod = M().eval()
+                x = torch.randn(1, 32, 128, 128, 128)
+                with torch.no_grad():
+                    expected = mod(x)
+                    compiled_m = torch.compile(mod, dynamic=dynamic)
+                    actual = compiled_m(x)
+                    self.assertEqual(expected, actual)
 
     @torch._dynamo.config.patch(
         capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index e3e5913be7d76..88f203421cc1c 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -239,7 +239,10 @@ def reduction_combine(
     if reduction_type in ("min", "max"):
         return f"{reduction_type}_propagate_nan({var}, {next_value})"
     if reduction_type == "welford_reduce":
-        return f"welford_combine({var}, {next_value})"
+        if helper_val:
+            return f"welford_combine({var}, {next_value}, &{helper_val})"
+        else:
+            return f"welford_combine({var}, {next_value})"
     if reduction_type == "welford_combine":
         if isinstance(next_value, tuple):
             mean, m2, weight = next_value
@@ -2194,10 +2197,8 @@ def need_use_acc_helper(self, reduction_type, dtype, use_scalar):
         # sum and welford
         # Note: using helper has non-negligible impact on performance
 
-        # keep the original behavior for welford_reduce
-        # acc helper is not used for scalar welford_reduce
         if reduction_type == "welford_reduce":
-            return not use_scalar
+            return True
 
         # TODO add supports for more data types when needed
         if reduction_type == "sum" and dtype == torch.float:
@@ -2323,9 +2324,15 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             reduction_size = functools.reduce(
                 operator.mul, self.ranges[self.reduction_depth :]
             )
-            helper_val = self.cascade_helper_cse.generate(
-                self.compute, f"reduction {reduction_key}", write=False
-            )
+            # use welford_helper/cascade_helper for vec kernel
+            if reduction_type == "welford_reduce":
+                helper_val = self.welford_helper_cse.generate(
+                    self.compute, f"reduction {reduction_key}", write=False
+                )
+            else:
+                helper_val = self.cascade_helper_cse.generate(
+                    self.compute, f"reduction {reduction_key}", write=False
+                )
             # rename the helper variable to distinguish it from vectorized version
             scalar_helper_val = f"scalar_{helper_val}"
             self._use_acc_helper(
@@ -3092,19 +3099,16 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
                 if self.ranges[self.tiling_idx] % self.tiling_factor
                 else sympy.Integer(0)
             )
-            # scalar helper for scalar sum is also needed when vec kernel is included
-            # Note: is it different from welford reduction as welford reduction of scalar version
-            # does not need helper, and the helper needs the information of reduction size to initialize
-            if reduction_type == "sum":
-                scalar_helper_val = f"scalar_{helper_val}"
-                self._use_acc_helper(
-                    reduction_type,
-                    acc,
-                    scalar_helper_val,
-                    reduction_size,
-                    dtype,
-                    use_scalar=True,
-                )
+            # scalar helper for scalar welford_reduce/sum is also needed when vec kernel is included
+            scalar_helper_val = f"scalar_{helper_val}"
+            self._use_acc_helper(
+                reduction_type,
+                acc,
+                scalar_helper_val,
+                reduction_size,
+                dtype,
+                use_scalar=True,
+            )
             self._use_acc_helper(
                 reduction_type, acc, helper_val, helper_vec_range, dtype
             )
diff --git a/torch/csrc/inductor/cpp_prefix.h b/torch/csrc/inductor/cpp_prefix.h
index decdef52a1daa..7dc161d13fd52 100644
--- a/torch/csrc/inductor/cpp_prefix.h
+++ b/torch/csrc/inductor/cpp_prefix.h
@@ -74,6 +74,22 @@ template <typename T, int N>
 struct IsVecMaskType<at::vec::VecMask<T, N>> : std::true_type {};
 #endif
 
+template <typename T>
+struct GetScalarType {
+  using type = T;
+};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T>
+struct GetScalarType<at::vec::Vectorized<T>> {
+  using type = T;
+};
+template <typename T, int N>
+struct GetScalarType<at::vec::VectorizedN<T, N>> {
+  using type = T;
+};
+#endif
+
 template <typename T, uint64_t kChunkSize>
 struct CascadeSumHelper {
   // A data struct to help cascade summation:
@@ -139,7 +155,7 @@ struct WelfordHelper {
   // 1. Save the reciprocal of weights to avoid redundant divisions.
   // 2. Save the welford stack, which is used to combine welford reduction
   //    with cascade summation to improve numerical stability.
-  static std::vector<typename T::value_type> weight_recps;
+  static std::vector<typename GetScalarType<T>::type> weight_recps;
   std::vector<Welford<T>> welford_stk{};
   uint64_t depth{0}; // depth of welford_stk.
   uint64_t num_chunks{0}; // number of chunks stored in welford_stk.
@@ -154,9 +170,9 @@ struct WelfordHelper {
 };
 
 template <typename T, uint64_t kChunkSize>
-std::vector<typename T::value_type> WelfordHelper<T, kChunkSize>::weight_recps =
-    []() {
-      using scalar_t = typename T::value_type;
+std::vector<typename GetScalarType<T>::type>
+    WelfordHelper<T, kChunkSize>::weight_recps = []() {
+      using scalar_t = typename GetScalarType<T>::type;
       std::vector<scalar_t> temp(kChunkSize);
       for (const auto i : c10::irange(kChunkSize)) {
         temp[i] = scalar_t(static_cast<double>(1) / static_cast<double>(i + 1));
@@ -202,21 +218,19 @@ Welford<T> welford_combine(
   // stability.
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
   // https://en.wikipedia.org/wiki/Pairwise_summation
-  if constexpr (IsVecType<T>::value) {
-    if (w != nullptr && w->depth > 0 && acc.index == kChunkSize) {
-      w->welford_stk[0] = welford_combine(w->welford_stk[0], acc);
-      w->num_chunks += 1;
-      acc.mean = T(0);
-      acc.m2 = T(0);
-      acc.weight = T(0);
-      acc.index = 0;
-      uint64_t mask = w->num_chunks;
-      for (uint64_t j = 1; j < w->depth && (mask & 1) == 0; ++j) {
-        w->welford_stk[j] =
-            welford_combine(w->welford_stk[j], w->welford_stk[j - 1]);
-        w->welford_stk[j - 1] = Welford<T>();
-        mask >>= 1;
-      }
+  if (w != nullptr && w->depth > 0 && acc.index == kChunkSize) {
+    w->welford_stk[0] = welford_combine(w->welford_stk[0], acc);
+    w->num_chunks += 1;
+    acc.mean = T(0);
+    acc.m2 = T(0);
+    acc.weight = T(0);
+    acc.index = 0;
+    uint64_t mask = w->num_chunks;
+    for (uint64_t j = 1; j < w->depth && (mask & 1) == 0; ++j) {
+      w->welford_stk[j] =
+          welford_combine(w->welford_stk[j], w->welford_stk[j - 1]);
+      w->welford_stk[j - 1] = Welford<T>();
+      mask >>= 1;
     }
   }
   // Add a single data point
@@ -224,22 +238,18 @@ Welford<T> welford_combine(
   auto new_weight = acc.weight + T(1);
   auto delta = data - acc.mean;
   T new_mean;
-  if constexpr (!IsVecType<T>::value) {
-    new_mean = acc.mean + delta / new_weight;
-  } else {
-    // use new_index to fecth 1 / new_weight to avoid divisions
-    new_mean = acc.mean +
-        ((w == nullptr || acc.index >= w->weight_recps.size())
-             ? delta / new_weight
-             : delta * T(w->weight_recps[acc.index]));
-  }
+  // use new_index to fecth 1 / new_weight to avoid divisions
+  new_mean = acc.mean +
+      ((w == nullptr || acc.index >= w->weight_recps.size())
+           ? delta / new_weight
+           : delta * T(w->weight_recps[acc.index]));
   auto new_delta = data - new_mean;
   auto result =
       Welford<T>{new_mean, acc.m2 + delta * new_delta, new_weight, new_index};
   return result;
 }
 
-template <typename T, uint64_t kChunkSize = 0>
+template <typename T, uint64_t kChunkSize>
 Welford<T> welford_combine(Welford<T>& acc, WelfordHelper<T, kChunkSize>* w) {
   for (const auto i : c10::irange(w->depth)) {
     acc = welford_combine(acc, w->welford_stk[i]);
@@ -256,7 +266,7 @@ struct IndexValue {
 };
 
 #if INDUCTOR_USE_VECTOR_TYPES()
-template <typename T, uint64_t kChunkSize>
+template <typename T, uint64_t kChunkSize = 0>
 Welford<T> welford_combine(
     Welford<T>& acc,
     T& data,

From d8ce6f8df991e083ad3b8fcb9d54550df4b5f2a1 Mon Sep 17 00:00:00 2001
From: Paul Zhang <paulzhan@meta.com>
Date: Mon, 17 Nov 2025 04:31:44 +0000
Subject: [PATCH 06/47] Enable PyTorch OSS numerics changes, inductor
 heuristics (#167799)

Test Plan: CI

Differential Revision: D86211542

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167799
Approved by: https://github.com/njriasan, https://github.com/eellison
---
 torch/_inductor/runtime/triton_heuristics.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 2a83f7b59117d..3cc47219258f9 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -22,7 +22,6 @@
 
 import torch
 from torch._dynamo.utils import counters, set_feature_use
-from torch._environment import is_fbcode
 from torch._inductor import metrics
 from torch._prims_common import compute_required_storage_length
 from torch.utils._debug_mode import get_active_debug_mode
@@ -2470,9 +2469,8 @@ def total_numel() -> int:
             rnumels[prefix] *= 2
 
     if num_warps is None:
-        if reduction_hint == ReductionHint.INNER and not is_fbcode():
-            # r is contiguous, so ensure that each thread has 8 elements for
-            # vectorized loads, assuming bf16/fp16
+        if reduction_hint == ReductionHint.INNER:
+            # r is contiguous, ensure at least 8 elements per thread
             # xblock is usually 1-2, default to giving each thread more work
             num_warps = r // 128
         else:
@@ -2942,7 +2940,7 @@ def outer_config_opt():
         )
 
     contiguous_config = make_config(
-        2 if rnumel <= 2048 and not is_fbcode() else 1,  # 1024 or less is persistent
+        2 if rnumel <= 2048 else 1,  # 1024 or less is persistent
         min(rnumel, MAX_R0_BLOCK),
         register_intensive=register_intensive,
     )
@@ -2955,7 +2953,7 @@ def outer_config_opt():
     outer_config = make_config(64, 8, register_intensive=register_intensive)
     # TODO (paulzhan): Test heuristic on AMD and internal testing
     # for correctness
-    if not torch.version.hip and not is_fbcode():
+    if not torch.version.hip:
         outer_config = outer_config_opt()
 
     configs = []

From aa504d4d2a4f8ce4fc9175bca738d1321dece53e Mon Sep 17 00:00:00 2001
From: PyTorch UpdateBot <pytorchupdatebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 05:21:24 +0000
Subject: [PATCH 07/47] [audio hash update] update the pinned audio hash
 (#167914)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned audio hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167914
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/audio.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 8462dd2aa4e55..616dfd88ce812 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-07b6cbde121417a70e4dc871adb6d27030e0ce3f
+ee1a1350eb37804b94334768f328144f058f14e9

From f2e6f94081c02704adf027fe0c81bf25726828f5 Mon Sep 17 00:00:00 2001
From: Laith Sakka <lsakka@meta.com>
Date: Fri, 14 Nov 2025 19:25:04 -0800
Subject: [PATCH 08/47] deprecate check_is_size and guard_size_oblivious
 (#167198)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167198
Approved by: https://github.com/bobrenjc93
---
 test/dynamo/test_fake_distributed.py     | 12 ++---
 test/dynamo/test_higher_order_ops.py     | 11 ++--
 test/export/test_export.py               |  4 --
 test/inductor/test_auto_functionalize.py | 16 +++---
 test/test_dynamic_shapes.py              | 64 ++++++++++++------------
 test/test_opaque_obj_v2.py               |  2 +-
 torch/__init__.py                        | 11 +++-
 torch/fx/experimental/symbolic_shapes.py |  4 ++
 torch/fx/passes/runtime_assert.py        | 11 ----
 9 files changed, 63 insertions(+), 72 deletions(-)

diff --git a/test/dynamo/test_fake_distributed.py b/test/dynamo/test_fake_distributed.py
index 41e373a50d76b..6a5a189c3bea1 100644
--- a/test/dynamo/test_fake_distributed.py
+++ b/test/dynamo/test_fake_distributed.py
@@ -90,12 +90,12 @@ def forward(self, primals_1: "Sym(s77)", primals_2: "Sym(s27)", floordiv: "Sym((
             """\
 class GraphModule(torch.nn.Module):
     def forward(self, primals_1: "Sym(u0)", primals_2: "Sym(u1)", primals_3: "Sym(u2)", primals_4: "f32[u0, u1, u2]"):
-        ge_1: "Sym(u0 >= 0)" = primals_1 >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
-        ge_3: "Sym(u1 >= 0)" = primals_2 >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
-        ge_5: "Sym(u2 >= 0)" = primals_3 >= 0
-        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u2 >= 0 on node 'ge_2'");  ge_5 = _assert_scalar_2 = None
+        ge: "Sym(u0 >= 0)" = primals_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
+        ge_1: "Sym(u1 >= 0)" = primals_2 >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
+        ge_2: "Sym(u2 >= 0)" = primals_3 >= 0
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u2 >= 0 on node 'ge_2'");  ge_2 = _assert_scalar_2 = None
 
         floordiv: "Sym((u0//2))" = primals_1 // 2
 
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 68e3a39800b6e..b34cae52d4c5f 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -727,7 +727,7 @@ def k(x):
         x = torch.randn(3)
         arg_count = ifdynstaticdefault(4, 5)
         # when compiled with dynamic, we don't have upper bound runtime assertions for u0
-        expected_op_count = ifdynstaticdefault(10, 8)
+        expected_op_count = ifdynstaticdefault(9, 7)
         out_graph = self._test_wrap_simple(
             f,
             default_args_generator((x,)),
@@ -747,7 +747,6 @@ def forward(self, s77: "Sym(s77)", L_x_: "f32[s77]"):
         c: "i64[u0, 1]" = l_x_.nonzero()
 
         sym_size_int_1: "Sym(u0)" = torch.ops.aten.sym_size.int(c, 0)
-        _check_is_size = torch._check_is_size(sym_size_int_1);  _check_is_size = None
 
         ge: "Sym(u0 >= 0)" = sym_size_int_1 >= 0
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
@@ -784,7 +783,6 @@ def forward(self, L_x_: "f32[3]"):
         c: "i64[u0, 1]" = l_x_.nonzero()
 
         sym_size_int_1: "Sym(u0)" = torch.ops.aten.sym_size.int(c, 0)
-        _check_is_size = torch._check_is_size(sym_size_int_1);  _check_is_size = None
 
         ge: "Sym(u0 >= 0)" = sym_size_int_1 >= 0
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
@@ -883,7 +881,7 @@ def k(x):
         x = torch.randn(3)
         arg_count = ifdynstaticdefault(4, 5)
         # when compiled with dynamic, we don't have upper bound runtime assertions for u0
-        expected_op_count = ifdynstaticdefault(10, 8)
+        expected_op_count = ifdynstaticdefault(9, 7)
         out_graph = self._test_wrap_simple(
             f,
             default_args_generator((x,)),
@@ -905,7 +903,6 @@ def forward(self, L_x_: "f32[3]"):
         c: "i64[u0, 1]" = l_x_.nonzero()
 
         sym_size_int: "Sym(u0)" = torch.ops.aten.sym_size.int(c, 0)
-        _check_is_size = torch._check_is_size(sym_size_int);  _check_is_size = None
 
         ge: "Sym(u0 >= 0)" = sym_size_int >= 0
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
@@ -956,7 +953,7 @@ def k(x):
         y = torch.randn(3)
         arg_count = ifdynstaticdefault(5, 6)
         # when compiled with dynamic, we don't have upper bound runtime assertions for u0 and u1
-        expected_op_count = ifdynstaticdefault(17, 13)
+        expected_op_count = ifdynstaticdefault(15, 11)
         out_graph = self._test_wrap_simple(
             f,
             default_args_generator((x, y)),
@@ -977,7 +974,6 @@ def forward(self, L_x_: "f32[3]", L_y_: "f32[3]"):
         c: "i64[u0, 1]" = l_x_.nonzero()
 
         sym_size_int_2: "Sym(u0)" = torch.ops.aten.sym_size.int(c, 0)
-        _check_is_size = torch._check_is_size(sym_size_int_2);  _check_is_size = None
 
         ge: "Sym(u0 >= 0)" = sym_size_int_2 >= 0
         _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
@@ -987,7 +983,6 @@ def forward(self, L_x_: "f32[3]", L_y_: "f32[3]"):
         d: "i64[u1, 1]" = l_y_.nonzero();  l_y_ = None
 
         sym_size_int_3: "Sym(u1)" = torch.ops.aten.sym_size.int(d, 0)
-        _check_is_size_1 = torch._check_is_size(sym_size_int_3);  _check_is_size_1 = None
 
         ge_1: "Sym(u1 >= 0)" = sym_size_int_3 >= 0
         _assert_scalar_default_2 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_default_2 = None
diff --git a/test/export/test_export.py b/test/export/test_export.py
index 204d458e77704..c60c8e82cc011 100755
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -3081,15 +3081,12 @@ def forward(self, x, y):
     foo = torch.ops.export.foo.default(x, y);  x = None
     sym_size_int = torch.ops.aten.sym_size.int(foo, 0)
     sym_size_int_1 = torch.ops.aten.sym_size.int(foo, 1)
-    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int);  sym_constrain_range_for_size_default = None
     ge = sym_size_int >= 0;  sym_size_int = None
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
-    sym_constrain_range_for_size_default_1 = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_1);  sym_constrain_range_for_size_default_1 = None
     ge_1 = sym_size_int_1 >= 0;  sym_size_int_1 = None
     _assert_scalar_default_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_default_1 = None
     bar = torch.ops.export.bar.default(y);  y = None
     sym_size_int_2 = torch.ops.aten.sym_size.int(bar, 0)
-    sym_constrain_range_for_size_default_2 = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_2);  sym_constrain_range_for_size_default_2 = None
     ge_2 = sym_size_int_2 >= 0;  sym_size_int_2 = None
     _assert_scalar_default_2 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u2 >= 0 on node 'ge_2'");  ge_2 = _assert_scalar_default_2 = None
     return (foo, bar)""",
@@ -17743,7 +17740,6 @@ def forward(self, x, mask):
 def forward(self, x, mask):
     masked_select = torch.ops.aten.masked_select.default(x, mask);  x = mask = None
     sym_size_int_1 = torch.ops.aten.sym_size.int(masked_select, 0)
-    sym_constrain_range_for_size_default = torch.ops.aten.sym_constrain_range_for_size.default(sym_size_int_1);  sym_constrain_range_for_size_default = None
     ge = sym_size_int_1 >= 0
     _assert_scalar_default = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar_default = None
     le = sym_size_int_1 <= 1188864
diff --git a/test/inductor/test_auto_functionalize.py b/test/inductor/test_auto_functionalize.py
index 6025c90cdb4a2..474d3986eb7ad 100644
--- a/test/inductor/test_auto_functionalize.py
+++ b/test/inductor/test_auto_functionalize.py
@@ -1492,8 +1492,8 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
         clone: "f32[s77][1]cpu" = torch.ops.aten.clone.default(arg1_1)
         nonzero: "i64[u0, 1][1, u0]cpu" = torch.ops.aten.nonzero.default(clone);  clone = None
         sym_size_int_1: "Sym(u0)" = torch.ops.aten.sym_size.int(nonzero, 0)
-        ge_1: "Sym(u0 >= 0)" = sym_size_int_1 >= 0;  sym_size_int_1 = None
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u0 >= 0)" = sym_size_int_1 >= 0;  sym_size_int_1 = None
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
         _to_copy: "f32[u0, 1][1, u0]cpu" = torch.ops.aten._to_copy.default(nonzero, dtype = torch.float32);  nonzero = None
         auto_functionalized_v2 = torch.ops.higher_order.auto_functionalized_v2(torch.ops.mylib.foo.default, _x_base_index = 0, _x_alias = True, _y_base_index = 1, _y_alias = True, _all_bases = [arg1_1, _to_copy]);  _to_copy = None
         getitem_1: "f32[s77][1]cpu" = auto_functionalized_v2[1]
@@ -1513,8 +1513,8 @@ def forward(self, arg0_1: "f32[2][1]cpu"):
         clone: "f32[2][1]cpu" = torch.ops.aten.clone.default(arg0_1)
         nonzero: "i64[u0, 1][1, u0]cpu" = torch.ops.aten.nonzero.default(clone);  clone = None
         sym_size_int: "Sym(u0)" = torch.ops.aten.sym_size.int(nonzero, 0)
-        ge_1: "Sym(u0 >= 0)" = sym_size_int >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u0 >= 0)" = sym_size_int >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
         le: "Sym(u0 <= 2)" = sym_size_int <= 2;  sym_size_int = None
         _assert_scalar_1 = torch.ops.aten._assert_scalar.default(le, "Runtime assertion failed for expression u0 <= 2 on node 'le'");  le = _assert_scalar_1 = None
         _to_copy: "f32[u0, 1][1, u0]cpu" = torch.ops.aten._to_copy.default(nonzero, dtype = torch.float32);  nonzero = None
@@ -1538,8 +1538,8 @@ def forward(self, arg0_1: "f32[2][1]cpu"):
 def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
         nonzero: "i64[u0, 1][1, u0]cpu" = torch.ops.aten.nonzero.default(arg1_1)
         sym_size_int_1: "Sym(u0)" = torch.ops.aten.sym_size.int(nonzero, 0)
-        ge_1: "Sym(u0 >= 0)" = sym_size_int_1 >= 0;  sym_size_int_1 = None
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u0 >= 0)" = sym_size_int_1 >= 0;  sym_size_int_1 = None
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
         convert_element_type: "f32[u0, 1][1, u0]cpu" = torch.ops.prims.convert_element_type.default(nonzero, torch.float32);  nonzero = None
         alias_default: "f32[s77][1]cpu" = torch.ops.aten.alias.default(arg1_1)
         alias_default_1: "f32[u0, 1][1, u0]cpu" = torch.ops.aten.alias.default(convert_element_type)
@@ -1557,8 +1557,8 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "f32[s77][1]cpu"):
 def forward(self, arg0_1: "f32[2][1]cpu"):
         nonzero: "i64[u0, 1][1, u0]cpu" = torch.ops.aten.nonzero.default(arg0_1)
         sym_size_int: "Sym(u0)" = torch.ops.aten.sym_size.int(nonzero, 0)
-        ge_1: "Sym(u0 >= 0)" = sym_size_int >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u0 >= 0)" = sym_size_int >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
         le: "Sym(u0 <= 2)" = sym_size_int <= 2;  sym_size_int = None
         _assert_scalar_1 = torch.ops.aten._assert_scalar.default(le, "Runtime assertion failed for expression u0 <= 2 on node 'le'");  le = _assert_scalar_1 = None
         convert_element_type: "f32[u0, 1][1, u0]cpu" = torch.ops.prims.convert_element_type.default(nonzero, torch.float32);  nonzero = None
diff --git a/test/test_dynamic_shapes.py b/test/test_dynamic_shapes.py
index 41ce5af6a28be..b6d825b1664f5 100644
--- a/test/test_dynamic_shapes.py
+++ b/test/test_dynamic_shapes.py
@@ -3532,11 +3532,11 @@ def make_non_contiguous_tensor_and_test(cnt):
             aot_graphs,
             """\
 def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)", arg3_1: "i64[u1][s7]cpu"):
-        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
-        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+        ge_1: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
@@ -3573,11 +3573,11 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)",
             aot_graphs,
             """\
 def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]cpu"):
-        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
-        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+        ge_1: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
@@ -3632,21 +3632,21 @@ def func(x, y):
             aot_graphs,
             """\
 def forward(self, arg0_1: "i64[2][1]cpu", arg1_1: "Sym(u2)", arg2_1: "Sym(u3)", arg3_1: "f32[u2, u3][1, u2]cpu"):
-        ge_1: "Sym(u2 >= 0)" = arg1_1 >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u2 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
-        ge_3: "Sym(u3 >= 0)" = arg2_1 >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        ge: "Sym(u2 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u2 >= 0 on node 'ge'");  ge = _assert_scalar = None
+        ge_1: "Sym(u3 >= 0)" = arg2_1 >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u3 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         select: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 0)
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(select);  select = None
-        ge_4: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_4, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_4 = _assert_scalar_2 = None
+        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_2 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_2'");  ge_2 = _assert_scalar_2 = None
         sym_sum: "Sym(u0 + 1)" = torch.sym_sum((1, _local_scalar_dense))
         gt: "Sym(u0 + 1 > 0)" = sym_sum > 0;  sym_sum = None
         _assert_scalar_3 = torch.ops.aten._assert_scalar.default(gt, "Runtime assertion failed for expression 0 < u0 + 1 on node 'gt'");  gt = _assert_scalar_3 = None
         select_1: "i64[][]cpu" = torch.ops.aten.select.int(arg0_1, 0, 1);  arg0_1 = None
         _local_scalar_dense_1: "Sym(u1)" = torch.ops.aten._local_scalar_dense.default(select_1);  select_1 = None
-        ge_5: "Sym(u1 >= 0)" = _local_scalar_dense_1 >= 0
-        _assert_scalar_4 = torch.ops.aten._assert_scalar.default(ge_5, "Runtime assertion failed for expression u1 >= 0 on node 'ge_3'");  ge_5 = _assert_scalar_4 = None
+        ge_3: "Sym(u1 >= 0)" = _local_scalar_dense_1 >= 0
+        _assert_scalar_4 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u1 >= 0 on node 'ge_3'");  ge_3 = _assert_scalar_4 = None
         sym_sum_1: "Sym(u1 + 1)" = torch.sym_sum((1, _local_scalar_dense_1))
         gt_1: "Sym(u1 + 1 > 0)" = sym_sum_1 > 0;  sym_sum_1 = None
         _assert_scalar_5 = torch.ops.aten._assert_scalar.default(gt_1, "Runtime assertion failed for expression 0 < u1 + 1 on node 'gt_1'");  gt_1 = _assert_scalar_5 = None
@@ -4068,10 +4068,10 @@ def func(x):
         self.assertExpectedInline(
             output,
             """\
-        ge_1: "Sym(u0 >= 0)" = arg0_1 >= 0;  arg0_1 = None
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
-        ge_3: "Sym(u1 >= 0)" = arg1_1 >= 0;  arg1_1 = None
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        ge: "Sym(u0 >= 0)" = arg0_1 >= 0;  arg0_1 = None
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
+        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0;  arg1_1 = None
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         clone: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.clone.default(arg2_1, memory_format = torch.contiguous_format);  arg2_1 = None
         add_3: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.add.Tensor(clone, 1);  clone = None
         mul_6: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.mul.Tensor(add_3, 100);  add_3 = None
@@ -4097,10 +4097,10 @@ def func(x):
         self.assertExpectedInline(
             output,
             """\
-        ge_1: "Sym(u0 >= 0)" = arg0_1 >= 0;  arg0_1 = None
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
-        ge_3: "Sym(u1 >= 0)" = arg1_1 >= 0;  arg1_1 = None
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_3, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_3 = _assert_scalar_1 = None
+        ge: "Sym(u0 >= 0)" = arg0_1 >= 0;  arg0_1 = None
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u0 >= 0 on node 'ge'");  ge = _assert_scalar = None
+        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0;  arg1_1 = None
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         add: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.add.Tensor(arg2_1, 1);  arg2_1 = None
         mul_5: "f32[u0, u1][Max(1, u1), 1]cpu" = torch.ops.aten.mul.Tensor(add, 100);  add = None
         return (mul_5,)""",  # noqa: B950
@@ -4283,11 +4283,11 @@ def make_non_contiguous_tensor_and_test(cnt):
             aot_graphs,
             """\
 def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)", arg3_1: "i64[u1][s7]cpu"):
-        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
-        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+        ge_1: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
@@ -4319,11 +4319,11 @@ def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "Sym(s7)",
             aot_graphs,
             """\
 def forward(self, arg0_1: "i64[1][1]cpu", arg1_1: "Sym(u1)", arg2_1: "i64[u1][1]cpu"):
-        ge_1: "Sym(u1 >= 0)" = arg1_1 >= 0
-        _assert_scalar = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge_1 = _assert_scalar = None
+        ge: "Sym(u1 >= 0)" = arg1_1 >= 0
+        _assert_scalar = torch.ops.aten._assert_scalar.default(ge, "Runtime assertion failed for expression u1 >= 0 on node 'ge'");  ge = _assert_scalar = None
         _local_scalar_dense: "Sym(u0)" = torch.ops.aten._local_scalar_dense.default(arg0_1);  arg0_1 = None
-        ge_2: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
-        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_2, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_2 = _assert_scalar_1 = None
+        ge_1: "Sym(u0 >= 0)" = _local_scalar_dense >= 0
+        _assert_scalar_1 = torch.ops.aten._assert_scalar.default(ge_1, "Runtime assertion failed for expression u0 >= 0 on node 'ge_1'");  ge_1 = _assert_scalar_1 = None
         pow_1: "Sym(u0**2)" = _local_scalar_dense ** 2
         eq: "Sym(Eq(u1, u0**2))" = arg1_1 == pow_1;  arg1_1 = pow_1 = None
         _assert_scalar_2 = torch.ops.aten._assert_scalar.default(eq, "Runtime assertion failed for expression Eq(u1, u0**2) on node 'eq'");  eq = _assert_scalar_2 = None
diff --git a/test/test_opaque_obj_v2.py b/test/test_opaque_obj_v2.py
index 7dcddfb0f3906..24f60fdffd520 100644
--- a/test/test_opaque_obj_v2.py
+++ b/test/test_opaque_obj_v2.py
@@ -121,7 +121,7 @@ def size_impl(queue: OpaqueQueue) -> int:
         def size_impl_fake(q: OpaqueQueue) -> int:
             ctx = torch._custom_op.impl.get_ctx()
             u0 = ctx.new_dynamic_size()
-            torch._check_is_size(u0)
+            torch._check(u0 >= 0)
             return u0
 
         torch.library.define(
diff --git a/torch/__init__.py b/torch/__init__.py
index e39e50a1f8409..ba8f60f5fffe0 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -33,7 +33,11 @@
     TypeVar as _TypeVar,
     Union as _Union,
 )
-from typing_extensions import ParamSpec as _ParamSpec, TypeIs as _TypeIs
+from typing_extensions import (
+    deprecated as _deprecated,
+    ParamSpec as _ParamSpec,
+    TypeIs as _TypeIs,
+)
 
 
 # As a bunch of torch.packages internally still have this check
@@ -1735,7 +1739,10 @@ def _check(cond, message=None):  # noqa: F811
     _check_with(RuntimeError, cond, message)  # pyrefly: ignore [bad-argument-type]
 
 
-# TODO add deprecation annotation
+@_deprecated(
+    "_check_is_size will be removed in a future PyTorch release along with guard_size_oblivious. \
+    Use _check(i >= 0) instead."
+)
 def _check_is_size(i, message=None, *, max=None):
     """Checks that a given integer is a valid size (i.e., is non-negative).
     You should use this over ``_check(i >= 0)`` because it can prevent
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index bacc95d4c9154..44d33e3f73ac4 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -470,6 +470,10 @@ def has_static_value(a: Union[SymBool, SymFloat, SymInt, bool, float, int]) -> b
     return a.node.shape_env.bound_sympy(a.node.expr).is_singleton()  # type: ignore[union-attr]
 
 
+@deprecated(
+    "guard_size_oblivious will be removed. Consider using explicit unbacked handling \
+    potentially utilizing guard_or_false, guard_or_true, or statically_known_true"
+)
 def guard_size_oblivious(expr: Union[torch.SymBool, bool]) -> bool:
     """
     Perform a guard on a symbolic boolean expression in a size oblivious way.
diff --git a/torch/fx/passes/runtime_assert.py b/torch/fx/passes/runtime_assert.py
index e475a5bc9b6df..3da33923d5363 100644
--- a/torch/fx/passes/runtime_assert.py
+++ b/torch/fx/passes/runtime_assert.py
@@ -576,17 +576,6 @@ def go(node, keypath):
                 if i0 in constrained_unbacked_symbols:
                     continue  # constrain symbol just once
 
-                if i0 in shape_env.size_like:
-                    if export:
-                        graph.call_function(
-                            torch.ops.aten.sym_constrain_range_for_size.default,
-                            (expr_to_proxy[i0].node,),
-                        )
-                    else:
-                        graph.call_function(
-                            torch._check_is_size, (expr_to_proxy[i0].node,)
-                        )
-
                 vr = shape_env.var_to_range[i0]
                 if vr.is_int and vr.upper == sys.maxsize - 1:
                     # treat upper bound == sys.maxsize - 1 for int symbols as +oo

From ca3aaef66ed12e845e3c25f51899ecaaee08f0fb Mon Sep 17 00:00:00 2001
From: roei shlezinger <roei121@gmail.com>
Date: Mon, 17 Nov 2025 07:40:36 +0000
Subject: [PATCH 09/47] Fix clamp broadcasting on MPS (Fixes #160734) (#165058)

This PR fixes a bug where `torch.clamp` on MPS fails when min/max tensors have more dimensions than the input tensor.
CPU already supports this broadcasting, but MPS raised a RuntimeError.

Example of failing case before the fix:
```python
x = torch.randn(2, 3, device="mps")
min_t = torch.randn(1, 2, 3, device="mps")
max_t = torch.randn(1, 2, 3, device="mps")
torch.clamp(x, min=min_t, max=max_t)  # RuntimeError
```
After this fix, MPS matches CPU behavior.

Fixes #160734

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165058
Approved by: https://github.com/malfet
---
 .../native/mps/operations/TensorCompare.mm    | 54 ++++++++++++-------
 test/test_mps.py                              | 22 +++++++-
 2 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm
index 90371fd8745c8..ed659bddd65cc 100644
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@@ -5,6 +5,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <algorithm>
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -89,13 +90,21 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
     auto clamp_shape = clamp_opt->sizes();
     auto input_shape = input_t.sizes();
 
-    TORCH_CHECK(num_clamp_dims <= num_input_dims,
-                op_name + ": clamp tensor number of dims must not be greater than that of input tensor")
+    if (num_clamp_dims > num_input_dims) {
+      auto leading_dims = num_clamp_dims - num_input_dims;
+      for (int64_t i = 0; i < leading_dims; ++i) {
+        TORCH_CHECK(clamp_shape[i] == 1,
+                    op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
+      }
+    }
 
-    for (int i = 0; i < num_clamp_dims; i++)
+    auto clamp_idx = num_clamp_dims - 1;
+    auto input_idx = num_input_dims - 1;
+    auto common_dims = std::min(num_clamp_dims, num_input_dims);
+    for (int64_t i = 0; i < common_dims; ++i)
       // One of the indices is allowed to be 1; will be handled by broadcast
-      TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
-                      clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
+      TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
+                      input_shape[input_idx - i] == 1,
                   op_name + ": clamp tensor trailing shape must match input tensor")
   }
 }
@@ -136,9 +145,6 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
 
   auto result_type = output_t.scalar_type();
 
-  IntArrayRef new_min_shape;
-  IntArrayRef new_max_shape;
-
   auto num_min_dims = min_opt->dim();
   auto num_max_dims = max_opt->dim();
   auto num_input_dims = input_t.dim();
@@ -146,24 +152,32 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
   std::vector<int64_t> new_min_arr(num_input_dims);
   std::vector<int64_t> new_max_arr(num_input_dims);
 
-  if (has_min && num_min_dims < num_input_dims) {
-    fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
-    new_min_shape = IntArrayRef(new_min_arr);
-  }
-
-  if (has_max && num_max_dims < num_input_dims) {
-    fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
-    new_max_shape = IntArrayRef(new_max_arr);
-  }
-
   Tensor min_opt_tensor;
   Tensor max_opt_tensor;
 
+  auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
+                                  int64_t num_clamp_dims,
+                                  std::vector<int64_t>& new_shape_storage) -> Tensor {
+    IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
+    bool requires_view = false;
+
+    if (num_clamp_dims > num_input_dims) {
+      clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
+      requires_view = true;
+    } else if (num_clamp_dims < num_input_dims) {
+      fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
+      clamp_shape = IntArrayRef(new_shape_storage);
+      requires_view = true;
+    }
+
+    return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
+  };
+
   if (has_min) {
-    min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
+    min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
   }
   if (has_max) {
-    max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
+    max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
   }
 
   @autoreleasepool {
diff --git a/test/test_mps.py b/test/test_mps.py
index 107aa3e4609d8..f78af8437060b 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -5616,7 +5616,6 @@ def helper(n, c, h, w):
         helper(2, 8, 4, 5)
 
     # Test clamp_max
-
     def test_clamp_max(self):
         def helper(n, c, h, w):
             cpu_x = torch.randn(n, c, h, w, device='cpu', dtype=torch.float, requires_grad=False)
@@ -5708,6 +5707,27 @@ def helper(n, c, h, w):
 
         helper(2, 8, 4, 5)
 
+    def test_clamp_tensor_bounds_broadcasting(self):
+        def helper(input_shape, bound_shape):
+            cpu_x = torch.randn(input_shape, device="cpu", dtype=torch.float32, requires_grad=False)
+            mps_x = cpu_x.detach().clone().to("mps")
+
+            cpu_min_t = torch.randn(bound_shape, device="cpu", dtype=cpu_x.dtype, requires_grad=False)
+            cpu_max_t = cpu_min_t + torch.rand_like(cpu_min_t).abs()
+
+            mps_min_t = cpu_min_t.detach().clone().to("mps")
+            mps_max_t = cpu_max_t.detach().clone().to("mps")
+
+            clamp_cpu = torch.clamp(cpu_x, min=cpu_min_t, max=cpu_max_t)
+            clamp_mps = torch.clamp(mps_x, min=mps_min_t, max=mps_max_t)
+
+            self.assertEqual(clamp_mps.cpu(), clamp_cpu)
+
+        helper((2, 3), (1, 2, 3))
+        helper((4, 2, 3), (1, 2, 3))
+        helper((2, 3), (2, 3))
+
+
     def test_divmode(self):
         def helper(shape, rounding_mode):
             for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]:

From b9bccec3bc903de51f696214c3a79096639d9002 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 07:56:28 +0000
Subject: [PATCH 10/47] Revert "[ATen][CUDA] Add sm_121a flag for
 RowwiseScaledMM (#167734)"

This reverts commit 226850cc66217e591c706397dd212b457ed61e22.

Reverted https://github.com/pytorch/pytorch/pull/167734 on behalf of https://github.com/Aidyn-A due to fails on CUDA 12.8 ([comment](https://github.com/pytorch/pytorch/pull/167734#issuecomment-3540410067))
---
 cmake/Codegen.cmake | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index c52fe1d2443b6..55d03b7c46320 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -118,11 +118,6 @@ if(INTERN_BUILD_ATEN_OPS)
             list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
           endif()
         endif()
-        if("${_arch}" STREQUAL "121a")
-          if(_existing_arch_flags MATCHES ".*compute_120.*")
-            list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
-          endif()
-        endif()
       endforeach()
       list(JOIN _file_compile_flags " " _file_compile_flags)
 
@@ -131,7 +126,7 @@ if(INTERN_BUILD_ATEN_OPS)
 
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
-      "89;90a;100a;103a;120a;121a")
+      "89;90a;100a;103a;120a")
     _BUILD_FOR_ADDITIONAL_ARCHS(
       "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
       "90a")

From 99117c1238c9adcd3fb2621e36c91f9d20ed2ff7 Mon Sep 17 00:00:00 2001
From: Aidyn-A <aidyn.b.aitzhan@gmail.com>
Date: Mon, 17 Nov 2025 08:07:20 +0000
Subject: [PATCH 11/47] Remove old NVTX interface (#167637)

The PR #167401 reminded me that the removal of old NVTX interface is long overdue, as the header-only NVTX3 has been around for more than 5 years and is shipped with all CUDA Toolkit versions of 12+. In addition to that, `libnvToolsExt.so` was removed in CUDA Toolkit 13 and onward.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167637
Approved by: https://github.com/eqy
---
 caffe2/CMakeLists.txt              |  5 -----
 cmake/Dependencies.cmake           |  5 +----
 cmake/TorchConfig.cmake.in         |  3 ---
 torch/CMakeLists.txt               |  4 ----
 torch/csrc/cuda/shared/nvtx.cpp    | 11 +----------
 torch/csrc/profiler/stubs/cuda.cpp |  4 ----
 6 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index d5c585c1e1f0b..9af0305778d38 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1643,8 +1643,6 @@ if(USE_CUDA)
   target_link_libraries(torch_cuda PUBLIC c10_cuda)
   if(TARGET torch::nvtx3)
     target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
-  else()
-    target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
   endif()
 
   target_include_directories(
@@ -1741,9 +1739,6 @@ if(BUILD_SHARED_LIBS)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
     target_link_libraries(torch_global_deps torch::cudart)
-    if(TARGET torch::nvtoolsext)
-      target_link_libraries(torch_global_deps torch::nvtoolsext)
-    endif()
   endif()
   install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 733183ef50bd5..444a7590a8a07 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -968,11 +968,8 @@ find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
 if(nvtx3_FOUND)
   add_library(torch::nvtx3 INTERFACE IMPORTED)
   target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
-  target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
 else()
-  message(WARNING "Cannot find NVTX3, find old NVTX instead")
-  add_library(torch::nvtoolsext INTERFACE IMPORTED)
-  set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
+  message(FATAL_ERROR "Cannot find NVTX3!")
 endif()
 
 
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 0b32ffa99ceb5..8a5587cad272a 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -132,9 +132,6 @@ if(@USE_CUDA@)
   else()
     set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
   endif()
-  if(TARGET torch::nvtoolsext)
-    list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
-  endif()
 
   if(@BUILD_SHARED_LIBS@)
     find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index d92b9e19a76c5..4e65720180617 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -150,10 +150,6 @@ if(USE_CUDA)
 
     if(TARGET torch::nvtx3)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
-    else()
-      if(TARGET torch::nvtoolsext)
-        list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
-      endif()
     endif()
 endif()
 
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index f4b3c8824b85c..8faf319071c37 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -2,18 +2,13 @@
 #include <wchar.h> // _wgetenv for nvtx
 #endif
 
-#include <cuda_runtime.h>
-
 #ifndef ROCM_ON_WINDOWS
-#if CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvtx3/nvtx3.hpp>
-#else // CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
-#include <nvToolsExt.h>
-#endif // CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
 #include <c10/cuda/CUDAException.h>
+#include <cuda_runtime.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch::cuda::shared {
@@ -55,11 +50,7 @@ static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) {
 void initNvtxBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
-#ifdef TORCH_CUDA_USE_NVTX3
   auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
-#else
-  auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
-#endif
   nvtx.def("rangePushA", nvtxRangePushA);
   nvtx.def("rangePop", nvtxRangePop);
   nvtx.def("rangeStartA", nvtxRangeStartA);
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index 45c288b976ae2..f62afd8d1f303 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,11 +1,7 @@
 #include <sstream>
 
 #ifndef ROCM_ON_WINDOWS
-#if CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvtx3/nvtx3.hpp>
-#else
-#include <nvToolsExt.h>
-#endif
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS

From 5804408f1bad17261665a9e5cc7b0e8f745d4b03 Mon Sep 17 00:00:00 2001
From: "Ma, Jing1" <jing1.ma@intel.com>
Date: Mon, 17 Nov 2025 11:11:23 +0000
Subject: [PATCH 12/47] [1/3][XPU][feature] The implementation of memory
 private pool in XPU device allocator (#166831)

The implementation plan of MemPool for XPU, which is the dependance of [XPUGraph](https://github.com/pytorch/pytorch/pull/166285), following the [RFC](https://github.com/pytorch/pytorch/issues/162143).

- [ ] ->#166831
- [ ] #166833
- [ ] #166843

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166831
Approved by: https://github.com/EikanWang, https://github.com/gujinghui

Co-authored-by: Eikan Wang <eikan.wang@intel.com>
---
 c10/xpu/XPUCachingAllocator.cpp | 172 ++++++++++++++++++++++++++++----
 c10/xpu/XPUCachingAllocator.h   |   2 +-
 2 files changed, 154 insertions(+), 20 deletions(-)

diff --git a/c10/xpu/XPUCachingAllocator.cpp b/c10/xpu/XPUCachingAllocator.cpp
index ba748449b29e3..3bd9eff0fee63 100644
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@@ -15,6 +15,8 @@ using namespace c10::CachingDeviceAllocator;
 // newly allocated memory with 512-byte alignment.
 constexpr size_t kDeviceAlignment = 512;
 
+class XPUAllocator;
+
 namespace {
 using stream_set = ska::flat_hash_set<xpu::XPUStream>;
 
@@ -23,14 +25,19 @@ typedef bool (*Comparison)(const Block*, const Block*);
 bool BlockComparatorSize(const Block* a, const Block* b);
 bool BlockComparatorAddress(const Block* a, const Block* b);
 
+struct PrivatePool;
+
 struct BlockPool {
-  BlockPool(bool small)
+  BlockPool(bool small, PrivatePool* private_pool = nullptr)
       : blocks(BlockComparatorSize),
         unmapped(BlockComparatorAddress),
-        is_small(small) {}
+        is_small(small),
+        owner_PrivatePool(private_pool) {}
+
   std::set<Block*, Comparison> blocks;
   std::set<Block*, Comparison> unmapped;
   const bool is_small;
+  PrivatePool* owner_PrivatePool;
 };
 
 struct ExpandableSegment;
@@ -349,6 +356,43 @@ struct AllocParams {
   StatTypes stat_types = {};
 };
 
+// Internal implementation that manages actual memory blocks.
+// high level MemPool interface wraps PrivatePool via MempoolId.
+struct PrivatePool {
+  PrivatePool(MempoolId_t id, XPUAllocator* allocator = nullptr)
+      : id(std::move(id)),
+        allocator_(allocator),
+        large_blocks(/*small=*/false, this),
+        small_blocks(/*small=*/true, this) {}
+  PrivatePool(const PrivatePool&) = delete;
+  PrivatePool(PrivatePool&&) = delete;
+  PrivatePool& operator=(const PrivatePool&) = delete;
+  PrivatePool& operator=(PrivatePool&&) = delete;
+  ~PrivatePool() = default;
+
+  // default Mempool when no Mempool is specified
+  MempoolId_t id{0, 0};
+  // Number of live graphs using this pool
+  int use_count{1};
+  // Number of unfreed allocations made for this pool. When use_count and
+  // allocation_count drop to zero, we can delete this PrivatePool from
+  // graph_pools.
+  int allocation_count{0};
+  XPUAllocator* allocator_;
+  BlockPool large_blocks;
+  BlockPool small_blocks;
+
+ public:
+  XPUAllocator* allocator() {
+    return allocator_;
+  }
+};
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 } // anonymous namespace
 
 class DeviceCachingAllocator {
@@ -365,6 +409,13 @@ class DeviceCachingAllocator {
   bool set_fraction = false;
   std::vector<ExpandableSegment*> expandable_segments;
   std::vector<c10::DeviceIndex> devices_with_peer_access; // reserved
+  std::vector<std::pair<MempoolId_t, std::function<bool(sycl::queue*)>>>
+      captures_underway;
+  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
+      graph_pools;
+  // Pools no longer referenced by any graph.
+  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
+      graph_pools_freeable;
 
   size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
     if (!src || src->allocated || src->event_count > 0 ||
@@ -463,7 +514,22 @@ class DeviceCachingAllocator {
     }
   }
 
-  BlockPool& get_pool(size_t size) {
+  BlockPool& get_pool(size_t size, sycl::queue* queue) {
+    if (C10_UNLIKELY(!captures_underway.empty())) {
+      for (auto& entry : captures_underway) {
+        // lookup for mempool id matching current capture graph
+        if (entry.second(queue)) {
+          auto it1 = graph_pools.find(entry.first);
+          // lookup mempool
+          TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+          if (size <= kSmallSize) {
+            return it1->second->small_blocks;
+          } else {
+            return it1->second->large_blocks;
+          }
+        }
+      }
+    }
     if (size < kSmallSize) {
       return small_blocks;
     } else {
@@ -669,6 +735,10 @@ class DeviceCachingAllocator {
     if (!ptr) {
       return false;
     }
+
+    if (p.pool->owner_PrivatePool) {
+      p.pool->owner_PrivatePool->allocation_count++;
+    }
     p.block = new Block(device, p.queue(), size, p.pool, ptr);
     for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
       stats.reserved_bytes[stat_type].increase(size);
@@ -677,11 +747,14 @@ class DeviceCachingAllocator {
     return true;
   }
 
-  void synchronize_and_free_events() {
+  void synchronize_and_free_events(PrivatePool* pool = nullptr) {
     for (auto& xe : xpu_events) {
       for (auto& e : xe.second) {
         auto event = e.first;
         auto* block = e.second;
+        if (pool && block->pool->owner_PrivatePool != pool) {
+          continue;
+        }
         event.wait();
         block->event_count--;
         if (block->event_count == 0) {
@@ -785,6 +858,13 @@ class DeviceCachingAllocator {
     for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
       stats.reserved_bytes[stat_type].decrease(unmapped.size);
     });
+
+    if (block->pool->owner_PrivatePool) {
+      // The Freed block belonged to a XPU graph's PrivatePool.
+      TORCH_INTERNAL_ASSERT(
+          block->pool->owner_PrivatePool->allocation_count > 0);
+      block->pool->owner_PrivatePool->allocation_count--;
+    }
   }
 
   void release_blocks(BlockPool& pool) {
@@ -812,13 +892,41 @@ class DeviceCachingAllocator {
     }
   }
 
-  bool release_cached_blocks() {
-    synchronize_and_free_events();
-    // See Note [Safe to Free Blocks on BlockPool]
-    c10::xpu::syncStreamsOnDevice(device_index);
+  bool release_cached_blocks(MempoolId_t mempool_id) {
+    if (mempool_id.first == 0 && mempool_id.second == 0 &&
+        captures_underway.empty()) {
+      synchronize_and_free_events();
+      // See Note [Safe to Free Blocks on BlockPool]
+      c10::xpu::syncStreamsOnDevice(device_index);
+
+      release_blocks(large_blocks);
+      release_blocks(small_blocks);
+    }
 
-    release_blocks(large_blocks);
-    release_blocks(small_blocks);
+    for (auto it = graph_pools_freeable.begin();
+         it != graph_pools_freeable.end();) {
+      if (mempool_id.first != 0 || mempool_id.second != 0) {
+        if (it->first == mempool_id) {
+          // If there is an active mempool, we sync only the events
+          // associated with the pool
+          synchronize_and_free_events(it->second);
+        } else {
+          // otherwise we move on
+          ++it;
+          continue;
+        }
+      }
+      TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+      release_blocks(it->second->small_blocks);
+      release_blocks(it->second->large_blocks);
+      if (it->second->allocation_count == 0) {
+        auto erase_count = graph_pools.erase(it->first);
+        TORCH_INTERNAL_ASSERT(erase_count == 1);
+        it = graph_pools_freeable.erase(it);
+      } else {
+        ++it;
+      }
+    }
     return true;
   }
 
@@ -903,6 +1011,30 @@ class DeviceCachingAllocator {
     }
   }
 
+  void create_or_incref_pool(
+      MempoolId_t mempool_id,
+      XPUAllocator* allocator = nullptr) {
+    auto it = graph_pools.find(mempool_id);
+    if (it == graph_pools.end()) {
+      // mempool_id does not reference an existing pool.
+      // Make a new pool for XPU graph capture or memory pool usage.
+      graph_pools.emplace(
+          mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
+    } else {
+      // mempool_id references an existing pool, which the current XPU graph
+      // capture will share.
+      TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
+      TORCH_INTERNAL_ASSERT(allocator == nullptr);
+      it->second->use_count++;
+    }
+  }
+
+  PrivatePool* get_private_pool(MempoolId_t mempool_id) {
+    auto it = graph_pools.find(mempool_id);
+    TORCH_INTERNAL_ASSERT(it != graph_pools.end());
+    return it->second.get();
+  }
+
  public:
   DeviceCachingAllocator(DeviceIndex device_index)
       : large_blocks(/* small */ false),
@@ -911,9 +1043,11 @@ class DeviceCachingAllocator {
 
   Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
     std::scoped_lock<std::recursive_mutex> lock(mutex);
-    process_events();
+    if (C10_LIKELY(captures_underway.empty())) {
+      process_events();
+    }
     size_t size = round_size(orig_size);
-    auto& pool = get_pool(size);
+    auto& pool = get_pool(size, &queue);
     const size_t alloc_size = get_allocation_size(size);
     AllocParams params(device, size, &queue, &pool, alloc_size);
     params.stat_types = get_stat_types_for_pool(pool);
@@ -923,7 +1057,7 @@ class DeviceCachingAllocator {
     // Can't reuse an existing block, try to get a new one.
     if (!block_found) {
       block_found = alloc_block(params, false) ||
-          (release_cached_blocks() && alloc_block(params, true));
+          (release_cached_blocks({0, 0}) && alloc_block(params, true));
     }
     if (!block_found) {
       const auto& raw_device = c10::xpu::get_raw_device(device);
@@ -1016,9 +1150,9 @@ class DeviceCachingAllocator {
     block->stream_uses.insert(stream);
   }
 
-  void emptyCache() {
+  void emptyCache(MempoolId_t mempool_id) {
     std::scoped_lock<std::recursive_mutex> lock(mutex);
-    release_cached_blocks();
+    release_cached_blocks(mempool_id);
   }
 
   DeviceStats getStats() {
@@ -1172,9 +1306,9 @@ class XPUAllocator : public DeviceAllocator {
     }
   }
 
-  void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
+  void emptyCache(MempoolId_t mempool_id) override {
     for (auto& da : device_allocators) {
-      da->emptyCache();
+      da->emptyCache(mempool_id);
     }
   }
 
@@ -1290,8 +1424,8 @@ void init(DeviceIndex device_count) {
   return allocator.init(device_count);
 }
 
-void emptyCache() {
-  return allocator.emptyCache();
+void emptyCache(MempoolId_t mempool_id) {
+  return allocator.emptyCache(mempool_id);
 }
 
 void resetPeakStats(DeviceIndex device) {
diff --git a/c10/xpu/XPUCachingAllocator.h b/c10/xpu/XPUCachingAllocator.h
index b1f41a103f8f8..bbb20a5b2ecdf 100644
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@@ -10,7 +10,7 @@ C10_XPU_API Allocator* get();
 
 C10_XPU_API void init(DeviceIndex device_count);
 
-C10_XPU_API void emptyCache();
+C10_XPU_API void emptyCache(MempoolId_t mempool_id = {0, 0});
 
 C10_XPU_API void resetPeakStats(DeviceIndex device);
 

From 93ddd38ecd7e1d2ce6ce6393deae6acac553c609 Mon Sep 17 00:00:00 2001
From: Ivan Grigorev <ivangrigoriev@meta.com>
Date: Mon, 17 Nov 2025 12:05:08 +0000
Subject: [PATCH 13/47] Re-land#2 "Fix thread safety in
 getCurrentCUDABlasHandle and getCUDABlasLtWorkspace" (#167928)

Summary:
getCurrentCUDABlasHandle() and getCUDABlasLtWorkspace() use static mutable maps that are not protected from concurrent read-and-write. This leads to crashes.

This diff adds mutexes to synchronize access to the static maps.

Re-land context:

This is a re-land of https://github.com/pytorch/pytorch/pull/167248.

A few issues were addressed:
- fix for a bug in fast path: premature return in getCurrentCUDABlasHandle)
- fix for test flakiness (https://github.com/pytorch/pytorch/pull/167884)

Test Plan:
1. regression tests:
buck2 test \mode/opt //caffe2/test\:test_transformers_cuda
https://www.internalfb.com/intern/testinfra/testrun/6192449759713581

2. Use a GPU OD, run multi-threaded tests with TSAN:

buck test fbcode//mode/dev-tsan fbcode//caffe2:cuda_cublas_handle_pool_test  -- --stress-runs 100
https://www.internalfb.com/intern/testinfra/testrun/14355223937501118

Differential Revision: D87111985

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167928
Approved by: https://github.com/Skylion007
---
 aten/src/ATen/cuda/CUDAContextLight.h         | 10 +-
 aten/src/ATen/cuda/CublasHandlePool.cpp       | 95 +++++++++++++++----
 aten/src/ATen/test/CMakeLists.txt             |  1 +
 .../test/cuda_cublas_handle_pool_test.cpp     | 77 +++++++++++++++
 4 files changed, 161 insertions(+), 22 deletions(-)
 create mode 100644 aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp

diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
index 86e960cc1ab4a..01d10f61da692 100644
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -3,6 +3,7 @@
 
 #include <cstdint>
 #include <map>
+#include <shared_mutex>
 
 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
 
 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
+struct WorkspaceMapWithMutex {
+  std::map<std::tuple<void*, void*>, at::DataPtr> map;
+  std::shared_mutex mutex;
+};
+
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 6175e69827e2f..9ec3acf4cd29e 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-    cublasDestroy(handle);
+  cublasDestroy(handle);
 #endif
 }
 
@@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
 
 } // namespace
 
-std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
   return instance;
 }
 
-std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
   return instance;
 }
 
 void clearCublasWorkspaces() {
-  cublas_handle_stream_to_workspace().clear();
-  cublaslt_handle_stream_to_workspace().clear();
+  {
+    auto& workspace = cublas_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
+  {
+    auto& workspace = cublaslt_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
 }
 
 size_t parseChosenWorkspaceSize() {
@@ -233,6 +241,38 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
   return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }
 
+void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+
+  auto& workspace = cublas_handle_stream_to_workspace();
+
+  size_t workspace_size = getChosenWorkspaceSize();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
+          handle, workspace_it->second.get(), workspace_size));
+      return;
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
+    TORCH_CUDABLAS_CHECK(
+        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
+  }
+}
+
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
   static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@@ -241,8 +281,10 @@ void* getCUDABlasLtWorkspace() {
     auto stream = c10::cuda::getCurrentCUDAStream();
     cudaStream_t _stream = stream;
     auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
     return workspace_it->second.mutable_get();
   }
 #endif
@@ -250,11 +292,29 @@ void* getCUDABlasLtWorkspace() {
   auto stream = c10::cuda::getCurrentCUDAStream();
   cudaStream_t _stream = stream;
   auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
-    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+
+  auto& workspace = cublaslt_handle_stream_to_workspace();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      return workspace_it->second.mutable_get();
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewCUDABlasLtWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it =
+          workspace.map.try_emplace(key, std::move(new_workspace)).first;
+    return workspace_it->second.mutable_get();
   }
-  return workspace_it->second.mutable_get();
 }
 
 cublasHandle_t getCurrentCUDABlasHandle() {
@@ -298,13 +358,8 @@ cublasHandle_t getCurrentCUDABlasHandle() {
   // will allocate memory dynamically (even if they're cheap) outside
   // PyTorch's CUDA caching allocator. It's possible that CCA used up
   // all the memory and cublas's cudaMallocAsync will return OOM
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
-    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
-  }
-  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
+  setWorkspaceForHandle(handle, stream);
+
 #if !defined(USE_ROCM)
   // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
   // FP32 data type calculations based on the value of the allow_tf32 flag.
diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 81b3ce90b36bf..a522e7ab76cf4 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
diff --git a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
new file mode 100644
index 0000000000000..535bb3d1cc2ea
--- /dev/null
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@@ -0,0 +1,77 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <atomic>
+#include <thread>
+#include <vector>
+
+// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
+// to verify that the data race fix is working correctly
+
+TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
+  if (!at::cuda::is_available()) {
+    return;
+  }
+
+  constexpr int num_accessor_threads = 15;
+  constexpr int num_clear_threads = 5;
+  constexpr int iterations_per_thread = 50;
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> error_count{0};
+  std::vector<std::thread> threads;
+  threads.reserve(num_accessor_threads + num_clear_threads);
+
+  // Launch accessor threads
+  for (int i = 0; i < num_accessor_threads; ++i) {
+    threads.emplace_back([&stop, &error_count]() {
+      try {
+        at::cuda::CUDAGuard device_guard(0);
+
+        while (!stop.load(std::memory_order_relaxed)) {
+          const auto handle = at::cuda::getCurrentCUDABlasHandle();
+          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
+
+          if (handle == nullptr || workspace == nullptr) {
+            error_count++;
+          }
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Launch threads that clear workspaces
+  for (int i = 0; i < num_clear_threads; ++i) {
+    threads.emplace_back([&error_count]() {
+      try {
+        for (int j = 0; j < iterations_per_thread; ++j) {
+          at::cuda::clearCublasWorkspaces();
+          std::this_thread::yield();
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Let them run for a bit
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  stop.store(true, std::memory_order_relaxed);
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  EXPECT_EQ(error_count.load(), 0);
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  c10::cuda::CUDACachingAllocator::init(1);
+  return RUN_ALL_TESTS();
+}

From 53809f964083a9e89182c2db7638fd44f3a6e304 Mon Sep 17 00:00:00 2001
From: Usamah Zaheer <usamah.zaheer@arm.com>
Date: Mon, 17 Nov 2025 12:06:30 +0000
Subject: [PATCH 14/47] [ARM] Improve LLM performance & mem usage using
 int4-bf16 KleidiAI kernels (#158250)

Co-authored-by: Nikhil Gupta [nikhil.gupta2@arm.com](mailto:nikhil.gupta2@arm.com)

This PR enables the use of KleidiAI INT4 kernels that directly produce BF16 outputs within PyTorch to boost LLM prefill & decode performance

**This change improves decode throughput by ~15% & reduces memory required to inference the model by 50%**

### Benchmark Setup
```
Model: meta-llama/Llama-3.1-8B
Test Platform: Neoverse V2
```
### Detailed Results

| Metric                           | With `--compile`         | Without `--compile`      |
|----------------------------------|---------------------------|---------------------------|
| Quantization Scheme              | INT4 symmetric channelwise | INT4 symmetric channelwise |
| Input Precision                  | BF16                      | BF16                      |
| Number of Layers Quantized       | 32                        | 32                        |
| Average Compression Ratio        | 87.49%                    | 87.49%                    |
| Total Quantization Time (s)      | 9.62                      | 10.32                     |
| Compile Time (First) (s)         | 134.48                    | 1.69                      |
| Compile Time (Second) (s)        | 80.44                     | 1.60                      |
| Compile Time (Subsequent) (s)    | 0.19                      | 0.22                      |
| Prefill Tokens                   | 54                        | 54                        |
| Decoded Tokens                   | 33                        | 33                        |
| Prefill Time (s)                 | 0.19                      | 0.22                      |
| Decode Time (s)                  | 0.76                      | 1.38                      |
| E2E Generation Time (s)          | 0.95                      | 1.60                      |
| Prefill Throughput (tokens/s)    | 288.13                    | 249.91                    |
| Decode Throughput (tokens/s)     | 43.42                     | 23.83                     |
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158250
Approved by: https://github.com/malfet, https://github.com/aditew01, https://github.com/fadara01

Co-authored-by: Nikhil Gupta <nikhil.gupta2@arm.com>
Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
---
 aten/src/ATen/native/LinearAlgebra.cpp        |   4 +-
 aten/src/ATen/native/cpu/int4mm_kernel.cpp    | 343 +++++++++++++-----
 aten/src/ATen/native/kleidiai/kai_kernels.cpp | 200 ++++++++--
 aten/src/ATen/native/kleidiai/kai_kernels.h   |   3 +-
 aten/src/ATen/native/kleidiai/kai_pack.h      |   9 +-
 .../native/kleidiai/kai_ukernel_interface.cpp |  34 ++
 .../native/kleidiai/kai_ukernel_interface.h   |  89 ++++-
 test/inductor/test_torchinductor.py           | 106 +++++-
 torch/_meta_registrations.py                  |  11 +-
 9 files changed, 662 insertions(+), 137 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 07bdc19ec8ff7..934ecb99d3382 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -3541,9 +3541,9 @@ Tensor _dyn_quant_matmul_4bit_cpu(
     const int64_t out_features) {
   auto M = inp.size(0);
   TORCH_CHECK(
-      inp.dtype() == kFloat,
+      inp.dtype() == kFloat || (inp.dtype() == kBFloat16 && block_size == in_features),
       __func__,
-      " : expect input to be 32-bit float tensor.");
+      " : expect input to be float32 or bfloat16 tensor.");
   TORCH_CHECK(
       block_size == in_features ||
           (!(block_size % 32) && !(in_features % block_size)),
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
index 33aae4fbf27a5..1ffaa7bcd90b7 100644
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -8,6 +8,7 @@
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/cpu/int_mm_kernel.h>
 #include <ATen/native/cpu/utils.h>
+#include <cmath>
 #include <c10/util/Unroll.h>
 #include <c10/util/irange.h>
 
@@ -793,6 +794,139 @@ bool can_use_kleidiai(
 }
 #endif
 
+static void ref_dyn_quant_matmul_4bit_channelwise_kernel_bf16(
+    size_t m,
+    size_t n,
+    size_t k,
+    const uint16_t* lhs_bf16,
+    const uint8_t* rhs_qs4cx,
+    const float* rhs_scales,
+    uint16_t* dst_bf16,
+    float scalar_min,
+    float scalar_max,
+    const float* bias) {
+  // Roundup lambda for internal stride calculations
+  auto roundup = [](size_t a, size_t b) { return ((a + b - 1) / b) * b; };
+
+  // Cast bfloat16 to float32 inline
+  auto cast_bf16_to_f32 = [](uint16_t bf16_val) {
+    uint32_t tmp = static_cast<uint32_t>(bf16_val) << 16;
+    float f;
+    std::memcpy(&f, &tmp, sizeof(f));
+    return f;
+  };
+
+  // Cast float32 to bfloat16 inline
+  auto cast_f32_to_bf16 = [](float f) {
+    uint32_t bits;
+    std::memcpy(&bits, &f, sizeof(bits));
+    return static_cast<uint16_t>(bits >> 16);
+  };
+
+  // Quantization pack lambda (channelwise QA8DX)
+  auto quant_pack_8bit_channelwise =
+      [&](size_t M, size_t K, const uint16_t* src_bf16, int8_t* dst_qa8dx) {
+        constexpr int8_t kI8Min = std::numeric_limits<std::int8_t>::lowest();
+        constexpr int8_t kI8Max = std::numeric_limits<std::int8_t>::max();
+
+        const size_t dst_stride =
+            K * sizeof(int8_t) + sizeof(float) + sizeof(int32_t);
+        for (size_t i = 0; i < M; ++i) {
+          const uint16_t* row_ptr = src_bf16 + i * K;
+          // find min/max
+          float mn = FLT_MAX, mx = -FLT_MAX;
+          for (size_t j = 0; j < K; ++j) {
+            float v = cast_bf16_to_f32(row_ptr[j]);
+            mn = std::min(mn, v);
+            mx = std::max(mx, v);
+          }
+          float rmin = std::min(0.0f, mn);
+          float rmax = std::max(0.0f, mx);
+          constexpr float qmin = static_cast<float>(kI8Min);
+          constexpr float qmax = static_cast<float>(kI8Max);
+          float scale = (rmin == rmax) ? 1.f : (qmax - qmin) / (rmax - rmin);
+          float recip = scale ? 1.0f / scale : 0.0f;
+          int32_t zp;
+          float des_min = rmin * scale;
+          float des_max = rmax * scale;
+          float err_min = qmin + des_min;
+          float err_max = qmax + des_max;
+          float zp_f =
+              (err_min + err_max) > 0 ? qmin - des_min : qmax - des_max;
+          zp_f = std::clamp(zp_f, qmin, qmax);
+          zp = std::lrintf(zp_f);
+          int8_t* out_ptr = dst_qa8dx + i * dst_stride;
+          // store header
+          *reinterpret_cast<float*>(out_ptr) = recip;
+          *reinterpret_cast<int32_t*>(out_ptr + sizeof(float)) = -zp;
+          out_ptr += sizeof(float) + sizeof(int32_t);
+          // quantize
+          for (size_t j = 0; j < K; ++j) {
+            float v = cast_bf16_to_f32(row_ptr[j]);
+            int32_t q = static_cast<int32_t>(std::round(v * scale)) + zp;
+            q = std::clamp(
+                q, static_cast<int32_t>(kI8Min), static_cast<int32_t>(kI8Max));
+            *out_ptr++ = static_cast<int8_t>(q);
+          }
+        }
+      };
+
+  // MatMul lambda (MXN x MXK -> MNXK BF16)
+  auto matmul_kernel = [&](size_t M,
+                           size_t N,
+                           size_t K,
+                           const int8_t* lhs,
+                           const uint8_t* rhs,
+                           const float* scales,
+                           uint16_t* dst,
+                           float lo,
+                           float hi) {
+    const size_t lhs_stride =
+        K * sizeof(int8_t) + sizeof(float) + sizeof(int32_t);
+    const size_t rhs_stride = roundup(K, 2) / 2;
+    for (size_t i = 0; i < M; ++i) {
+      const int8_t* lhs_row = lhs + i * lhs_stride;
+      for (size_t j = 0; j < N; ++j) {
+        int32_t acc = 0;
+        const int8_t* lptr = lhs_row;
+        const uint8_t* rptr = rhs + j * rhs_stride;
+        float lhs_scale = *reinterpret_cast<const float*>(lptr);
+        int32_t lhs_off =
+            *reinterpret_cast<const int32_t*>(lptr + sizeof(float));
+        lptr += sizeof(float) + sizeof(int32_t);
+        for (size_t t = 0; t < K; ++t) {
+          int32_t lv = static_cast<int32_t>(lptr[t]);
+          uint8_t bv = rptr[t / 2];
+          int32_t rv = ((t & 1) == 0) ? (static_cast<int32_t>(bv & 0xF) - 8)
+                                      : (static_cast<int32_t>(bv >> 4) - 8);
+          acc += lv * rv + lhs_off * rv;
+        }
+        float res = static_cast<float>(acc) * scales[j] * lhs_scale;
+        if (bias) {
+          res += bias[j];
+        }
+        res = std::clamp(res, lo, hi);
+        *dst++ = cast_f32_to_bf16(res);
+      }
+    }
+  };
+
+  // allocate and run
+  std::unique_ptr<int8_t[]> packed(
+      new int8_t[m * (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t))]);
+  quant_pack_8bit_channelwise(m, k, lhs_bf16, packed.get());
+  matmul_kernel(
+      m,
+      n,
+      k,
+      packed.get(),
+      rhs_qs4cx,
+      rhs_scales,
+      dst_bf16,
+      scalar_min,
+      scalar_max);
+}
+
 /**
  * The Int4 quantized weights must be represented as a uint8 tensor
  * For matrix multiplication with a weight shape of (N x K)
@@ -819,21 +953,21 @@ void dyn_quant_pack_4bit_weight_kernel(
 #if AT_KLEIDIAI_ENABLED()
   if (can_use_kleidiai(scales_zeros, K, block_size)) {
     const int64_t weight_packed_size =
-        kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
+        kleidiai::kai_pack_rhs_int4_size(N, K, block_size, weights.scalar_type());
     packed_weights.resize_({weight_packed_size});
     kleidiai::kai_pack_int4_rhs(
         packed_weights, weights, scales_zeros, bias, N, K, block_size);
   } else
 #endif
   {
-    TORCH_CHECK(
-        bias.has_value() == 0,
-        __func__,
-        " : Bias is unsupported in reference implementation");
     packed_weights = packed_weights.to(kFloat);
-    auto weight_reshaped = weights.view({-1}).to(kFloat);
-    auto scales_zeros_reshaped = scales_zeros.view({-1}).to(kFloat);
-    auto res = at::cat({weight_reshaped, scales_zeros_reshaped}, 0);
+    auto weight_reshaped = weights.reshape({-1}).to(kFloat);
+    auto scales_zeros_reshaped = scales_zeros.reshape({-1}).to(kFloat);
+    std::vector<at::Tensor> tensors_to_cat = {weight_reshaped, scales_zeros_reshaped};
+    if (bias.has_value()) {
+      tensors_to_cat.push_back(bias.value().view({-1}).to(kFloat));
+    }
+    auto res = at::cat(tensors_to_cat, 0);
     packed_weights.resize_(res.sizes()).copy_(res);
   }
 }
@@ -847,7 +981,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
     const float* rhs_scales_f32,
     float* dst_f32,
     float scalar_min,
-    float scalar_max) {
+    float scalar_max,
+    const float* bias) {
   const size_t input_size_8bit = m * (k + sizeof(int32_t) + sizeof(float));
 
   auto lhs_qa8dx_buffer = std::make_unique<uint8_t[]>(input_size_8bit);
@@ -857,6 +992,9 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
   // required format for matmul
   auto input_quant_pack_8bit_channelwise =
       [&](size_t m, size_t k, const float* lhs_f32, int8_t* lhs_qa8dx) {
+        constexpr int8_t kI8Min = std::numeric_limits<std::int8_t>::lowest();
+        constexpr int8_t kI8Max = std::numeric_limits<std::int8_t>::max();
+
         const size_t dst_stride =
             (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t));
 
@@ -877,8 +1015,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
           }
 
           // Maximum/minimum int8 values
-          const float qmin = (float)INT8_MIN;
-          const float qmax = (float)INT8_MAX;
+          constexpr float qmin = static_cast<float>(kI8Min);
+          constexpr float qmax = static_cast<float>(kI8Max);
 
           const float rmin0 = std::min(0.0f, min0);
           const float rmax0 = std::max(0.0f, max0);
@@ -904,7 +1042,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
           zero_point0 = std::min(zero_point0, qmax);
 
           // Round to nearest integer
-          const int32_t nudged_zero_point0 = lrintf(zero_point0);
+          const int32_t nudged_zero_point0 = std::lrintf(zero_point0);
 
           int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
 
@@ -922,8 +1060,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
             int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
 
             v0_s32 = v0_s32 + nudged_zero_point0;
-            v0_s32 = std::max(v0_s32, static_cast<int32_t>(INT8_MIN));
-            v0_s32 = std::min(v0_s32, static_cast<int32_t>(INT8_MAX));
+            v0_s32 = std::max(v0_s32, static_cast<int32_t>(kI8Min));
+            v0_s32 = std::min(v0_s32, static_cast<int32_t>(kI8Max));
             dst_ptr[0] = (int8_t)v0_s32;
             dst_ptr += sizeof(int8_t);
           }
@@ -987,6 +1125,10 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
 
       main_acc = main_acc * lhs_scale;
 
+      if (bias) {
+        main_acc += bias[n_idx];
+      }
+
       // Clamp (min-max) operation
       main_acc = std::max(main_acc, scalar_min);
       main_acc = std::min(main_acc, scalar_max);
@@ -1007,12 +1149,16 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
     const float* rhs_scales_fp32,
     float* dst_f32,
     float scalar_min,
-    float scalar_max) {
+    float scalar_max,
+    const float* bias) {
   // Lambda for LHS quantization
   auto lhs_quant_pack = [&](size_t m,
                             size_t k,
                             const float* lhs_f32,
                             int8_t* lhs_qa8dx) {
+    constexpr int8_t kI8Min = std::numeric_limits<std::int8_t>::lowest();
+    constexpr int8_t kI8Max = std::numeric_limits<std::int8_t>::max();
+
     const size_t dst_stride =
         (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t));
 
@@ -1028,8 +1174,8 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
         min0 = std::min(src0_0, min0);
       }
 
-      const float qmin = (float)INT8_MIN;
-      const float qmax = (float)INT8_MAX;
+      constexpr float qmin = static_cast<float>(kI8Min);
+      constexpr float qmax = static_cast<float>(kI8Max);
 
       const float rmin0 = std::min(0.0f, min0);
       const float rmax0 = std::max(0.0f, max0);
@@ -1046,7 +1192,7 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
 
       zero_point0 = std::max(zero_point0, qmin);
       zero_point0 = std::min(zero_point0, qmax);
-      const int32_t nudged_zero_point0 = lrintf(zero_point0);
+      const int32_t nudged_zero_point0 = std::lrintf(zero_point0);
 
       int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
 
@@ -1059,9 +1205,8 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
         const float src0_0 = src_ptr[k_idx];
         int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
         v0_s32 = std::max(
-            std::min(
-                v0_s32 + nudged_zero_point0, static_cast<int32_t>(INT8_MAX)),
-            static_cast<int32_t>(INT8_MIN));
+            std::min(v0_s32 + nudged_zero_point0, static_cast<int32_t>(kI8Max)),
+            static_cast<int32_t>(kI8Min));
         dst_ptr[0] = (int8_t)v0_s32;
         dst_ptr += sizeof(int8_t);
       }
@@ -1118,6 +1263,11 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
       }
 
       main_acc = main_acc * lhs_scale;
+
+      if (bias) {
+        main_acc += bias[col_idx];
+      }
+
       main_acc = std::max(main_acc, scalar_min);
       main_acc = std::min(main_acc, scalar_max);
 
@@ -1128,28 +1278,27 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
 }
 
 /**
- * Dynamic Input Quant 4 bit weights matmul execution flow
-              (INT4 Weights + FP scales + FP32 Bias)
-  FP32 Input              Packed Buffer
-       |                       |
-    Quantize                Cast
-   to INT8                 to INT8
-       |                       |
-       v                       v
- INT8 Input              INT8 Weights
-          \               /
-            \            /
-             \         /
-           INT8 Matrix Multiplication
-                   |
-                   v
- FP32 Dequantized and Accumulate in FP32
-                   |
-                   v
-             FP32 Final Output
-
- * The Groupwise kernel requires BFloat16 Scales and Channelwise kernel requires
- * Float32 Scales. If not provided, we will use fallback implementation.
+ * Dynamic INT4 weight-only MatMul with per-row input quantization.
+ *
+ * Execution Flow:
+ *
+ *   (INT4 Weights + FP Scales [+ optional Bias])
+ *
+ *    Input (FP32 or BF16)         Packed Weight Buffer
+ *           |                             |
+ *    Row-wise Quantization (INT8)         |
+ *           |                             |
+ *     INT8 Input Activation      INT4 Quantized Weights + Scales
+ *                  \             /
+ *                   \           /
+ *              Quantized Matrix Multiply
+ *                     |
+ *              Output Tensor (BF16 or FP32)
+ *
+ * Notes:
+ *   - Groupwise kernels expect BF16 scales
+ *   - Channelwise kernels expect FP32 scales
+ *   - Bias is currently unsupported in fallback path
  */
 void dyn_quant_matmul_4bit_kernel(
     const Tensor& output,
@@ -1161,65 +1310,75 @@ void dyn_quant_matmul_4bit_kernel(
     const int64_t block_size) {
 #if AT_KLEIDIAI_ENABLED()
   const int64_t weight_packed_size =
-      kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
+      kleidiai::kai_pack_rhs_int4_size(N, K, block_size, inp.scalar_type());
   if (weight_packed_size == packed_weights.numel()) {
     // KleidiAI interface internally handles the Channelwise and groupwise
     // distinction
-    kleidiai::kai_quant_pack_lhs_int4_mm(
-        output, inp, packed_weights, M, N, K, block_size);
+    kleidiai::kai_quant_pack_lhs_int4_mm(output, inp, packed_weights, M, N, K, block_size);
   } else
 #endif
   {
-    float* lhs_f32 = reinterpret_cast<float*>(inp.data_ptr());
-    const auto weights_size = N * K / 2;
-    // The weights needs to be in uint8_t data type after quantization
-    auto extracted_weights =
-        (packed_weights.narrow(0, 0, weights_size)).to(kByte);
-    auto float32_scales =
-        (packed_weights.narrow(
-             0, weights_size, packed_weights.size(0) - weights_size))
-            .to(kFloat);
-    uint8_t* rhs_4bit =
-        reinterpret_cast<uint8_t*>(extracted_weights.data_ptr());
-    float* rhs_scales_f32 = reinterpret_cast<float*>(float32_scales.data_ptr());
-    float* dst_f32 = reinterpret_cast<float*>(output.data_ptr());
-    if (block_size == K) {
-      ref_dyn_quant_matmul_4bit_channelwise_kernel(
-          M,
-          N,
-          K,
-          lhs_f32,
-          rhs_4bit,
-          rhs_scales_f32,
-          dst_f32,
-          -FLT_MAX,
-          FLT_MAX);
-    } else if (!(block_size % 32) && !(K % block_size)) {
-      ref_dyn_quant_matmul_4bit_groupwise_kernel(
-          M,
-          N,
-          K,
-          block_size,
-          lhs_f32,
-          rhs_4bit,
-          rhs_scales_f32,
-          dst_f32,
-          -FLT_MAX,
-          FLT_MAX);
+    {
+    void* input = inp.data_ptr();
+    void* dst = output.data_ptr();
+
+    // Extract weights, sclaes and biases form from packed tensor
+    const int weights_elements = N * K / 2;
+    const int scale_elements = N * (K / block_size);
+    TORCH_CHECK(packed_weights.numel() >=  (weights_elements + scale_elements), "Invalid packed weight tensor size");
+
+    auto extracted_weights = packed_weights.narrow(0, 0, weights_elements).to(kByte);
+    auto extracted_scales_and_bias = packed_weights.narrow(0, weights_elements, packed_weights.size(0) - weights_elements).to(kFloat);
+    auto float32_scales = extracted_scales_and_bias.narrow(0, 0, scale_elements);
+
+    int bias_elements = packed_weights.numel() - (weights_elements + scale_elements);
+    float* weight_scales = float32_scales.data_ptr<float>();
+
+    void* bias_data = nullptr;
+    if (bias_elements) {
+        auto float32_bias = extracted_scales_and_bias.narrow(0, scale_elements, bias_elements);
+        TORCH_CHECK(float32_bias.size(0) == N, "Expected bias length to match output dimension");
+        bias_data = float32_bias.data_ptr();
+
+    }
+    // 2 elements of 4 bit weights are packed into 1 uint8 packet
+    uint8_t* weights_4bit = reinterpret_cast<uint8_t*>(extracted_weights.data_ptr());
+
+    // Dispatch to reference kernels
+    if (inp.scalar_type() == at::kBFloat16) {
+        // BF16 input, BF16 output
+        constexpr float BF16_MAX = 3.38953139e+38f;
+        constexpr float BF16_MIN = -BF16_MAX;
+        if (block_size == K) {
+            ref_dyn_quant_matmul_4bit_channelwise_kernel_bf16(
+                M, N, K,
+                (uint16_t*)input, weights_4bit, weight_scales,
+                (uint16_t*)dst, BF16_MIN, BF16_MAX, (float*)bias_data);
+        } else {
+            TORCH_CHECK(false, "Unsupported block size for BF16 fallback");
+        }
+    } else if (inp.scalar_type() == at::kFloat) {
+        // FP32 input, FP32 output
+        if (block_size == K) {
+            ref_dyn_quant_matmul_4bit_channelwise_kernel(
+                M, N, K,
+                (float*)input, weights_4bit, weight_scales,
+                (float*)dst, -FLT_MAX, FLT_MAX, (float*)bias_data);
+        } else if (!(block_size % 32) && !(K % block_size)) {
+            ref_dyn_quant_matmul_4bit_groupwise_kernel(
+                M, N, K, block_size,
+                (float*)input, weights_4bit, weight_scales,
+                (float*)dst, -FLT_MAX, FLT_MAX, (float*)bias_data);
+        } else {
+            TORCH_CHECK(false, "Unsupported block size for FP32 fallback");
+        }
     } else {
-      TORCH_CHECK(
-          block_size == K || (!(block_size % 32) && !(K % block_size)),
-          __func__,
-          ": Group size should be multiple 32 or in_features [",
-          K,
-          "]. Provided ",
-          block_size);
+        TORCH_CHECK(false, "Unsupported input/output dtype combination for int4mm kernel");
     }
-  }
 }
-
+}
 } // anonymous namespace
-
+}
 ALSO_REGISTER_AVX512_DISPATCH(weight_to_int4pack_stub, &weight_to_int4pack_kernel)
 ALSO_REGISTER_AVX512_DISPATCH(int4pack_mm_stub, &int4pack_mm_kernel)
 REGISTER_DISPATCH(dyn_quant_pack_4bit_weight_stub, &dyn_quant_pack_4bit_weight_kernel)
diff --git a/aten/src/ATen/native/kleidiai/kai_kernels.cpp b/aten/src/ATen/native/kleidiai/kai_kernels.cpp
index ce0f10bf6df1f..1313f98f90109 100644
--- a/aten/src/ATen/native/kleidiai/kai_kernels.cpp
+++ b/aten/src/ATen/native/kleidiai/kai_kernels.cpp
@@ -21,18 +21,27 @@ void kai_pack_int4_rhs(
     const int64_t n,
     const int64_t k,
     const int64_t bl) {
-  // Prefer Channelwise kernel over Groupwise kernel for conflicting cases
   if (bl == k) {
     // Channelwise
-    auto kernel_packet = kai_select_channelwise_matmul_ukernel(
-        kai_kernel_id::
-            matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
-    auto& params = kernel_packet.rhs_pack_params;
-    params.lhs_zero_point = 1;
-    params.rhs_zero_point = 8;
-
-    kai_pack_rhs_channelwise_int4<kai_matmul_ukernel_f32_qa8dxp_qs4cxp>(
-        kernel_packet, weight_packed, weight, scales, bias, n, k);
+    if (weight.scalar_type() == at::kBFloat16) {
+      auto kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(
+          kai_kernel_id::
+              matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod);
+      auto& params = kernel_packet.rhs_pack_params;
+      params.lhs_zero_point = 1;
+      params.rhs_zero_point = 8;
+      kai_pack_rhs_channelwise_int4<kai_matmul_ukernel_bf16_qa8dxp_qs4cxp>(
+          kernel_packet, weight_packed, weight, scales, bias, n, k);
+    } else {
+      auto kernel_packet = kai_select_channelwise_matmul_ukernel(
+          kai_kernel_id::
+              matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
+      auto& params = kernel_packet.rhs_pack_params;
+      params.lhs_zero_point = 1;
+      params.rhs_zero_point = 8;
+      kai_pack_rhs_channelwise_int4<kai_matmul_ukernel_f32_qa8dxp_qs4cxp>(
+          kernel_packet, weight_packed, weight, scales, bias, n, k);
+    }
   } else if (!(bl % 32) && !(k % bl)) {
     // Groupwise
     auto kernel_packet = kai_select_groupwise_matmul_ukernel(
@@ -63,19 +72,29 @@ void kai_pack_int4_rhs(
 size_t kai_pack_rhs_int4_size(
     const int64_t n,
     const int64_t k,
-    const int64_t bl) {
+    const int64_t bl,
+    at::ScalarType tensor_dtype) {
   size_t packed_size = n * k;
-  // Prefer Channelwise kernel over Groupwise kernel for conflicting cases
   if (bl == k) {
-    // Channelwise
-    auto kernel_packet = kai_select_channelwise_matmul_ukernel(
-        kai_kernel_id::
-            matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
-    const auto& ukernel = kernel_packet.ukernel;
-    const size_t nr = ukernel.get_nr();
-    const size_t kr = ukernel.get_kr();
-    const size_t sr = ukernel.get_sr();
-    packed_size = kernel_packet.kai_get_rhs_packed_size(n, k, nr, kr, sr);
+    if (tensor_dtype == at::kBFloat16) {
+      auto kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(
+          kai_kernel_id::
+              matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod);
+      const auto& ukernel = kernel_packet.ukernel;
+      const size_t nr = ukernel.get_nr();
+      const size_t kr = ukernel.get_kr();
+      const size_t sr = ukernel.get_sr();
+      packed_size = kernel_packet.kai_get_rhs_packed_size(n, k, nr, kr, sr);
+    } else {
+      auto kernel_packet = kai_select_channelwise_matmul_ukernel(
+          kai_kernel_id::
+              matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
+      const auto& ukernel = kernel_packet.ukernel;
+      const size_t nr = ukernel.get_nr();
+      const size_t kr = ukernel.get_kr();
+      const size_t sr = ukernel.get_sr();
+      packed_size = kernel_packet.kai_get_rhs_packed_size(n, k, nr, kr, sr);
+    }
   } else if (!(bl % 32) && !(k % bl)) {
     // Groupwise
     auto kernel_packet = kai_select_groupwise_matmul_ukernel(
@@ -148,8 +167,7 @@ static void kai_quant_pack_lhs_int4_mm_groupwise(
     const auto lhs_src_ptr = lhs_native_mtx_f32 + thread_id * src_stride;
     const int64_t m_idx = thread_id * vec_per_thread;
     auto lhs_packed_ptr = lhs_packed_base +
-        kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(
-                              m_idx, k, mr, kr, sr);
+        kernel_packet.kai_get_lhs_quant_pack_offset(m_idx, k, mr, kr, sr);
     const int64_t vec_num = (thread_id == num_threads - 1)
         ? (m - vec_per_thread * thread_id)
         : vec_per_thread;
@@ -259,8 +277,7 @@ static void kai_quant_pack_lhs_int4_mm_channelwise(
     const auto lhs_src_ptr = lhs_native_mtx_f32 + thread_id * src_stride;
     const int64_t m_idx = thread_id * vec_per_thread;
     auto lhs_packed_ptr = lhs_packed_base +
-        kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(
-                              m_idx, k, mr, kr, sr);
+        kernel_packet.kai_get_lhs_quant_pack_offset(m_idx, k, mr, kr, sr);
     const int64_t vec_num = (thread_id == num_threads - 1)
         ? (m - vec_per_thread * thread_id)
         : vec_per_thread;
@@ -320,19 +337,144 @@ static void kai_quant_pack_lhs_int4_mm_channelwise(
       });
 }
 
-void kai_quant_pack_lhs_int4_mm(
+static void kai_quant_pack_lhs_int4_mm_bf16_channelwise(
     const Tensor& output,
     const Tensor& input,
     const Tensor& weight,
     const int64_t m,
     const int64_t n,
+    const int64_t k) {
+  // Kernel IDs for GEMM and GEMV
+  constexpr kai_kernel_id gemm_id =
+      kai_kernel_id::matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm;
+  constexpr kai_kernel_id gemv_id =
+      kai_kernel_id::matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod;
+
+  // Get total threads and select kernel
+  const int64_t total_threads = at::get_num_threads();
+  auto kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(gemv_id);
+  if (cpuinfo_has_arm_i8mm() && m > 1) {
+    kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(gemm_id);
+  }
+
+  // Thread blocking parameters
+  const int64_t n_step = kernel_packet.ukernel.get_n_step();
+  const size_t mr = kernel_packet.ukernel.get_mr();
+  const size_t kr = kernel_packet.ukernel.get_kr();
+  const size_t sr = kernel_packet.ukernel.get_sr();
+
+  const size_t lhs_packed_size =
+      kernel_packet.kai_get_lhs_packed_size(m, k, mr, kr, sr);
+  auto lhs_packed = std::make_unique<uint8_t[]>(lhs_packed_size);
+  uint8_t* dst_act_mtx_bf16 = reinterpret_cast<uint8_t*>(output.data_ptr());
+  const uint8_t* lhs_native_mtx_bf16 =
+      reinterpret_cast<const uint8_t*>(input.data_ptr());
+  const uint8_t* rhs_packed_mtx_qs4cx =
+      reinterpret_cast<const uint8_t*>(weight.data_ptr());
+  uint8_t* lhs_packed_base = lhs_packed.get();
+
+  constexpr int32_t element_size = sizeof(uint16_t);
+  const size_t lhs_stride = k * element_size;
+  const size_t dst_stride = n * element_size;
+
+  // LHS quantization packing
+  int64_t vec_per_thread = get_vec_per_thread(m, total_threads, mr);
+  int64_t num_threads = (m + vec_per_thread - 1) / vec_per_thread;
+  const size_t src_stride = vec_per_thread * lhs_stride;
+
+  auto lhs_quant_pack = [=, &kernel_packet](int64_t thread_id) {
+    const auto lhs_src_ptr = lhs_native_mtx_bf16 + thread_id * src_stride;
+    const int64_t m_idx = thread_id * vec_per_thread;
+    auto lhs_packed_ptr = lhs_packed_base +
+        kernel_packet.kai_get_lhs_quant_pack_offset(m_idx, k, mr, kr, sr);
+    const int64_t vec_num = (thread_id == num_threads - 1)
+        ? (m - vec_per_thread * thread_id)
+        : vec_per_thread;
+
+    kernel_packet.kai_run_lhs_quant_pack(
+        vec_num,
+        k,
+        mr,
+        kr,
+        sr,
+        0,
+        (const uint16_t*)lhs_src_ptr,
+        lhs_stride,
+        lhs_packed_ptr);
+  };
+
+  at::parallel_for(
+      0, num_threads, /*grain_size=*/1, [&](int64_t begin, int64_t end) {
+        for (int64_t thread_id = begin; thread_id < end; ++thread_id) {
+          lhs_quant_pack(thread_id);
+        }
+      });
+
+  // Matrix multiplication
+  vec_per_thread = get_vec_per_thread(n, total_threads, n_step);
+  num_threads = (n + vec_per_thread - 1) / vec_per_thread;
+
+  auto mm = [=, &kernel_packet](int64_t thread_id) {
+    const auto rhs_packed_ptr = rhs_packed_mtx_qs4cx +
+        kernel_packet.ukernel.get_rhs_packed_offset(
+            thread_id * vec_per_thread, k);
+    auto dst_ptr = dst_act_mtx_bf16 +
+        kernel_packet.ukernel.get_dst_offset(
+            0, thread_id * vec_per_thread, dst_stride);
+    const int64_t vec_num = (thread_id == num_threads - 1)
+        ? (n - vec_per_thread * thread_id)
+        : vec_per_thread;
+
+    kernel_packet.ukernel.run_matmul(
+        m,
+        vec_num,
+        k,
+        lhs_packed_base,
+        rhs_packed_ptr,
+        (uint16_t*)dst_ptr,
+        dst_stride,
+        element_size, // dst_stride_col
+        -FLT_MAX,
+        FLT_MAX);
+  };
+
+  at::parallel_for(
+      0, num_threads, /*grain_size=*/1, [&](int64_t begin, int64_t end) {
+        for (int64_t thread_id = begin; thread_id < end; ++thread_id) {
+          mm(thread_id);
+        }
+      });
+}
+void kai_quant_pack_lhs_int4_mm(
+    const at::Tensor& output,
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const int64_t m,
+    const int64_t n,
     const int64_t k,
     const int64_t bl) {
   // Prefer Channelwise kernel over Groupwise kernel for conflicting cases
   if (bl == k) {
-    kleidiai::kai_quant_pack_lhs_int4_mm_channelwise(
-        output, input, weight, m, n, k);
-  } else if (!(bl % 32) && !(k % bl)) {
+    const auto input_dtype = input.dtype();
+
+    if (input_dtype == at::kBFloat16) {
+      if (cpuinfo_has_arm_bf16()) {
+        kleidiai::kai_quant_pack_lhs_int4_mm_bf16_channelwise(
+            output, input, weight, m, n, k);
+      } else {
+        TORCH_CHECK(
+            false,
+            "BF16 Unsupported: CPU does not support BF16. Please use a CPU with BF16 support.");
+      }
+    } else if (input_dtype == at::kFloat) {
+      kleidiai::kai_quant_pack_lhs_int4_mm_channelwise(
+          output, input, weight, m, n, k);
+    } else {
+      TORCH_CHECK(
+          false,
+          "Unsupported input data type: Only Bfloat16 and Float inputs are supported.");
+    }
+  } else if ((bl % 32 == 0) && (k % bl == 0)) {
     kleidiai::kai_quant_pack_lhs_int4_mm_groupwise(
         output, input, weight, m, n, k, bl);
   }
diff --git a/aten/src/ATen/native/kleidiai/kai_kernels.h b/aten/src/ATen/native/kleidiai/kai_kernels.h
index 9b522d7f7705a..a4179cefd06cf 100644
--- a/aten/src/ATen/native/kleidiai/kai_kernels.h
+++ b/aten/src/ATen/native/kleidiai/kai_kernels.h
@@ -25,7 +25,8 @@ void kai_pack_int4_rhs(
 size_t kai_pack_rhs_int4_size(
     const int64_t n,
     const int64_t k,
-    const int64_t bl);
+    const int64_t bl,
+    at::ScalarType tensor_dtype = at::kFloat);
 
 /**
  * @brief Run 2 operations ( Input quantize and pack -> 4 bit Matmul )
diff --git a/aten/src/ATen/native/kleidiai/kai_pack.h b/aten/src/ATen/native/kleidiai/kai_pack.h
index 4ff3371ab5e2a..d9f08333591ed 100644
--- a/aten/src/ATen/native/kleidiai/kai_pack.h
+++ b/aten/src/ATen/native/kleidiai/kai_pack.h
@@ -36,7 +36,8 @@ void kai_pack_rhs_groupwise_int4(
     AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null");
   }
 
-  float* bias_ptr = bias.has_value() ? bias.value().data_ptr<float>() : NULL;
+  float* bias_ptr =
+      bias.has_value() ? bias.value().to(kFloat).data_ptr<float>() : NULL;
   auto& params = kernel.rhs_pack_params;
 
   kernel.kai_run_rhs_pack(
@@ -73,7 +74,8 @@ void kai_pack_rhs_channelwise_int4(
   auto weight_packed_data =
       reinterpret_cast<uint8_t*>(weight_packed.data_ptr());
   const auto weight_data = weight.data_ptr<uint8_t>();
-  const auto scales_data = scales.data_ptr<float>();
+
+  const auto scales_data = scales.to(kFloat).data_ptr<float>();
 
   if (weight_data == nullptr) {
     AT_ERROR("kai_pack_rhs_channelwise_int4: Weight data pointer is null");
@@ -83,7 +85,8 @@ void kai_pack_rhs_channelwise_int4(
     AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null");
   }
 
-  float* bias_ptr = bias.has_value() ? bias.value().data_ptr<float>() : NULL;
+  float* bias_ptr =
+      bias.has_value() ? bias.value().to(kFloat).data_ptr<float>() : NULL;
   auto& params = kernel.rhs_pack_params;
 
   kernel.kai_run_rhs_pack(
diff --git a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp
index 0de198d7dc012..783133b83e670 100644
--- a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp
+++ b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp
@@ -68,5 +68,39 @@ kai_matmul_ukernel_f32_qa8dxp_qs4cxp kai_select_channelwise_matmul_ukernel(
     const kai_kernel_id id) {
   return channelwise_8bit_4bit_kernels.at(id);
 }
+
+// Kernel Mapping - BF16 Channelwise
+std::unordered_map<kai_kernel_id, kai_matmul_ukernel_bf16_qa8dxp_qs4cxp>
+    bf16_channelwise_8bit_4bit_kernels = {
+        {kai_kernel_id::
+             matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+         {{kai_get_m_step_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_n_step_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_mr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_nr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_kr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_sr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_lhs_packed_offset_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_rhs_packed_offset_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_dst_offset_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_get_dst_size_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
+           kai_run_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod}}},
+        {kai_kernel_id::matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+         {{kai_get_m_step_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_n_step_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_mr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_nr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_kr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_sr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_lhs_packed_offset_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_rhs_packed_offset_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_dst_offset_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_get_dst_size_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
+           kai_run_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm}}}};
+
+kai_matmul_ukernel_bf16_qa8dxp_qs4cxp kai_select_bf16_channelwise_matmul_ukernel(
+    const kai_kernel_id id) {
+  return bf16_channelwise_8bit_4bit_kernels.at(id);
+}
 } // namespace at::native::kleidiai
 #endif
diff --git a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
index 8480469cdea86..cfcf7a81ba85f 100644
--- a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
+++ b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
@@ -10,21 +10,32 @@
 #include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h>
 #include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h>
 #include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h>
+#include <kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod.h>
+#include <kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm.h>
+#include <kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_interface.h>
 #include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h>
+#include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_bf16_neon.h>
 #include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>
 #include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h>
 
 namespace at::native::kleidiai {
 
 enum class kai_kernel_id {
+  // FP32 inputs, 4-bit weights, FP32 output
   matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod =
-      0, // Groupwise 4 bit GEMV
+      0, // Groupwise 4-bit GEMV (per-group scales, NEON DOTPROD)
   matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_4x8x32_neon_i8mm =
-      1, // Groupwise 4 bit GEMM
+      1, // Groupwise 4-bit GEMM (per-group scales, NEON I8MM)
   matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod =
-      2, // Channelwise 4 bit GEMV
+      2, // Channelwise 4-bit GEMV (per-channel scales, NEON DOTPROD)
   matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm =
-      3 // Channelwise 4 bit GEMM
+      3, // Channelwise 4-bit GEMM (per-channel scales, NEON I8MM)
+
+  // BF16 inputs, 4-bit weights, BF16 output
+  matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod =
+      4, // Channelwise 4-bit GEMV with BF16 input/output
+  matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm =
+      5  // Channelwise 4-bit GEMM with BF16 input/output
 };
 
 // Channelwise Kernel mapping
@@ -66,6 +77,9 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp {
       void* rhs_packed,
       size_t extra_bytes,
       const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params);
+   size_t(*kai_get_lhs_quant_pack_offset)(
+        size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr
+    );
 
   kai_matmul_ukernel_f32_qa8dxp_qs4cxp(
       const kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel& kernel)
@@ -75,12 +89,71 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp {
         kai_get_rhs_packed_size(
             &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
         kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32),
-        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0) {}
+        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
+        kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32){}
 };
 
 struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp
 kai_select_channelwise_matmul_ukernel(const kai_kernel_id id);
 
+// bf16 Channelwise Kernel mapping
+struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp {
+    struct kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel ukernel;
+    struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params rhs_pack_params;
+    size_t (*kai_get_lhs_packed_size)(
+        size_t m,
+        size_t k,
+        size_t mr,
+        size_t kr,
+        size_t sr);
+    size_t (*kai_get_rhs_packed_size)(
+        size_t n,
+        size_t k,
+        size_t nr,
+        size_t kr,
+        size_t sr);
+    void (*kai_run_lhs_quant_pack)(
+        size_t m,
+        size_t k,
+        size_t mr,
+        size_t kr,
+        size_t sr,
+        size_t m_idx_start,
+        const void* lhs,
+        size_t lhs_stride,
+        void* lhs_packed);
+    void (*kai_run_rhs_pack)(
+        size_t num_groups,
+        size_t n,
+        size_t k,
+        size_t nr,
+        size_t kr,
+        size_t sr,
+        const uint8_t* rhs,
+        const float* bias,
+        const float* scale,
+        void* rhs_packed,
+        size_t extra_bytes,
+        const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params);
+        size_t(*kai_get_lhs_quant_pack_offset)(
+            size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr
+        );
+
+    kai_matmul_ukernel_bf16_qa8dxp_qs4cxp(
+        const kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel& kernel)
+        : ukernel(kernel),
+          kai_get_lhs_packed_size(
+              &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_bf16_neon),
+          kai_get_rhs_packed_size(
+              &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
+          kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_bf16_neon),
+          kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
+          kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_bf16_neon){}
+  };
+
+struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp
+kai_select_bf16_channelwise_matmul_ukernel(const kai_kernel_id id);
+
 // Groupwise Kernel mapping
 struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p {
   struct kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel ukernel;
@@ -125,6 +198,9 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p {
       void* rhs_packed,
       size_t extra_bytes,
       const struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params* params);
+      size_t(*kai_get_lhs_quant_pack_offset)(
+        size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr
+    );
 
   kai_matmul_ukernel_f32_qa8dxp_qs4c32p(
       const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& kernel)
@@ -134,7 +210,8 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p {
         kai_get_rhs_packed_size(
             &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0),
         kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32),
-        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0) {}
+        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0),
+        kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32) {}
 };
 
 struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p kai_select_groupwise_matmul_ukernel(
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 4132674993e1e..780cfc1716824 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2482,7 +2482,7 @@ def fn(a, b_int8pack, b_scales, c):
     @skipCUDAIf(True, "No _dyn_quant_pack_4bit_weight implementation on CUDA")
     @skipIfRocm
     @skipIfXpu(msg="No _dyn_quant_pack_4bit_weight implementation on XPU")
-    def test__dyn_quant_pack_4bit_weight(self):
+    def test__dyn_quant_pack_4bit_weight_fp32(self):
         q_group = 32
         k = 128
         n = 128
@@ -2513,12 +2513,53 @@ def fn(b, in_features, out_features):
 
         self.common(fn, (b, in_features, out_features))
 
+    @xfail_if_mps_unimplemented
+    @xfail_if_triton_cpu
+    @skipCUDAIf(True, "No _dyn_quant_pack_4bit_weight implementation on CUDA")
+    @skipIfRocm
+    @skipIfXpu(msg="No _dyn_quant_pack_4bit_weight implementation on XPU")
+    def test__dyn_quant_pack_4bit_weight_bf16(self):
+        k = 128
+        n = 128
+        q_group = 32
+
+        if not self.is_dtype_supported(torch.bfloat16):
+            raise unittest.SkipTest(
+                f"torch.bfloat16 not supported for device {self.device}"
+            )
+
+        torch.manual_seed(1)
+        b = torch.rand((k, n), dtype=torch.bfloat16)
+        in_features = b.size(0)
+        out_features = b.size(1)
+
+        def dyn_quant_pack_4bit_weight(b, in_features, out_features):
+            b_uint8, b_scales_and_zeros = _group_quantize_tensor_symmetric(
+                b, n_bit=4, groupsize=q_group
+            )
+
+            if q_group == in_features:
+                b_scales_and_zeros = b_scales_and_zeros.to(torch.float)
+            else:
+                b_scales_and_zeros = b_scales_and_zeros.to(torch.bfloat16)
+            b_int4pack = torch._dyn_quant_pack_4bit_weight(
+                b_uint8, b_scales_and_zeros, None, q_group, in_features, out_features
+            )
+
+            return b_int4pack, b_scales_and_zeros
+
+        def fn(b, in_features, out_features):
+            b_int4pack, _ = dyn_quant_pack_4bit_weight(b, in_features, out_features)
+            return b_int4pack
+
+        self.common(fn, (b, in_features, out_features))
+
     @xfail_if_mps_unimplemented
     @xfail_if_triton_cpu
     @skipCUDAIf(True, "No _dyn_quant_matmul_4bit implementation on CUDA")
     @skipIfRocm
     @skipIfXpu(msg="No _dyn_quant_matmul_4bit implementation on XPU")
-    def test__dyn_quant_matmul_4bit(self):
+    def test__dyn_quant_matmul_4bit_fp32_input(self):
         q_group = 32
         m = 32
         k = 128
@@ -2558,6 +2599,67 @@ def fn(a, q_group, in_features, out_features):
 
         self.common(fn, (a, q_group, in_features, out_features))
 
+    @skipCPUIf(IS_MACOS, "fails on M1, mismatch in bf16 support reporting")
+    @xfail_if_mps_unimplemented
+    @xfail_if_triton_cpu
+    @skipCUDAIf(True, "No _dyn_quant_matmul_4bit implementation on CUDA")
+    @skipIfRocm
+    @skipIfXpu(msg="No _dyn_quant_matmul_4bit implementation on XPU")
+    def test__dyn_quant_matmul_4bit_bf16_input(self):
+        m = 32
+        k = 128
+        n = 128
+        q_group = k
+
+        if not self.is_dtype_supported(torch.bfloat16):
+            raise unittest.SkipTest(
+                f"torch.bfloat16 not supported for device {self.device}"
+            )
+
+        torch.manual_seed(1)
+        a = torch.rand((m, k), dtype=torch.bfloat16)
+        b = torch.rand((k, n), dtype=torch.bfloat16)
+
+        # codegen_dynamic_shape test fails without explicitly marking these dynamic
+        torch._dynamo.mark_dynamic(a, 0)
+        torch._dynamo.mark_dynamic(b, 1)
+
+        in_features = b.size(0)
+        out_features = b.size(1)
+
+        if not self.is_dtype_supported(torch.bfloat16):
+            raise unittest.SkipTest(
+                f"torch.bfloat16 not supported for device {self.device}"
+            )
+
+        def dyn_quant_pack_4bit_weight(b, in_features, out_features):
+            b_uint8, b_scales_and_zeros = _group_quantize_tensor_symmetric(
+                b, n_bit=4, groupsize=q_group
+            )
+
+            if q_group == in_features:
+                b_scales_and_zeros = b_scales_and_zeros.to(torch.float)
+            else:
+                b_scales_and_zeros = b_scales_and_zeros.to(torch.bfloat16)
+            b_int4pack = torch._dyn_quant_pack_4bit_weight(
+                b_uint8, b_scales_and_zeros, None, q_group, in_features, out_features
+            )
+
+            return b_int4pack, b_scales_and_zeros
+
+        def fn(a, q_group, in_features, out_features):
+            b_int4pack, _ = dyn_quant_pack_4bit_weight(b, in_features, out_features)
+            res = torch.ops.aten._dyn_quant_matmul_4bit(
+                a,
+                b_int4pack,
+                q_group,
+                in_features,
+                out_features,
+            )
+            return res
+
+        self.common(fn, (a, q_group, in_features, out_features), atol=1, rtol=0.5)
+
     def test_expanded_reduction(self):
         def fn(x, y):
             z = x * y
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 5a629b371c766..2ed88a4ec2344 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -3741,6 +3741,7 @@ def kai_roundup(a: int, b: int) -> int:
 
 def get_kai_packed_weight_size(n_bits, N, K, groupsize):
     if n_bits == 4:
+        # Works for both fp32 and bf16 Kernels
         if groupsize == K:  # channelwise
             # dotprod params only [1x8x32_neon_dotprod]
             kai_nr = 8
@@ -3870,6 +3871,8 @@ def meta__dyn_quant_pack_4bit_weight(
         )
         return weights.new_empty(int(packed_weight_size), dtype=torch.uint8)
     packed_weight_size = weights.numel() + scales_zeros.numel()
+    if bias is not None:
+        packed_weight_size += bias.numel()
     return weights.new_empty(packed_weight_size, dtype=torch.float)
 
 
@@ -3883,8 +3886,12 @@ def meta__dyn_quant_matmul_4bit(
 ):
     torch._check(inp.dim() == 2, lambda: "input must be a 2D tensor")
     torch._check(
-        inp.dtype == torch.float32,
-        lambda: f"expected input to be f32, got {inp.dtype}",
+        (inp.dtype == torch.float32)
+        or (inp.dtype == torch.bfloat16 and block_size == in_features),
+        lambda: (
+            f"expected input to be f32 or bf16 (bf16 requires block_size == in_features), "
+            f"got {inp.dtype} with block_size={block_size} and in_features={in_features}"
+        ),
     )
     M = inp.size(0)
     return inp.new_empty(M, out_features, dtype=inp.dtype)

From 661d1653aa5f8a4068eab9ee166bb1e81feb369b Mon Sep 17 00:00:00 2001
From: PyTorch UpdateBot <pytorchupdatebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 12:20:29 +0000
Subject: [PATCH 15/47] [xla hash update] update the pinned xla hash (#167968)

This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/.github/workflows/nightly.yml).
Update the pinned xla hash.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167968
Approved by: https://github.com/pytorchbot
---
 .github/ci_commit_pins/xla.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt
index 191c21631f662..803ba72d9ac92 100644
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@@ -1 +1 @@
-e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
+94631807d22c09723dd006f7be5beb649d5f88d0

From 6fdb974f4a43c5e52ca7725c87adc0c550891f90 Mon Sep 17 00:00:00 2001
From: "Cui, Yifeng" <yifeng.cui@intel.com>
Date: Mon, 17 Nov 2025 12:58:42 +0000
Subject: [PATCH 16/47] Update torch-xpu-ops commit pin (#167698)

Update the torch-xpu-ops commit to [intel/torch-xpu-ops@1e69f4](https://github.com/intel/torch-xpu-ops/commit/1e69f40b3c03492eb3dd7e03462a5566f29674d3), includes:

- Add PTL in the default AOT target list for both Win and Lin
- Use PyTorch p2p API in Copy kernel
- Add event cache and event timing to XCCL
- Add Float8_e8m0fnu support for copy
- Add CMAKE_SYCL_COMPILER_LAUNCHER for sccache
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167698
Approved by: https://github.com/EikanWang
---
 third_party/xpu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xpu.txt b/third_party/xpu.txt
index a5031de150288..f05ce60393d66 100644
--- a/third_party/xpu.txt
+++ b/third_party/xpu.txt
@@ -1 +1 @@
-9aac5a1ddf50d75f929d572df51bb368b32da14e
+1e69f40b3c03492eb3dd7e03462a5566f29674d3

From 9ff95f6835e839f7a9fc35ebd5757b955740d310 Mon Sep 17 00:00:00 2001
From: IvanKobzarev <ivan.kobzarev@gmail.com>
Date: Wed, 12 Nov 2025 03:00:17 -0800
Subject: [PATCH 17/47] [inductor] Expose config for fx bucket all_reduces
 (#167634)

Exposing `_inductor.config.bucket_all_reduces_fx` similar to all_gathers, reduce_scatters with only option "all".

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167634
Approved by: https://github.com/eellison
---
 test/distributed/test_inductor_collectives.py |  8 +-------
 torch/_inductor/config.py                     |  4 ++++
 torch/_inductor/fx_passes/post_grad.py        | 12 ++++++++++++
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index 84b468afcfa2d..dd30bf81b67be 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -1773,16 +1773,10 @@ def func(x, w, ar_0, ar_1, tag, ranks, group_size):
         inputs = [x, w, ar_0, ar_1]
         f(*inputs, **self.get_world_trs())
 
-        def _pass(g):
-            from torch._inductor.fx_passes.bucketing import bucket_all_reduce
-
-            bucket_all_reduce(g.owning_module, lambda _: 2000)
-
-        torch._inductor.config.post_grad_custom_post_pass = _pass
-
         with torch._inductor.config.patch(
             {
                 "reorder_for_compute_comm_overlap": False,
+                "bucket_all_reduces_fx": bucket_mode,
             }
         ):
             compiled = torch.compile(f)
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index 5152915b7d0be..2f28ce551b103 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -421,6 +421,10 @@ def prologue_fusion_enabled() -> bool:
     None
 )
 
+bucket_all_reduces_fx: Literal["none", "all"] = "none"
+# By default torch._inductor.fx_passes.bucketing.bucket_size_determinator is used
+bucket_all_reduces_fx_bucket_size_determinator: Optional[Callable[[int], int]] = None
+
 # runtime estimation function for ops
 # for built-in estimation function, pass in "default"; for user-defined estimation function, pass in the function handle
 estimate_op_runtime = "default"
diff --git a/torch/_inductor/fx_passes/post_grad.py b/torch/_inductor/fx_passes/post_grad.py
index 958a52fcdf510..e0362f2aaafd4 100644
--- a/torch/_inductor/fx_passes/post_grad.py
+++ b/torch/_inductor/fx_passes/post_grad.py
@@ -222,6 +222,18 @@ def post_grad_passes(gm: torch.fx.GraphModule, is_inference: bool):
         )
         collectives_bucketing = True
 
+    if config.bucket_all_reduces_fx != "none":
+        from torch._inductor.fx_passes.bucketing import bucket_all_reduce
+
+        GraphTransformObserver(gm, "bucket_all_reduce").apply_graph_pass(
+            lambda graph: bucket_all_reduce(
+                graph.owning_module,
+                config.bucket_all_reduces_fx_bucket_size_determinator,
+                config.bucket_all_reduces_fx,  # type: ignore[arg-type]
+            )
+        )
+        collectives_bucketing = True
+
     # Fx all_gather bucketing introduces mutation op
     # Keeping it in the end to keep invariant of functional graph for previous passes.
     if config.bucket_all_gathers_fx != "none":

From 2b5eabc74b1e1484c24cb3c8dbc62b9f4971e610 Mon Sep 17 00:00:00 2001
From: Sam Gross <colesbury@gmail.com>
Date: Mon, 17 Nov 2025 14:52:02 +0000
Subject: [PATCH 18/47] Rework PyObject preservation (v2) (#167564)

Make the PyObject preservation scheme thread-safe with free threaded (nogil) Python. The general idea is:

* Python Tensor and Storage objects always hold a strong reference to their underlying c10 object
* c10 objects hold a strong reference to their Python objects if there's at least one other reference to the c10 object

This is implemented in `intrusive_ptr`:

* The top most bit (`kHasPyObject`) from the weakref count is now used to indicate if the `intrusive_ptr_target` has an associated PyObject. So `kHasPyObject` is one bit, the weakref count is now 31 bits and the strong refcount remains 32 bits.
* When the reference count increases from one to two and `kHasPyObject` is set, we incref the associated Python object to ensure that it's kept alive.
* When the reference count decreases from two to one (i.e., there are no C++ reference to the `intrusive_ptr_target` other than from the Python object), we decre the associated Python object to break the cycle.

Other benefits:

* We can delete a lot of the copypasta from Python internal `subtype_dealloc`
* This fixes the weakref and GC bugs we had in the previous scheme. Python weakrefs on Tensors and Storages should just work as expected now.

Risks:

* Extra branch for reference count operations on `intrusive_ptr<TensorImpl>`, `intrusive_ptr<StorageImpl>`, and the generic `intrusive_ptr<intrusive_ptr_target>` even when we're not using Python.
* It's a big change

(Second attempt at https://github.com/pytorch/pytorch/pull/166342)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167564
Approved by: https://github.com/albanD, https://github.com/Skylion007
---
 aten/src/ATen/core/TensorBase.h               |   3 +
 aten/tools/valgrind.sup                       |   7 +
 c10/core/SafePyObject.h                       |   4 +-
 c10/core/StorageImpl.cpp                      |  24 +
 c10/core/StorageImpl.h                        |  20 +
 c10/core/TensorImpl.cpp                       |  25 +-
 c10/core/TensorImpl.h                         |  19 +
 c10/core/impl/PyInterpreter.cpp               |  11 +-
 c10/core/impl/PyInterpreter.h                 |  12 +-
 c10/core/impl/PyObjectSlot.cpp                |  56 --
 c10/core/impl/PyObjectSlot.h                  | 131 +--
 c10/util/intrusive_ptr.h                      | 151 +++-
 test/test_autograd.py                         |  28 +
 test/test_torch.py                            |  56 +-
 torch/_inductor/cudagraph_trees.py            |  19 +-
 torch/csrc/Module.cpp                         |  41 +-
 torch/csrc/PyInterpreter.cpp                  |  77 +-
 torch/csrc/Storage.cpp                        | 327 ++------
 torch/csrc/Storage.h                          |   8 +-
 torch/csrc/StorageMethods.cpp                 |   5 +-
 .../csrc/autograd/functions/accumulate_grad.h |   6 +-
 torch/csrc/autograd/input_buffer.cpp          |   4 +-
 torch/csrc/autograd/python_variable.cpp       | 773 ++++--------------
 torch/csrc/autograd/python_variable.h         |   8 +-
 .../autograd/utils/grad_layout_contract.h     |   4 +-
 torch/csrc/autograd/utils/wrap_outputs.h      |   4 +
 torch/csrc/autograd/variable.h                |  18 +-
 torch/csrc/utils/pyobject_preservation.cpp    |  76 +-
 torch/csrc/utils/pyobject_preservation.h      |  26 +-
 29 files changed, 748 insertions(+), 1195 deletions(-)
 delete mode 100644 c10/core/impl/PyObjectSlot.cpp

diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h
index 2b9558197bdcb..2d7ca10433d6a 100644
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@@ -245,6 +245,9 @@ class TORCH_API TensorBase {
   size_t weak_use_count() const noexcept {
     return impl_.weak_use_count();
   }
+  bool is_uniquely_owned() const noexcept {
+    return impl_.is_uniquely_owned();
+  }
 
   std::string toString() const;
 
diff --git a/aten/tools/valgrind.sup b/aten/tools/valgrind.sup
index ad5f66e0b0531..585487c4d2be2 100644
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@@ -10,6 +10,13 @@
    ...
 }
 
+{
+   ignore_empty_generic_uninitialised_conditional_jump
+   Memcheck:Cond
+   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
+   ...
+}
+
 {
    Cond_cuda
    Memcheck:Cond
diff --git a/c10/core/SafePyObject.h b/c10/core/SafePyObject.h
index 1ec0cdb6751e9..bcace0ac358b4 100644
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@@ -44,7 +44,7 @@ struct C10_API SafePyObject {
       (*other.pyinterpreter_)->incref(other.data_);
     }
     if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
     }
     data_ = other.data_;
     pyinterpreter_ = other.pyinterpreter_;
@@ -53,7 +53,7 @@ struct C10_API SafePyObject {
 
   ~SafePyObject() {
     if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
     }
   }
 
diff --git a/c10/core/StorageImpl.cpp b/c10/core/StorageImpl.cpp
index a614fc9234c94..00fc03bbd0fcf 100644
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() {
   TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }
 
+void StorageImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void StorageImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool StorageImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
   // Allowlist verification.
   // Only if the devicetype is in the allowlist,
diff --git a/c10/core/StorageImpl.h b/c10/core/StorageImpl.h
index f34a1baed7a48..c7dbd5c1f005b 100644
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
     data_ptr_.clear();
   }
 
+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
   size_t nbytes() const {
     // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
     TORCH_CHECK(!size_bytes_is_heap_allocated_);
@@ -370,4 +376,18 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
     bool resizable,
     std::optional<at::Device> device_opt);
 
+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<
+        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 } // namespace c10
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index c59524a0932c2..94a7375cc32fb 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -277,7 +277,6 @@ void TensorImpl::release_resources() {
   if (storage_) {
     storage_ = {};
   }
-  pyobj_slot_.maybe_destroy_pyobj();
 }
 
 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
   }
 }
 
+void TensorImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void TensorImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool TensorImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 namespace impl {
 
 namespace {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 420ed73e48d21..71a0195dde773 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -2178,6 +2178,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return &pyobj_slot_;
   }
 
+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
  private:
   // See NOTE [std::optional operator usage in CUDA]
   // We probably don't want to expose this publicly until
@@ -3079,6 +3085,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };
 
+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
diff --git a/c10/core/impl/PyInterpreter.cpp b/c10/core/impl/PyInterpreter.cpp
index 8676f0aaf8e0e..52d263fad36c5 100644
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
 
   void incref(PyObject* pyobj) const override {} // do nothing
 
-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
-  } // do nothing
+  void decref(PyObject* pyobj) const override {} // do nothing
+
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
+    return false;
+  }
 
 #define PANIC(m)              \
   TORCH_INTERNAL_ASSERT(      \
@@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
       "attempted to call " #m \
       " on a Tensor with nontrivial PyObject after corresponding interpreter died")
 
+  size_t refcnt(PyObject* pyobj) const override {
+    PANIC(refcnt);
+  }
+
   c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
     PANIC(detach);
   }
diff --git a/c10/core/impl/PyInterpreter.h b/c10/core/impl/PyInterpreter.h
index def708c24b802..463b1e520b36e 100644
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@@ -18,6 +18,9 @@ namespace c10 {
 struct IValue;
 class OperatorHandle;
 struct TensorImpl;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
 } // namespace c10
 
 namespace torch::jit {
@@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable {
 
   // Run Py_INCREF on a PyObject.
   virtual void incref(PyObject* pyobj) const = 0;
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
-  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
+  virtual void decref(PyObject* pyobj) const = 0;
+  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
+  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
+  // Run Py_REFCNT on a PyObject.
+  virtual size_t refcnt(PyObject* pyobj) const = 0;
 
   // Perform a detach by deferring to the __torch_dispatch__ implementation of
   // detach, which will also arrange for the PyObject to get copied in this
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
deleted file mode 100644
index 0f1bfb2110747..0000000000000
--- a/c10/core/impl/PyObjectSlot.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <c10/core/impl/PyObjectSlot.h>
-
-namespace c10::impl {
-
-PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
-
-PyObjectSlot::~PyObjectSlot() {
-  maybe_destroy_pyobj();
-}
-
-void PyObjectSlot::maybe_destroy_pyobj() {
-  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
-    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*pyobj_interpreter_.load(std::memory_order_acquire))
-        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
-    // NB: this destructor can only be entered when there are no
-    // references to this C++ object (obviously), NOR any references
-    // to the PyObject (if there are references to the PyObject,
-    // then the PyObject holds an owning reference to the tensor).
-    // So it is OK to clear pyobj_ here as it is impossible for it to
-    // be used again (modulo weak reference races)
-    pyobj_ = nullptr; // for safety
-  }
-}
-
-PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return pyobj_interpreter_.load(std::memory_order_acquire);
-}
-
-PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
-}
-
-PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
-  if (interpreter) {
-    return *interpreter;
-  }
-  TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
-}
-
-bool PyObjectSlot::owns_pyobj() {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<uintptr_t>(pyobj_) & 1;
-}
-
-void PyObjectSlot::set_owns_pyobj(bool b) {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  pyobj_ = reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
-}
-
-} // namespace c10::impl
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index 58b2490eba001..a0633401b3634 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -8,117 +8,58 @@
 
 #include <atomic>
 
+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10::impl {
 
 struct C10_API PyObjectSlot {
  public:
-  PyObjectSlot();
-
-  ~PyObjectSlot();
-
-  void maybe_destroy_pyobj();
-
-  // Associate the TensorImpl with the specified PyObject, and, if necessary,
-  // also tag the interpreter.
-  //
-  // NB: This lives in a header so that we can inline away the switch on status
-  //
-  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
-  // PyObject if necessary!
-  void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
-    pyobj_ = pyobj;
-  }
+  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
 
   // Query the PyObject interpreter.  This may return null if there is no
-  // interpreter.  This is racy!
-  PyInterpreter* pyobj_interpreter();
-
-  PyObject* _unchecked_untagged_pyobj() const;
-
-  // Test the interpreter tag.  If tagged for the current interpreter, return
-  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
-  // returns a nullopt.  If it is definitely invalid, raises an error.
-  //
-  // If `ignore_hermetic_tls` is false and this function is called from a
-  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
-  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
-  // context is ignored, allowing you to check the interpreter tag of a
-  // nonhermetic PyObject from within a hermetic context. This is necessary
-  // because there are some cases where the deallocator function of a
-  // nonhermetic PyObject is called from within a hermetic context, so it must
-  // be properly treated as a nonhermetic PyObject.
-  //
-  // NB: this lives in header so that we can avoid actually creating the
-  // std::optional
+  // interpreter.
+  PyInterpreter* pyobj_interpreter() const {
+    return pyobj_interpreter_.load(std::memory_order_acquire);
+  }
 
-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
-    impl::PyInterpreter* interpreter =
-        pyobj_interpreter_.load(std::memory_order_acquire);
-    if (interpreter == nullptr) {
-      return std::nullopt;
-    }
+  PyInterpreter& load_pyobj_interpreter() const {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    TORCH_INTERNAL_ASSERT(
+        interpreter, "cannot access PyObject for Tensor - no interpreter set");
+    return *interpreter;
+  }
 
-    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
-      return std::nullopt;
-    } else {
-      return _unchecked_untagged_pyobj();
-    }
+  PyObject* load_pyobj() const {
+    return pyobj_.load(std::memory_order_acquire);
   }
 
-  PyInterpreter& load_pyobj_interpreter() const;
+  void store_pyobj(PyObject* obj) {
+    pyobj_.store(obj, std::memory_order_release);
+  }
 
-  bool owns_pyobj();
+  bool has_unique_reference() const {
+    PyObject* pyobj = load_pyobj();
+    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
+  }
 
-  void set_owns_pyobj(bool b);
+  void clear() {
+    pyobj_.store(nullptr, std::memory_order_relaxed);
+    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
+  }
 
  private:
-  // This field contains the interpreter tag for this object.  See
-  // Note [Python interpreter tag] for general context
-  //
-  // Note [Memory ordering on Python interpreter tag]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // What memory_order do we need when accessing this atomic?  We don't
-  // need a single total modification order (as provided by
-  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
-  // transition from -1 to some positive integer and never changes afterwards.
-  // Because there is only one modification, it trivially already has a total
-  // modification order (e.g., we don't need fences or locked instructions on
-  // x86)
-  //
-  // In fact, one could make a reasonable argument that relaxed reads are OK,
-  // due to the presence of external locking (GIL) to ensure that interactions
-  // with other data structures are still correctly synchronized, so that
-  // we fall in the "Single-Location Data Structures" case as described in
-  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
-  // as I get the same assembly in both cases.  So I just use the more
-  // conservative acquire (which will impede compiler optimizations but I don't
-  // care)
+  // This is now always the global interpreter if the PyObject is set.
+  // Maybe we can remove this field some day...
   std::atomic<PyInterpreter*> pyobj_interpreter_;
 
-  // This field contains a reference to a PyObject representing this Tensor.
-  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
-  // PyObject for it and set this field.  This field does not have to be
-  // protected by an atomic as it is only allowed to be accessed when you hold
-  // the GIL, or during destruction of the tensor.
-  //
-  // When a PyObject dies, you are obligated to clear this field
-  // (otherwise, you will try to use-after-free the pyobj); this currently
-  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
-  //
-  // NB: Ordinarily, this should not be a strong reference, as if the
-  // PyObject owns the Tensor, this would create a reference cycle.
-  // However, sometimes this ownership flips.  To track who owns
-  // who, this has a single pointer tag indicating whether or not the
-  // C++ object owns the PyObject (the common case, zero, means PyObject
-  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
-  // or check_pyobj for checked access.  See references to PyObject
-  // resurrection in torch/csrc/autograd/python_variable.cpp
-  PyObject* pyobj_;
+  // The PyObject representing this Tensor or nullptr. Ownership is managed
+  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
+  // reference is already dead.
+  std::atomic<PyObject*> pyobj_;
+
+  friend class torch::utils::PyObjectPreservation;
 };
 
 } // namespace c10::impl
diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h
index 3d5478be90e60..0c8f55f5061ab 100644
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@@ -12,6 +12,10 @@ template <typename, typename...>
 class class_;
 }
 
+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@@ -33,6 +37,8 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
 constexpr uint64_t kReferenceCountOne = 1;
 constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
 constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+// Indicates whether the object has a PyObject wrapper.
+constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);
 
 template <class TTarget>
 struct intrusive_target_default_null_type final {
@@ -55,7 +61,11 @@ inline uint32_t refcount(uint64_t combined_refcount) {
 }
 
 inline uint32_t weakcount(uint64_t combined_refcount) {
-  return static_cast<uint32_t>(combined_refcount >> 32);
+  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
+}
+
+inline bool has_pyobject(uint64_t combined_refcount) {
+  return (combined_refcount & kHasPyObject) != 0;
 }
 
 // The only requirement for refcount increment is that it happens-before
@@ -66,12 +76,6 @@ inline uint64_t atomic_combined_refcount_increment(
   return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
 }
 
-inline uint32_t atomic_refcount_increment(
-    std::atomic<uint64_t>& combined_refcount) {
-  return detail::refcount(atomic_combined_refcount_increment(
-      combined_refcount, kReferenceCountOne));
-}
-
 inline uint32_t atomic_weakcount_increment(
     std::atomic<uint64_t>& combined_refcount) {
   return detail::weakcount(atomic_combined_refcount_increment(
@@ -99,6 +103,11 @@ inline uint32_t atomic_weakcount_decrement(
       combined_refcount, kWeakReferenceCountOne));
 }
 
+template <class T, class = void>
+struct TargetTraits {
+  static constexpr bool can_have_pyobject = false;
+};
+
 } // namespace detail
 
 /**
@@ -155,6 +164,23 @@ class C10_API intrusive_ptr_target {
   // we can atomically operate on both at the same time for performance
   // and defined behaviors.
   //
+  // Note [PyObject preservation for Tensor and Storages]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // intrusive_ptr has special support for preserving PyObject wrappers
+  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
+  // the combined_refcount_ is used to indicate whether the object has a
+  // PyObject wrapper.
+  //
+  //   - The PyObject, if it exists, holds a strong reference to the
+  //     intrusive_ptr_target.
+  //
+  //   - When the refcount goes from 1 to 2, we incref the PyObject.
+  //
+  //   - When the refcount goes from 2 to 1, we decref the PyObject.
+  //
+  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
+  // are other C++ references to the intrusive_ptr_target.
+
   mutable std::atomic<uint64_t> combined_refcount_;
   static_assert(sizeof(std::atomic<uint64_t>) == 8);
   static_assert(alignof(std::atomic<uint64_t>) == 8);
@@ -172,6 +198,8 @@ class C10_API intrusive_ptr_target {
   template <typename T>
   friend struct ExclusivelyOwnedTensorTraits;
 
+  friend class torch::utils::PyObjectPreservation;
+
  protected:
   // protected destructor. We never want to destruct intrusive_ptr_target*
   // directly.
@@ -255,6 +283,16 @@ class C10_API intrusive_ptr_target {
    */
   virtual void release_resources() {}
 
+  /**
+   * These two methods are called when the refcount transitions between one
+   * and two and the object has a PyObject wrapper.
+   */
+  virtual void incref_pyobject() const {}
+  virtual void decref_pyobject() const {}
+  virtual bool try_incref_pyobject() const {
+    return false;
+  }
+
   uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
     return detail::refcount(combined_refcount_.load(order));
   }
@@ -265,6 +303,19 @@ class C10_API intrusive_ptr_target {
   }
 };
 
+namespace detail {
+
+#ifndef C10_MOBILE
+template <>
+struct TargetTraits<c10::intrusive_ptr_target> {
+  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
+  // or StorageImpl, so we have to allow for PyObject support.
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;
 
@@ -314,18 +365,34 @@ class intrusive_ptr final {
 
   void retain_() {
     if (target_ != NullType::singleton()) {
-      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->combined_refcount_);
+      uint64_t combined = detail::atomic_combined_refcount_increment(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined);
       TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
           new_refcount != 1,
           "intrusive_ptr: Cannot increase refcount after it reached zero.");
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 1 to 2, we need to incref the
+        // PyObject. In other words, we need to ensure that the PyObject stays
+        // alive now that we have a C++ reference to this object in addition to
+        // the PyObject itself.
+        if (C10_UNLIKELY(
+                detail::has_pyobject(combined) &&
+                detail::refcount(combined) == 2)) {
+          target_->incref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !detail::has_pyobject(combined),
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+      }
     }
   }
 
   void reset_() noexcept {
     if (target_ != NullType::singleton()) {
-      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
-          detail::kUniqueRef) {
+      if (is_uniquely_owned()) {
         // Both counts are 1, so there are no weak references and
         // we are releasing the last strong reference. No other
         // threads can observe the effects of this target_ deletion
@@ -337,9 +404,10 @@ class intrusive_ptr final {
 
       auto combined_refcount = detail::atomic_combined_refcount_decrement(
           target_->combined_refcount_, detail::kReferenceCountOne);
-      if (detail::refcount(combined_refcount) == 0) {
-        bool should_delete =
-            (combined_refcount == detail::kWeakReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined_refcount);
+      bool has_pyobject = detail::has_pyobject(combined_refcount);
+      if (new_refcount == 0) {
+        bool should_delete = detail::weakcount(combined_refcount) == 1;
         // See comment above about weakcount. As long as refcount>0,
         // weakcount is one larger than the actual number of weak references.
         // So we need to decrement it here.
@@ -356,6 +424,18 @@ class intrusive_ptr final {
         if (should_delete) {
           delete target_;
         }
+      } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 2 to 1, we need to decref the
+        // PyObject. In other words, we don't want to keep the PyObject alive if
+        // there are no C++ references to this object other than the PyObject
+        // itself.
+        if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
+          target_->decref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !has_pyobject,
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
       }
     }
   }
@@ -522,6 +602,16 @@ class intrusive_ptr final {
     return use_count() == 1;
   }
 
+  /**
+   * Stronger than unique() in that it must not have any weakrefs as well.
+   */
+  bool is_uniquely_owned() const noexcept {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
+    uint64_t combined =
+        target_->combined_refcount_.load(std::memory_order_acquire);
+    return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
+  }
+
   /**
    * Returns an owning (!) pointer to the underlying object and makes the
    * intrusive_ptr instance invalid. That means the refcount is not decreased.
@@ -932,6 +1022,7 @@ class weak_intrusive_ptr final {
     if (target_ == NullType::singleton()) {
       return intrusive_ptr<TTarget, NullType>();
     } else {
+      bool increfed = false;
       auto combined_refcount =
           target_->combined_refcount_.load(std::memory_order_relaxed);
       do {
@@ -940,12 +1031,31 @@ class weak_intrusive_ptr final {
           // Return nullptr.
           return intrusive_ptr<TTarget, NullType>();
         }
+        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+          if (detail::has_pyobject(combined_refcount) &&
+              detail::refcount(combined_refcount) == 1 && !increfed) {
+            // Object has a python wrapper with no other C++ references.
+            // We need to to incref the Python object before we acquire a
+            // strong reference to the C++ object to avoid a situation
+            // where the Python object is deallocated concurrently.
+            if (!target_->try_incref_pyobject()) {
+              return intrusive_ptr<TTarget, NullType>();
+            }
+            increfed = true;
+          }
+        }
       } while (!target_->combined_refcount_.compare_exchange_weak(
           combined_refcount,
           combined_refcount + detail::kReferenceCountOne,
           std::memory_order_acquire,
           std::memory_order_relaxed));
 
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        if (increfed && detail::refcount(combined_refcount) != 1) {
+          target_->decref_pyobject();
+        }
+      }
+
       return intrusive_ptr<TTarget, NullType>(
           target_, raw::DontIncreaseRefcount{});
     }
@@ -1060,7 +1170,18 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
   if (self) {
-    detail::atomic_refcount_increment(self->combined_refcount_);
+    uint64_t combined = detail::atomic_combined_refcount_increment(
+        self->combined_refcount_, detail::kReferenceCountOne);
+
+#ifndef C10_MOBILE
+    if (C10_UNLIKELY(
+            detail::has_pyobject(combined) &&
+            detail::refcount(combined) == 2)) {
+      self->incref_pyobject();
+    }
+#else
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
+#endif
   }
 }
 
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e025a8e6e582d..5960ac8add36d 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -10895,6 +10895,34 @@ def func(inp):
 
             self.assertTrue(gradcheck(func, x, fast_mode=True))
 
+    def test_grad_thread_safety(self):
+        import threading
+        from concurrent.futures import ThreadPoolExecutor
+
+        NUM_ITERS = 10
+        NUM_THREADS = 4
+
+        # Concurrent calls to tensor.untyped_storage()
+        def access_grad(tensor, barrier):
+            barrier.wait()
+            return weakref.ref(tensor.grad)
+
+        for i in range(NUM_ITERS):
+            tensor = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
+            (tensor**2).sum().backward()
+
+            barrier = threading.Barrier(NUM_THREADS)
+            with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
+                futures = [
+                    executor.submit(access_grad, tensor, barrier)
+                    for _ in range(NUM_THREADS)
+                ]
+
+                # Check that all the grad tensors returned were the same
+                for future in futures:
+                    self.assertEqual(future.result()(), tensor.grad)
+                self.assertIsNotNone(tensor.grad)
+
 
 def index_perm_variable(shape, max_indices):
     if not isinstance(shape, tuple):
diff --git a/test/test_torch.py b/test/test_torch.py
index dce0ce53ac722..01c6fb39a5a2a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -259,7 +259,8 @@ def test_storage_setitem(self, device, dtype):
     def test_storage_use_count(self, device):
         a = torch.randn(10, device=device)
         prev_cf = torch._C._storage_Use_Count(a.untyped_storage()._cdata)
-        self.assertEqual(prev_cf, 1)
+        # Two references: 'a' and the wrapper returned by untyped_storage()
+        self.assertEqual(prev_cf, 2)
         b = a.view(2, 5)
         self.assertEqual(torch._C._storage_Use_Count(b.untyped_storage()._cdata), prev_cf + 1)
 
@@ -9324,7 +9325,7 @@ class BadSubTensor:
             member_var = object()
 
         err_msg = "Creating a Tensor subclass from a class that does not inherit from Tensor"
-        with self.assertRaisesRegex(RuntimeError, err_msg):
+        with self.assertRaisesRegex(TypeError, err_msg):
             s0 = t0.as_subclass(BadSubTensor)
 
     # FIXME: Port to a test suite that better fits slicing
@@ -10324,20 +10325,21 @@ def test_backward_hooks_traverse(self):
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1993")
     def test_tensor_dead_weak_ref(self):
-        x = torch.empty(2)
+        x = torch.ones(2)
         w_x = weakref.ref(x)
-        y = torch.empty(2)
+        y = torch.ones(2)
         y.grad = x
         del x
 
         x = w_x()
-        # Ideally, x would keep the tensor live.  But CPython doesn't
-        # provide enough hooks to do this.  So it will go dead and x
-        # will transmute into an undefined tensor.  Not great, but the
-        # best we can do.
+        # x should keep the tensor live. This didn't happen in earlier PyTorch
+        # versions.
         del y
 
-        self.assertRaises(RuntimeError, lambda: x.sigmoid())
+        self.assertEqual(2, x.sum())
+
+        del x
+        self.assertIsNone(w_x())
 
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1993")
     def test_storage_dead_weak_ref(self):
@@ -10345,16 +10347,9 @@ def test_storage_dead_weak_ref(self):
         w_x = weakref.ref(x)
         y = torch.tensor(x)
         del x
-
-        x = w_x()
-        # Ideally, x would keep the storage live.  But CPython doesn't
-        # provide enough hooks to do this.  So it will go dead and x
-        # will transmute into storage with null StorageImpl. Not great, but the
-        # best we can do.
+        self.assertIsNotNone(w_x())
         del y
-
-        self.assertRaisesRegex(RuntimeError, "Got a null Storage", lambda: x[0])
-        self.assertRaisesRegex(RuntimeError, "Got a null Storage", lambda: x.float())
+        self.assertIsNone(w_x())
 
     def test_tensor_resurrected_weak_ref(self):
         x = torch.empty(2)
@@ -10415,6 +10410,31 @@ def callback(w):
 
         self.assertTrue(called)
 
+    def test_storage_thread_safety(self):
+        import threading
+        from concurrent.futures import ThreadPoolExecutor
+
+        NUM_ITERS = 10
+        NUM_THREADS = 4
+
+        # Concurrent calls to tensor.untyped_storage()
+        def access_untyped_storage(tensor, barrier):
+            barrier.wait()
+            return weakref.ref(tensor.untyped_storage())
+
+        for i in range(NUM_ITERS):
+            tensor = torch.tensor([1.0, 2.0, 3.0])
+            barrier = threading.Barrier(NUM_THREADS)
+            with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
+                futures = [
+                    executor.submit(access_untyped_storage, tensor, barrier)
+                    for _ in range(NUM_THREADS)
+                ]
+
+                # Check that all the storages returned were the same
+                for future in futures:
+                    self.assertEqual(future.result()(), tensor.untyped_storage())
+
     # FIXME: move to test_linalg
     @torch.inference_mode()
     def test_bmm_multithreaded(self):
diff --git a/torch/_inductor/cudagraph_trees.py b/torch/_inductor/cudagraph_trees.py
index 20cd5ca9a8888..98280b5af783c 100644
--- a/torch/_inductor/cudagraph_trees.py
+++ b/torch/_inductor/cudagraph_trees.py
@@ -536,9 +536,14 @@ def expired(self) -> bool:
         if self.extra_ref_check is not None and not self.extra_ref_check():
             return False
 
-        # if extra_ref_check is not None we expect an additional reference
         stor_count = torch._C._storage_Use_Count(self.ref.cdata)
-        return (stor_count - (self.extra_ref_check is not None)) == 0
+        if self.extra_ref_check is not None:
+            # if extra_ref_check is not None we expect two additional references:
+            #  - one from the Python storage object
+            #  - one from the cached Tensor
+            stor_count -= 2
+        assert stor_count >= 0
+        return stor_count == 0
 
     def __repr__(self) -> str:
         if self.ref is None or self.ref.expired():
@@ -1439,7 +1444,15 @@ def check_refcount(i: int) -> bool:
                 self_loc = self_ref()
                 if self_loc is None:
                     return False
-                return self_loc.get_output_refcount(i) == 2
+                refcount = self_loc.get_output_refcount(i)
+                # pyrefly: ignore
+                if self_loc.cached_tensor_outputs[i]._use_count() > 1:
+                    # c10::Tensor may also holds one reference count
+                    assert refcount >= 3
+                    return refcount == 3
+                else:
+                    assert refcount >= 2
+                    return refcount == 2
 
             check = functools.partial(check_refcount, i=i)
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index e2d0e17738dc2..4de6ba3976688 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -398,36 +398,27 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
 
   // weak_use_count() adds 1 if use_count is non-zero
   TORCH_CHECK(
-      a->cdata->weak_use_count() == 1,
+      a->cdata.weak_use_count() == 1,
       "Expected no weakrefs to t1's Tensor object but got  ",
-      a->cdata->weak_use_count() - 1);
+      a->cdata.weak_use_count() - 1);
   TORCH_CHECK(
-      b->cdata->weak_use_count() == 1,
+      b->cdata.weak_use_count() == 1,
       "Expected no weakrefs to t2's Tensor object but got  ",
-      b->cdata->weak_use_count() - 1);
+      b->cdata.weak_use_count() - 1);
+
+  // NB: Creating local copies of *both* Tensors here ensures that they each
+  // hold a strong reference to their PyObject. This avoids having to fix up
+  // reference counts when we swap the PyObject slots below.
+  at::Tensor tmp_a = a->cdata;
+  at::Tensor tmp_b = b->cdata;
 
   // Swap the Tensor Impl
-  c10::MaybeOwned<at::Tensor> tmp = a->cdata;
-
-  // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
-  // associated with the TensorImpl. Swap this field as well.
-  std::optional<PyObject*> mb_obj_a =
-      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  std::optional<PyObject*> mb_obj_b =
-      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  TORCH_INTERNAL_ASSERT(
-      mb_obj_a.has_value() && mb_obj_b.has_value(),
-      "Both tensors should have PyObjects tagged by the current python interpreter");
-  TORCH_CHECK(mb_obj_a.value() == a_);
-  TORCH_CHECK(mb_obj_b.value() == b_);
-
-  a->cdata = b->cdata;
-  b->cdata = tmp;
-
-  a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(a_);
-  b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(b_);
+  a->cdata = tmp_b;
+  b->cdata = tmp_a;
+
+  // Fix up the PyObjects associated with each TensorImpl
+  a->cdata.unsafeGetTensorImpl()->pyobj_slot()->store_pyobj(a_);
+  b->cdata.unsafeGetTensorImpl()->pyobj_slot()->store_pyobj(b_);
 
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index 974f95999f17b..7f36d88bdaa32 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -45,7 +45,9 @@ struct ConcretePyInterpreterVTable final
   std::string name() const override;
 
   void incref(PyObject* pyobj) const override;
-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override;
+  void decref(PyObject* pyobj) const override;
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override;
+  size_t refcnt(PyObject* pyobj) const override;
 
   // TODO: Need to make this work for StorageImpl too. I imagine I'll want to
   // operate upon a PyObjectSlot rather than a TensorImpl
@@ -235,53 +237,13 @@ py::object torchDispatchFromTensorImpl(
           TorchFunctionName::TorchDispatch));
 }
 
-// NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-// Before calling PyInterpreter::decref, we must statically know if the
-// pyobj has a PyObjectSlot or not.
-// - If it has a PyObjectSlot, we need to be careful about PyObject resurrection
-// - If it does not have a PyObjectSlot, we can freely decref
-// One alternative to this is using PyObject_IsInstance
-// to get at this information. However, we don't want to risk an incorrect
-// `__instancecheck__` changing the semantics here.
-void ConcretePyInterpreterVTable::decref(PyObject* pyobj, bool has_pyobj_slot)
-    const {
+void ConcretePyInterpreterVTable::decref(PyObject* pyobj) const {
   // Leak the pyobj if not initialized.  This can happen if we are running
   // exit handlers that are destructing tensors with residual (owned)
   // PyObjects stored in them.
   if (!Py_IsInitialized())
     return;
-
   pybind11::gil_scoped_acquire gil;
-  // Two possibilities:
-  // 1. We are decref-ing an object that has a PyObjectSlot, like a Tensor or
-  // Storage. Then we must be careful about PyObject resurrection (see
-  // THPVariable_clear).
-  // 2. We are decref-ing some other Python object. We don't do
-  // PyObject resurrection on non-Tensors, so we just carry on as usual
-  if (has_pyobj_slot && Py_REFCNT(pyobj) > 1) {
-    if (THPVariable_Check(pyobj)) {
-      // It's still alive!  This can happen if a weak ref resurrected
-      // the PyObject without flipping ownership.  At this point it is
-      // too late to rescue the object, so just stub out the PyObject
-      // so that it fails on subsequent uses.  Don't raise an error here;
-      // you're probably in a destructor.
-      TORCH_WARN(
-          "Deallocating Tensor that still has live PyObject references.  "
-          "This probably happened because you took out a weak reference to "
-          "Tensor and didn't call _fix_weakref() after dereferencing it.  "
-          "Subsequent accesses to this tensor via the PyObject will now fail.");
-      (reinterpret_cast<THPVariable*>(pyobj))->cdata =
-          c10::MaybeOwned<torch::autograd::Variable>();
-    } else if (THPStorage_Check(pyobj)) {
-      TORCH_WARN(
-          "Deallocating UntypedStorage that still has live PyObject references.  "
-          "This probably happened because you took out a weak reference to "
-          "UntypedStorage and didn't call _fix_weakref() after dereferencing it.  "
-          "Subsequent accesses to this storage via the PyObject will now fail.");
-      (reinterpret_cast<THPStorage*>(pyobj))->cdata =
-          c10::MaybeOwned<c10::Storage>();
-    }
-  }
   Py_DECREF(pyobj);
 }
 
@@ -292,6 +254,25 @@ void ConcretePyInterpreterVTable::incref(PyObject* pyobj) const {
   Py_INCREF(pyobj);
 }
 
+bool ConcretePyInterpreterVTable::try_incref(
+    const c10::impl::PyObjectSlot& pyobj_slot) const {
+  if (!Py_IsInitialized())
+    return false;
+  pybind11::gil_scoped_acquire gil;
+  PyObject* pyobj = pyobj_slot.load_pyobj();
+  if (!pyobj) {
+    return false;
+  }
+  return PyUnstable_TryIncRef(pyobj);
+}
+
+size_t ConcretePyInterpreterVTable::refcnt(PyObject* pyobj) const {
+  if (!Py_IsInitialized() || pyobj == nullptr)
+    return 0;
+  pybind11::gil_scoped_acquire gil;
+  return Py_REFCNT(pyobj);
+}
+
 bool isPythonTensor(const at::Tensor& tensor) {
   return tensor.unsafeGetTensorImpl()->key_set().has(c10::DispatchKey::Python);
 }
@@ -620,11 +601,7 @@ static void set_tensor_attr_with_capsule(
     const c10::TensorImpl* tensor,
     py::capsule& capsule,
     const char* attr_name) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-  TORCH_CHECK(
-      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
-  auto obj = mb_obj.value();
+  PyObject* obj = tensor->pyobj_slot()->load_pyobj();
   py::handle(obj).attr(attr_name) = capsule;
 }
 
@@ -648,11 +625,7 @@ static c10::ArrayRef<T> get_set_cached_attr(
     const c10::TensorImpl* tensor,
     const char* base_attr_name,
     const py::object& obj) {
-  std::optional<PyObject*> mb_obj =
-      tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
-  TORCH_CHECK(
-      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
-  auto tensor_obj = mb_obj.value();
+  PyObject* tensor_obj = tensor->pyobj_slot()->load_pyobj();
   auto buffer_len_attr_name = std::string(base_attr_name) + std::string("_len");
 
   bool is_buffer_allocated = false;
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 02558cbdf8968..671c28adef3e3 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -23,6 +23,8 @@
 #include <c10/util/intrusive_ptr.h>
 #include <fmt/format.h>
 
+using torch::utils::PyObjectPreservation;
+
 template <>
 void THPPointer<c10::StorageImpl>::free() {
   if (ptr) {
@@ -32,238 +34,72 @@ void THPPointer<c10::StorageImpl>::free() {
 
 PyTypeObject* THPStorageClass = nullptr;
 
-PyObject* THPStorage_NewWithStorage(
-    PyTypeObject* type,
-    c10::Storage _storage,
-    bool allow_preexisting_pyobj) {
-  TORCH_CHECK(
-      PyType_IsSubtype(type, &THPStorageType),
-      "Creating a Storage subclass from a class that does not inherit from ",
-      "Storage is not possible. Make sure your class inherits from Storage.");
-
-  auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-  if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
-    TORCH_CHECK(
-        allow_preexisting_pyobj,
-        "Creating a new Storage subclass ",
-        type->tp_name,
-        " but the raw Storage object is already associated to a python object ",
-        "of type ",
-        maybe_pyobj.value()->ob_type->tp_name);
-    PyObject* obj = *maybe_pyobj;
-    PyTypeObject* obj_type = Py_TYPE(obj);
-    TORCH_CHECK(
-        obj_type == type || PyType_IsSubtype(obj_type, type),
-        "Creating a new Storage subclass ",
-        type->tp_name,
-        " but the raw Storage object is already associated to a python object ",
-        "of type ",
-        maybe_pyobj.value()->ob_type->tp_name,
-        " which is not a subclass of the "
-        "requested type");
-    return THPStorage_Wrap(std::move(_storage));
-  }
-
+// Create a new Python Storage object, but don't set the pyobj slot on the
+// c10::Storage object.
+static PyObject* THPStorage_New(PyTypeObject* type, c10::Storage _storage) {
   PyObject* obj = type->tp_alloc(type, 0);
   TORCH_CHECK(obj, "Failed to allocate a ", type->tp_name, " object");
 
-  auto s = reinterpret_cast<THPStorage*>(obj);
+  // Ensure that PyUnstable_TryIncref calls don't fail spuriously in
+  // free-threaded Python.
+  PyUnstable_EnableTryIncRef(obj);
 
-  new (&s->cdata) c10::MaybeOwned<c10::Storage>();
-
-  s->cdata = c10::MaybeOwned<c10::Storage>::owned(std::move(_storage));
+  auto s = (THPStorage*)obj;
+  new (&s->cdata) c10::Storage(std::move(_storage));
+  return obj;
+}
 
-  if (!c10::impl::HermeticPyObjectTLS::get_state()) {
-    s->is_hermetic = false;
-    const auto& storage = THPStorage_Unpack(s);
-    storage.unsafeGetStorageImpl()->pyobj_slot()->init_pyobj(obj);
-  } else {
-    s->is_hermetic = true;
-  }
+// Create a new Python Storage object for a new c10::Storage, and set the
+// pyobj slot. The c10::Storage must not already have a pyobj set.
+PyObject* THPStorage_NewWithStorage(PyTypeObject* type, c10::Storage _storage) {
+  TORCH_CHECK(
+      type == THPStorageClass || PyType_IsSubtype(type, &THPStorageType),
+      "Creating a Storage subclass from a class that does not inherit from ",
+      "Storage is not possible. Make sure your class inherits from Storage.");
+  TORCH_INTERNAL_ASSERT(_storage.use_count() == 1);
 
+  c10::StorageImpl* storage_impl = _storage.unsafeGetStorageImpl();
+  PyObject* obj = THPStorage_New(type, std::move(_storage));
+  PyObjectPreservation::init_fresh_nonatomic(
+      storage_impl, storage_impl->pyobj_slot(), obj);
   return obj;
 }
 
-// Wraps the c10::Storage with a storage PyObject
+// Returns a PyObject wrapper for the c10::Storage object. The existing
+// wrapper is returned if it already exists.
 PyObject* THPStorage_Wrap(c10::Storage storage) {
-  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
   if (c10::impl::HermeticPyObjectTLS::get_state()) {
-    return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+    return THPStorage_New(THPStorageClass, std::move(storage));
   }
-  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
-  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-  if (maybe_pyobj.has_value()) {
-    auto obj = *maybe_pyobj;
-    if (obj) {
-      TORCH_CHECK(
-          THPStorage_Check(obj),
-          "Expected a storage type, but got ",
-          Py_TYPE(obj)->tp_name);
-
-      if (pyobj_slot->owns_pyobj()) {
-        pyobj_slot->set_owns_pyobj(false);
-        reinterpret_cast<THPStorage*>(obj)->cdata =
-            c10::MaybeOwned<c10::Storage>::owned(std::move(storage));
-        return obj;
-      } else {
-        Py_INCREF(obj);
-        return obj;
-      }
-    }
-  }
-  return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
-}
-
-static bool THPStorage_isPreservable(THPStorage* self) {
-  if (self->cdata.unsafeIsBorrowed()) {
-    return false;
-  }
-  auto const& storage = THPStorage_Unpack(self);
-
-  if (self->is_hermetic) {
-    return false;
-  }
+  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
+  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
-  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/true) != reinterpret_cast<PyObject*>(self)) {
-    return false;
-  }
-  if (storage.use_count() <= 1) {
-    return false;
+  PyObject* obj = pyobj_slot->load_pyobj();
+  if (obj) {
+    return Py_NewRef(obj);
   }
-  return true;
-}
 
-static bool THPStorage_tryPreserve(THPStorage* self) {
-  if (!THPStorage_isPreservable(self)) {
-    return false;
+  obj = THPStorage_New(THPStorageClass, std::move(storage));
+  PyObject* wrapper =
+      PyObjectPreservation::init_once(storage_impl, pyobj_slot, obj);
+  if (wrapper != obj) {
+    // Another thread beat us to it
+    Py_DECREF(obj);
+    return Py_NewRef(wrapper);
   }
-
-  const auto& storage = THPStorage_Unpack(self);
-  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
-
-  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/true);
-  // NOTE: It is possible to just set the PyObjectSlot here, but the point is
-  // that we should have already set PyObjectSlot when the storage PyObject
-  // was created.
-  TORCH_INTERNAL_ASSERT(
-      maybe_pyobj.has_value(),
-      "Trying to preserve a Python storage whose PyObjectSlot does not have a PyObject");
-
-  PyObject* pyobj = *maybe_pyobj;
-
-  TORCH_CHECK(
-      THPStorage_Check(pyobj),
-      "Expected a storage type, but got ",
-      Py_TYPE(pyobj)->tp_name);
-
-  TORCH_INTERNAL_ASSERT(
-      (void*)pyobj == (void*)self,
-      "Python storage and the PyObject in the internal PyObjectSlot are not at the same address");
-
-  TORCH_INTERNAL_ASSERT(!storage_impl->pyobj_slot()->owns_pyobj());
-
-  storage_impl->pyobj_slot()->set_owns_pyobj(true);
-  // When resurrecting, we MUST use _Py_NewReference and not Py_INCREF to
-  // ensure the PyObject is in a valid state
-  _Py_NewReference(reinterpret_cast<PyObject*>(self));
-
-  self->cdata = c10::MaybeOwned<c10::Storage>::borrowed(storage);
-  return true;
+  return obj;
 }
 
-static void THPStorage_subclass_dealloc(PyObject* self) {
+static void THPStorage_dealloc(PyObject* self) {
   THPStorage* _self = reinterpret_cast<THPStorage*>(self);
-
-  if (THPStorage_tryPreserve(_self)) {
-    return;
-  }
-
-  // Some subclass of StorageBase could be GC-tracked objects even
-  // though the base class is not
-  auto* type = Py_TYPE(self);
-  if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
-    PyObject_GC_UnTrack(self);
-  }
-
-  bool has_finalizer = type->tp_finalize || type->tp_del;
-
-  if (type->tp_finalize) {
-    PyObject_GC_Track(self);
-    if (PyObject_CallFinalizerFromDealloc(self) < 0) {
-      // The finalizer has resurrected the PyObject and there is a new Python
-      // reference to it, so we can just stop deallocating. Read about
-      // resurrection from `__del__` here:
-      // https://docs.python.org/3/reference/datamodel.html#object.__del__
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  // base test is unnecessary as THPStorae does not set this
-  if (type->tp_weaklistoffset) {
-    PyObject_ClearWeakRefs(self);
+  auto pyobj_slot = _self->cdata.unsafeGetStorageImpl()->pyobj_slot();
+  if (pyobj_slot->load_pyobj() == self) {
+    TORCH_INTERNAL_ASSERT(_self->cdata.use_count() == 1);
+    pyobj_slot->clear();
   }
-
-  if (type->tp_del) {
-    PyObject_GC_Track(self);
-    type->tp_del(self);
-    if (Py_REFCNT(self) > 0) {
-      // Resurrected (see above comment about resurrection from `__del__`)
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  if (has_finalizer) {
-    /* New weakrefs could be created during the finalizer call.
-       If this occurs, clear them out without calling their
-       finalizers since they might rely on part of the object
-       being finalized that has already been destroyed. */
-    if (type->tp_weaklistoffset) {
-      /* Modeled after GET_WEAKREFS_LISTPTR() */
-      PyWeakReference** list = reinterpret_cast<PyWeakReference**>(
-          PyObject_GET_WEAKREFS_LISTPTR(self));
-      while (*list)
-        _PyWeakref_ClearRef(*list);
-    }
-  }
-
-  // Clear slots
-  {
-    PyTypeObject* base = type;
-    while (base != &THPStorageType) {
-      if (Py_SIZE(base)) {
-        clear_slots(base, self);
-      }
-      base = base->tp_base;
-      TORCH_INTERNAL_ASSERT(base);
-    }
-  }
-
-  // Clear __dict__
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr(self);
-    if (dictptr != nullptr) {
-      PyObject* dict = *dictptr;
-      if (dict != nullptr) {
-        Py_DECREF(dict);
-        *dictptr = nullptr;
-      }
-    }
-  }
-
-  TORCH_INTERNAL_ASSERT(Py_TYPE(self) == type);
-
-  _self->cdata.~MaybeOwned<c10::Storage>();
+  _self->cdata.~Storage();
   Py_TYPE(_self)->tp_free(self);
-
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  Py_DECREF(type);
 }
 
 static PyObject* THPStorage_pynew(
@@ -553,64 +389,13 @@ static PyMappingMethods THPStorage_mappingmethods = {
     reinterpret_cast<binaryfunc>(THPStorage_get),
     reinterpret_cast<objobjargproc>(THPStorage_set)};
 
-struct THPStorageMeta {
-  PyHeapTypeObject base;
-};
-
-static int THPStorageMetaType_init(
-    PyObject* cls,
-    PyObject* args,
-    PyObject* kwargs);
-
-static PyTypeObject THPStorageMetaType = {
-    PyVarObject_HEAD_INIT(DEFERRED_ADDRESS(&PyType_Type), 0)
-    "torch._C._StorageMeta", /* tp_name */
-    sizeof(THPStorageMeta), /* tp_basicsize */
-    0, /* tp_itemsize */
-    nullptr, /* tp_dealloc */
-    0, /* tp_vectorcall_offset */
-    nullptr, /* tp_getattr */
-    nullptr, /* tp_setattr */
-    nullptr, /* tp_reserved */
-    nullptr, /* tp_repr */
-    nullptr, /* tp_as_number */
-    nullptr, /* tp_as_sequence */
-    nullptr, /* tp_as_mapping */
-    nullptr, /* tp_hash  */
-    nullptr, /* tp_call */
-    nullptr, /* tp_str */
-    nullptr, /* tp_getattro */
-    nullptr, /* tp_setattro */
-    nullptr, /* tp_as_buffer */
-    // NOLINTNEXTLINE(misc-redundant-expression)
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
-    nullptr, /* tp_doc */
-    nullptr, /* tp_traverse */
-    nullptr, /* tp_clear */
-    nullptr, /* tp_richcompare */
-    0, /* tp_weaklistoffset */
-    nullptr, /* tp_iter */
-    nullptr, /* tp_iternext */
-    nullptr, /* tp_methods */
-    nullptr, /* tp_members */
-    nullptr, /* tp_getset */
-    DEFERRED_ADDRESS(&PyType_Type), /* tp_base */
-    nullptr, /* tp_dict */
-    nullptr, /* tp_descr_get */
-    nullptr, /* tp_descr_set */
-    0, /* tp_dictoffset */
-    THPStorageMetaType_init, /* tp_init */
-    nullptr, /* tp_alloc */
-    nullptr, /* tp_new */
-};
-
 // TODO: implement equality
 PyTypeObject THPStorageType = {
-    PyVarObject_HEAD_INIT(&THPStorageMetaType, 0)
+    PyVarObject_HEAD_INIT(DEFERRED_ADDRESS(&PyType_Type), 0)
     "torch._C.StorageBase", /* tp_name */
     sizeof(THPStorage), /* tp_basicsize */
     0, /* tp_itemsize */
-    nullptr, /* tp_dealloc */
+    THPStorage_dealloc, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
@@ -649,15 +434,6 @@ PyTypeObject THPStorageType = {
     THPStorage_pynew, /* tp_new */
 };
 
-int THPStorageMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
-  if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
-    return -1;
-  }
-  (reinterpret_cast<PyTypeObject*>(cls))->tp_dealloc =
-      static_cast<destructor>(THPStorage_subclass_dealloc);
-  return 0;
-}
-
 static PyObject* THPStorage_device(THPStorage* self, void* unused) {
   HANDLE_TH_ERRORS
   THPStorage_assertNotNull(self);
@@ -692,13 +468,6 @@ bool THPStorage_init(PyObject* module) {
   THPUtils_addPyMethodDefs(methods, THPStorage_getMethods());
   THPUtils_addPyMethodDefs(methods, THPStorage_getSharingMethods());
 
-  THPStorageMetaType.tp_base = &PyType_Type;
-  if (PyType_Ready(&THPStorageMetaType) < 0)
-    return false;
-  Py_INCREF(&THPStorageMetaType);
-  PyModule_AddObject(
-      module, "_StorageMeta", reinterpret_cast<PyObject*>(&THPStorageMetaType));
-
   THPStorageType.tp_methods = methods.data();
   THPStorageType.tp_getset = THPStorage_properties;
   if (PyType_Ready(&THPStorageType) < 0)
diff --git a/torch/csrc/Storage.h b/torch/csrc/Storage.h
index 698cd80548efa..89e853181f3da 100644
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@@ -11,15 +11,13 @@
 
 struct THPStorage {
   PyObject_HEAD
-  c10::MaybeOwned<c10::Storage> cdata;
-  bool is_hermetic;
+  c10::Storage cdata;
 };
 
 TORCH_PYTHON_API PyObject* THPStorage_Wrap(c10::Storage storage);
 TORCH_PYTHON_API PyObject* THPStorage_NewWithStorage(
     PyTypeObject* type,
-    c10::Storage _storage,
-    bool allow_preexisting_pyobj = false);
+    c10::Storage _storage);
 TORCH_PYTHON_API extern PyTypeObject* THPStorageClass;
 
 inline bool THPStorage_CheckTypeExact(PyTypeObject* tp) {
@@ -49,7 +47,7 @@ TORCH_PYTHON_API void THPStorage_assertNotNull(PyObject* obj);
 TORCH_PYTHON_API extern PyTypeObject THPStorageType;
 
 inline const c10::Storage& THPStorage_Unpack(THPStorage* storage) {
-  return *storage->cdata;
+  return storage->cdata;
 }
 
 inline const c10::Storage& THPStorage_Unpack(PyObject* obj) {
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 68c06f7c88c1c..178f735802fb7 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -529,9 +529,8 @@ static PyObject* THPStorage__setCdata(PyObject* _self, PyObject* new_cdata) {
       THPUtils_typename(new_cdata));
   c10::StorageImpl* ptr =
       static_cast<c10::StorageImpl*>(PyLong_AsVoidPtr(new_cdata));
-  self->cdata.~MaybeOwned<c10::Storage>();
-  self->cdata = c10::MaybeOwned<c10::Storage>::owned(
-      c10::Storage(c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(ptr)));
+  self->cdata =
+      c10::Storage(c10::intrusive_ptr<c10::StorageImpl>::reclaim_copy(ptr));
   Py_INCREF(self);
   return reinterpret_cast<PyObject*>(self);
   END_HANDLE_TH_ERRORS
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 97e689d36050c..8f55f22ae4ad4 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -180,7 +180,9 @@ struct TORCH_API AccumulateGrad : public Node {
       if (!GradMode::is_enabled() && !new_grad.is_sparse() &&
           !new_grad.is_sparse_csr() &&
           !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) &&
-          at::caching::adjusted_use_count(new_grad) <= num_expected_refs &&
+          impl::is_tensor_stealable(
+              new_grad,
+              num_expected_refs + at::caching::is_cached_tensor(new_grad)) &&
           (new_grad.is_mkldnn() ||
            utils::obeys_layout_contract(new_grad, variable))) {
         // See Case 1.1: Stealable dense new_grad
@@ -193,7 +195,7 @@ struct TORCH_API AccumulateGrad : public Node {
           // SparseTensor should be the only one holding a reference to these.
           new_grad._indices().use_count() <= 1 &&
           new_grad._values().use_count() <= 1 &&
-          new_grad.use_count() <= num_expected_refs) {
+          impl::is_tensor_stealable(new_grad, num_expected_refs)) {
         // Case 1.2: Stealable sparse new_grad
         // No scenario where we expect this to be true currently
         TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index 62770ef946592..a477bf4c3e507 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -86,8 +86,8 @@ bool can_accumulate_inplace(const Variable& v) {
       v.is_non_overlapping_and_dense() &&
 
       // and we hold the last reference
-      at::caching::adjusted_use_count(v) == 1 && v.has_storage() &&
-      v.storage().use_count() == 1);
+      impl::is_tensor_stealable(v, 1 + at::caching::is_cached_tensor(v)) &&
+      v.has_storage() && v.storage().use_count() == 1);
 }
 } // anonymous namespace
 
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index 74c7a751fe960..6d0bf5d0a8579 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -54,6 +54,7 @@
 using namespace at;
 using namespace torch;
 using namespace torch::autograd;
+using torch::utils::PyObjectPreservation;
 
 namespace {
 class OperatorArgsKwargsView {
@@ -321,20 +322,15 @@ PyObject* THPVariableClass = nullptr;
 
 PyObject* ParameterClass = nullptr;
 
-static PyObject* THPVariable_NewWithVar(
-    PyTypeObject* type,
-    const at::TensorBase& _var,
-    bool allow_preexisting_pyobj = false,
-    std::optional<bool> has_torch_dispatch_if_known = std::nullopt);
-
 // clang-tidy gets confused by static const
 static constexpr const char* VOLATILE_WARNING =
     "volatile was removed and now has no effect. Use "
     "`with torch.no_grad():` instead.";
 
+static void TORCH_CHECK_TENSOR_SUBTYPE(PyObject* cls);
+
 static bool check_has_torch_dispatch(PyObject* obj) {
-  PyTypeObject* tp = Py_TYPE(obj);
-  if (THPVariable_CheckTypeExact(tp)) {
+  if (THPVariable_CheckExact(obj)) {
     return false;
   }
   py::object attr = PyObject_FastGetAttrString(obj, "__torch_dispatch__");
@@ -370,152 +366,86 @@ void activateGPUTrace() {
   c10::impl::GPUTrace::set_trace(getPyInterpreter());
 }
 
-PyObject* THPVariable_Wrap(const at::TensorBase& var) {
+static void check_tensor_subclass(PyObject* obj, PyTypeObject* type) {
+  TORCH_CHECK(
+      PyObject_TypeCheck(obj, type),
+      "Creating a new Tensor subclass ",
+      type->tp_name,
+      " but the raw Tensor object is already associated to a python object ",
+      "of type ",
+      Py_TYPE(obj)->tp_name,
+      " which is not a subclass of the requested type");
+}
+
+// Generic for const Tensor& or Tensor&&
+template <typename T>
+static PyObject* THPVariable_WrapWithType(
+    T&& var,
+    std::optional<PyTypeObject*> desired_type) {
   if (!var.defined()) {
     Py_RETURN_NONE;
   }
 
-  if (c10::impl::HermeticPyObjectTLS::get_state()) {
-    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
-  }
-
-  std::optional<PyObject*> mb_obj =
-      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
-  if (mb_obj.has_value()) {
-    auto obj = *mb_obj;
-    if (obj) {
-      if (var.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
-        // C++ owns the Python object; this implies there weren't any other
-        // owning references to the Python object.  Since we're making the
-        // object "live" again on Python side, let's flip back the ownership
-        // (Python owns C++) as it would now be unsound to deallocate the C++
-        // object if all C++ references go to zero
-        var.unsafeGetTensorImpl()->pyobj_slot()->set_owns_pyobj(false);
-        reinterpret_cast<THPVariable*>(obj)->cdata =
-            MaybeOwned<Variable>::owned(Variable(var));
-        // NB: incref is not necessary, because we are "stealing" the previous
-        // ownership from the Variable to return it here for the wrap
-        return obj;
-      }
-      Py_INCREF(obj);
-      return obj;
+  c10::TensorImpl* tensor_impl = var.unsafeGetTensorImpl();
+  c10::impl::PyObjectSlot* pyobj_slot = tensor_impl->pyobj_slot();
+
+  PyObject* obj = pyobj_slot->load_pyobj();
+  if (obj) {
+    if (desired_type) {
+      check_tensor_subclass(obj, *desired_type);
     }
-    // TODO: a better invariant is that if we tagged, we MUST have a valid
-    // PyObject.  That's PyObject preservation
-    // (https://github.com/pytorch/pytorch/pull/56017).  Prior to this PR
-    // being a thing, the PyObject field will get cleared when all references
-    // to the Python object are removed.
+    return Py_NewRef(obj);
   }
 
-  if (C10_LIKELY(var.device().type() != c10::kXLA)) {
-    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+  PyTypeObject* type = reinterpret_cast<PyTypeObject*>(THPVariableClass);
+  if (desired_type) {
+    type = *desired_type;
+  } else if (C10_UNLIKELY(var.device().type() == c10::kXLA)) {
+    if (auto clazz = getPythonTensorClass(var.device())) {
+      type = reinterpret_cast<PyTypeObject*>(clazz);
+    }
   }
 
-  if (auto clazz = getPythonTensorClass(var.device())) {
-    return THPVariable_NewWithVar((PyTypeObject*)clazz, var);
-  }
+  obj = type->tp_alloc(type, 0);
+  TORCH_CHECK(obj, "Failed to allocate a ", type->tp_name, " object");
 
-  return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
-}
+  // Ensure that PyUnstable_TryIncref calls don't fail spuriously in
+  // free-threaded Python.
+  PyUnstable_EnableTryIncRef(obj);
 
-static bool isResurrectable(THPVariable* self) {
-  // We want to divide this check into 2 cases.
-
-  // 1. C++ owns PyObject (in this case, self->cdata.unsafeIsBorrowed() is
-  // true). You might think that in this case, it is impossible for tp_clear to
-  // be called: surely the C++ reference to the PyObject is keeping it live? And
-  // you'd be right! In fact, when C++ owns the PyObject, we have an invariant
-  // that the refcount on the PyObject should be precisely one (because if you
-  // take out another reference to the PyObject, we're supposed to flip the
-  // ownership pointer back). In reality, you can violate this invariant
-  // temporarily with weak references, so we don't test for it in asserts.
-
-  // 2. PyObject owns C++ (in this case, self->cdata.unsafeIsBorrowed() is
-  // false). In this case, tp_clear can get called if the PyObject is referenced
-  // from a dead cycle, and nowhere else. But if resurrection did not occur,
-  // then the reference to C++ from the PyObject must be the ONLY reference to
-  // the C++ object.
-  if (self->cdata.unsafeIsBorrowed()) {
-    return false;
-  }
-  auto const& tensor = THPVariable_Unpack(self);
-  if (!tensor.defined() || tensor.use_count() <= 1) {
-    return false;
-  }
-  // Check if this is hermetic. If it is, no resurrection.
-  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
-    return false;
-  }
-  return true;
-}
+  auto v = reinterpret_cast<THPVariable*>(obj);
+  new (&v->cdata) Tensor(std::forward<T>(var));
 
-// returns true if successfully rezzed; if so, cancel the
-// rest of deallocation
-static bool THPVariable_tryResurrect(THPVariable* self) {
-  const auto& tensor = THPVariable_Unpack(self);
-
-  if (!isResurrectable(self)) {
-    return false;
+  if (THPVariable_Unpack(obj).is_uniquely_owned()) {
+    // We can use a faster non-atomic code path if we have the only reference to
+    // a fresh Tensor.
+    PyObjectPreservation::init_fresh_nonatomic(tensor_impl, pyobj_slot, obj);
+    return obj;
   }
 
-  // At this point, we are definitely going to resurrect the tensor. So, the
-  // tensor better be defined :)
-  TORCH_INTERNAL_ASSERT(tensor.defined());
-
-  // There are other C++ owners of the tensor.  Flip ownership
-  // so that C++ owns this Python object, and cancel deallocation.
-  TORCH_INTERNAL_ASSERT(
-      !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
-
-  c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-
-  TORCH_INTERNAL_ASSERT(
-      maybe_pyobj.has_value(),
-      "Trying to preserve a Python tensor whose PyObjectSlot does not have a PyObject");
-
-  tensor_impl->pyobj_slot()->set_owns_pyobj(true);
-
-  // Resurrect the Python object.  This is something CPython does
-  // internally occasionally, see
-  // https://github.com/python/cpython/blob/b98eba5bc2ffbe7a0ed49d540ebc4f756ae61985/Objects/object.c#L248-L259
-  // so we just copy the pattern here.  Note that we don't have to worry
-  // about saving and restoring the refcount (as the quoted code does)
-  // because we actually DO need to reset the refcount to one here, we
-  // can't assume that some other code has taken care of it.
-  // NB: this will overreport _Py_RefTotal but based on inspection of object.c
-  // there is no way to avoid this
-
-  // When resurrecting, we MUST use _Py_NewReference and not Py_INCREF to
-  // ensure the PyObject is in a valid state
-  _Py_NewReference((PyObject*)self);
-
-  // Flip THPVariable to be non-owning
-  // (near use-after-free miss here: fresh MaybeOwned is created breaking
-  // reference on Tensor in struct BEFORE we overwrite the old one)
-  TORCH_INTERNAL_ASSERT(!c10::impl::HermeticPyObjectTLS::get_state());
-  self->cdata = MaybeOwned<Variable>::borrowed(tensor);
-
-  // NB: At this point, tensor *could* be dead (e.g., some other C++ thread
-  // decrefed it.)  At this point, it is probably waiting on the GIL to
-  // deallocate the Python object and will kill self, BUT NOT YET.
+  PyObject* wrapper =
+      PyObjectPreservation::init_once(tensor_impl, pyobj_slot, obj);
+  if (wrapper != obj) {
+    // Another thread beat us to it
+    Py_DECREF(obj);
+    if (desired_type) {
+      check_tensor_subclass(wrapper, *desired_type);
+    }
+    return Py_NewRef(wrapper);
+  }
+  return obj;
+}
 
-  return true;
+PyObject* THPVariable_Wrap(at::TensorBase&& var) {
+  return THPVariable_WrapWithType(std::move(var), std::nullopt);
 }
 
-static int THPFake_traverse(THPVariable* self, visitproc visit, void* arg) {
-  TORCH_INTERNAL_ASSERT(
-      false, "TensorBase tp_traverse function was not overridden properly");
-  return 0;
+PyObject* THPVariable_Wrap(const at::TensorBase& var) {
+  return THPVariable_WrapWithType(var, std::nullopt);
 }
 
-static int THPFake_clear(THPVariable* self) {
-  TORCH_INTERNAL_ASSERT(
-      false, "TensorBase tp_clear function was not overridden properly");
-  return 0;
+PyObject* THPVariable_Wrap(const at::TensorBase& var, PyTypeObject* type) {
+  return THPVariable_WrapWithType(var, type);
 }
 
 static PyObject* THPVariable_pynew(
@@ -677,16 +607,16 @@ static PyObject* THPVariable_as_subclass(
   ParsedArgs<1> parsed_args{};
   auto r = parser.parse(_self, args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
   // guard completely turns off torch dispatch modes, doesn't just pop off the
   // stack
   torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
   c10::impl::DisablePythonDispatcher dpd_g;
-  return THPVariable_NewWithVar((PyTypeObject*)cls, self.alias());
+  PyObject* obj = THPVariable_WrapWithType(self.alias(), (PyTypeObject*)cls);
+  if (check_has_torch_dispatch(obj)) {
+    THPVariable_Unpack(obj).unsafeGetTensorImpl()->set_python_dispatch(true);
+  }
+  return obj;
   END_HANDLE_TH_ERRORS
 }
 
@@ -701,11 +631,7 @@ static PyObject* THPVariable_make_subclass(
   ParsedArgs<7> parsed_args{};
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
   // guard completely turns off torch dispatch modes, doesn't just pop off the
   // stack
   torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
@@ -738,7 +664,11 @@ static PyObject* THPVariable_make_subclass(
     data.unsafeGetTensorImpl()->_change_backend_component_keys(r.device(6));
   }
 
-  return THPVariable_NewWithVar((PyTypeObject*)cls, data);
+  PyObject* obj = THPVariable_WrapWithType(data, (PyTypeObject*)cls);
+  if (check_has_torch_dispatch(obj)) {
+    THPVariable_Unpack(obj).unsafeGetTensorImpl()->set_python_dispatch(true);
+  }
+  return obj;
   END_HANDLE_TH_ERRORS
 }
 
@@ -835,11 +765,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
 
   // This is an important safety check; without it, the default behavior will be
   // to continue on to the underlying CPU/CUDA kernel advertised by the dispatch
@@ -877,6 +803,8 @@ static PyObject* THPVariable_make_wrapper_subclass(
       /*storage_size=*/r.toSymIntOptional(14),
       r.toDispatchKeySetOptional(13));
 
+  tensor.unsafeGetTensorImpl()->set_python_dispatch(true);
+
   const auto sizes_strides_policy = r.stringViewOptional(10);
   if (sizes_strides_policy.has_value()) {
     tensor.unsafeGetTensorImpl()->set_python_custom_sizes_strides(
@@ -892,13 +820,7 @@ static PyObject* THPVariable_make_wrapper_subclass(
     tensor.unsafeGetTensorImpl()->set_python_custom_layout(true);
   }
 
-  return THPVariable_NewWithVar(
-      (PyTypeObject*)cls,
-      tensor,
-      // false is the default
-      /*allow_preexisting_pyobj=*/false,
-      // we checked __torch_dispatch__ above; avoid checking again.
-      /*has_torch_dispatch_if_known=*/true);
+  return THPVariable_WrapWithType(std::move(tensor), (PyTypeObject*)cls);
   END_HANDLE_TH_ERRORS
 }
 
@@ -1699,11 +1621,7 @@ static PyObject* THPVariable_dtensor_new(
   auto r = parser.parse(args, kwargs, parsed_args);
   PyObject* cls = r.pyobject(0);
 
-  TORCH_CHECK_TYPE(
-      PyType_Check(cls),
-      "cls must be a type (got ",
-      Py_TYPE(cls)->tp_name,
-      ")");
+  TORCH_CHECK_TENSOR_SUBTYPE(cls);
 
 #ifndef NDEBUG
   // This is specifically for making a DTensor, which we know defines
@@ -1756,14 +1674,9 @@ static PyObject* THPVariable_dtensor_new(
       /*storage_size=*/std::nullopt,
       extra_dispatch_keys);
   tensor.set_requires_grad(requires_grad);
-  py::object py_tensor =
-      py::reinterpret_steal<py::object>(THPVariable_NewWithVar(
-          (PyTypeObject*)cls,
-          tensor,
-          // false is the default
-          /*allow_preexisting_pyobj=*/false,
-          // we know DTensor has __torch_dispatch__; avoid checking again.
-          /*has_torch_dispatch_if_known=*/true));
+  tensor.unsafeGetTensorImpl()->set_python_dispatch(true);
+  py::object py_tensor = py::reinterpret_steal<py::object>(
+      THPVariable_WrapWithType(std::move(tensor), (PyTypeObject*)cls));
   py_tensor.attr(dtensor_interned_strings._spec) = spec;
   py_tensor.attr(dtensor_interned_strings._local_tensor) = local_tensor;
   return py_tensor.release().ptr();
@@ -3440,15 +3353,16 @@ static PyTypeObject THPVariableMetaType = {
     nullptr, /* tp_new */
 };
 
+static void THPVariable_dealloc(PyObject* self);
+static int THPVariable_clear(THPVariable* self);
+static int THPVariable_traverse(PyObject* self, visitproc visit, void* arg);
+
 static PyTypeObject THPVariableType = {
     PyVarObject_HEAD_INIT(&THPVariableMetaType, 0)
     "torch._C.TensorBase", /* tp_name */
     sizeof(THPVariable), /* tp_basicsize */
     0, /* tp_itemsize */
-    // This is unspecified, because it is illegal to create a THPVariableType
-    // directly.  Subclasses will have their tp_dealloc set appropriately
-    // by the metaclass
-    nullptr, /* tp_dealloc */
+    THPVariable_dealloc, /* tp_dealloc */
     0, /* tp_vectorcall_offset */
     nullptr, /* tp_getattr */
     nullptr, /* tp_setattr */
@@ -3467,9 +3381,8 @@ static PyTypeObject THPVariableType = {
     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
         Py_TPFLAGS_HAVE_GC, /* tp_flags */
     nullptr, /* tp_doc */
-    // Also set by metaclass
-    (traverseproc)THPFake_traverse, /* tp_traverse */
-    (inquiry)THPFake_clear, /* tp_clear */
+    (traverseproc)THPVariable_traverse, /* tp_traverse */
+    (inquiry)THPVariable_clear, /* tp_clear */
     nullptr, /* tp_richcompare */
     0, /* tp_weaklistoffset */
     nullptr, /* tp_iter */
@@ -3498,345 +3411,68 @@ PyObject* THPVariable_pynew(
       type != &THPVariableType,
       "Cannot directly construct TensorBase; subclass it and then construct that");
   jit::tracer::warn("torch.Tensor", jit::tracer::WARN_CONSTRUCTOR);
-  auto tensor = torch::utils::base_tensor_ctor(args, kwargs);
   // WARNING: tensor is NOT guaranteed to be a fresh tensor; e.g., if it was
   // given a raw pointer that will refcount bump
   // NB: base_tensor_ctor can call into dispatched ATen functions (e.g.,
   // alias(), lift_fresh()) which can return Tensor subclasses.  We allow
   // these to be passed on directly.
-  return THPVariable_NewWithVar(
-      type,
-      tensor,
-      /*allow_preexisting_pyobj=*/true);
+  PyObject* obj = THPVariable_WrapWithType(
+      torch::utils::base_tensor_ctor(args, kwargs), type);
+  if (check_has_torch_dispatch(obj)) {
+    THPVariable_Unpack(obj).unsafeGetTensorImpl()->set_python_dispatch(true);
+  }
+  return obj;
   END_HANDLE_TH_ERRORS
 }
 
-static int THPVariable_subclass_clear(THPVariable* self) {
-  // Is it OK for an object to still be live after running
-  // tp_clear? Yes. When Python is breaking reference cycles, it can't assume
-  // that an object will dealloc after it's cleared.  The source code explicitly
-  // handles this case:
-  // https://github.com/python/cpython/blob/4e661cd69164318c1f871faa476c68a04092ddc4/Modules/gcmodule.c#L1010-L1025
-
-  // Note that we don't need to actually resurrect here. There are 2 cases:
-  // 1. The PyObject is not part of a reference cycle. In this case, we don't
-  // need to do anything. The GC will move on to try and break the reference
-  // cycle on another object, which will eventually trigger tp_dealloc (and thus
-  // resurrection).
-
-  // 2. The PyObject is part of a reference cycle. This case should not actually
-  // be possible, due to the logic in our tp_traverse
-  // (THPVariable_subclass_traverse).
-
-  // In fact, resurrecting here breaks the invariant that "C++ owns Python only
-  // when PyObject's refcount would otherwise be 0". Most immediately, as we're
-  // merely breaking reference cycles here, there can be other references to the
-  // PyObject. *However*, if other objects in the refcycle resurrect, then we
-  // will be in a state where the PyObject has multiple Python references, yet
-  // C++ owns the PyObject.
-
-  // See https://github.com/pytorch/pytorch/pull/75933 for more discussion.
-  if (isResurrectable(self)) {
-    return 0;
-  }
-
+static int THPVariable_clear(THPVariable* self) {
   // First clear Tensor specific things
-
   Py_CLEAR(self->backward_hooks);
   Py_CLEAR(self->post_accumulate_grad_hooks);
-  const auto& tensor = THPVariable_Unpack(self);
-  if (tensor.defined()) {
-    // Two situations to consider:
-    //    PyObject -owns-> Tensor
-    //        unsafeIsBorrowed() is FALSE.  We're obligated to look through
-    //        Tensor to break references.  Clearing cdata must induce the
-    //        destruction of the C++ Tensor.  If there were other references
-    //        to C++ tensor, the Python object would have been resurrected
-    //        by flipping the ownership.
-    //    Tensor -owns-> PyObject
-    //        unsafeIsBorrowed() is TRUE.  We're deallocating the PyObject
-    //        because Tensor asked us to (it's already destructing).
-
-    if (!self->cdata.unsafeIsBorrowed() &&
-        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-            /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
-      // TODO: empirically, on OS X this assert appears to be untrue
-      // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
-      // distributed/rpc/test_process_group_agent.py
-      //
-      //  libc++abi.dylib: terminating with uncaught exception of type
-      //  c10::Error:
-      //  !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()INTERNAL
-      //  ASSERT FAILED at "../torch/csrc/autograd/python_variable.cpp":171,
-      //  please report a bug to PyTorch. Exception raised from
-      //  THPVariable_subclass_clear at
-      //  ../torch/csrc/autograd/python_variable.cpp:171 (most recent call
-      //  first): frame #0: c10::Error::Error(c10::SourceLocation,
-      //  std::__1::basic_string<char, std::__1::char_traits<char>,
-      //  std::__1::allocator<char> >) + 98 (0x1158a0442 in libc10.dylib) frame
-      //  #1: c10::detail::torchCheckFail(char const*, char const*, unsigned
-      //  int, char const*) + 205 (0x11589ed3d in libc10.dylib) frame #2:
-      //  c10::detail::torchInternalAssertFail(char const*, char const*,
-      //  unsigned int, char const*, c10::detail::CompileTimeEmptyString) + 9
-      //  (0x1141e3f89 in libtorch_python.dylib) frame #3:
-      //  THPVariable_subclass_clear(THPVariable*) + 412 (0x1148a547c in
-      //  libtorch_python.dylib) frame #4:
-      //  THPVariable_subclass_dealloc(_object*) + 453 (0x1148a5035 in
-      //  libtorch_python.dylib) frame #5: (anonymous
-      //  namespace)::concrete_decref_fn(c10::impl::PyInterpreter const*,
-      //  _object*) + 53 (0x1148a5ea5 in libtorch_python.dylib) frame #6:
-      //  c10::TensorImpl::release_resources() + 182 (0x11588c4a6 in
-      //  libc10.dylib) frame #7:
-      //  c10::MaybeOwned<at::Tensor>::operator=(c10::MaybeOwned<at::Tensor>&&)
-      //  + 91 (0x11488c11b in libtorch_python.dylib) frame #8:
-      //  THPVariable_subclass_dealloc(_object*) + 607 (0x1148a50cf in
-      //  libtorch_python.dylib) <omitting python frames> frame #47: start + 1
-      //  (0x7fff6ffc7cc9 in libdyld.dylib) frame #48: 0x0 + 4 (0x4 in ???)
-      // TORCH_INTERNAL_ASSERT(!tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
-      if (auto grad_acc =
-              torch::autograd::impl::try_get_grad_accumulator(tensor)) {
-        grad_acc->pre_hooks().clear();
-        grad_acc->tensor_pre_hooks().clear();
-        grad_acc->retains_grad_hooks().clear();
-      }
+  if (self->cdata.defined()) {
+    auto pyobj_slot = self->cdata.unsafeGetTensorImpl()->pyobj_slot();
+    // Typically the Tensor's pyobj_slot points back to this object. The only
+    // time that's not the case is if we had a race in THPVariable_Wrap and we
+    // need to discard the Python object because some other thread beat us to
+    // setting the pyobj_slot.
+    if (pyobj_slot->load_pyobj() == (PyObject*)self) {
+      // A Tensor's Python object should only be destroyed when the Tensor has
+      // no other references too.
+      TORCH_INTERNAL_ASSERT(self->cdata.use_count() == 1);
+
+      // Clear the pyobj_slot so that a try_incref() call from
+      // weak_intrusive_ptr::lock() won't see a freed pointer.
+      pyobj_slot->clear();
     }
   }
-  TORCH_INTERNAL_ASSERT(!isResurrectable(self));
   {
     // MapAllocator can take significant time to release large tensors;
     // release the GIL here to avoid impacting main thread perf.
     pybind11::gil_scoped_release no_gil;
-    self->cdata = MaybeOwned<Variable>();
+    self->cdata = Variable();
   }
-  // Since we override the basic subtype_clear from CPython, we need a crappy
-  // version here just like for traverse and dealloc
-
-  // Clear all slots until we get to the base Tensor class
-  PyTypeObject* type = Py_TYPE((PyObject*)self);
-  PyTypeObject* base = type;
-  while (base != &THPVariableType) {
-    if (Py_SIZE(base))
-      clear_slots(base, (PyObject*)self);
-    base = base->tp_base;
-    TORCH_INTERNAL_ASSERT(base);
-  }
-
-  // Assume we never have managed dict for Tensors as we don't set the flag on
-  // the base class
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr((PyObject*)self);
-    if (dictptr && *dictptr)
-      Py_CLEAR(*dictptr);
-  }
-
   return 0;
 }
 
-// NB: this is not the tp_dealloc on THPVariable; instead, its the dealloc
-// on subclasses.  It's never valid to construct a THPVariable so it's not
-// necessary to implement the dealloc for that case
-static void THPVariable_subclass_dealloc(PyObject* self) {
-  if (THPVariable_tryResurrect((THPVariable*)self))
-    return;
-
-  // This is like a crappy version of subtype_dealloc.
-  // Unfortunately, we cannot directly delegate to
-  // subtype_dealloc as it will start walking the parent
-  // chain *starting with* the type of self, which will cause
-  // us to go back to our custom dealloc.
-  //
-  // We have to replicate the subtype_dealloc logic to ensure
-  // that finalizers are handled correctly
-  PyTypeObject* type = Py_TYPE(self);
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  TORCH_INTERNAL_ASSERT(PyType_IS_GC(type), "GC types not implemented");
-
+static void THPVariable_dealloc(PyObject* self) {
   PyObject_GC_UnTrack(self);
-  // TODO: consider using trash can
-
-  bool has_finalizer = type->tp_finalize || type->tp_del;
-
-  if (type->tp_finalize) {
-    PyObject_GC_Track(self);
-    if (PyObject_CallFinalizerFromDealloc(self) < 0) {
-      /* Resurrected */
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  // base test is unnecessary as THPVariable does not set this
-  if (type->tp_weaklistoffset) {
-    PyObject_ClearWeakRefs(self);
-  }
-
-  if (type->tp_del) {
-    PyObject_GC_Track(self);
-    type->tp_del(self);
-    if (Py_REFCNT(self) > 0) {
-      /* Resurrected */
-      return;
-    }
-    PyObject_GC_UnTrack(self);
-  }
-
-  if (has_finalizer) {
-    /* New weakrefs could be created during the finalizer call.
-       If this occurs, clear them out without calling their
-       finalizers since they might rely on part of the object
-       being finalized that has already been destroyed. */
-    if (type->tp_weaklistoffset) {
-      /* Modeled after GET_WEAKREFS_LISTPTR() */
-      PyWeakReference** list =
-          (PyWeakReference**)PyObject_GET_WEAKREFS_LISTPTR(self);
-      while (*list)
-        _PyWeakref_ClearRef(*list);
-    }
-  }
-
-  // Clear all slots until we get to base class THPVariableType
-  {
-    PyTypeObject* base = type;
-    while (base != &THPVariableType) {
-      if (Py_SIZE(base)) {
-        clear_slots(base, self);
-      }
-      base = base->tp_base;
-      TORCH_INTERNAL_ASSERT(base);
-    }
-  }
-
-  // All Python defined classes have __dict__
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr(self);
-    if (dictptr != nullptr) {
-      PyObject* dict = *dictptr;
-      if (dict != nullptr) {
-        Py_DECREF(dict);
-        *dictptr = nullptr;
-      }
-    }
-  }
-
-  // subtype_dealloc allows for this but we don't
-  TORCH_INTERNAL_ASSERT(Py_TYPE(self) == type);
-
-  // Finally clear out the base THPVariable
-  THPVariable_subclass_clear((THPVariable*)self);
-  ((THPVariable*)self)->cdata.~MaybeOwned<Variable>();
+  THPVariable_clear((THPVariable*)self);
+  ((THPVariable*)self)->cdata.~Variable();
   Py_TYPE(self)->tp_free(self);
-
-  // Python defined subclasses should always be on the heap
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  Py_DECREF(type);
 }
 
-// Creates a new Python object for a Variable.
-static PyObject* THPVariable_NewWithVar(
-    PyTypeObject* type,
-    const at::TensorBase& _var,
-    bool allow_preexisting_pyobj,
-    std::optional<bool> has_torch_dispatch_if_known) {
-  // Make sure that the reinterpret into a THPVariable* will be valid
-  TORCH_CHECK(
-      type == &THPVariableType || PyType_IsSubtype(type, &THPVariableType),
-      "Creating a Tensor subclass from a class ",
-      "that does not inherit from Tensor is not possible. Make sure your class inherits from Tensor.");
-
-  // This function overwrite the Tensor's pyobj field without extra checks
-  // Make sure it is not set otherwise we would leak memory
-  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
-
-  // Under some circumstances, we may attempt to create a new Python
-  // object for a variable that already has a Python object.  The most common
-  // situation this can occur is if you have a TorchDispatchMode active that
-  // is returning a subclass from lift_fresh (which is invoked to
-  // appropriately "wrap" a constant tensor into whatever ambient modes are
-  // active.)
-  //
-  // In general, it is impossible to handle this case compositionally.
-  // Suppose you have a user call ATensor([1, 2, 3]) when a mode is active
-  // that is transforming all ops (including the internal lift_fresh call that
-  // transforms [1, 2, 3] into a torch.tensor([1., 2., 3.])) to output
-  // BTensor, where ATensor and BTensor are completely unrelated subclasses
-  // and there is no way to compose them.  There is no way to satisfy the user
-  // request here: in particular, you can't just try to re-invoke the ATensor
-  // constructor on the returned BTensor, because (1) this could cause an
-  // infinite loop--we are already in ATensor.__new__ and (2) there isn't any
-  // guarantee that ATensor.__new__ supports a single element constructor
-  // anyway.
-  //
-  // However, a more common case is a user just called torch.Tensor([1, 2, 3]),
-  // and a fake tensor mode is active.  Really, all you want is to get back
-  // a FakeTensor, in the same way torch.tensor([1, 2, 3]) or torch.arange(3)
-  // would have returned a fake tensor (concretely, the way this happens
-  // is we create a *real* tensor torch.tensor([1., 2., 3.]), and then it
-  // turns into a FakeTensor when we call lift_fresh on this real tensor).
-  // This case is compositional because FakeTensor is a subclass of Tensor, so
-  // it's valid for us to return it in place of a Tensor.  So this is what we
-  // do.
-
-  if (mb_obj.has_value() && mb_obj.value()) {
-    TORCH_CHECK(
-        allow_preexisting_pyobj,
-        "Creating a new Tensor subclass ",
-        type->tp_name,
-        " but the raw Tensor object is already associated to a python object ",
-        "of type ",
-        mb_obj.value()->ob_type->tp_name);
-    // Even if we allow pre-existing PyObject, we don't allow completely
-    // ignoring the requested type.  Check that we fulfilled a subtype
-    // relation here.  In the common case the requested type is Tensor and
-    // this always succeeds.
-    PyObject* obj = *mb_obj;
-    // Check if it's OK to just directly return the Python object without
-    // allocating a new variable.  We just check that the existing Python
-    // object is a subclass of the requested type.
-    PyTypeObject* obj_type = Py_TYPE(obj);
-    TORCH_CHECK(
-        obj_type == type || PyType_IsSubtype(obj_type, type),
-        "Creating a new Tensor subclass ",
-        type->tp_name,
-        " but the raw Tensor object is already associated to a python object ",
-        "of type ",
-        mb_obj.value()->ob_type->tp_name,
-        " which is not a subclass of the "
-        "requested type");
-    // We may (in fact, we typically will) need to resurrect this
-    return THPVariable_Wrap(_var);
-  }
-
-  PyObject* obj = type->tp_alloc(type, 0);
-  if (obj) {
-    auto v = (THPVariable*)obj;
-    // TODO: named constructor to avoid default initialization
-    new (&v->cdata) MaybeOwned<Variable>();
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
-      // Do NOT initialize pyobj field on the tensor, you own the C++
-      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
-      TORCH_INTERNAL_ASSERT(
-          !check_has_torch_dispatch(obj),
-          "While HermeticPyObject was enabled, we attempted to create a tensor "
-          "subclass with __torch_dispatch__.  This violates the invariant that "
-          "operations in HermeticPyObject have equivalent C++ implementations. "
-          "If your operator registered from Python operator registration isn't "
-          "doing anything strange, there may be an internal PyTorch bug involving "
-          "not appropriately disabling TorchDispatchMode before executing "
-          "Python op registration.");
-    } else {
-      // Normal codepath
-      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
-      const auto& var = THPVariable_Unpack(v);
-      var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(obj);
-      if (has_torch_dispatch_if_known.has_value()
-              ? *has_torch_dispatch_if_known
-              : check_has_torch_dispatch(obj)) {
-        var.unsafeGetTensorImpl()->set_python_dispatch(true);
-      }
-    }
-  }
-  return obj;
+static void TORCH_CHECK_TENSOR_SUBTYPE(PyObject* cls) {
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
+  PyTypeObject* type = reinterpret_cast<PyTypeObject*>(cls);
+  TORCH_CHECK_TYPE(
+      type == &THPVariableType || cls == THPVariableClass ||
+          PyType_IsSubtype(type, &THPVariableType),
+      "Creating a Tensor subclass from a class that does not inherit from "
+      "Tensor is not possible. Make sure your class inherits from Tensor.");
 }
 
 /// NOTE [ PyObject Traversal ]
@@ -3855,7 +3491,7 @@ static PyObject* THPVariable_NewWithVar(
 /// into account these C++ ownership links.
 ///
 /// The main danger here comes from the fact that, while all python-related code
-/// is thread safe wrt the GC execution (thanks to the GIL), other threads might
+/// is thread safe wrt the GC execution, other threads might
 /// be using our C++ objects arbitrarily which can lead to shared_ptr ref count
 /// going up or down in between the different traverse/clear invocations. The
 /// one constraint we add here that is not explicitly mentioned in the GC
@@ -3885,124 +3521,46 @@ static PyObject* THPVariable_NewWithVar(
 /// https://github.com/pytorch/pytorch/issues/7343
 ///
 
-static int traverse_slots(
-    PyTypeObject* type,
-    PyObject* self,
-    visitproc visit,
-    void* arg) {
-  auto n = Py_SIZE(type);
-  auto mp = type->tp_members;
-  for (Py_ssize_t i = 0; i < n; i++, mp++) {
-    if (mp->type == T_OBJECT_EX) {
-      char* addr = (char*)self + mp->offset;
-      PyObject* obj = *(PyObject**)addr;
-      if (obj != nullptr) {
-        int err = visit(obj, arg);
-        if (err)
-          return err;
-      }
-    }
-  }
-  return 0;
-}
-
-static int THPVariable_subclass_traverse(
-    PyObject* self,
-    visitproc visit,
-    void* arg) {
-  // If the tensor is eligible to be resurrected, don't traverse it; instead
-  // treat all of its references as a root (as they WOULD be a root since we
-  // can treat the inbound C++ references as root owners).
-  //
-  // This works because unlike conventional GCs, Python's GC operates in two
-  // phases: first it uses traverse to discover roots, and then it uses traverse
-  // to do reachability.  Bypassing traverse during root discovery forces Python
-  // to treat self as a root for everything it refers to.  For a full
-  // explanation of the algorithm see
-  // https://devguide.python.org/garbage_collector/
-  //
-  // NB: if we don't hold an owning reference to the underlying Tensor, it is
-  // possible that the underlying Tensor has already gone dead.  In that case,
-  // it's not safe to access it.  But it's also safe to traverse, because if
-  // the underlying Tensor *is* live, then root discovery will determine that
-  // self is live, and nothing will get GC'ed anyway (resurrection cannot happen
-  // if the C++ objects owns the PyObject)
+static int THPVariable_traverse(PyObject* self, visitproc visit, void* arg) {
   THPVariable* var = reinterpret_cast<THPVariable*>(self);
-  if (isResurrectable(var)) {
-    return 0;
-  }
-
-  // Crappy version of subtype_traverse; same deal as
-  // THPVariable_subclass_dealloc
-
-  PyTypeObject* type = Py_TYPE(self);
-  // Traverse slots until we get to base class THPVariableType
-  {
-    PyTypeObject* base = type;
-    while (base != &THPVariableType) {
-      if (Py_SIZE(base)) {
-        int err = traverse_slots(base, self, visit, arg);
-        if (err)
-          return err;
-      }
-      base = base->tp_base;
-      TORCH_INTERNAL_ASSERT(base);
-    }
-  }
-
-  // All Python defined classes have __dict__
-  if (C10_LIKELY(type->tp_dictoffset)) {
-    PyObject** dictptr = _PyObject_GetDictPtr(self);
-    if (dictptr && *dictptr)
-      Py_VISIT(*dictptr);
-  }
-
-  TORCH_INTERNAL_ASSERT(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
-  Py_VISIT(type);
-
-  // Finally traverse THPVariable special stuff
   Py_VISIT(var->backward_hooks);
   Py_VISIT(var->post_accumulate_grad_hooks);
-  if (!var->cdata.unsafeIsBorrowed()) {
-    const auto& tensor = THPVariable_Unpack(var);
-    if (tensor.defined()) {
-      // WARNING: The grad_fn traversal logic is very subtle, if you change
-      // this, be very careful not to re-introduce this bug:
-      // https://gist.github.com/zou3519/7ac92b84dd7d206dcc6eae55fee8372c
-
-      // We ensure that we follow NOTE [ PyObject Traversal ] he by checking
-      // that this python object is the sole owner of the underlying Tensor and
-      // that this Tensor is the sole owner of its grad_fn. In this case, the
-      // only way to get a new reference to the grad_fn is by using this python
-      // object, which requires the GIL to be accessed. Note that this is only
-      // valid as long as user don't share non-owning references across
-      // different threads (which is crazy and should never be done).
-      auto autograd_meta = torch::autograd::impl::get_autograd_meta(tensor);
-      if (tensor.use_count() == 1) {
-        if (autograd_meta) {
-          // Do NOT call grad_fn() here as that might trigger a recompute
-          const auto& grad_fn = autograd_meta->grad_fn_;
-          if (grad_fn && grad_fn.use_count() == 1) {
-            // All Node can have a pyobj (stored in "pyobj_")
-            Py_VISIT(grad_fn->pyobj());
-            // PyNode are special as they also have an "obj" field
-            if (auto py_node_fn = dynamic_cast<PyNode*>(grad_fn.get())) {
-              Py_VISIT(py_node_fn->obj);
-            }
+  const auto& tensor = THPVariable_Unpack(var);
+  if (tensor.defined()) {
+    // WARNING: The grad_fn traversal logic is very subtle, if you change
+    // this, be very careful not to re-introduce this bug:
+    // https://gist.github.com/zou3519/7ac92b84dd7d206dcc6eae55fee8372c
+
+    // We ensure that we follow NOTE [ PyObject Traversal ] he by checking
+    // that this python object is the sole owner of the underlying Tensor and
+    // that this Tensor is the sole owner of its grad_fn. In this case, the
+    // only way to get a new reference to the grad_fn is by using this python
+    // object, which requires the GIL to be accessed. Note that this is only
+    // valid as long as user don't share non-owning references across
+    // different threads (which is crazy and should never be done).
+    auto autograd_meta = torch::autograd::impl::get_autograd_meta(tensor);
+    if (tensor.use_count() == 1) {
+      if (autograd_meta) {
+        // Do NOT call grad_fn() here as that might trigger a recompute
+        const auto& grad_fn = autograd_meta->grad_fn_;
+        if (grad_fn && grad_fn.use_count() == 1) {
+          // All Node can have a pyobj (stored in "pyobj_")
+          Py_VISIT(grad_fn->pyobj());
+          // PyNode are special as they also have an "obj" field
+          if (auto py_node_fn = dynamic_cast<PyNode*>(grad_fn.get())) {
+            Py_VISIT(py_node_fn->obj);
           }
         }
       }
-      if (autograd_meta) {
-        for (const auto& hook : torch::autograd::impl::hooks(tensor)) {
-          if (auto pyhook =
-                  dynamic_cast<PyFunctionTensorPreHook*>(hook.get())) {
-            Py_VISIT(pyhook->dict);
-          }
+    }
+    if (autograd_meta) {
+      for (const auto& hook : torch::autograd::impl::hooks(tensor)) {
+        if (auto pyhook = dynamic_cast<PyFunctionTensorPreHook*>(hook.get())) {
+          Py_VISIT(pyhook->dict);
         }
       }
     }
   }
-
   return 0;
 }
 
@@ -4010,17 +3568,6 @@ int THPVariableMetaType_init(PyObject* cls, PyObject* args, PyObject* kwargs) {
   if (PyType_Type.tp_init(cls, args, kwargs) < 0) {
     return -1;
   }
-  // It is important for all three of these to be overridden correctly for the
-  // resurrection checks to properly happen. In particular, an older version
-  // was not overriding tp_clear here. This lead to the default subtype_clear
-  // running on the Tensor object (as only TensorBase tp_clear was custom),
-  // clearing the __dict__ field, before the TensorBase custom clear was called
-  // and would properly detect the resurrect.
-  // See https://github.com/pytorch/pytorch/issues/136358 for the exact behavior
-  ((PyTypeObject*)cls)->tp_dealloc = (destructor)THPVariable_subclass_dealloc;
-  ((PyTypeObject*)cls)->tp_traverse =
-      (traverseproc)THPVariable_subclass_traverse;
-  ((PyTypeObject*)cls)->tp_clear = (inquiry)THPVariable_subclass_clear;
 
   // Don't do anything for the base Tensor class
   if (!THPVariableClass) {
diff --git a/torch/csrc/autograd/python_variable.h b/torch/csrc/autograd/python_variable.h
index 1b2116ec1ee6b..5b6f089990693 100644
--- a/torch/csrc/autograd/python_variable.h
+++ b/torch/csrc/autograd/python_variable.h
@@ -17,7 +17,7 @@ namespace py = pybind11;
 struct THPVariable {
   PyObject_HEAD
   // Payload
-  c10::MaybeOwned<at::Tensor> cdata;
+  at::Tensor cdata;
   // Hooks to be run on backwards pass (corresponds to Python attr
   // '_backwards_hooks', set by 'register_hook')
   PyObject* backward_hooks = nullptr;
@@ -37,7 +37,11 @@ TORCH_PYTHON_API extern PyObject* THPVariableClass;
 TORCH_PYTHON_API extern PyObject* ParameterClass;
 
 bool THPVariable_initModule(PyObject* module);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(at::TensorBase&& var);
 TORCH_PYTHON_API PyObject* THPVariable_Wrap(const at::TensorBase& var);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(
+    const at::TensorBase& var,
+    PyTypeObject* type);
 
 inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) {
   // Check that a python object is a `Tensor`, but not a `Tensor` subclass.
@@ -69,7 +73,7 @@ inline bool THPVariable_Check(PyObject* obj) {
 }
 
 inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
-  return *var->cdata;
+  return var->cdata;
 }
 
 inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
diff --git a/torch/csrc/autograd/utils/grad_layout_contract.h b/torch/csrc/autograd/utils/grad_layout_contract.h
index ed97dc4530eb4..00bdb91c36867 100644
--- a/torch/csrc/autograd/utils/grad_layout_contract.h
+++ b/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -65,7 +65,9 @@ inline at::Tensor clone_obey_contract(
                          .new_empty_strided_symint(
                              variable.sym_sizes(),
                              variable.sym_strides(),
-                             variable.options().memory_format(std::nullopt))
+                             variable.options()
+                                 .memory_format(std::nullopt)
+                                 .dtype(new_grad.dtype()))
                          .copy_(new_grad));
   } else {
     // (2)
diff --git a/torch/csrc/autograd/utils/wrap_outputs.h b/torch/csrc/autograd/utils/wrap_outputs.h
index 6e0494df5cf47..616b0fa0331bc 100644
--- a/torch/csrc/autograd/utils/wrap_outputs.h
+++ b/torch/csrc/autograd/utils/wrap_outputs.h
@@ -70,6 +70,10 @@ inline PyObject* wrap(const at::Tensor& tensor) {
   return THPVariable_Wrap(tensor);
 }
 
+inline PyObject* wrap(at::Tensor&& tensor) {
+  return THPVariable_Wrap(std::move(tensor));
+}
+
 inline PyObject* wrap(const at::Scalar& scalar) {
   return wrap(scalar_to_tensor(scalar));
 }
diff --git a/torch/csrc/autograd/variable.h b/torch/csrc/autograd/variable.h
index a297a9f5ef425..05dbfdaa44325 100644
--- a/torch/csrc/autograd/variable.h
+++ b/torch/csrc/autograd/variable.h
@@ -197,6 +197,22 @@ TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
 TORCH_API void create_cpp_hook(
     const at::TensorBase& /*self*/,
     bool is_retains_grad_hooks = false);
+
+inline bool is_tensor_stealable(
+    const at::Tensor& new_grad,
+    size_t num_expected_refs = 1) {
+  size_t use_count = new_grad.use_count();
+  if (use_count <= num_expected_refs) {
+    return true;
+  }
+  if (use_count >= 2 &&
+      new_grad.unsafeGetTensorImpl()->pyobj_slot()->has_unique_reference()) {
+    // The Python wrapper, if it exists, also has a reference to the Tensor.
+    num_expected_refs++;
+  }
+  return use_count <= num_expected_refs;
+}
+
 } // namespace impl
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -894,7 +910,7 @@ inline Variable make_variable(
     bool requires_grad = false,
     bool allow_tensor_metadata_change = true) {
   if (data.defined()) {
-    if (data.getIntrusivePtr().use_count() == 1 &&
+    if (impl::is_tensor_stealable(data) &&
         data.getIntrusivePtr()->unique_version()) {
       auto data_impl = data.unsafeReleaseIntrusivePtr();
       data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
diff --git a/torch/csrc/utils/pyobject_preservation.cpp b/torch/csrc/utils/pyobject_preservation.cpp
index 4f2d0a2507011..a652cbdb7aefd 100644
--- a/torch/csrc/utils/pyobject_preservation.cpp
+++ b/torch/csrc/utils/pyobject_preservation.cpp
@@ -1,19 +1,67 @@
 #include <torch/csrc/utils/pyobject_preservation.h>
 
-#include <structmember.h>
-
-void clear_slots(PyTypeObject* type, PyObject* self) {
-  Py_ssize_t n = Py_SIZE(type);
-  PyMemberDef* mp = type->tp_members;
-
-  for (Py_ssize_t i = 0; i < n; i++, mp++) {
-    if (mp->type == T_OBJECT_EX && !(mp->flags & READONLY)) {
-      char* addr = (char*)self + mp->offset;
-      PyObject* obj = *(PyObject**)addr;
-      if (obj != nullptr) {
-        *(PyObject**)addr = nullptr;
-        Py_DECREF(obj);
-      }
+#include <c10/core/impl/PyObjectSlot.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace torch::utils {
+
+using c10::intrusive_ptr_target;
+using c10::impl::PyObjectSlot;
+
+void PyObjectPreservation::init_fresh_nonatomic(
+    intrusive_ptr_target* target,
+    PyObjectSlot* slot,
+    PyObject* pyobj) {
+  TORCH_INTERNAL_ASSERT(slot->load_pyobj() == nullptr);
+  TORCH_INTERNAL_ASSERT(
+      target->combined_refcount_.load(std::memory_order_relaxed) ==
+      c10::detail::kUniqueRef);
+
+  slot->pyobj_.store(pyobj, std::memory_order_relaxed);
+  slot->pyobj_interpreter_.store(
+      c10::impl::getGlobalPyInterpreter(), std::memory_order_relaxed);
+  target->combined_refcount_.store(
+      c10::detail::kHasPyObject | c10::detail::kUniqueRef,
+      std::memory_order_relaxed);
+}
+
+PyObject* PyObjectPreservation::init_once(
+    intrusive_ptr_target* target,
+    PyObjectSlot* slot,
+    PyObject* pyobj) {
+  PyObject* expected = nullptr;
+  if (!slot->pyobj_.compare_exchange_strong(
+          expected, pyobj, std::memory_order_acq_rel)) {
+    TORCH_INTERNAL_ASSERT(expected != nullptr);
+    return expected;
+  }
+
+  slot->pyobj_interpreter_.store(
+      c10::impl::getGlobalPyInterpreter(), std::memory_order_release);
+
+  bool increfed = false;
+  auto combined = target->combined_refcount_.load(std::memory_order_relaxed);
+  do {
+    TORCH_INTERNAL_ASSERT(!c10::detail::has_pyobject(combined));
+    if (c10::detail::refcount(combined) > 1 && !increfed) {
+      // We need to incref the object to preserve the invariant that
+      // if refcount > 1, the c10 object holds a reference to the PyObject.
+      // This must happen before we set the kHasPyObject bit.
+      Py_INCREF(pyobj);
+      increfed = true;
     }
+  } while (!target->combined_refcount_.compare_exchange_weak(
+      combined,
+      combined | c10::detail::kHasPyObject,
+      std::memory_order_acq_rel,
+      std::memory_order_relaxed));
+
+  if (increfed && c10::detail::refcount(combined) == 1) {
+    // Fix up if refcount if we did the incref in a failed compare-exchange
+    Py_DECREF(pyobj);
   }
+
+  return pyobj;
 }
+
+} // namespace torch::utils
diff --git a/torch/csrc/utils/pyobject_preservation.h b/torch/csrc/utils/pyobject_preservation.h
index 456095d7b7037..b060bc034b2c3 100644
--- a/torch/csrc/utils/pyobject_preservation.h
+++ b/torch/csrc/utils/pyobject_preservation.h
@@ -4,4 +4,28 @@
 
 // This file contains utilities used for handling PyObject preservation
 
-void clear_slots(PyTypeObject* type, PyObject* self);
+namespace c10 {
+class intrusive_ptr_target;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
+} // namespace c10
+
+namespace torch::utils {
+
+class PyObjectPreservation {
+ public:
+  // Store a PyObject wrapper on a fresh c10 wrapper. The caller must hold
+  // a unique reference to `target`.
+  static void init_fresh_nonatomic(
+      c10::intrusive_ptr_target* target,
+      c10::impl::PyObjectSlot* slot,
+      PyObject* pyobj);
+
+  static PyObject* init_once(
+      c10::intrusive_ptr_target* target,
+      c10::impl::PyObjectSlot* slot,
+      PyObject* pyobj);
+};
+
+} // namespace torch::utils

From 2f74916e36cfc7b78cfb6fc298ea31624ab26c27 Mon Sep 17 00:00:00 2001
From: IvanKobzarev <ivan.kobzarev@gmail.com>
Date: Mon, 17 Nov 2025 03:03:17 -0800
Subject: [PATCH 19/47] Do not hardfail on use nccl estimations for non-nccl
 (#167827)

Previously we hard failed if pg was "gloo".
Fallback on hardcoded formulas.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167827
Approved by: https://github.com/eellison
---
 test/distributed/test_inductor_collectives.py | 45 +++++++++++++++++++
 torch/_inductor/comm_analysis.py              | 23 ++++++----
 2 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py
index dd30bf81b67be..0117f67c38c11 100644
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@@ -40,6 +40,7 @@
     DynamoDistributedSingleProcTestCase,
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
+    requires_gloo,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@@ -2228,6 +2229,50 @@ def func(inp, group_size, group_name):
                 )
                 assert est_ms_nccl > 0
 
+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    def test_regression_use_nccl_estimate_with_gloo(self):
+        # Test checks that using nccl estimator option does not hard fail
+        # with backends that does not support runtime estimations, e.g. gloo
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                inp, group_size, group_name
+            )
+            ag_0_wait = torch.ops.c10d_functional.wait_tensor(ag_0_out)
+            ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_0_wait, group_size, group_name
+            )
+            ag_1_wait = torch.ops.c10d_functional.wait_tensor(ag_1_out)
+            return ag_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_gather_into_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index 74a58acb84ff3..e95db64b03a39 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -398,6 +398,20 @@ def estimate_nccl_collective_runtime_from_fx_node(
 
     def _nccl_estimate() -> Optional[float]:
         # TODO: Refactor with estimate_nccl_collective_runtime_nccl_estimator
+        from torch.distributed.distributed_c10d import (
+            _get_pg_default_device,
+            _resolve_process_group,
+        )
+
+        pg = _resolve_process_group(group_name)
+        if torch.distributed.distributed_c10d.get_backend(pg) == "fake":
+            # nccl estimator requires real process group
+            return None
+
+        device = _get_pg_default_device(pg)
+        backend = pg._get_backend(device)
+        if not backend.supports_time_estimate:
+            return None
 
         flat_args, flat_args_pytree_spec = pytree.tree_flatten((args, kwargs))
 
@@ -421,13 +435,6 @@ def to_real_tensor(e: Any) -> Any:
         flat_args = [to_real_tensor(a) for a in flat_args]
         real_args, real_kwargs = pytree.tree_unflatten(flat_args, flat_args_pytree_spec)
 
-        from torch.distributed.distributed_c10d import _resolve_process_group
-
-        pg = _resolve_process_group(group_name)
-        if torch.distributed.distributed_c10d.get_backend(pg) == "fake":
-            # nccl estimator requires real process group
-            return None
-
         fn = fx_node.target
         assert isinstance(fn, torch._ops.OpOverload)
         with torch.distributed._time_estimator(group=pg) as time_estimator:
@@ -441,7 +448,7 @@ def to_real_tensor(e: Any) -> Any:
         est_time_ms = est_time_us / 1e3
         return est_time_ms
 
-    if torch.distributed.is_nccl_available() and use_nccl_estimator:
+    if use_nccl_estimator:
         est_time_ms = _nccl_estimate()
         if est_time_ms is not None:
             return est_time_ms

From 2b69673bbfdadad6a963d37a6d4f1339c1b14048 Mon Sep 17 00:00:00 2001
From: Robert Hardwick <robert.hardwick@arm.com>
Date: Mon, 17 Nov 2025 08:56:00 +0000
Subject: [PATCH 20/47] [CD] Add libopenblas to dep list for AArch64+CPU whl
 (#167841)

#166044 removes openblas from whl dependency list for AArch64+CPU build so this PR adds it back. Only affects CPU build since AArch64+CUDA uses NVPL.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167841
Approved by: https://github.com/tinglvv, https://github.com/malfet
---
 .ci/manywheel/build_cpu.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.ci/manywheel/build_cpu.sh b/.ci/manywheel/build_cpu.sh
index c3ddba33cd946..ad51810e06a2b 100755
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@@ -75,9 +75,11 @@ if [[ "$ARCH" == "aarch64" ]]; then
     # ARM system libraries
     DEPS_LIST+=(
         "/usr/lib64/libgfortran.so.5"
+        "/opt/OpenBLAS/lib/libopenblas.so.0"
     )
     DEPS_SONAME+=(
         "libgfortran.so.5"
+        "libopenblas.so.0"
     )
 fi
 

From 1b43d6cd4e01b63f6bcf5238fdca5dc41e9121ae Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 17 Nov 2025 16:18:49 +0000
Subject: [PATCH 21/47] [ROCm] enable fastSpecializedAtomicAdd for gfx950
 (#167661)

Use standard HIP headers for unsafeAtomicAdd. Removes copy/paste of unsafeAtomicAdd as "preview" implementation for gfx942.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167661
Approved by: https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 aten/src/ATen/native/cuda/KernelUtils.cuh | 60 +----------------------
 1 file changed, 1 insertion(+), 59 deletions(-)

diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
index 5c8b98105bb26..fd406829707a1 100644
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -5,69 +5,11 @@
 #include <cuda_bf16.h>
 #endif
 
-// ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM)
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
-
-__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
-  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
-  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
-  union {
-    __hip_bfloat162_raw bf162_raw;
-    vec_short2 vs2;
-  } u{static_cast<__hip_bfloat162_raw>(value)};
-  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
-  return static_cast<__hip_bfloat162>(u.bf162_raw);
-#else
-  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
-  union u_hold {
-    __hip_bfloat162_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-
-__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
-  // The api expects an ext_vector_type of half
-  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
-  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
-  union {
-    __half2_raw h2r;
-    vec_fp162 fp16;
-  } u {static_cast<__half2_raw>(value)};
-  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
-  return static_cast<__half2>(u.h2r);
-#else
-  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
-  union u_hold {
-    __half2_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-#define ATOMICADD preview_unsafeAtomicAdd
+#define ATOMICADD unsafeAtomicAdd
 #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
 #else
 #define ATOMICADD atomicAdd

From 4c152a71add2bd6a5f35dd9bb78ebb22e0748357 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 16:37:07 +0000
Subject: [PATCH 22/47] Revert "add device generalization support for
 distributed tests (#165067)"

This reverts commit 96a4c4b3d1c533b36cfa7259524b91a0eaf4254f.

Reverted https://github.com/pytorch/pytorch/pull/165067 on behalf of https://github.com/jeanschmidt due to breaks internal tests see D87036515, @albanD please help the author get this PR merged ([comment](https://github.com/pytorch/pytorch/pull/165067#issuecomment-3542820651))
---
 .../test_2d_composability.py                  | 27 +++++----
 .../test_pp_composability.py                  | 27 ++++++---
 .../ddp_comm_hooks/test_ddp_hooks.py          | 58 ++++++++++++-------
 .../checkpoint/test_state_dict_utils.py       | 36 ++++++------
 .../optim/test_zero_redundancy_optimizer.py   | 17 ++----
 .../test_c10d_functional_native.py            | 21 +++++--
 .../test_c10d_object_collectives.py           | 44 +++++++++-----
 test/distributed/test_device_mesh.py          |  4 +-
 .../distributed/_tensor/common_dtensor.py     | 12 ++--
 9 files changed, 144 insertions(+), 102 deletions(-)

diff --git a/test/distributed/_composable/test_composability/test_2d_composability.py b/test/distributed/_composable/test_composability/test_2d_composability.py
index 0a108590bc5ed..9375c86d35584 100644
--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@@ -65,7 +65,6 @@
 
 
 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-curr_backend = dist.get_default_backend_for_device(device_type)
 
 
 class SimpleModel(nn.Module):
@@ -423,10 +422,10 @@ class TestFullyShard2DStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
-        return f"cpu:gloo,{device_type}:{curr_backend}"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_fully_shard_tp_2d_set_full_state_dict(self):
         dummy_model = SimpleModel().to(device_type)
         mesh_2d = init_device_mesh(
@@ -515,8 +514,8 @@ def _check_module(self, m1, m2, check_grad=False):
                 ).to_local()
             self.assertEqual(param_m2, param_m1)
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_2d_ddp_integration_functionality(self) -> None:
         model, twod_model, dp_pg = self.init_model(self.device_type)
         optim = torch.optim.Adam(model.parameters(), lr=3e-5)
@@ -567,8 +566,8 @@ def _compare_params(self, m1, m2):
                         p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                     self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_2d_fsdp_state_enable_extension(self):
         mesh_2d = init_device_mesh(
             self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
@@ -643,18 +642,18 @@ def _test_2d_e2e_training(
         # Ensure all params are still the same after optimizer update.
         self._compare_params(model, model_2d)
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_2d_e2e_training_default(self):
         self._test_2d_e2e_training()
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_2d_e2e_training_use_orig_params(self):
         self._test_2d_e2e_training(use_orig_params=True)
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_2d_e2e_training_not_use_orig_params(self):
         # TODO: need to revisit input_reshard API about why it failed multi-gpu tests.
         # self._test_2d_e2e_training(recompute_activation=True)
@@ -667,10 +666,10 @@ class TestNew2dParallelStateDict(DTensorTestBase):
     @property
     def backend(self):
         # need to specify gloo backend for testing cpu offload
-        return f"cpu:gloo,{device_type}:{curr_backend}"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     def test_fsdp_2d_extension(self):
         """
         Test whether _fsdp_extension from FSDPstate has been set correctly.
@@ -701,8 +700,8 @@ def test_fsdp_2d_extension(self):
         model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
         self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     @parametrize("is_even_sharded_model", [True, False])
     def test_2d_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@@ -757,8 +756,8 @@ def test_2d_state_dict(self, is_even_sharded_model):
                 torch.allclose(no_wrap_v, all_gather_two_d_v.to_local()), True
             )
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     @parametrize("is_even_sharded_model", [True, False])
     def test_2d_load_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@@ -812,8 +811,8 @@ def test_2d_load_state_dict(self, is_even_sharded_model):
             self.assertEqual(v1.device_mesh, v2.device_mesh)
             self.assertEqual(v1.placements, v2.placements)
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
+    @skip_if_lt_x_gpu(4)
     @parametrize("is_even_sharded_model", [True, False])
     def test_2d_optim_state_dict(self, is_even_sharded_model):
         simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@@ -900,9 +899,9 @@ def test_2d_optim_state_dict(self, is_even_sharded_model):
                 else:
                     self.assertEqual(new_state, state)
 
-    @skip_if_lt_x_gpu(4)
     @with_comms
     @with_temp_dir
+    @skip_if_lt_x_gpu(4)
     def test_fsdp1_tp_2d_set_full_state_dict(self):
         """
         This is a workaround for loading full state dict into a FSDP1+TP 2D model.
diff --git a/test/distributed/_composable/test_composability/test_pp_composability.py b/test/distributed/_composable/test_composability/test_pp_composability.py
index 9ddbe867fa879..a66518fc0ef0f 100644
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@@ -29,8 +29,8 @@
     parallelize_module,
     RowwiseParallel,
 )
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    at_least_x_gpu,
     MultiProcessTestCase,
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
@@ -40,6 +40,7 @@
     parametrize,
     run_tests,
     skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir
 
@@ -106,9 +107,11 @@ def world_size(self):
     def device(self):
         return self.rank
 
-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
+    )
     def test_pp_and_dcp(self):
         """
         Test that pipeline parallelism and distributed checkpointing can be used together and
@@ -198,9 +201,11 @@ def _dcp_test(self):
 
         _dcp_test(self)
 
-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -350,9 +355,11 @@ def apply_tp(
 
         torch.distributed.destroy_process_group()
 
-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
@@ -543,9 +550,11 @@ def apply_same_precision(partial_model):
 
         torch.distributed.destroy_process_group()
 
-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
     @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
     @parametrize(
         "ScheduleClass",
         [
diff --git a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
index 2099a2a2d44d9..89a893037c3b5 100644
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@@ -1,5 +1,6 @@
 # Owner(s): ["oncall: distributed"]
 
+import os
 import sys
 
 import torch
@@ -17,8 +18,8 @@
 )
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import (
-    DistributedTestBase,
-    requires_accelerator_dist_backend,
+    MultiProcessTestCase,
+    requires_nccl,
     skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
@@ -29,12 +30,9 @@
     sys.exit(0)
 
 
-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-
-
 def gpus_for_rank(world_size):
-    visible_devices = list(range(torch.accelerator.device_count()))
-    gpus_per_process = torch.accelerator.device_count() // world_size
+    visible_devices = list(range(torch.cuda.device_count()))
+    gpus_per_process = torch.cuda.device_count() // world_size
     gpus_for_rank = []
     for rank in range(world_size):
         gpus_for_rank.append(
@@ -62,7 +60,27 @@ def forward(self, x, rank):
         return self.t0(x ** (1 + rank))
 
 
-class DistributedDataParallelCommHookTest(DistributedTestBase):
+class DistributedDataParallelCommHookTest(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def _get_process_group_nccl(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        return dist.distributed_c10d._get_default_group()
+
     @property
     def world_size(self):
         return 2
@@ -101,14 +119,14 @@ def _run_and_get_grads(self, model):
         param = next(model.parameters())
         return param.grad
 
-    @requires_accelerator_dist_backend()
+    @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_allreduce_hook(self):
         """
         This unit test verifies the ``allreduce`` hook registered case gives same result
         with no hook registered case.
         """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -117,14 +135,14 @@ def test_ddp_comm_hook_allreduce_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
-    @requires_accelerator_dist_backend()
+    @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_fp16compress_hook(self):
         """
         This unit test verifies the ``fp16 compress`` hook registered case
         gives close result with no hook registered case.
         """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -133,14 +151,14 @@ def test_ddp_comm_hook_fp16compress_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-    @requires_accelerator_dist_backend()
+    @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_quantize_per_tensor_hook(self):
         """
         This unit test verifies the ``quantize per tensor`` hook registered case
         gives close result with no hook registered case.
         """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -149,14 +167,14 @@ def test_ddp_comm_hook_quantize_per_tensor_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-    @requires_accelerator_dist_backend()
+    @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_quantize_per_channel_hook(self):
         """
         This unit test verifies the ``quantize per channel`` hook registered case
         gives close result with no hook registered case.
         """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -167,14 +185,14 @@ def test_ddp_comm_hook_quantize_per_channel_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
 
-    @requires_accelerator_dist_backend()
+    @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_ddp_comm_hook_noop_hook(self):
         """
         This unit test verifies the ``noop`` hook registered case and a subsequent allreduce
         gives same result with no hook registered case.
         """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()
 
         # No hook registered case, get the reference grads.
         reference_grads = self._get_grads(process_group, None)
@@ -186,10 +204,10 @@ def test_ddp_comm_hook_noop_hook(self):
 
         torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)
 
-    @requires_accelerator_dist_backend()
+    @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_is_last_hook(self):
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()
 
         def hook(flags, bucket):
             flags.append(bucket.is_last())
diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
index c0f850cf95c9c..76e9aeb9e3302 100644
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -32,7 +32,7 @@
 class TestStateDictUtils(DTensorTestBase):
     @property
     def world_size(self):
-        return min(4, torch.accelerator.device_count())
+        return min(4, torch.cuda.device_count())
 
     @with_comms
     @skip_if_lt_x_gpu(2)
@@ -49,7 +49,7 @@ def test_gather_state_dict_dtensor(self):
             dist_tensor.to_local(), gather_dim=0, group=(device_mesh, 0)
         )
         self.assertEqual(expected_gathered_dtensor, gathered_state_dict["dtensor"])
-        self.assertEqual(gathered_state_dict["dtensor"].device.type, self.device_type)
+        self.assertTrue(gathered_state_dict["dtensor"].is_cuda)
 
     @with_comms
     @skip_if_lt_x_gpu(4)
@@ -69,16 +69,14 @@ def test_gather_with_cpu_and_ranks_only(self):
         )
         if dist.get_rank() in (0, 2):
             self.assertEqual(expected_gathered_dtensor, gathered_state_dict["dtensor"])
-            self.assertNotEqual(
-                gathered_state_dict["dtensor"].device.type, self.device_type
-            )
+            self.assertFalse(gathered_state_dict["dtensor"].is_cuda)
         else:
             self.assertEqual(gathered_state_dict, {})
 
     @with_comms
     @skip_if_lt_x_gpu(4)
     def test_cpu_and_ranks_only(self):
-        device = torch.device(self.device_type)
+        device = torch.device("cuda")
         state_dict = {
             "tensor1": torch.arange(10, device=device),
             "tensor2": torch.ones(10, device=device),
@@ -87,7 +85,7 @@ def test_cpu_and_ranks_only(self):
         cpu_state_dict = _offload_state_dict_to_cpu(state_dict, ranks_only=(0, 2))
         if dist.get_rank() in (0, 2):
             for v in cpu_state_dict.values():
-                self.assertNotEqual(v.device.type, self.device_type)
+                self.assertFalse(v.is_cuda)
             self.assertEqual(cpu_state_dict["tensor1"], torch.arange(10))
             self.assertEqual(cpu_state_dict["tensor2"], torch.ones(10))
         else:
@@ -111,27 +109,27 @@ def create_dtensor():
         for _ in range(10):
             tensor, dtensor = create_dtensor()
             ltensor.append(tensor)
-            ltensor.append(torch.ones(10, device=torch.device(self.device_type)))
+            ltensor.append(torch.ones(10, device=torch.device("cuda")))
             ldtensor.append(dtensor)
-            ldtensor.append(torch.ones(10, device=torch.device(self.device_type)))
+            ldtensor.append(torch.ones(10, device=torch.device("cuda")))
 
         tensor, dtensor = create_dtensor()
         dist_state_dict = {
             "local": dtensor,
             "list": ldtensor,
-            "arange": torch.arange(10, device=torch.device(self.device_type)),
+            "arange": torch.arange(10, device=torch.device("cuda")),
         }
         state_dict = {
             "local": tensor,
             "list": ltensor,
-            "arange": torch.arange(10, device=torch.device(self.device_type)),
+            "arange": torch.arange(10, device=torch.device("cuda")),
         }
         self.assertEqual(state_dict, _gather_state_dict(dist_state_dict))
 
     @with_comms
     @skip_if_lt_x_gpu(2)
     def test_create_cpu_state_dict(self):
-        device = torch.device(self.device_type)
+        device = torch.device("cuda")
         rank = dist.get_rank()
         # Scale tensors based on world size
         # to fit in the tensor shards accurately.
@@ -151,7 +149,7 @@ def test_create_cpu_state_dict(self):
                         metadata=ShardMetadata(
                             shard_offsets=[5 * rank, 0],
                             shard_sizes=[5, 10],
-                            placement=f"rank:{rank}/{self.device_type}:{rank}",
+                            placement=f"rank:{rank}/cuda:{rank}",
                         ),
                     )
                 ],
@@ -161,7 +159,7 @@ def test_create_cpu_state_dict(self):
                 torch.arange(50 * scale_factor, device=device).reshape(
                     5 * scale_factor, 10
                 ),
-                init_device_mesh(self.device_type, mesh_shape=(self.world_size,)),
+                init_device_mesh("cuda", mesh_shape=(self.world_size,)),
                 [Shard(0)],
             ),
             "non_tensor_bytes_io": copy.deepcopy(buffer),
@@ -247,7 +245,7 @@ def test_state_dict_util_distribute_tensors(self):
         even_tensor = torch.randn(self.world_size, 2)
         uneven_tensor = torch.randn(1, 2)
 
-        mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        mesh = init_device_mesh("cuda", mesh_shape=(self.world_size,))
         even_dtensor = distribute_tensor(
             torch.randn(self.world_size, 2), mesh, [Shard(0)]
         )
@@ -275,10 +273,10 @@ def test_state_dict_util_distribute_tensors(self):
     @with_comms
     @skip_if_lt_x_gpu(2)
     def test_cpu_offload_for_dtensor(self):
-        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        device_mesh = init_device_mesh("cuda", mesh_shape=(self.world_size,))
         sd = {
             "k": DTensor.from_local(
-                torch.ones(8, 8, device=self.device_type), device_mesh, [Shard(0)]
+                torch.ones(8, 8, device="cuda"), device_mesh, [Shard(0)]
             )
         }
         cpu_sd = _create_cpu_state_dict(sd)
@@ -292,12 +290,12 @@ def test_cpu_offload_for_dtensor(self):
 
         self.assertFalse(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
         _copy_state_dict(sd, cpu_sd, non_blocking=True)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
         self.assertTrue(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
         sd["k"] += 1
         self.assertFalse(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
         _copy_state_dict(sd, cpu_sd, non_blocking=True)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
         self.assertTrue(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
 
 
diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py
index 6f527dbb0257f..35eefdad512e6 100644
--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@@ -7,7 +7,7 @@
 
 import copy
 import sys
-from contextlib import contextmanager, nullcontext
+from contextlib import nullcontext
 from typing import Any, cast
 
 import numpy as np
@@ -40,6 +40,7 @@
     skip_if_rocm_multiprocess,
     skip_if_win32,
 )
+from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
@@ -56,17 +57,7 @@
     HAS_TORCHVISION = False
 
 
-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-
-
-@contextmanager
-def deterministic_algorithms(enabled=True):
-    prev_state = torch.are_deterministic_algorithms_enabled()
-    torch.use_deterministic_algorithms(enabled)
-    try:
-        yield
-    finally:
-        torch.use_deterministic_algorithms(prev_state)
+device_type = str(get_devtype())
 
 
 class TestZeroRedundancyOptimizer(DistributedTestBase):
@@ -1250,7 +1241,7 @@ def _test_ddp_zero_overlap(
                     enabled=True, deterministic=True, benchmark=False
                 )
                 if "cuda" in device
-                else deterministic_algorithms(True)
+                else torch.use_deterministic_algorithms(True)
             )
             with det_ctx:
                 device_ids = [rank] if requires_ddp_rank(device) else None
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index 473198e5421c5..0877eb53cd6f5 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -24,7 +24,7 @@
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
-    DistributedTestBase,
+    MultiProcessTestCase,
     requires_accelerator_dist_backend,
     skip_if_lt_x_gpu,
 )
@@ -59,8 +59,12 @@ def load_test_module(name):
     sys.exit(0)
 
 
-@requires_accelerator_dist_backend()
-class TestWithNCCL(DistributedTestBase):
+@requires_accelerator_dist_backend(["nccl", "xccl"])
+class TestWithNCCL(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
     @property
     def world_size(self) -> int:
         return 2
@@ -74,7 +78,16 @@ def device(self) -> torch.device:
         return torch.device(self.rank)
 
     def _init_process_group(self) -> None:
-        self.create_pg(self.device.type)
+        torch.accelerator.set_device_index(self.rank)
+        store = dist.FileStore(self.file_name, self.world_size)
+        backend = dist.get_default_backend_for_device(self.device.type)
+
+        dist.init_process_group(
+            backend=backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
         torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD)
 
     @skip_if_lt_x_gpu(2)
diff --git a/test/distributed/test_c10d_object_collectives.py b/test/distributed/test_c10d_object_collectives.py
index 9ef04b61ab23b..594564c456068 100644
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@@ -11,10 +11,13 @@
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import DistributedTestBase, TEST_SKIPS
 from torch.testing._internal.common_utils import (
     run_tests,
     skipIfHpu,
+    TEST_CUDA,
+    TEST_HPU,
     TEST_WITH_DEV_DBG_ASAN,
 )
 
@@ -26,8 +29,16 @@
     )
     sys.exit(0)
 
-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-device_count = torch.accelerator.device_count()
+if TEST_HPU:
+    DEVICE = "hpu"
+elif TEST_CUDA:
+    DEVICE = "cuda"
+else:
+    DEVICE = "cpu"
+
+device_module = torch.get_device_module(DEVICE)
+device_count = device_module.device_count()
+BACKEND = dist.get_default_backend_for_device(DEVICE)
 
 
 def with_comms(func=None):
@@ -38,10 +49,11 @@ def with_comms(func=None):
 
     @wraps(func)
     def wrapper(self, *args, **kwargs):
-        if device_type != "cpu" and device_count < self.world_size:
+        if DEVICE != "cpu" and device_count < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
-        self.pg = self.create_pg(device=device_type)
+        kwargs["device"] = DEVICE
+        self.pg = self.create_pg(device=DEVICE)
         try:
             return func(self, *args, **kwargs)
         finally:
@@ -52,7 +64,7 @@ def wrapper(self, *args, **kwargs):
 
 class TestObjectCollectives(DistributedTestBase):
     @with_comms()
-    def test_all_gather_object(self):
+    def test_all_gather_object(self, device):
         output = [None] * dist.get_world_size()
         dist.all_gather_object(object_list=output, obj=self.rank)
 
@@ -60,7 +72,7 @@ def test_all_gather_object(self):
             self.assertEqual(i, v, f"rank: {self.rank}")
 
     @with_comms()
-    def test_gather_object(self):
+    def test_gather_object(self, device):
         output = [None] * dist.get_world_size() if self.rank == 0 else None
         dist.gather_object(obj=self.rank, object_gather_list=output)
 
@@ -70,7 +82,7 @@ def test_gather_object(self):
 
     @skipIfHpu
     @with_comms()
-    def test_send_recv_object_list(self):
+    def test_send_recv_object_list(self, device):
         val = 99 if self.rank == 0 else None
         object_list = [val] * dist.get_world_size()
         if self.rank == 0:
@@ -84,7 +96,7 @@ def test_send_recv_object_list(self):
             self.assertEqual(None, object_list[0])
 
     @with_comms()
-    def test_broadcast_object_list(self):
+    def test_broadcast_object_list(self, device):
         val = 99 if self.rank == 0 else None
         object_list = [val] * dist.get_world_size()
         # TODO test with broadcast_object_list's device argument
@@ -93,7 +105,7 @@ def test_broadcast_object_list(self):
         self.assertEqual(99, object_list[0])
 
     @with_comms()
-    def test_scatter_object_list(self):
+    def test_scatter_object_list(self, device):
         input_list = list(range(dist.get_world_size())) if self.rank == 0 else None
         output_list = [None]
         dist.scatter_object_list(
@@ -111,30 +123,34 @@ def setup_sub_pg(self):
         my_pg = dist.new_group(ranks, use_local_synchronization=True)
         return rank, ranks, my_pg
 
+    @skipIfHpu
     @with_comms()
-    def test_subpg_scatter_object(self):
+    def test_subpg_scatter_object(self, device):
         rank, ranks, my_pg = self.setup_sub_pg()
         out_list = [None]
         dist.scatter_object_list(out_list, ranks, src=ranks[0], group=my_pg)
         self.assertEqual(rank, out_list[0])
 
+    @skipIfHpu
     @with_comms()
-    def test_subpg_all_gather_object(self):
+    def test_subpg_all_gather_object(self, device):
         rank, ranks, my_pg = self.setup_sub_pg()
         out_list = [None] * len(ranks)
         dist.all_gather_object(out_list, rank, group=my_pg)
         self.assertEqual(ranks, out_list)
 
+    @skipIfHpu
     @with_comms()
-    def test_subpg_gather_object(self):
+    def test_subpg_gather_object(self, device):
         rank, ranks, my_pg = self.setup_sub_pg()
         out_list = [None] * len(ranks) if rank == ranks[0] else None
         dist.gather_object(rank, out_list, dst=ranks[0], group=my_pg)
         if rank == ranks[0]:
             self.assertEqual(ranks, out_list)
 
+    @skipIfHpu
     @with_comms()
-    def test_subpg_broadcast_object(self):
+    def test_subpg_broadcast_object(self, device):
         rank, ranks, my_pg = self.setup_sub_pg()
         out_list = [None]
         if rank == ranks[0]:
@@ -143,5 +159,7 @@ def test_subpg_broadcast_object(self):
         self.assertEqual(ranks[0], out_list[0])
 
 
+devices = ("cpu", "cuda", "hpu")
+instantiate_device_type_tests(TestObjectCollectives, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index 396e49949deb5..a0de1b13c6161 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -29,7 +29,7 @@
 )
 from torch.distributed.tensor.placement_types import _Partial, Shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import run_tests, TEST_HPU, TEST_XPU, TestCase
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
     with_comms,
@@ -58,7 +58,7 @@ def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_ran
         os.environ["LOCAL_RANK"] = f"{local_rank}"
 
 
-@unittest.skipIf(TEST_XPU or TEST_HPU, "XPU/HPU does not support gloo backend.")
+@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend.")
 class DeviceMeshTestGlooBackend(DTensorTestBase):
     @property
     def backend(self):
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index fb53ce4439afd..6ce7d4b2ca507 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -43,7 +43,6 @@
     SequenceParallel,
 )
 from torch.testing._internal.common_distributed import (
-    ACCELERATOR_DIST_BACKENDS,
     MultiProcContinuousTest,
     MultiProcessTestCase,
     MultiThreadedTestCase,
@@ -397,17 +396,14 @@ def build_device_mesh(self) -> DeviceMesh:
         return init_device_mesh(self.device_type, (self.world_size,))
 
     def init_pg(self, eager_init, backend: Optional[str] = None) -> None:
-        if backend is None:
-            backend = self.backend
-
-        requires_gpu = any(
-            gpu_backend in backend for gpu_backend in ACCELERATOR_DIST_BACKENDS
-        )
-        if requires_gpu and torch.accelerator.device_count() < self.world_size:
+        if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
             sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
 
         curr_backend = dist.get_default_backend_for_device(self.device_type)
 
+        if backend is None:
+            backend = self.backend
+
         if backend not in [
             "nccl",
             "gloo",

From 39ebab1dd9e52f363aa076a48aea3a63253f70c7 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 16:41:26 +0000
Subject: [PATCH 23/47] Revert "Remove python workaround for ContextDecorator
 (#167049)"

This reverts commit e20ca3bc2e6ef9935c782fe548348f81fabc5bd7.

Reverted https://github.com/pytorch/pytorch/pull/167049 on behalf of https://github.com/jeanschmidt due to breaks internal tests see D87120562, @Skylion007 please thelp the author get this PR merged ([comment](https://github.com/pytorch/pytorch/pull/167049#issuecomment-3542847796))
---
 torch/_C/__init__.pyi.in                      |  5 +--
 torch/autograd/profiler.py                    | 41 +++++++++++++++----
 .../csrc/jit/frontend/script_type_parser.cpp  |  6 ---
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 7194ef034bb5a..e9b58b9ce71eb 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -4,7 +4,6 @@
 # ruff: noqa: F401
 
 from collections.abc import Callable, Iterable, Iterator, Sequence
-from contextlib import AbstractContextManager
 from enum import Enum, IntEnum
 from pathlib import Path
 from types import EllipsisType
@@ -231,8 +230,8 @@ ${dtype_class_hints}
 class layout: ...
 
 # Defined in torch/csrc/utils/disable_torch_function.cpp
-def DisableTorchFunction() -> AbstractContextManager: ...
-def DisableTorchFunctionSubclass() -> AbstractContextManager: ...
+def DisableTorchFunction(): ...
+def DisableTorchFunctionSubclass(): ...
 
 # Defined in torch/csrc/utils/tensor_layouts.cpp
 strided: layout = ...
diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index ec28bfbd825e2..fa43af2701171 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -52,7 +52,26 @@
     "MemRecordsAcc",
 ]
 
-from contextlib import ContextDecorator
+try:
+    # Available in Python >= 3.2
+    from contextlib import ContextDecorator as _ContextDecorator
+except ImportError:
+    import functools
+
+    class _ContextDecorator:  # type: ignore[no-redef]
+        def __enter__(self):
+            raise NotImplementedError
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            raise NotImplementedError
+
+        def __call__(self, func):
+            @functools.wraps(func)
+            def wrapped(*args, **kwargs):
+                with self:
+                    return func(*args, **kwargs)
+
+            return wrapped
 
 
 # global python state - whether profiler is currently enabled
@@ -209,12 +228,12 @@ def __init__(
                 FutureWarning,
                 stacklevel=2,
             )
-            self.use_device: str | None = "cuda"
+            self.use_device: Optional[str] = "cuda"
         else:
             self.use_device = use_device
         # TODO Consider changing _function_events into data structure with size cap
-        self._function_events: EventList | None = None
-        self._old_function_events: EventList | None = None
+        self._function_events: Optional[EventList] = None
+        self._old_function_events: Optional[EventList] = None
         # Function event processing is done lazily
         self._needs_processing = False
         self.entered = False
@@ -229,7 +248,7 @@ def __init__(
         if experimental_config is None:
             experimental_config = _ExperimentalConfig()
         self.experimental_config = experimental_config
-        self.kineto_results: _ProfilerResult | None = None
+        self.kineto_results: Optional[_ProfilerResult] = None
         self.profiling_start_time_ns = 0
         self.profiling_end_time_ns = 0
         self._stats = _ProfilerStats()
@@ -725,7 +744,8 @@ def createFunctionEventForMemoryEvents(evt):
         return all_function_events
 
 
-class record_function(ContextDecorator):
+# pyrefly: ignore [invalid-inheritance]
+class record_function(_ContextDecorator):
     """Context manager/function decorator that adds a label to a code block/function when running autograd profiler.
     Label will only appear if CPU activity tracing is enabled.
 
@@ -764,13 +784,16 @@ class record_function(ContextDecorator):
 
     """
 
-    def __init__(self, name: str, args: str | None = None):
+    def __init__(self, name: str, args: Optional[str] = None):
         self.name: str = name
-        self.args: str | None = args
+        self.args: Optional[str] = args
         # Whether or not we should run record function's end callbacks when exiting.
         self.run_callbacks_on_exit: bool = True
+        # TODO: TorchScript ignores standard type annotation here
+        # self.record: Optional["torch.classes.profiler._RecordFunction"] = None
         self.record = torch.jit.annotate(
-            Optional[torch.classes.profiler._RecordFunction],
+            # pyrefly: ignore [not-a-type]
+            Optional["torch.classes.profiler._RecordFunction"],
             None,
         )
 
diff --git a/torch/csrc/jit/frontend/script_type_parser.cpp b/torch/csrc/jit/frontend/script_type_parser.cpp
index 29964e0918534..31fc483812ab0 100644
--- a/torch/csrc/jit/frontend/script_type_parser.cpp
+++ b/torch/csrc/jit/frontend/script_type_parser.cpp
@@ -308,12 +308,6 @@ TypePtr ScriptTypeParser::parseTypeFromExprImpl(const Expr& expr) const {
     if (auto custom_class_type = getCustomClass(*name)) {
       return custom_class_type;
     }
-    // Check if the type is a custom class. This is done by checking
-    // if type_name starts with "torch.classes."
-    if (name->find("torch.classes.") == 0) {
-      auto custom_class_type = getCustomClass("__torch__." + *name);
-      return custom_class_type;
-    }
 
     throw ErrorReport(expr) << "Unknown type name '" << *name << "'";
   }

From 22ccd44d732100a301a4f9c9119850682aff48bb Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 16:46:44 +0000
Subject: [PATCH 24/47] Revert "Improve char printing (#167899)"

This reverts commit 2245d7d3b90162ae2958929a22c140537cfc4b42.

Reverted https://github.com/pytorch/pytorch/pull/167899 on behalf of https://github.com/jeanschmidt due to need to revert in order to revert https://github.com/pytorch/pytorch/pull/167899 ([comment](https://github.com/pytorch/pytorch/pull/167899#issuecomment-3542869096))
---
 aten/src/ATen/LegacyBatchedTensorImpl.h       |   2 +-
 aten/src/ATen/TensorIndexing.cpp              |   6 +-
 aten/src/ATen/TensorNames.cpp                 |   2 +-
 aten/src/ATen/TensorUtils.cpp                 |   8 +-
 aten/src/ATen/Version.cpp                     |  26 ++--
 aten/src/ATen/code_template.h                 |   2 +-
 aten/src/ATen/core/Dimname.cpp                |   2 +-
 aten/src/ATen/core/Range.cpp                  |   2 +-
 aten/src/ATen/core/Tensor.cpp                 |   2 +-
 aten/src/ATen/core/Vitals.cpp                 |   4 +-
 aten/src/ATen/core/alias_info.h               |  10 +-
 aten/src/ATen/core/blob.h                     |   2 +-
 aten/src/ATen/core/class_type.cpp             |   4 +-
 aten/src/ATen/core/class_type.h               |   2 +-
 .../core/dispatch/DispatchKeyExtractor.cpp    |   6 +-
 aten/src/ATen/core/dispatch/Dispatcher.cpp    |   4 +-
 aten/src/ATen/core/dispatch/OperatorEntry.cpp |  12 +-
 aten/src/ATen/core/function_schema.cpp        |  14 +--
 aten/src/ATen/core/function_schema.h          |   8 +-
 aten/src/ATen/core/ivalue.cpp                 |  40 +++---
 aten/src/ATen/core/jit_type.h                 |  20 +--
 aten/src/ATen/core/operator_name.cpp          |   2 +-
 aten/src/ATen/core/tensor_type.cpp            |  20 +--
 aten/src/ATen/core/type.cpp                   |  34 ++---
 aten/src/ATen/core/union_type.cpp             |   4 +-
 aten/src/ATen/cpu/vec/vec256/vec256.h         |   2 +-
 aten/src/ATen/cpu/vec/vec512/vec512.h         |   2 +-
 aten/src/ATen/cuda/detail/CUDAHooks.cpp       |  28 ++---
 aten/src/ATen/cuda/jiterator.cu               |   4 +-
 aten/src/ATen/cuda/tunable/Tunable.cpp        |  12 +-
 aten/src/ATen/cudnn/Descriptors.cpp           |  24 ++--
 aten/src/ATen/functorch/DynamicLayer.cpp      |   6 +-
 aten/src/ATen/functorch/TensorWrapper.cpp     |   4 +-
 aten/src/ATen/miopen/Descriptors.cpp          |  12 +-
 aten/src/ATen/mps/MPSProfiler.h               |   2 +-
 aten/src/ATen/mps/MPSProfiler.mm              |   4 +-
 aten/src/ATen/native/ConvUtils.h              |   2 +-
 aten/src/ATen/native/Convolution.cpp          |   2 +-
 aten/src/ATen/native/SpectralOps.cpp          |   4 +-
 aten/src/ATen/native/TensorCompare.cpp        |   2 +-
 aten/src/ATen/native/cuda/Reduce.cu           |   8 +-
 aten/src/ATen/native/cuda/ScaledGroupMM.cu    |   4 +-
 aten/src/ATen/native/cuda/jit_utils.cpp       |  16 +--
 aten/src/ATen/native/cudnn/ConvShared.cpp     |  28 ++---
 aten/src/ATen/native/cudnn/Conv_v7.cpp        |   9 +-
 .../native/metal/MetalTensorImplStorage.mm    |   2 +-
 aten/src/ATen/native/mkldnn/xpu/Conv.cpp      |   2 +-
 .../qnnpack/test/avgpool-microkernel-tester.h |   8 +-
 .../qnnpack/test/maxpool-microkernel-tester.h |   2 +-
 aten/src/ATen/native/utils/ParamUtils.h       |   2 +-
 aten/src/ATen/native/vulkan/api/Adapter.cpp   |   8 +-
 aten/src/ATen/native/vulkan/api/Exception.cpp |   6 +-
 aten/src/ATen/native/vulkan/api/QueryPool.cpp |   4 +-
 aten/src/ATen/native/vulkan/api/Runtime.cpp   |   2 +-
 aten/src/ATen/native/vulkan/api/Utils.h       |   2 +-
 aten/src/ATen/test/basic.cpp                  |   2 +-
 aten/src/ATen/test/scalar_test.cpp            |  10 +-
 aten/src/ATen/test/test_install/main.cpp      |   2 +-
 aten/src/ATen/test/vec_test_all_types.cpp     |  12 +-
 aten/src/ATen/test/vitals.cpp                 |   2 +-
 aten/src/ATen/test/vulkan_api_test.cpp        |  12 +-
 .../ATen/test/vulkan_quantized_api_test.cpp   |  46 +++----
 c10/core/DispatchKeySet.cpp                   |   2 +-
 c10/core/TensorOptions.cpp                    |   2 +-
 c10/cuda/CUDADeviceAssertionHost.cpp          |  16 +--
 c10/test/core/DispatchKeySet_test.cpp         |   2 +-
 c10/test/util/Half_test.cpp                   |   4 +-
 c10/test/util/logging_test.cpp                |   2 +-
 c10/util/ArrayRef.h                           |   4 +-
 c10/util/Backtrace.cpp                        |  12 +-
 c10/util/Exception.cpp                        |   6 +-
 c10/util/Logging.cpp                          |  12 +-
 c10/util/SmallVector.h                        |   4 +-
 c10/util/StringUtil.cpp                       |   2 +-
 c10/util/StringUtil.h                         |   4 +-
 c10/util/signal_handler.cpp                   |   2 +-
 c10/util/sparse_bitset.h                      |   4 +-
 torch/csrc/DataLoader.cpp                     |   2 +-
 torch/csrc/Device.cpp                         |   4 +-
 torch/csrc/Module.cpp                         |   8 +-
 torch/csrc/TypeInfo.cpp                       |   4 +-
 .../torch/detail/TensorDataContainer.h        |   8 +-
 .../api/include/torch/nn/modules/batchnorm.h  |   2 +-
 .../nn/modules/container/parameterdict.h      |   8 +-
 .../nn/modules/container/parameterlist.h      |   8 +-
 .../csrc/api/include/torch/nn/modules/conv.h  |  12 +-
 .../include/torch/nn/modules/instancenorm.h   |   2 +-
 .../api/include/torch/nn/modules/pooling.h    |   8 +-
 torch/csrc/api/src/nn/module.cpp              |   4 +-
 torch/csrc/api/src/nn/modules/activation.cpp  |  34 ++---
 torch/csrc/api/src/nn/modules/distance.cpp    |   4 +-
 torch/csrc/api/src/nn/modules/dropout.cpp     |  10 +-
 torch/csrc/api/src/nn/modules/embedding.cpp   |   4 +-
 torch/csrc/api/src/nn/modules/fold.cpp        |   4 +-
 torch/csrc/api/src/nn/modules/linear.cpp      |   6 +-
 torch/csrc/api/src/nn/modules/loss.cpp        |  10 +-
 .../csrc/api/src/nn/modules/normalization.cpp |   8 +-
 torch/csrc/api/src/nn/modules/padding.cpp     |  16 +--
 .../csrc/api/src/nn/modules/pixelshuffle.cpp  |   4 +-
 torch/csrc/api/src/nn/modules/pooling.cpp     |  14 +--
 torch/csrc/api/src/nn/modules/rnn.cpp         |   6 +-
 torch/csrc/api/src/nn/modules/upsampling.cpp  |   2 +-
 torch/csrc/autograd/saved_variable.cpp        |   8 +-
 torch/csrc/cuda/Module.cpp                    |   2 +-
 .../distributed/c10d/FlightRecorderDetail.hpp |   4 +-
 .../distributed/c10d/ProcessGroupNCCL.cpp     |   2 +-
 .../distributed/c10d/ProcessGroupWrapper.cpp  |   2 +-
 torch/csrc/distributed/c10d/UCCTracing.cpp    |   2 +-
 torch/csrc/distributed/c10d/UCCUtils.cpp      |   6 +-
 torch/csrc/distributed/c10d/Utils.hpp         |   4 +-
 .../c10d/control_plane/WorkerServer.cpp       |   8 +-
 torch/csrc/distributed/c10d/logger.cpp        |   4 +-
 torch/csrc/distributed/c10d/reducer.cpp       |   4 +-
 .../symm_mem/CUDASymmetricMemoryUtils.hpp     |   2 +-
 .../c10d/symm_mem/DMAConnectivity.cpp         |   2 +-
 .../c10d/symm_mem/NCCLSymmetricMemory.cu      |   2 +-
 .../c10d/symm_mem/NVSHMEMSymmetricMemory.cu   |   2 +-
 .../c10d/symm_mem/intra_node_comm.cpp         |   4 +-
 .../c10d/symm_mem/nvshmem_extension.cu        |   2 +-
 torch/csrc/distributed/rpc/rpc_agent.cpp      |   2 +-
 torch/csrc/distributed/rpc/rref_impl.cpp      |   4 +-
 torch/csrc/distributed/rpc/types.cpp          |   2 +-
 .../csrc/dynamo/python_compiled_autograd.cpp  |   6 +-
 torch/csrc/export/upgrader.cpp                |   2 +-
 .../inductor/aoti_eager/kernel_meta_info.cpp  |   4 +-
 .../aoti_package/model_package_loader.cpp     |   2 +-
 torch/csrc/inductor/aoti_runtime/model_base.h |   2 +-
 .../csrc/inductor/aoti_torch/shim_common.cpp  |   6 +-
 torch/csrc/jit/api/module.cpp                 |   2 +-
 torch/csrc/jit/api/module.h                   |   2 +-
 torch/csrc/jit/backends/backend_detail.cpp    |  12 +-
 torch/csrc/jit/codegen/fuser/tensor_desc.h    |   6 +-
 .../jit/frontend/concrete_module_type.cpp     |  24 ++--
 torch/csrc/jit/frontend/error_report.cpp      |   2 +-
 torch/csrc/jit/frontend/ir_emitter.cpp        |  10 +-
 torch/csrc/jit/frontend/parser.cpp            |   2 +-
 torch/csrc/jit/frontend/schema_matching.cpp   |   4 +-
 torch/csrc/jit/frontend/source_range.cpp      |   6 +-
 torch/csrc/jit/frontend/tree.h                |  16 +--
 torch/csrc/jit/ir/alias_analysis.cpp          |  20 +--
 torch/csrc/jit/ir/ir.cpp                      |  46 +++----
 torch/csrc/jit/jit_log.cpp                    |   6 +-
 torch/csrc/jit/mobile/debug_info.cpp          |   2 +-
 torch/csrc/jit/mobile/import_data.cpp         |   2 +-
 torch/csrc/jit/mobile/interpreter.cpp         |   4 +-
 torch/csrc/jit/mobile/model_tracer/tracer.cpp |   4 +-
 torch/csrc/jit/passes/check_strict_fusion.cpp |   6 +-
 torch/csrc/jit/passes/liveness.cpp            |   8 +-
 torch/csrc/jit/passes/onnx.cpp                |   2 +-
 torch/csrc/jit/passes/onnx/constant_map.cpp   |  10 +-
 .../jit/passes/onnx/function_extraction.cpp   |   2 +-
 .../onnx/remove_inplace_ops_for_onnx.cpp      |   2 +-
 .../jit/passes/symbolic_shape_analysis.cpp    |   4 +-
 .../csrc/jit/passes/utils/subgraph_utils.cpp  |   4 +-
 torch/csrc/jit/python/init.cpp                |   2 +-
 torch/csrc/jit/python/python_arg_flatten.h    |   8 +-
 torch/csrc/jit/python/python_ir.cpp           |  12 +-
 torch/csrc/jit/python/python_tracer.cpp       |   6 +-
 torch/csrc/jit/python/script_init.cpp         |  10 +-
 torch/csrc/jit/runtime/argument_spec.cpp      |   2 +-
 torch/csrc/jit/runtime/argument_spec.h        |  12 +-
 torch/csrc/jit/runtime/instruction.cpp        |   4 +-
 torch/csrc/jit/runtime/interpreter.cpp        |   8 +-
 .../csrc/jit/runtime/interpreter/code_impl.h  |   6 +-
 torch/csrc/jit/runtime/register_prim_ops.cpp  |  10 +-
 torch/csrc/jit/runtime/static/impl.cpp        |  10 +-
 torch/csrc/jit/serialization/onnx.cpp         |  42 +++----
 torch/csrc/jit/serialization/pickler.cpp      |   2 +-
 torch/csrc/jit/serialization/python_print.cpp | 119 +++++++++---------
 torch/csrc/jit/tensorexpr/block_codegen.cpp   |  41 +++---
 .../csrc/jit/tensorexpr/bounds_inference.cpp  |   8 +-
 torch/csrc/jit/tensorexpr/bounds_overlap.cpp  |   2 +-
 torch/csrc/jit/tensorexpr/codegen.cpp         |   2 +-
 torch/csrc/jit/tensorexpr/cpp_codegen.cpp     |  48 +++----
 torch/csrc/jit/tensorexpr/cuda_codegen.cpp    |  54 ++++----
 torch/csrc/jit/tensorexpr/ir_printer.cpp      | 104 +++++++--------
 torch/csrc/jit/tensorexpr/loopnest.cpp        |   4 +-
 .../jit/tensorexpr/loopnest_randomization.cpp |   2 +-
 .../jit/tensorexpr/mem_dependency_checker.cpp |  24 ++--
 torch/csrc/jit/tensorexpr/registerizer.cpp    |   8 +-
 torch/csrc/jit/tensorexpr/types.cpp           |   2 +-
 torch/csrc/jit/testing/file_check.cpp         |   8 +-
 torch/csrc/lazy/core/debug_util.cpp           |   6 +-
 torch/csrc/lazy/core/ir.cpp                   |   2 +-
 torch/csrc/lazy/core/ir_dump_util.cpp         |  18 +--
 torch/csrc/lazy/core/ir_metadata.cpp          |   6 +-
 torch/csrc/lazy/core/lazy_graph_executor.cpp  |   2 +-
 torch/csrc/lazy/core/shape_inference.cpp      |   2 +-
 torch/csrc/lazy/core/trie.cpp                 |   2 +-
 torch/csrc/monitor/counters.h                 |   2 +-
 torch/csrc/profiler/kineto_shim.cpp           |  12 +-
 .../standalone/execution_trace_observer.cpp   |   8 +-
 torch/csrc/profiler/stubs/cuda.cpp            |   2 +-
 torch/csrc/profiler/unwind/action.h           |   8 +-
 torch/csrc/profiler/unwind/eh_frame_hdr.h     |   2 +-
 torch/csrc/profiler/unwind/fde.h              |  24 ++--
 torch/csrc/profiler/unwind/unwind.cpp         |   2 +-
 torch/csrc/profiler/util.cpp                  |  10 +-
 torch/csrc/tensor/python_tensor.cpp           |   2 +-
 torch/csrc/utils/python_arg_parser.cpp        |  14 +--
 torch/csrc/utils/python_dispatch.cpp          |   2 +-
 torch/csrc/utils/structseq.cpp                |   2 +-
 torch/csrc/utils/tensor_types.cpp             |   4 +-
 torch/csrc/xpu/Module.cpp                     |   2 +-
 torch/nativert/executor/OpKernel.cpp          |   2 +-
 .../executor/memory/FunctionSchema.cpp        |   4 +-
 torch/nativert/graph/Graph.cpp                |  16 +--
 torch/nativert/graph/GraphSignature.cpp       |  22 ++--
 .../graph/passes/pass_manager/PassManager.cpp |   4 +-
 209 files changed, 927 insertions(+), 920 deletions(-)

diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index f051e7b1f6531..798e3535af3fb 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }
 
 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
   return out;
 }
 
diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp
index 8618a67259c9c..1fa852686656f 100644
--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();
 
 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
+  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
   return stream;
 }
 
@@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }
 
 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << '(';
+  stream << "(";
   for (const auto i : c10::irange(tensor_indices.size())) {
     stream << tensor_indices[i];
     if (i < tensor_indices.size() - 1) stream << ", ";
   }
-  stream << ')';
+  stream << ")";
   return stream;
 }
 
diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp
index ac6857b95c1d6..bff12aa8de65f 100644
--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
   out << tensorname.name_ << " (index ";
   out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ')';
+  out << tensorname.origin_ << ")";
   return out;
 }
 
diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp
index 2752ff792e485..8236751679f06 100644
--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
   if (t.pos == 0) {
     // 0 is distinguished; it usually indicates 'self' or the return
     // tensor
-    out << '\'' << t.name << '\'';
+    out << "'" << t.name << "'";
   } else {
-    out << "argument #" << t.pos << " '" << t.name << '\'';
+    out << "argument #" << t.pos << " '" << t.name << "'";
   }
   return out;
 }
@@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
       oss << "Tensor for " << t2 << " is on CPU, ";
     }
     oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ')';
+        << " to be on GPU (while checking arguments for " << c << ")";
     TORCH_CHECK(false, oss.str());
   }
   TORCH_CHECK(
@@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
         i++;
       }
       oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ')';
+          << " instead (while checking arguments for " << c << ")";
       TORCH_CHECK(false, oss.str());
     }
 }
diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp
index a6335d9e11304..7239f357fdd64 100644
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
     // https://github.com/intel/ideep/issues/29
     {
       const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
-         << " (Git Hash " << ver->hash << ')';
+      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
+         << " (Git Hash " << ver->hash << ")";
     }
   #else
     ss << "MKLDNN not found";
@@ -81,7 +81,7 @@ std::string get_openmp_version() {
           break;
       }
       if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ')';
+        ss << " (a.k.a. OpenMP " << ver_str << ")";
       }
     }
   #else
@@ -135,38 +135,38 @@ std::string show_config() {
 
 #if defined(__GNUC__)
   {
-    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
+    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
   }
 #endif
 
 #if defined(__cplusplus)
   {
-    ss << "  - C++ Version: " << __cplusplus << '\n';
+    ss << "  - C++ Version: " << __cplusplus << "\n";
   }
 #endif
 
 #if defined(__clang_major__)
   {
-    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
+    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
   }
 #endif
 
 #if defined(_MSC_VER)
   {
-    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
+    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
   }
 #endif
 
 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << '\n';
+  ss << "  - " << get_mkl_version() << "\n";
 #endif
 
 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << '\n';
+  ss << "  - " << get_mkldnn_version() << "\n";
 #endif
 
 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << '\n';
+  ss << "  - " << get_openmp_version() << "\n";
 #endif
 
 #if AT_BUILD_WITH_LAPACK()
@@ -183,7 +183,7 @@ std::string show_config() {
   ss << "  - Cross compiling on MacOSX\n";
 #endif
 
-  ss << "  - "<< used_cpu_capability() << '\n';
+  ss << "  - "<< used_cpu_capability() << "\n";
 
   if (hasCUDA()) {
     ss << detail::getCUDAHooks().showConfig();
@@ -200,10 +200,10 @@ std::string show_config() {
   ss << "  - Build settings: ";
   for (const auto& pair : caffe2::GetBuildOptions()) {
     if (!pair.second.empty()) {
-      ss << pair.first << '=' << pair.second << ", ";
+      ss << pair.first << "=" << pair.second << ", ";
     }
   }
-  ss << '\n';
+  ss << "\n";
 
   // TODO: do HIP
   // TODO: do XLA
diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h
index 2cde802dac172..2026795fc0a3d 100644
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@@ -209,7 +209,7 @@ struct CodeTemplate {
   // to indent correctly in the context.
   void emitIndent(std::ostream& out, size_t indent) const {
     for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << ' ';
+      out << " ";
     }
   }
   void emitStringWithIndents(
diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp
index 66aa8cb69e1ed..c78d554732b9e 100644
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
   if (dimname.type() == NameType::WILDCARD) {
     out << "None";
   } else {
-    out << '\'' << dimname.symbol().toUnqualString() << '\'';
+    out << "'" << dimname.symbol().toUnqualString() << "'";
   }
   return out;
 }
diff --git a/aten/src/ATen/core/Range.cpp b/aten/src/ATen/core/Range.cpp
index b5f4c7b6f85bc..06a79a9c7d063 100644
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@@ -5,7 +5,7 @@
 namespace at {
 
 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << ']';
+  out << "Range[" << range.begin << ", " << range.end << "]";
   return out;
 }
 
diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp
index 090e77e703736..c5f887f096cd1 100644
--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {
 
 void TensorBase::print() const {
   if (defined()) {
-    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
+    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
   } else {
     std::cerr << "[UndefinedTensor]" << '\n';
   }
diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp
index ac1ee45d58345..1cfc720aca52b 100644
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@@ -9,8 +9,8 @@ APIVitals VitalsAPI;
 
 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
   for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
-       << m.second.value << '\n';
+    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
+       << m.second.value << "\n";
   }
   return os;
 }
diff --git a/aten/src/ATen/core/alias_info.h b/aten/src/ATen/core/alias_info.h
index 6a3335c328be2..bf0ff6ee72d3b 100644
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
 
 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << '(';
+  out << "(";
   bool first = true;
   for (const auto& set : aliasInfo.beforeSets()) {
     if (first) {
       first = false;
     } else {
-      out << '|';
+      out << "|";
     }
     out << set.toUnqualString();
   }
   if (aliasInfo.isWrite()) {
-    out << '!';
+    out << "!";
   }
   if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
     out << " -> ";
@@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
       if (first) {
         first = false;
       } else {
-        out << '|';
+        out << "|";
       }
       out << set.toUnqualString();
     }
   }
-  out << ')';
+  out << ")";
   return out;
 }
 } // namespace c10
diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h
index 617d6a982ab4e..251da65e0896f 100644
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }
 
 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << ']';
+  return out << "Blob[" << v.TypeName() << "]";
 }
 
 } // namespace caffe2
diff --git a/aten/src/ATen/core/class_type.cpp b/aten/src/ATen/core/class_type.cpp
index a65124e80979e..800d9ea0ef9f6 100644
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
           *why_not << "Method on class '" << repr_str()
                    << "' (1) is not compatible with interface '"
                    << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << '\n'
-                   << "  (2) " << schema << '\n';
+                   << "  (1) " << self_method->getSchema() << "\n"
+                   << "  (2) " << schema << "\n";
         }
         return false;
       }
diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h
index f6f6bade9c90d..ea537400ef73d 100644
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
   std::string repr_str() const override {
     std::stringstream ss;
     ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
+       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
     return ss.str();
   }
 
diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
index 369bd374747ad..9180d0d19e644 100644
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
   std::ostringstream oss;
   for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
     if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << '1';
+      oss << "1";
     } else {
-      oss << '0';
+      oss << "0";
     }
   }
-  oss << ' ' << nonFallthroughKeys_ << '\n';
+  oss << " " << nonFallthroughKeys_ << "\n";
   return oss.str();
 }
 
diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp
index 5facca30a54f3..afcaf51f231ae 100644
--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@@ -69,8 +69,8 @@ class RegistrationListenerList final {
 
 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
   auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
 }
 } // namespace detail
 
diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
index e2627354971a0..928474ec3336d 100644
--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {
 
 std::string OperatorEntry::listAllDispatchKeys() const {
   std::ostringstream str;
-  str << '[';
+  str << "[";
 
   bool has_kernels = false;
   for (auto k : allDispatchKeysInFullSet()) {
@@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
     str << k;
     has_kernels = true;
   }
-  str << ']';
+  str << "]";
   return str.str();
 }
 
@@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
   std::ostringstream oss;
-  oss << "name: " << name_ << '\n';
+  oss << "name: " << name_ << "\n";
   if (schema_) {
-    oss << "schema: " << schema_->schema << '\n';
-    oss << "debug: " << schema_->debug << '\n';
+    oss << "schema: " << schema_->schema << "\n";
+    oss << "debug: " << schema_->debug << "\n";
     oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
   } else {
     oss << "schema: (none)\n";
   }
diff --git a/aten/src/ATen/core/function_schema.cpp b/aten/src/ATen/core/function_schema.cpp
index ffccbe282ddd2..6587af0f9ccc0 100644
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@@ -7,7 +7,7 @@
 namespace c10 {
 
 void FunctionSchema::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }
 
 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
 
   out << schema.name();
   if (!schema.overload_name().empty()) {
-    out << '.' << schema.overload_name();
+    out << "." << schema.overload_name();
   }
-  out << '(';
+  out << "(";
 
   bool seen_kwarg_only = false;
   for (const auto i : c10::irange(schema.arguments().size())) {
@@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
   }
 
   if (need_paren) {
-    out << '(';
+    out << "(";
   }
   for (const auto i : c10::irange(returns.size())) {
     if (i > 0) {
@@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
     out << "...";
   }
   if (need_paren) {
-    out << ')';
+    out << ")";
   }
   return out;
 }
@@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
     if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
       if (why_not) {
         why_not
-            << '\'' << arguments().at(i).name() << '\''
+            << "'" << arguments().at(i).name() << "'"
             << " is not forward compatible with the older version of the schema";
       }
       return false;
@@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
              .isForwardCompatibleWith(old.arguments().at(i))) {
       if (why_not) {
         why_not << "Out argument '"
-                << '\'' << arguments().at(i).name()
+                << "'" << arguments().at(i).name()
                 << " is not FC with the older version of the schema";
       }
       return false;
diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h
index f349567c26478..c3e1520dc9868 100644
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
     if (arg.N()) {
         N = std::to_string(*arg.N());
     }
-    out << '[' << N << ']';
+    out << "[" << N << "]";
   } else {
     out << unopt_type->str();
   }
@@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
   }
 
   if (is_opt) {
-    out << '?';
+    out << "?";
   }
 
   if (!arg.name().empty()) {
-    out << ' ' << arg.name();
+    out << " " << arg.name();
   }
 
   if (arg.default_value()) {
-    out << '=';
+    out << "=";
     if ((type->kind() == c10::TypeKind::StringType ||
         unopt_type->kind() == c10::TypeKind::StringType) &&
         arg.default_value().value().isString()) {
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 6e4ee82ab1137..1ff8dd0410949 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }
 
 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << '.' << v.name();
+  out << v.qualifiedClassName() << "." << v.name();
   return out;
 }
 
@@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
       !elementTypeCanBeInferredFromMembers(list_elem_type)) {
     out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
     printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ')';
+    out << ")";
     return out;
   } else {
     return printList(out, the_list.toListRef(), "[", "]", formatter);
@@ -538,7 +538,7 @@ std::ostream& printDict(
     std::ostream& out,
     const Dict& v,
     const IValueFormatter& formatter) {
-  out << '{';
+  out << "{";
 
   bool first = true;
   for (const auto& pair : v) {
@@ -552,7 +552,7 @@ std::ostream& printDict(
     first = false;
   }
 
-  out << '}';
+  out << "}";
   return out;
 }
 }
@@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
   auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
   if (the_dict.toGenericDict().empty() ||
       !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
-    printDict(out, the_dict.toGenericDict(), formatter) << ')';
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
+    printDict(out, the_dict.toGenericDict(), formatter) << ")";
   } else {
     return printDict(out, the_dict.toGenericDict(), formatter);
   }
@@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
   c10::complex<double> d = v.toComplexDouble();
   IValue real(d.real()), imag(std::abs(d.imag()));
   auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << 'j';
+  return out << real << sign << imag << "j";
 }
 
 std::ostream& IValue::repr(
@@ -605,9 +605,9 @@ std::ostream& IValue::repr(
         if (static_cast<double>(i) == d) {
           // -0.0 (signed zero) needs to be parsed as -0.
           if (i == 0 && std::signbit(d)) {
-            return out << '-' << i << '.';
+            return out << "-" << i << ".";
           }
-          return out << i << '.';
+          return out << i << ".";
         }
       }
       auto orig_prec = out.precision();
@@ -643,20 +643,20 @@ std::ostream& IValue::repr(
       device_stream << v.toDevice();
       out << "torch.device(";
       c10::printQuotedString(out, device_stream.str());
-      return out << ')';
+      return out << ")";
     }
     case IValue::Tag::Generator: {
       auto generator = v.toGenerator();
       out << "torch.Generator(device=";
       c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ')';
+      out << ", seed=" << generator.current_seed() << ")";
       return out;
     }
     case IValue::Tag::GenericDict:
       return printMaybeAnnotatedDict(out, v, formatter);
     case IValue::Tag::Enum: {
       auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << '.' <<
+      return out << enum_holder->qualifiedClassName() << "." <<
           enum_holder->name();
     }
     case IValue::Tag::Object: {
@@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       if (c == FP_NORMAL || c == FP_ZERO) {
         int64_t i = static_cast<int64_t>(d);
         if (static_cast<double>(i) == d) {
-          return out << i << '.';
+          return out << i << ".";
         }
       }
       auto orig_prec = out.precision();
@@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       return printDict(out, v.toGenericDict(), formatter);
     case IValue::Tag::PyObject: {
       auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << '>';
+      return out << "<PyObject at" << py_obj << ">";
     }
     case IValue::Tag::Generator:
       return out << "Generator";
@@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
       // TODO we should attempt to call __str__ if the object defines it.
       auto obj = v.toObject();
       // print this out the way python would do it
-      return out << '<' << obj->name() << " object at " << obj.get() << '>';
+      return out << "<" << obj->name() << " object at " << obj.get() << ">";
     }
     case IValue::Tag::Enum: {
       auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
-          enum_holder->name() << '>';
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
+          enum_holder->name() << ">";
     }
 
   }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
 }
 
 #undef TORCH_FORALL_TAGS
 
 void IValue::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }
 
 std::shared_ptr<ClassType> ivalue::Object::type() const {
@@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
       std::stringstream err;
       err << "Cannot serialize custom bound C++ class";
       if (auto qualname = type()->name()) {
-        err << ' ' << qualname->qualifiedName();
+        err << " " << qualname->qualifiedName();
       }
       err << ". Please define serialization methods via def_pickle() for "
             "this class.";
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index 535831ea11d6e..666d1ade5789c 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string str() const override {
     std::stringstream ss;
-    ss << getElementType()->str() << '?';
+    ss << getElementType()->str() << "?";
     return ss.str();
   }
 
@@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -906,7 +906,7 @@ struct TORCH_API ListType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << ']';
+    ss << "List[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
   std::string str() const override {
     std::stringstream ss;
     ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ')';
+       << ")";
     return ss.str();
   }
 
@@ -1018,7 +1018,7 @@ struct TORCH_API FutureType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ')';
+    ss << "Future(" << getElementType()->str() << ")";
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1041,7 +1041,7 @@ struct TORCH_API FutureType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ')';
+    ss << "Await(" << getElementType()->str() << ")";
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
@@ -1102,7 +1102,7 @@ struct TORCH_API RRefType
 
   std::string str() const override {
     std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ')';
+    ss << "RRef(" << getElementType()->str() << ")";
     return ss.str();
   }
   TypePtr createWithContained(
@@ -1115,7 +1115,7 @@ struct TORCH_API RRefType
 
   std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
     std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
+    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
     return ss.str();
   }
 };
diff --git a/aten/src/ATen/core/operator_name.cpp b/aten/src/ATen/core/operator_name.cpp
index e55a84a4d305a..43a1fd24749a7 100644
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
   os << opName.name;
   if (!opName.overload_name.empty()) {
-    os << '.' << opName.overload_name;
+    os << "." << opName.overload_name;
   }
   return os;
 }
diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp
index d428aceb3d04c..9d8080cb8f317 100644
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {
 
 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << '(';
+  out << "(";
   if (!vs.size()) {
     out << "*)";
     return out;
@@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
     if (v.has_value()) {
       out << v.value();
     } else {
-      out << '*';
+      out << "*";
     }
   }
-  out << ')';
+  out << ")";
   return out;
 }
 
@@ -105,7 +105,7 @@ std::ostream& operator<<(
   }
   auto sizes_opt = ss.sizes();
 
-  os << '(';
+  os << "(";
   for (size_t i = 0; i < rank_opt.value(); i++) {
     if (i > 0) {
       os << ", ";
@@ -113,10 +113,10 @@ std::ostream& operator<<(
     if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
       os << sizes_opt.value()[i];
     } else {
-      os << '*';
+      os << "*";
     }
   }
-  os << ')';
+  os << ")";
 
   return os;
 }
@@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }
 
 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << '{';
+  os << "{";
   if (s.stride_index_.has_value()) {
     os << *s.stride_index_;
   } else {
-    os << '*';
+    os << "*";
   }
-  os << ':';
+  os << ":";
   if (s.stride_.has_value()) {
     os << *s.stride_;
   } else {
-    os << '*';
+    os << "*";
   }
   os << '}';
   return os;
diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp
index 46dc550b1f37b..abba4e14583a3 100644
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
       bool has_valid_strides_info = ndim > 0 &&
           value->strides().isComplete() && value->strides().size() == ndim;
 
-      out << '(';
+      out << "(";
       size_t i = 0;
       bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
       for (i = 0; i < *ndim; ++i) {
@@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
         } else if (symbolic) {
           out << value->symbolic_sizes().at(i);
         } else {
-          out << '*';
+          out << "*";
         }
       }
       if (has_valid_strides_info &&
@@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           }
           out << value->strides()[i].value();
         }
-        out << ']';
+        out << "]";
       }
       if (type_verbosity() >= TypeVerbosity::Full) {
         if (value->requiresGrad()) {
@@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           out << "device=" << *value->device();
         }
       }
-      out << ')';
+      out << ")";
     } else {
       if (type_verbosity() >= TypeVerbosity::Full) {
         size_t i = 0;
         if (value->requiresGrad()) {
-          out << '('
+          out << "("
               << "requires_grad=" << *value->requiresGrad();
           i++;
         }
@@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
           out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
         }
         if (i > 0) {
-          out << ')';
+          out << ")";
         }
       }
     }
@@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
     out << *prim << "[]";
   } else if (t.kind() == TypeKind::OptionalType) {
     auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << '?';
+    out << *prim << "?";
   } else if(t.kind() == TypeKind::FutureType) {
     auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << ']';
+    out << "Future[" << *elem << "]";
   } else if(t.kind() == TypeKind::RRefType) {
     auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << ']';
+    out << "RRef[" << *elem << "]";
   } else if(auto tup = t.cast<TupleType>()) {
     if (tup->schema()) {
       out << "NamedTuple";
     }
-    out << '(';
+    out << "(";
     for(size_t i = 0; i < tup->elements().size(); ++i) {
       if(i > 0)
         out << ", ";
@@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
         out << *(tup->elements()[i]);
       }
     }
-    out << ')';
+    out << ")";
   } else if (t.kind() == TypeKind::FunctionType) {
     out << "Function";
   } else {
@@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
       why_not << "Could not unify type list since element " << i << " of type "
               << elements.at(i)->repr_str()
               << " did not match the types before it ("
-              << ret_type->repr_str() << ')';
+              << ret_type->repr_str() << ")";
       return std::nullopt;
     }
     ret_type = *maybe_unified;
@@ -907,13 +907,13 @@ std::string TupleType::str() const {
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     ss << name()->qualifiedName();
   } else {
-    ss << '(';
+    ss << "(";
     for(size_t i = 0; i < elements().size(); ++i) {
       if(i > 0)
         ss << ", ";
       ss << elements()[i]->str();
     }
-    ss << ')';
+    ss << ")";
   }
   return ss.str();
 }
@@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
           *why_not << "Method on interface '" << lhs.repr_str()
                    << "' (1) is not compatible with interface '"
                    << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << '\n'
-                   << "  (2) " << schema << '\n';
+                   << "  (1) " << *self_schema << "\n"
+                   << "  (2) " << schema << "\n";
           return false;
         }
         return false;
@@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }
 
 void SymbolicShape::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }
 
 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp
index 8731c2cbc4952..dc4cb78872182 100644
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
     for (const auto i : c10::irange(reference.size())) {
       msg << reference[i]->repr_str();
       if (i > 0) {
-        msg << ',';
+        msg << ",";
       }
-      msg << ' ';
+      msg << " ";
     }
     msg << "} has the single type " << types_[0]->repr_str()
          << ". Use the common supertype instead of creating a Union"
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index a2eb9e5f45104..50c3cc31a6c48 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
     }
     stream << buf[i];
   }
-  stream << ']';
+  stream << "]";
   return stream;
 }
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index 623971454df8b..975b71ce9a867 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
     }
     stream << buf[i];
   }
-  stream << ']';
+  stream << "]";
   return stream;
 }
 
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
index b2b9be4498e5b..594045a1b41d2 100644
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
     // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
     if(v < 500) {
       // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << '.' << (v % 10);
+      oss << (v / 100) << "." << (v % 10);
     }
     else {
       // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
+      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
     }
 #else
-    oss << (v / 1000) << '.' << (v / 10 % 100);
+    oss << (v / 1000) << "." << (v / 10 % 100);
     if (v % 10 != 0) {
-      oss << '.' << (v % 10);
+      oss << "." << (v % 10);
     }
 #endif
   };
@@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
   oss << "  - HIP Runtime ";
 #endif
   printCudaStyleVersion(runtimeVersion);
-  oss << '\n';
+  oss << "\n";
 
   // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
   if (runtimeVersion != CUDART_VERSION) {
     oss << "  - Built with CUDA Runtime ";
     printCudaStyleVersion(CUDART_VERSION);
-    oss << '\n';
+    oss << "\n";
   }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
 #endif
 
 #if !defined(USE_ROCM)
@@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {
 
 
   auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << '.' << (v / 100 % 10);
+    oss << (v / 1000) << "." << (v / 100 % 10);
     if (v % 100 != 0) {
-      oss << '.' << (v % 100);
+      oss << "." << (v % 100);
     }
   };
 
@@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
   if (cudnnCudartVersion != CUDART_VERSION) {
     oss << "  (built against CUDA ";
     printCudaStyleVersion(cudnnCudartVersion);
-    oss << ')';
+    oss << ")";
   }
-  oss << '\n';
+  oss << "\n";
   if (cudnnVersion != CUDNN_VERSION) {
     oss << "    - Built with CuDNN ";
     printCudnnStyleVersion(CUDNN_VERSION);
-    oss << '\n';
+    oss << "\n";
   }
 #endif
 #else
   // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
 #endif
 
 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
 #endif
 
   return oss.str();
diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu
index d664c828bdad6..3af5104288d21 100644
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
 
   // The cache key includes all the parameters to generate_code + vec_size + dev_idx
   std::stringstream ss;
-  ss << nInputs << '_' << nOutputs << f;
+  ss << nInputs << "_" << nOutputs << f;
   ss << f_inputs_type_str << compute_type_str << result_type_str;
   ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
   ss << extra_args_types;
@@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(
 
   // The cache key includes all the parameters to generate_code + dev_idx
   std::stringstream ss;
-  ss << nInputs << '_' << nOutputs << f;
+  ss << nInputs << "_" << nOutputs << f;
   ss << f_inputs_type_str << compute_type_str << result_type_str;
   ss << contiguous << dynamic_casting;
   ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp
index eb7e381d27766..9fb04b40d30f6 100644
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
   static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
   if (!blaslog) {
-    return stream << entry.key_ << ',' << entry.time_;
+    return stream << entry.key_ << "," << entry.time_;
   }
   else {
-    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
   }
 }
 
@@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
     if (isNew) {
       static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
       if (!blaslog) {
-        untuned_file << op_signature << ',' << params_signature << std::endl;
+        untuned_file << op_signature << "," << params_signature << std::endl;
       }
       else {
-        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
       }
       TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
     }
@@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const
 
   if(!file_exists || file_empty) {
     for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
+      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
       realtime_out_->flush();
     }
     validators_written_ = true;
@@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
     return;
   }
 
-  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
+  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
   realtime_out_->flush(); //ensure immediate write to disk
 
   TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp
index a2cb0cb0a1025..8636d267209e9 100644
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
       return "CUDNN_DATA_UINT8x4";
     default:
       std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
   int nbDims = 0;
   int dimA[CUDNN_DIM_MAX];
   int strideA[CUDNN_DIM_MAX];
   cudnnDataType_t dtype{};
   cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << '\n';
+  out << "\n";
   out << "    strideA = ";
   for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << '\n';
+  out << "\n";
   return out;
 }
 
@@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
       return "CUDNN_TENSOR_NHWC";
     default:
       std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
   int nbDims = 0;
   int dimA[CUDNN_DIM_MAX];
   cudnnDataType_t dtype{};
   cudnnTensorFormat_t tformat{};
   cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << '\n';
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << '\n';
+  out << "\n";
   return out;
 }
 
diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp
index 518098a8b4a80..69af08a7bd7ce 100644
--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }
 
 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ':' << layer.key();
+  os << layer.layerId() << ":" << layer.key();
   return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
   os << "DynamicLayerStack[ ";
   for (const auto& layer : dls) {
-    os << layer << ' ';
+    os << layer << " ";
   }
-  os << ']';
+  os << "]";
   return os;
 }
 
diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp
index ba5dcfc923878..65de9268927f0 100644
--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
     if (batched) {
       ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
       dumpTensor(ss, batched->value());
-      ss << ']';
+      ss << "]";
       return;
     }
     ss << "Tensor" << tensor.sizes();
@@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
     ss << "dead, ";
   }
   dumpTensor(ss, wrapped->value());
-  ss << ']';
+  ss << "]";
 }
 
 void TensorWrapper::refreshMetadata() {
diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp
index 3fe27c7a0825b..86e42ee3b66dc 100644
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
       return "miopenBFloat16";
     default:
       std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
       return oss.str();
   }
 }
 
 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
   int nbDims = 0;
   int dimA[MIOPEN_DIM_MAX];
   int strideA[MIOPEN_DIM_MAX];
   miopenDataType_t dtype;
   miopenGetTensorDescriptorSize(d.desc(), &nbDims);
   miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << miopenTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
   // Read out only nbDims of the arrays!
   out << "    dimA = ";
   for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << '\n';
+  out << "\n";
   out << "    strideA = ";
   for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
     out << i << ", ";
   }
-  out << '\n';
+  out << "\n";
   return out;
 }
 
diff --git a/aten/src/ATen/mps/MPSProfiler.h b/aten/src/ATen/mps/MPSProfiler.h
index 187e86d92e1bf..c1cb9090fc4af 100644
--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
     std::stringstream kernelStr;
     kernelStr << kernelName;
     for (const Tensor& tensor : tensors) {
-      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
     }
     return kernelStr.str();
   }
diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm
index 1d0408b8089c9..a91574c56c52d 100644
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@@ -39,9 +39,9 @@
     // see comments for INCLUDE_BUFFER_ID
     if (includeBufferId && deviceType == at::kMPS) {
       id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
     }
-    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
     return tensorStr.str();
   } else {
     return "undefined";
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 2a3388a052685..892144ac663a6 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
     std::stringstream ss;
     ss << arg_name << " should be greater than zero but got (";
     std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
     TORCH_CHECK(false, ss.str());
   }
 }
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index cb37f6f1030d3..ca3a4f5f3faba 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
       << "  deterministic = " << params.deterministic
       << "  cudnn_enabled = " << params.cudnn_enabled
       << "  allow_tf32 = " << params.allow_tf32
-      << '}';
+      << "}";
   return out;
 }
 
diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp
index 975e237c468d6..79aaac48034ac 100644
--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
        << ", hop_length=" << hop_length << ", win_length=" << win_length \
        << ", window="; \
     if (window.defined()) { \
-      SS << window.toString() << '{' << window.sizes() << '}'; \
+      SS << window.toString() << "{" << window.sizes() << "}"; \
     } else { \
       SS << "None"; \
     } \
@@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
        << ", hop_length=" << hop_length << ", win_length=" << win_length \
        << ", window="; \
     if (window.defined()) { \
-      SS << window.toString() << '{' << window.sizes() << '}'; \
+      SS << window.toString() << "{" << window.sizes() << "}"; \
     } else { \
       SS << "None"; \
     } \
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index 8a0b38eafab36..c086c7db31404 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }
 
 void _print(std::string_view s) {
-  std::cout << s << '\n';
+  std::cout << s << "\n";
 }
 
 // Sorting-based algorithm for isin(); used when the number of test elements is
diff --git a/aten/src/ATen/native/cuda/Reduce.cu b/aten/src/ATen/native/cuda/Reduce.cu
index b32c55a10df6b..36a1313488245 100644
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
   if (dim.y == 1 && dim.z == 1) {
     out << dim.x;
   } else {
-    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
+    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
   }
   return out;
 }
@@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "input_mult=[";
   for (int i = 0; i < 3; i++) {
     if (i != 0) {
-      out << ',';
+      out << ",";
     }
     out << config.input_mult[i];
   }
@@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "output_mult=[";
   for (int i = 0; i < 2; i++) {
     if (i != 0) {
-      out << ',';
+      out << ",";
     }
     out << config.output_mult[i];
   }
@@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
   out << "block=" << config.block() << ", ";
   out << "grid=" << config.grid() << ", ";
   out << "global_memory_size=" << config.global_memory_size();
-  out << ')';
+  out << ")";
   return out;
 }
 
diff --git a/aten/src/ATen/native/cuda/ScaledGroupMM.cu b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
index 71c9c8dac766d..9a06c5907febc 100644
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
   //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
   //           stride_output_h + group_count);
 
-  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
   //   "
-  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
+  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
   //             << scale_b.data_ptr() << "\n";
   //   for (int i = 0; i < group_count; i++) {
   //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index e65fa4ceb38e9..09c8e74d4b2cf 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -1057,14 +1057,14 @@ std::string generate_code(
     // TODO these arrays are potentially of the different types, use function
     // traits to determine the types
     declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
-                        << '[' << std::to_string(thread_work_size) << "];\n";
+                        << "[" << std::to_string(thread_work_size) << "];\n";
   }
   env.s("declare_load_arrays", declare_load_arrays.str());
 
   std::stringstream declare_store_arrays;
   for (int i = 0; i < nOutputs; i++) {
     declare_store_arrays << result_type << " out" << std::to_string(i)
-                        << '[' << std::to_string(thread_work_size) << "];\n";
+                        << "[" << std::to_string(thread_work_size) << "];\n";
   }
   env.s("declare_store_arrays", declare_store_arrays.str());
 
@@ -1217,7 +1217,7 @@ std::string generate_code(
   for (const auto i : c10::irange(nInputs)){
     auto i_string = std::to_string(i);
     vector_inputs << "auto * input" << i_string <<
-        " = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
         " + block_work_size * idx;\n";
   }
   env.s("vector_inputs", vector_inputs.str());
@@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(
 
     // Constructs file path by appending constructed cubin name to cache path
     std::stringstream ss;
-    ss << *cache_dir << '/';
+    ss << *cache_dir << "/";
     ss << kernel_name;
 #ifdef USE_ROCM
     ss << "_arch" << prop->gcnArchName;
 #else
-    ss << "_arch" << cuda_major << '.' << cuda_minor;
+    ss << "_arch" << cuda_major << "." << cuda_minor;
 #endif
-    ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
+    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
     ss << (compile_to_sass ? "_sass" : "_ptx");
-    ss << '_' << code.length();
-    ss << '_' << hash_code;
+    ss << "_" << code.length();
+    ss << "_" << hash_code;
     file_path = ss.str();
 
     std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp
index 1584d5e9acd38..325b082f314d9 100644
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@@ -82,15 +82,15 @@ namespace native {
 
 std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
   out << "ConvolutionParams \n"
-      << "    memory_format = " << params.memory_format << '\n'
-      << "    data_type = " << cudnnTypeToString(params.dataType) << '\n'
-      << "    padding = " << ArrayRef<int>{params.padding} << '\n'
-      << "    stride = " << ArrayRef<int>{params.stride} << '\n'
-      << "    dilation = " << ArrayRef<int>{params.dilation} << '\n'
-      << "    groups = " << params.groups << '\n'
+      << "    memory_format = " << params.memory_format << "\n"
+      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
+      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
+      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
+      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
+      << "    groups = " << params.groups << "\n"
       << "    deterministic = " << (params.deterministic ? "true" : "false")
-      << '\n'
-      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';
+      << "\n"
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
 
   return out;
 }
@@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
             at::globalContext().float32Precision(
                 at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
             at::Float32Precision::TF32)
-     << '\n';
+     << "\n";
   ss << "torch.backends.cudnn.benchmark = "
-     << pybool(at::globalContext().benchmarkCuDNN()) << '\n';
+     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
   ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
-     << '\n';
+     << "\n";
   ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
-     << '\n';
+     << "\n";
   ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
      << ", dtype=" << full_dtype << ", ";
-  ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
   ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
      << out_channels << ", ";
   ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
   ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
   ss << "groups=" << params.groups << ")\n";
   ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
-     << '\n';
+     << "\n";
   ss << "out = net(data)\n";
   ss << "out.backward(torch.randn_like(out))\n";
   ss << "torch.cuda.synchronize()\n\n";
diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp
index d5102910c6471..bc064e3ad3167 100644
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@@ -93,10 +93,11 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
       << "input: " << args.idesc // already has a trailing newline
       << "output: " << args.odesc // already has a trailing newline
       << "weight: " << args.wdesc // already has a trailing newline
-      << "Pointer addresses: " << '\n'
-      << "    input: " << args.input.const_data_ptr() << '\n'
-      << "    output: " << args.output.const_data_ptr() << '\n'
-      << "    weight: " << args.weight.const_data_ptr() << '\n';
+      << "Pointer addresses: "
+      << "\n"
+      << "    input: " << args.input.const_data_ptr() << "\n"
+      << "    output: " << args.output.const_data_ptr() << "\n"
+      << "    weight: " << args.weight.const_data_ptr() << "\n";
 
   return out;
 }
diff --git a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
index 20a942a9e2573..f614429eefddf 100644
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@@ -115,7 +115,7 @@ void copy_data_to_host(float* host) {
   std::copy(
       strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
   oss << sizes.back();
-  output << oss.str() << '}';
+  output << oss.str() << "}";
   return output;
 }
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
index 6827e02cc3f42..1555eed558e29 100644
--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
       << "  transposed = " << params.transposed
       << "  output_padding = " << IntArrayRef{params.output_padding}
       << "  groups = " << params.groups << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic << '}';
+      << "  deterministic = " << params.deterministic << "}";
   return out;
 }
 
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
index ac6370f8df29f..1a425146ad6c2 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
@@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
           ASSERT_NEAR(
               float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
         }
       }
@@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
           ASSERT_NEAR(
               float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
               << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
         }
       }
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
index fc94f9666d9d0..e1583a2c058ef 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
@@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
           ASSERT_EQ(
               uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
               << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
               << "), kc = " << kc();
         }
       }
diff --git a/aten/src/ATen/native/utils/ParamUtils.h b/aten/src/ATen/native/utils/ParamUtils.h
index 8887664df1ce3..c9088c03d81c1 100644
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << '=' << list_param;
+       << "dimensions, but got " << param_name << "=" << list_param;
     TORCH_CHECK(false, ss.str());
   } else {
     return list_param.vec();
diff --git a/aten/src/ATen/native/vulkan/api/Adapter.cpp b/aten/src/ATen/native/vulkan/api/Adapter.cpp
index 350df39ea3684..173479a0c2de0 100644
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
   std::string device_type = get_device_type_str(properties.deviceType);
   VkPhysicalDeviceLimits limits = properties.limits;
 
-  ss << '{' << std::endl;
+  ss << "{" << std::endl;
   ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << '.' << v_minor << std::endl;
+  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
   ss << "    driverversion: " << properties.driverVersion << std::endl;
   ss << "    deviceType:    " << device_type << std::endl;
   ss << "    deviceName:    " << properties.deviceName << std::endl;
@@ -371,7 +371,7 @@ std::string Adapter::stringize() const {
 
 #define PRINT_LIMIT_PROP_VEC3(name)                                       \
   ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << ',' << limits.name[1] << ',' << limits.name[2] << std::endl;
+     << "," << limits.name[1] << "," << limits.name[2] << std::endl;
 
   ss << "    Physical Device Limits {" << std::endl;
   PRINT_LIMIT_PROP(maxImageDimension1D);
@@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
     ;
   }
   ss << "  ]" << std::endl;
-  ss << '}';
+  ss << "}";
 
   return ss.str();
 }
diff --git a/aten/src/ATen/native/vulkan/api/Exception.cpp b/aten/src/ATen/native/vulkan/api/Exception.cpp
index 436b38cbba6c6..9b8b653e0619e 100644
--- a/aten/src/ATen/native/vulkan/api/Exception.cpp
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
     VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
     VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
     default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
       break;
   }
   return out;
@@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
 //
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ':' << loc.line;
+  out << loc.function << " at " << loc.file << ":" << loc.line;
   return out;
 }
 
@@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
     : msg_(std::move(msg)), source_location_{source_location} {
   std::ostringstream oss;
   oss << "Exception raised from " << source_location_ << ": ";
-  oss << '(' << cond << ") is false! ";
+  oss << "(" << cond << ") is false! ";
   oss << msg_;
   what_ = oss.str();
 }
diff --git a/aten/src/ATen/native/vulkan/api/QueryPool.cpp b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
index 63c163aa44aa9..bfa92357daeed 100644
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@@ -173,8 +173,8 @@ void QueryPool::extract_results() {
 
 static std::string stringize(const VkExtent3D& extents) {
   std::stringstream ss;
-  ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
-     << '}';
+  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
+     << "}";
   return ss.str();
 }
 
diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp
index a7485b706c54e..cf8402e40a0b8 100644
--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
   (void)flags;
 
   std::stringstream stream;
-  stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
+  stream << layer_prefix << " " << message_code << " " << message << std::endl;
   const std::string log = stream.str();
 
   std::cout << log;
diff --git a/aten/src/ATen/native/vulkan/api/Utils.h b/aten/src/ATen/native/vulkan/api/Utils.h
index 8cd6a74c1c467..3172c9c461079 100644
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@@ -253,7 +253,7 @@ using vec4 = vec<4u>;
 
 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
   return os;
 }
 
diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp
index 33fe4121a040e..0937de4552821 100644
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@@ -246,7 +246,7 @@ void TestToCFloat() {
 void TestToString() {
   Tensor b = ones({3, 7}) * .0000001f;
   std::stringstream s;
-  s << b << '\n';
+  s << b << "\n";
   std::string expect = "1e-07 *";
   ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect);
 }
diff --git a/aten/src/ATen/test/scalar_test.cpp b/aten/src/ATen/test/scalar_test.cpp
index a22fb0d16adf8..0d7b62b44d214 100644
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@@ -33,7 +33,7 @@ struct Foo {
   static void apply(Tensor a, Tensor b) {
     scalar_type s = 1;
     std::stringstream ss;
-    ss << "hello, dispatch: " << a.toString() << s << '\n';
+    ss << "hello, dispatch: " << a.toString() << s << "\n";
     auto data = (scalar_type*)a.data_ptr();
     (void)data;
   }
@@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) {
   Scalar bar = 3.0;
   Half h = bar.toHalf();
   Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' '
-       << bar.toDouble() << ' ' << what.isIntegral(false) << '\n';
+  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
+       << bar.toDouble() << " " << what.isIntegral(false) << "\n";
   auto gen = at::detail::getDefaultCPUGenerator();
   {
     // See Note [Acquire lock when using random generators]
@@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) {
   }
   if (at::hasCUDA()) {
     auto t2 = zeros({4, 4}, at::kCUDA);
-    cout << &t2 << '\n';
+    cout << &t2 << "\n";
   }
   auto t = ones({4, 4});
 
@@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) {
       std::stringstream ss;
       // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
       ASSERT_NO_THROW(
-          ss << "hello, dispatch" << x.toString() << s << '\n');
+          ss << "hello, dispatch" << x.toString() << s << "\n");
       auto data = (scalar_t*)x.data_ptr();
       (void)data;
     });
diff --git a/aten/src/ATen/test/test_install/main.cpp b/aten/src/ATen/test/test_install/main.cpp
index 3a57e0c6212bf..e9a03d2303a39 100644
--- a/aten/src/ATen/test/test_install/main.cpp
+++ b/aten/src/ATen/test/test_install/main.cpp
@@ -1,5 +1,5 @@
 #include <ATen/ATen.h>
 
 int main() {
-  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n';
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
 }
diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp
index c0c05c1484175..da0da76109569 100644
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@@ -1828,9 +1828,9 @@ namespace {
       #endif
 
         EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
-            << "Test failed for float to uint16 " << f32s[i] << '\n';
+            << "Test failed for float to uint16 " << f32s[i] << "\n";
         EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16))
-            << "Test failed for uint16 to float " << u16 << '\n';
+            << "Test failed for uint16 to float " << u16 << "\n";
       }
     }
     TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
@@ -1848,10 +1848,10 @@ namespace {
           EXPECT_TRUE(std::isnan(f32));
         } else {
           EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << '\n';
+              << "Test failed for u8 to float " << input << "\n";
         }
         EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << '\n';
+            << "Test failed for float to u8 " << f32 << "\n";
       }
     }
     TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
@@ -2015,10 +2015,10 @@ namespace {
           EXPECT_TRUE(std::isnan(f32));
         } else {
           EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << '\n';
+              << "Test failed for u8 to float " << input << "\n";
         }
         EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << '\n';
+            << "Test failed for float to u8 " << f32 << "\n";
       }
     }
     TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp
index eaf1cc152bc37..cc93775bb5383 100644
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@@ -19,7 +19,7 @@ TEST(Vitals, Basic) {
     c10::utils::set_env("TORCH_VITAL", "1");
     TORCH_VITAL_DEFINE(Testing);
     TORCH_VITAL(Testing, Attribute0) << 1;
-    TORCH_VITAL(Testing, Attribute1) << '1';
+    TORCH_VITAL(Testing, Attribute1) << "1";
     TORCH_VITAL(Testing, Attribute2) << 1.0f;
     TORCH_VITAL(Testing, Attribute3) << 1.0;
     auto t = at::ones({1, 1});
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 29f01fbd78c51..396ea59d2f008 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
   std::cout << "Max Diff allowed: " << maxDiff << std::endl;
   if (diff.sizes().size() == 2) {
     for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ':';
+      std::cout << y << ":";
       for (const auto x : c10::irange(diff.sizes()[1])) {
         float diff_xy = diff[y][x].item<float>();
         if (diff_xy > maxDiff) {
           std::cout << std::setw(5) << x;
         }
         else {
-          std::cout << std::setw(5) << ' ';
+          std::cout << std::setw(5) << " ";
         }
       }
       std::cout << std::endl;
@@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {
 
 void print_shape(const std::vector<int64_t>& shape) {
   for (const auto& num : shape) {
-    std::cout << num << ' ';
+    std::cout << num << " ";
   }
 }
 
@@ -3367,7 +3367,7 @@ void test_masked_fill_scalar(
             print_shape(tmp_curr_input_shape);
             std::cout << "], and mask of shape [";
             print_shape(tmp_curr_mask_shape);
-            std::cout << ']' << std::endl;
+            std::cout << "]" << std::endl;
           }
 
           ASSERT_TRUE(check);
@@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
     if (!check) {
       std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
       for (uint32_t place = 0; place < shape.size() - 1; place++) {
-        std::cout << shape[place] << ' ';
+        std::cout << shape[place] << " ";
       }
-      std::cout << shape.back() << '}' << std::endl;
+      std::cout << shape.back() << "}" << std::endl;
       showRtol(out_cpu, out_vulkan.cpu());
     }
     ASSERT_TRUE(check);
diff --git a/aten/src/ATen/test/vulkan_quantized_api_test.cpp b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
index 2eff421a64ced..2829aed94def9 100644
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@@ -95,7 +95,7 @@ void showRtol(
   std::cout << "Max Diff found is: " << diff.max().item<double>() << std::endl;
   if (diff.sizes().size() == 2) {
     for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ':';
+      std::cout << y << ":";
       for (const auto x : c10::irange(diff.sizes()[1])) {
         double diff_xy = diff[y][x].item<double>();
         if (diff_xy > maxDiff) {
@@ -109,7 +109,7 @@ void showRtol(
             }
           }
         } else {
-          std::cout << std::setw(5) << ' ';
+          std::cout << std::setw(5) << " ";
         }
       }
       std::cout << std::endl;
@@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4;
 using at::native::vulkan::api::utils::vec4;
 
 std::ostream& operator<<(std::ostream& os, const vec4& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
   return os;
 }
 
 std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
   return os;
 }
 
@@ -3379,51 +3379,51 @@ bool _test_quantized_linear(
       showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
     }
     if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos
-                << '\n';
+      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
+                << "\n";
       std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
-                << '\n';
-      std::cout << "Input tensor row " << ypos << '\n';
+                << "\n";
+      std::cout << "Input tensor row " << ypos << "\n";
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu[ypos][i].item<double>() << ", ";
       }
-      std::cout << '\n';
+      std::cout << "\n";
 
       std::cout << "Weight tensor scale: " << w_scale
-                << " zerop: " << w_zero_point << '\n';
-      std::cout << "Weight tensor col " << xpos << '\n';
+                << " zerop: " << w_zero_point << "\n";
+      std::cout << "Weight tensor col " << xpos << "\n";
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight[xpos][i].item<double>() << ", ";
       }
-      std::cout << '\n';
+      std::cout << "\n";
 
       std::cout << "Input tensor quantized row " << ypos << " with dtype "
-                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
       for (int i = 0; i < input_cpu.sizes()[1]; i++) {
         std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
       }
-      std::cout << '\n';
+      std::cout << "\n";
 
       std::cout << "Weight tensor quantized col " << xpos << " with dtype "
-                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
       for (int i = 0; i < weight.sizes()[1]; i++) {
         std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
       }
-      std::cout << '\n';
+      std::cout << "\n";
 
       std::cout << "bias tensor\n";
       for (int i = 0; i < bias.sizes()[0]; i++) {
         std::cout << bias[i].item<double>() << ", ";
       }
-      std::cout << '\n';
+      std::cout << "\n";
 
       std::cout << "out_scale: " << out_scale
-                << " out_zero_point: " << out_zero_point << '\n';
+                << " out_zero_point: " << out_zero_point << "\n";
 
       std::cout << "cpu unmatched output: "
-                << out_cpu_dequant[ypos][xpos].item<double>() << '\n';
+                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
       std::cout << "vk unmatched output: "
-                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << '\n';
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
     }
   }
   return check;
diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp
index d1ec51b6a47d6..107530e9e28a2 100644
--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@@ -176,7 +176,7 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
     os << k;
     first = false;
   }
-  os << ')';
+  os << ")";
   return os;
 }
 
diff --git a/c10/core/TensorOptions.cpp b/c10/core/TensorOptions.cpp
index b1a90cce30edc..d3282ae7114e5 100644
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorOptions& options) {
   } else {
     stream << "(nullopt)";
   }
-  stream << ')';
+  stream << ")";
 
   return stream;
 }
diff --git a/c10/cuda/CUDADeviceAssertionHost.cpp b/c10/cuda/CUDADeviceAssertionHost.cpp
index 08e657a411614..9b7c3568a9833 100644
--- a/c10/cuda/CUDADeviceAssertionHost.cpp
+++ b/c10/cuda/CUDADeviceAssertionHost.cpp
@@ -136,7 +136,7 @@ std::string c10_retrieve_device_side_assertion_info() {
     // Something failed, let's talk about that
     oss << failures_found
         << " CUDA device-side assertion failures were found on GPU #"
-        << device_num << '!' << std::endl;
+        << device_num << "!" << std::endl;
     if (assertion_data_for_device.assertion_count >
         C10_CUDA_DSA_ASSERTION_COUNT) {
       oss << "But at least " << assertion_data_for_device.assertion_count
@@ -151,17 +151,17 @@ std::string c10_retrieve_device_side_assertion_info() {
       oss << "Assertion failure " << i << std::endl;
       oss << "  GPU assertion failure message = " << self.assertion_msg
           << std::endl;
-      oss << "  File containing assertion = " << self.filename << ':'
+      oss << "  File containing assertion = " << self.filename << ":"
           << self.line_number << std::endl;
       oss << "  Device function containing assertion = " << self.function_name
           << std::endl;
-      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ','
-          << self.thread_id[1] << ',' << self.thread_id[2] << ']' << std::endl;
-      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ','
-          << self.block_id[1] << ',' << self.block_id[2] << ']' << std::endl;
+      oss << "  Thread ID that failed assertion = [" << self.thread_id[0] << ","
+          << self.thread_id[1] << "," << self.thread_id[2] << "]" << std::endl;
+      oss << "  Block ID that failed assertion = [" << self.block_id[0] << ","
+          << self.block_id[1] << "," << self.block_id[2] << "]" << std::endl;
       if (launch_info.generation_number == self.caller) {
         oss << "  File containing kernel launch = "
-            << launch_info.launch_filename << ':' << launch_info.launch_linenum
+            << launch_info.launch_filename << ":" << launch_info.launch_linenum
             << std::endl;
         oss << "  Function containing kernel launch = "
             << launch_info.launch_function << std::endl;
@@ -175,7 +175,7 @@ std::string c10_retrieve_device_side_assertion_info() {
         if (launch_registry.gather_launch_stacktrace) {
           oss << "Launch stacktracing disabled." << std::endl;
         } else {
-          oss << '\n' << launch_info.launch_stacktrace << std::endl;
+          oss << "\n" << launch_info.launch_stacktrace << std::endl;
         }
       } else {
         oss << "  CPU launch site info: Unavailable, the circular queue wrapped around. Increase `CUDAKernelLaunchRegistry::max_size`."
diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp
index cdbdc150167e0..a93461a041c39 100644
--- a/c10/test/core/DispatchKeySet_test.cpp
+++ b/c10/test/core/DispatchKeySet_test.cpp
@@ -435,7 +435,7 @@ TEST(DispatchKeySet, TestFunctionalityDispatchKeyToString) {
     if (i > 0) {
       ASSERT_TRUE(res.find("Unknown") == std::string::npos)
           << i << " (before is " << toString(static_cast<DispatchKey>(i - 1))
-          << ')';
+          << ")";
     } else {
       ASSERT_TRUE(res.find("Unknown") == std::string::npos) << i;
     }
diff --git a/c10/test/util/Half_test.cpp b/c10/test/util/Half_test.cpp
index 33c77ead61fc8..a76814615101b 100644
--- a/c10/test/util/Half_test.cpp
+++ b/c10/test/util/Half_test.cpp
@@ -96,10 +96,10 @@ TEST(HalfConversionTest, TestPorableConversion) {
   for (auto x : inputs) {
     auto target = c10::detail::fp16_ieee_to_fp32_value(x);
     EXPECT_EQ(halfbits2float(x), target)
-        << "Test failed for uint16 to float " << x << '\n';
+        << "Test failed for uint16 to float " << x << "\n";
     EXPECT_EQ(
         float2halfbits(target), c10::detail::fp16_ieee_from_fp32_value(target))
-        << "Test failed for float to uint16" << target << '\n';
+        << "Test failed for float to uint16" << target << "\n";
   }
 }
 
diff --git a/c10/test/util/logging_test.cpp b/c10/test/util/logging_test.cpp
index 4587130564dfc..b8fc81ddc6bbe 100644
--- a/c10/test/util/logging_test.cpp
+++ b/c10/test/util/logging_test.cpp
@@ -98,7 +98,7 @@ struct Noncopyable {
 };
 
 std::ostream& operator<<(std::ostream& out, const Noncopyable& nc) {
-  out << "Noncopyable(" << nc.x << ')';
+  out << "Noncopyable(" << nc.x << ")";
   return out;
 }
 } // namespace
diff --git a/c10/util/ArrayRef.h b/c10/util/ArrayRef.h
index 55900b6ee43c6..bbbb1d7288fdd 100644
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@@ -204,13 +204,13 @@ ArrayRef(const std::initializer_list<T>&) -> ArrayRef<T>;
 template <typename T>
 std::ostream& operator<<(std::ostream& out, ArrayRef<T> list) {
   int i = 0;
-  out << '[';
+  out << "[";
   for (const auto& e : list) {
     if (i++ > 0)
       out << ", ";
     out << e;
   }
-  out << ']';
+  out << "]";
   return out;
 }
 
diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index 29dbfe427ae01..8838cafb029e4 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -106,8 +106,8 @@ class GetBacktraceImpl {
           /*length*/ &length,
           /*status*/ &status);
 
-      os << " frame #" << idx++ << '\t'
-         << ((demangled != NULL && status == 0) ? demangled : symbol) << '['
+      os << " frame #" << idx++ << "\t"
+         << ((demangled != NULL && status == 0) ? demangled : symbol) << "["
          << addr << "]\t" << std::endl;
     }
     free(demangled);
@@ -274,7 +274,7 @@ class GetBacktraceImpl {
       } else {
         // In the edge-case where we couldn't parse the frame string, we can
         // just use it directly (it may have a different format).
-        stream << symbols[frame_number] << '\n';
+        stream << symbols[frame_number] << "\n";
       }
     }
 
@@ -413,8 +413,8 @@ class GetBacktraceImpl {
              << back_trace_[i_frame] << std::dec;
       if (with_symbol) {
         stream << std::setfill('0') << std::setw(16) << std::uppercase
-               << std::hex << p_symbol->Address << std::dec << ' ' << module
-               << '!' << p_symbol->Name;
+               << std::hex << p_symbol->Address << std::dec << " " << module
+               << "!" << p_symbol->Name;
       } else {
         stream << " <unknown symbol address> " << module << "!<unknown symbol>";
       }
@@ -424,7 +424,7 @@ class GetBacktraceImpl {
       } else {
         stream << "<unknown file> @ <unknown line number>";
       }
-      stream << ']' << std::endl;
+      stream << "]" << std::endl;
     }
 
     return stream.str();
diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index c8470893d9f57..cccdb28607141 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -45,7 +45,7 @@ std::string Error::compute_what(bool include_backtrace) const {
 
   if (context_.size() == 1) {
     // Fold error and context in one line
-    oss << " (" << context_[0] << ')';
+    oss << " (" << context_[0] << ")";
   } else {
     for (const auto& c : context_) {
       oss << "\n  " << c;
@@ -53,7 +53,7 @@ std::string Error::compute_what(bool include_backtrace) const {
   }
 
   if (include_backtrace && backtrace_) {
-    oss << '\n' << backtrace_->get();
+    oss << "\n" << backtrace_->get();
   }
 
   return oss.str();
@@ -248,7 +248,7 @@ void WarningHandler::process(const Warning& warning) {
   LOG_AT_FILE_LINE(
       WARNING, warning.source_location().file, warning.source_location().line)
       << "Warning: " << warning.msg() << " (function "
-      << warning.source_location().function << ')';
+      << warning.source_location().function << ")";
 }
 
 std::string GetExceptionString(const std::exception& e) {
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index 0ae1e78637588..b95eaec9d3ebb 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -474,12 +474,12 @@ MessageLogger::MessageLogger(
   if (GLOBAL_RANK != -1) {
     stream_ << "[rank" << GLOBAL_RANK << "]:";
   }
-  stream_ << '[' << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
+  stream_ << "[" << CAFFE2_SEVERITY_PREFIX[std::min(4, GLOG_FATAL - severity_)]
           << (timeinfo->tm_mon + 1) * 100 + timeinfo->tm_mday
-          << std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour
-          << ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2)
-          << timeinfo->tm_sec << '.' << std::setw(9) << ns << ' '
-          << c10::filesystem::path(file).filename() << ':' << line << "] ";
+          << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour
+          << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2)
+          << timeinfo->tm_sec << "." << std::setw(9) << ns << " "
+          << c10::filesystem::path(file).filename() << ":" << line << "] ";
 }
 
 // Output the contents of the stream to the proper channel on destruction.
@@ -488,7 +488,7 @@ MessageLogger::~MessageLogger() noexcept(false) {
     // Nothing needs to be logged.
     return;
   }
-  stream_ << '\n';
+  stream_ << "\n";
 #ifdef ANDROID
   static const int android_log_levels[] = {
       ANDROID_LOG_FATAL, // LOG_FATAL
diff --git a/c10/util/SmallVector.h b/c10/util/SmallVector.h
index d47f37cdf7eca..d02c9380a563d 100644
--- a/c10/util/SmallVector.h
+++ b/c10/util/SmallVector.h
@@ -1412,13 +1412,13 @@ inline size_t capacity_in_bytes(const SmallVector<T, N>& X) {
 template <typename T, unsigned N>
 std::ostream& operator<<(std::ostream& out, const SmallVector<T, N>& list) {
   int i = 0;
-  out << '[';
+  out << "[";
   for (auto e : list) {
     if (i++ > 0)
       out << ", ";
     out << e;
   }
-  out << ']';
+  out << "]";
   return out;
 }
 
diff --git a/c10/util/StringUtil.cpp b/c10/util/StringUtil.cpp
index 6fae2f004cc93..063a8fc93ea7a 100644
--- a/c10/util/StringUtil.cpp
+++ b/c10/util/StringUtil.cpp
@@ -79,7 +79,7 @@ std::ostream& _str(std::ostream& ss, const std::wstring& wString) {
 } // namespace detail
 
 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ':' << loc.line;
+  out << loc.function << " at " << loc.file << ":" << loc.line;
   return out;
 }
 
diff --git a/c10/util/StringUtil.h b/c10/util/StringUtil.h
index de241bc9f7c45..cbc6f4ec336bb 100644
--- a/c10/util/StringUtil.h
+++ b/c10/util/StringUtil.h
@@ -170,7 +170,7 @@ inline bool isPrint(char s) {
 }
 
 inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
-  stmt << '"';
+  stmt << "\"";
   for (auto s : str) {
     switch (s) {
       case '\\':
@@ -224,7 +224,7 @@ inline void printQuotedString(std::ostream& stmt, const std::string_view str) {
         break;
     }
   }
-  stmt << '"';
+  stmt << "\"";
 }
 
 template <typename T>
diff --git a/c10/util/signal_handler.cpp b/c10/util/signal_handler.cpp
index bfb04e1ccbc36..831c0d0245245 100644
--- a/c10/util/signal_handler.cpp
+++ b/c10/util/signal_handler.cpp
@@ -223,7 +223,7 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
         // a single thread that wouldn't receive the SIGUSR2
         if (std::cv_status::timeout == writingCond.wait_for(ul, 2s)) {
           if (!signalReceived) {
-            std::cerr << "signal lost waiting for stacktrace " << pid << ':'
+            std::cerr << "signal lost waiting for stacktrace " << pid << ":"
                       << tid << '\n';
             break;
           }
diff --git a/c10/util/sparse_bitset.h b/c10/util/sparse_bitset.h
index e7ad1db06d6f7..c8eb0df47f6ae 100644
--- a/c10/util/sparse_bitset.h
+++ b/c10/util/sparse_bitset.h
@@ -877,7 +877,7 @@ std::ostream& operator<<(
     std::ostream& stream,
     const SparseBitVector<ElementSize>& vec) {
   bool first = true;
-  stream << '{';
+  stream << "{";
   for (auto el : vec) {
     if (first) {
       first = false;
@@ -886,7 +886,7 @@ std::ostream& operator<<(
     }
     stream << el;
   }
-  stream << '}';
+  stream << "}";
   return stream;
 }
 
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
index 31cec72d8a1c3..a6ad3f00b2782 100644
--- a/torch/csrc/DataLoader.cpp
+++ b/torch/csrc/DataLoader.cpp
@@ -61,7 +61,7 @@ static void setSignalHandler(
       sigaction(signal, &sa, old_sa_ptr) != 0) {
     std::ostringstream oss;
     oss << "An error occurred while setting handler for " << strsignal(signal)
-        << '.';
+        << ".";
     TORCH_CHECK(false, oss.str());
   }
 }
diff --git a/torch/csrc/Device.cpp b/torch/csrc/Device.cpp
index da7b287369dab..f3babe4cd72bb 100644
--- a/torch/csrc/Device.cpp
+++ b/torch/csrc/Device.cpp
@@ -29,14 +29,14 @@ PyObject* THPDevice_New(const at::Device& device) {
 
 static PyObject* THPDevice_repr(THPDevice* self) {
   std::ostringstream oss;
-  oss << "device(type=\'" << self->device.type() << '\'';
+  oss << "device(type=\'" << self->device.type() << "\'";
   if (self->device.has_index()) {
     // `self->device.index()` returns uint8_t which is treated as ascii while
     // printing, hence casting it to uint16_t.
     // https://stackoverflow.com/questions/19562103/uint8-t-cant-be-printed-with-cout
     oss << ", index=" << static_cast<uint16_t>(self->device.index());
   }
-  oss << ')';
+  oss << ")";
   return THPUtils_packString(oss.str().c_str());
 }
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 4de6ba3976688..42a2e5c526742 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -212,8 +212,8 @@ static PyObject* THPModule_initExtension(
         }
         auto frame_id = s_tb[idx];
         const auto& frame = s_tbs.all_frames.at(frame_id);
-        oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
-            << ':' << frame.lineno << '\n';
+        oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
+            << ":" << frame.lineno << '\n';
       }
       return oss.str();
     });
@@ -2772,8 +2772,8 @@ Call this whenever a new thread is created in order to propagate values from
 
   py_module.def("_dump_local_tls_set", []() {
     auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-    std::cout << "Included: " << toString(local_keyset.included_) << '\n';
-    std::cout << "Excluded: " << toString(local_keyset.excluded_) << '\n';
+    std::cout << "Included: " << toString(local_keyset.included_) << "\n";
+    std::cout << "Excluded: " << toString(local_keyset.excluded_) << "\n";
   });
 
   py_module.def(
diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp
index de23b79536033..6874374eff768 100644
--- a/torch/csrc/TypeInfo.cpp
+++ b/torch/csrc/TypeInfo.cpp
@@ -254,7 +254,7 @@ static PyObject* THPFInfo_str(THPFInfo* self) {
       << PyFloat_AsDouble(THPFInfo_smallest_normal(self, nullptr));
   oss << ", tiny=" << PyFloat_AsDouble(THPFInfo_tiny(self, nullptr));
   if (dtypeStr != nullptr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
   }
   return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
 }
@@ -266,7 +266,7 @@ static PyObject* THPIInfo_str(THPIInfo* self) {
   oss << "iinfo(min=" << PyLong_AsDouble(THPIInfo_min(self, nullptr));
   oss << ", max=" << PyLong_AsDouble(THPIInfo_max(self, nullptr));
   if (dtypeStr) {
-    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ')';
+    oss << ", dtype=" << PyUnicode_AsUTF8(dtypeStr) << ")";
   }
 
   return !PyErr_Occurred() ? THPUtils_packString(oss.str().c_str()) : nullptr;
diff --git a/torch/csrc/api/include/torch/detail/TensorDataContainer.h b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
index 152672c7f3f21..9485af1d297d2 100644
--- a/torch/csrc/api/include/torch/detail/TensorDataContainer.h
+++ b/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@@ -271,7 +271,7 @@ struct TensorDataContainer {
           "TensorDataContainer_pretty_print_scalar",
           [&] { stream << scalar_.to<scalar_t>(); });
     } else if (is_init_list()) {
-      stream << '{';
+      stream << "{";
       for (const TensorDataContainer* it = init_list_.begin();
            it != init_list_.end();
            it++) {
@@ -279,9 +279,9 @@ struct TensorDataContainer {
         if (std::next(it) != init_list_.end())
           stream << ", ";
       }
-      stream << '}';
+      stream << "}";
     } else if (is_tensor()) {
-      stream << '{';
+      stream << "{";
       for (const auto i : c10::irange(tensor_.sizes()[0])) {
         AT_DISPATCH_ALL_TYPES_AND3(
             at::kBool,
@@ -293,7 +293,7 @@ struct TensorDataContainer {
         if (i != tensor_.sizes()[0] - 1)
           stream << ", ";
       }
-      stream << '}';
+      stream << "}";
     } else {
       TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
     }
diff --git a/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
index a0456578da0e7..8437ffd7afb8e 100644
--- a/torch/csrc/api/include/torch/nn/modules/batchnorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -145,7 +145,7 @@ class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
     stream << ", "
            << "affine=" << this->options.affine() << ", "
            << "track_running_stats=" << this->options.track_running_stats()
-           << ')';
+           << ")";
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
index 72cc777cd5c0e..008d790fdece1 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -28,13 +28,13 @@ class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
   void pretty_print(std::ostream& stream) const override {
     stream << "torch::nn::ParameterDict(" << '\n';
     for (const auto& pair : parameters_) {
-      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
-             << pair.value().scalar_type() << " of size "
-             << pair.value().sizes() << ']';
+      stream << "(" << pair.key() << ")"
+             << ": Parameter containing: [" << pair.value().scalar_type()
+             << " of size " << pair.value().sizes() << "]";
       ;
       stream << '\n';
     }
-    stream << ')';
+    stream << ")";
   }
 
   /// Insert the parameter along with the key into ParameterDict
diff --git a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
index c42215715406d..198172ab56489 100644
--- a/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
+++ b/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -36,13 +36,13 @@ class ParameterListImpl : public Cloneable<ParameterListImpl> {
   void pretty_print(std::ostream& stream) const override {
     stream << "torch::nn::ParameterList(" << '\n';
     for (const auto& pair : parameters_) {
-      stream << '(' << pair.key() << ')' << ": Parameter containing: ["
-             << pair.value().scalar_type() << " of size "
-             << pair.value().sizes() << ']';
+      stream << "(" << pair.key() << ")"
+             << ": Parameter containing: [" << pair.value().scalar_type()
+             << " of size " << pair.value().sizes() << "]";
       ;
       stream << '\n';
     }
-    stream << ')';
+    stream << ")";
   }
 
   /// push the a given parameter at the end of the list
diff --git a/torch/csrc/api/include/torch/nn/modules/conv.h b/torch/csrc/api/include/torch/nn/modules/conv.h
index 56fb6023ed4b5..8c5f1f3e39182 100644
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -113,8 +113,8 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
 
   /// Pretty prints the `Conv{1,2,3}d` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::Conv" << D << 'd' << '(' << options.in_channels()
-           << ", " << options.out_channels()
+    stream << "torch::nn::Conv" << D << "d"
+           << "(" << options.in_channels() << ", " << options.out_channels()
            << ", kernel_size=" << options.kernel_size()
            << ", stride=" << options.stride();
     std::visit(
@@ -143,7 +143,7 @@ class ConvNdImpl : public torch::nn::Cloneable<Derived> {
       stream << ", padding_mode="
              << enumtype::get_enum_name(options.padding_mode());
     }
-    stream << ')';
+    stream << ")";
   }
 
   /// The options with which this `Module` was constructed.
@@ -278,8 +278,8 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
 
   /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::ConvTranspose" << D << 'd' << '('
-           << this->options.in_channels() << ", "
+    stream << "torch::nn::ConvTranspose" << D << "d"
+           << "(" << this->options.in_channels() << ", "
            << this->options.out_channels()
            << ", kernel_size=" << this->options.kernel_size()
            << ", stride=" << this->options.stride();
@@ -303,7 +303,7 @@ class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
       stream << ", padding_mode="
              << enumtype::get_enum_name(this->options.padding_mode());
     }
-    stream << ')';
+    stream << ")";
   }
 
  protected:
diff --git a/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
index 492aba8e4e234..228f181715fc7 100644
--- a/torch/csrc/api/include/torch/nn/modules/instancenorm.h
+++ b/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -53,7 +53,7 @@ class InstanceNormImpl
            << "momentum=" << this->options.momentum() << ", "
            << "affine=" << this->options.affine() << ", "
            << "track_running_stats=" << this->options.track_running_stats()
-           << ')';
+           << ")";
   }
 };
 
diff --git a/torch/csrc/api/include/torch/nn/modules/pooling.h b/torch/csrc/api/include/torch/nn/modules/pooling.h
index 4f08bf31031e6..17ed12f4cc037 100644
--- a/torch/csrc/api/include/torch/nn/modules/pooling.h
+++ b/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -232,8 +232,8 @@ class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
   /// Pretty prints the `AdaptiveMaxPool{1,2,3}d` module into the given
   /// `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveMaxPool" << D << 'd'
-           << "(output_size=" << options.output_size() << ')';
+    stream << "torch::nn::AdaptiveMaxPool" << D << "d"
+           << "(output_size=" << options.output_size() << ")";
   }
 
   /// The options with which this `Module` was constructed.
@@ -365,8 +365,8 @@ class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
   /// Pretty prints the `AdaptiveAvgPool{1,2,3}d` module into the given
   /// `stream`.
   void pretty_print(std::ostream& stream) const override {
-    stream << "torch::nn::AdaptiveAvgPool" << D << 'd'
-           << "(output_size=" << options.output_size() << ')';
+    stream << "torch::nn::AdaptiveAvgPool" << D << "d"
+           << "(output_size=" << options.output_size() << ")";
   }
 
   /// The options with which this `Module` was constructed.
diff --git a/torch/csrc/api/src/nn/module.cpp b/torch/csrc/api/src/nn/module.cpp
index 5dbc36b7dd5f2..563ed4789cb12 100644
--- a/torch/csrc/api/src/nn/module.cpp
+++ b/torch/csrc/api/src/nn/module.cpp
@@ -355,11 +355,11 @@ void Module::pretty_print_recursive(
     stream << "(\n";
     const std::string next_indentation = indentation + "  ";
     for (const auto& child : children_) {
-      stream << next_indentation << '(' << child.key() << "): ";
+      stream << next_indentation << "(" << child.key() << "): ";
       child.value()->pretty_print_recursive(stream, next_indentation);
       stream << '\n';
     }
-    stream << indentation << ')';
+    stream << indentation << ")";
   }
 }
 
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index 5144ea51ecee7..68949f3fb496e 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -21,7 +21,7 @@ void ELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -39,7 +39,7 @@ void SELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -55,7 +55,7 @@ void HardshrinkImpl::reset() {}
 
 void HardshrinkImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Hardshrink(" << options.lambda()
-         << ')';
+         << ")";
 }
 
 // ============================================================================
@@ -86,7 +86,7 @@ void HardtanhImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -107,7 +107,7 @@ void LeakyReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -129,7 +129,7 @@ SoftmaxImpl::SoftmaxImpl(const SoftmaxOptions& options_) : options(options_) {}
 void SoftmaxImpl::reset() {}
 
 void SoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmax(dim=" << options.dim() << ')';
+  stream << "torch::nn::Softmax(dim=" << options.dim() << ")";
 }
 
 Tensor SoftmaxImpl::forward(const Tensor& input) {
@@ -143,7 +143,7 @@ SoftminImpl::SoftminImpl(const SoftminOptions& options_) : options(options_) {}
 void SoftminImpl::reset() {}
 
 void SoftminImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softmin(dim=" << options.dim() << ')';
+  stream << "torch::nn::Softmin(dim=" << options.dim() << ")";
 }
 
 Tensor SoftminImpl::forward(const Tensor& input) {
@@ -158,7 +158,7 @@ LogSoftmaxImpl::LogSoftmaxImpl(const LogSoftmaxOptions& options_)
 void LogSoftmaxImpl::reset() {}
 
 void LogSoftmaxImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ')';
+  stream << "torch::nn::LogSoftmax(dim=" << options.dim() << ")";
 }
 
 Tensor LogSoftmaxImpl::forward(const Tensor& input) {
@@ -197,7 +197,7 @@ void PReLUImpl::reset() {
 
 void PReLUImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PReLU(num_parameters=" << options.num_parameters()
-         << ')';
+         << ")";
 }
 
 // ============================================================================
@@ -215,7 +215,7 @@ void ReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -233,7 +233,7 @@ void ReLU6Impl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << "inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -257,7 +257,7 @@ void RReLUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -275,7 +275,7 @@ void CELUImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
@@ -289,7 +289,7 @@ Tensor GLUImpl::forward(const Tensor& input) {
 void GLUImpl::reset() {}
 
 void GLUImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::GLU(dim=" << options.dim() << ')';
+  stream << "torch::nn::GLU(dim=" << options.dim() << ")";
 }
 
 // ============================================================================
@@ -355,7 +355,7 @@ void SoftplusImpl::reset() {}
 
 void SoftplusImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Softplus(beta=" << options.beta()
-         << ", threshold=" << options.threshold() << ')';
+         << ", threshold=" << options.threshold() << ")";
 }
 
 // ============================================================================
@@ -370,7 +370,7 @@ Tensor SoftshrinkImpl::forward(const Tensor& input) {
 void SoftshrinkImpl::reset() {}
 
 void SoftshrinkImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::Softshrink(" << options.lambda() << ')';
+  stream << "torch::nn::Softshrink(" << options.lambda() << ")";
 }
 
 // ============================================================================
@@ -430,7 +430,7 @@ void ThresholdImpl::pretty_print(std::ostream& stream) const {
   if (options.inplace()) {
     stream << std::boolalpha << ", inplace=" << options.inplace();
   }
-  stream << ')';
+  stream << ")";
 }
 
 // ============================================================================
diff --git a/torch/csrc/api/src/nn/modules/distance.cpp b/torch/csrc/api/src/nn/modules/distance.cpp
index 7b45deadac947..d8e7fa8ac4003 100644
--- a/torch/csrc/api/src/nn/modules/distance.cpp
+++ b/torch/csrc/api/src/nn/modules/distance.cpp
@@ -12,7 +12,7 @@ void CosineSimilarityImpl::reset() {}
 
 void CosineSimilarityImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::CosineSimilarity"
-         << "(dim=" << options.dim() << ", eps=" << options.eps() << ')';
+         << "(dim=" << options.dim() << ", eps=" << options.eps() << ")";
 }
 
 Tensor CosineSimilarityImpl::forward(const Tensor& x1, const Tensor& x2) {
@@ -30,7 +30,7 @@ void PairwiseDistanceImpl::reset() {}
 void PairwiseDistanceImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::PairwiseDistance"
          << "(p=" << options.p() << ", eps=" << options.eps()
-         << ", keepdim=" << options.keepdim() << ')';
+         << ", keepdim=" << options.keepdim() << ")";
 }
 
 Tensor PairwiseDistanceImpl::forward(const Tensor& x1, const Tensor& x2) {
diff --git a/torch/csrc/api/src/nn/modules/dropout.cpp b/torch/csrc/api/src/nn/modules/dropout.cpp
index 08433bf363128..2b7c5aa3a289e 100644
--- a/torch/csrc/api/src/nn/modules/dropout.cpp
+++ b/torch/csrc/api/src/nn/modules/dropout.cpp
@@ -19,7 +19,7 @@ Tensor DropoutImpl::forward(Tensor input) {
 
 void DropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ')';
+         << ", inplace=" << options.inplace() << ")";
 }
 
 // ============================================================================
@@ -31,7 +31,7 @@ Tensor Dropout2dImpl::forward(Tensor input) {
 
 void Dropout2dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout2d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ')';
+         << ", inplace=" << options.inplace() << ")";
 }
 
 // ============================================================================
@@ -43,7 +43,7 @@ Tensor Dropout3dImpl::forward(Tensor input) {
 
 void Dropout3dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::Dropout3d(p=" << options.p()
-         << ", inplace=" << options.inplace() << ')';
+         << ", inplace=" << options.inplace() << ")";
 }
 
 // ============================================================================
@@ -55,7 +55,7 @@ Tensor AlphaDropoutImpl::forward(const Tensor& input) {
 
 void AlphaDropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::AlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ')';
+         << ", inplace=" << options.inplace() << ")";
 }
 
 // ============================================================================
@@ -67,7 +67,7 @@ Tensor FeatureAlphaDropoutImpl::forward(const Tensor& input) {
 
 void FeatureAlphaDropoutImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::FeatureAlphaDropout(p=" << options.p()
-         << ", inplace=" << options.inplace() << ')';
+         << ", inplace=" << options.inplace() << ")";
 }
 
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/embedding.cpp b/torch/csrc/api/src/nn/modules/embedding.cpp
index e704e71c97e65..b9fededfd7372 100644
--- a/torch/csrc/api/src/nn/modules/embedding.cpp
+++ b/torch/csrc/api/src/nn/modules/embedding.cpp
@@ -76,7 +76,7 @@ void EmbeddingImpl::pretty_print(std::ostream& stream) const {
   if (options.sparse()) {
     stream << ", sparse=" << std::boolalpha << options.sparse();
   }
-  stream << ')';
+  stream << ")";
 }
 
 torch::Tensor EmbeddingImpl::forward(const Tensor& input) {
@@ -181,6 +181,6 @@ void EmbeddingBagImpl::pretty_print(std::ostream& stream) const {
   if (padding_idx_opt.has_value()) {
     stream << ", padding_idx=" << padding_idx_opt.value();
   }
-  stream << ')';
+  stream << ")";
 }
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/fold.cpp b/torch/csrc/api/src/nn/modules/fold.cpp
index 43b07b84fcf27..32c83ca6e1b7f 100644
--- a/torch/csrc/api/src/nn/modules/fold.cpp
+++ b/torch/csrc/api/src/nn/modules/fold.cpp
@@ -17,7 +17,7 @@ void FoldImpl::pretty_print(std::ostream& stream) const {
          << ", kernel_size=" << options.kernel_size()
          << ", dilation=" << options.dilation()
          << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ')';
+         << ")";
 }
 
 Tensor FoldImpl::forward(const Tensor& input) {
@@ -40,7 +40,7 @@ void UnfoldImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Unfold(kernel_size=" << options.kernel_size()
          << ", dilation=" << options.dilation()
          << ", padding=" << options.padding() << ", stride=" << options.stride()
-         << ')';
+         << ")";
 }
 
 Tensor UnfoldImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/api/src/nn/modules/linear.cpp b/torch/csrc/api/src/nn/modules/linear.cpp
index 6ed92d2998c24..0b31e3aa03730 100644
--- a/torch/csrc/api/src/nn/modules/linear.cpp
+++ b/torch/csrc/api/src/nn/modules/linear.cpp
@@ -55,7 +55,7 @@ void LinearImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha
          << "torch::nn::Linear(in_features=" << options.in_features()
          << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ')';
+         << ", bias=" << options.bias() << ")";
 }
 
 Tensor LinearImpl::forward(const Tensor& input) {
@@ -70,7 +70,7 @@ void FlattenImpl::reset() {}
 
 void FlattenImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::Flatten(start_dim=" << options.start_dim()
-         << ", end_dim=" << options.end_dim() << ')';
+         << ", end_dim=" << options.end_dim() << ")";
 }
 
 Tensor FlattenImpl::forward(const Tensor& input) {
@@ -161,7 +161,7 @@ void BilinearImpl::pretty_print(std::ostream& stream) const {
          << "torch::nn::Bilinear(in1_features=" << options.in1_features()
          << ", in2_features=" << options.in2_features()
          << ", out_features=" << options.out_features()
-         << ", bias=" << options.bias() << ')';
+         << ", bias=" << options.bias() << ")";
 }
 
 Tensor BilinearImpl::forward(const Tensor& input1, const Tensor& input2) {
diff --git a/torch/csrc/api/src/nn/modules/loss.cpp b/torch/csrc/api/src/nn/modules/loss.cpp
index 6ea9d76af8128..7cae60ac99251 100644
--- a/torch/csrc/api/src/nn/modules/loss.cpp
+++ b/torch/csrc/api/src/nn/modules/loss.cpp
@@ -74,7 +74,7 @@ HingeEmbeddingLossImpl::HingeEmbeddingLossImpl(
 void HingeEmbeddingLossImpl::reset() {}
 
 void HingeEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ')';
+  stream << "torch::nn::HingeEmbeddingLoss(margin=" << options.margin() << ")";
 }
 
 Tensor HingeEmbeddingLossImpl::forward(
@@ -104,7 +104,7 @@ void MultiMarginLossImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::MultiMarginLoss(p=" << options.p()
          << ", margin=" << options.margin() << ", weight=" << options.weight()
          << ", reduction=" << enumtype::get_enum_name(options.reduction())
-         << ')';
+         << ")";
 }
 
 Tensor MultiMarginLossImpl::forward(const Tensor& input, const Tensor& target) {
@@ -126,7 +126,7 @@ CosineEmbeddingLossImpl::CosineEmbeddingLossImpl(
 void CosineEmbeddingLossImpl::reset() {}
 
 void CosineEmbeddingLossImpl::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ')';
+  stream << "torch::nn::CosineEmbeddingLoss(margin=" << options.margin() << ")";
 }
 
 Tensor CosineEmbeddingLossImpl::forward(
@@ -169,7 +169,7 @@ void TripletMarginLossImpl::reset() {}
 void TripletMarginLossImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::TripletMarginLoss(margin=" << options.margin()
          << ", p=" << options.p() << ", eps=" << options.eps() << std::boolalpha
-         << ", swap=" << options.swap() << ')';
+         << ", swap=" << options.swap() << ")";
 }
 
 Tensor TripletMarginLossImpl::forward(
@@ -199,7 +199,7 @@ void TripletMarginWithDistanceLossImpl::pretty_print(
     std::ostream& stream) const {
   stream << "torch::nn::TripletMarginWithDistanceLoss(margin="
          << options.margin() << std::boolalpha << ", swap=" << options.swap()
-         << ')';
+         << ")";
 }
 
 Tensor TripletMarginWithDistanceLossImpl::forward(
diff --git a/torch/csrc/api/src/nn/modules/normalization.cpp b/torch/csrc/api/src/nn/modules/normalization.cpp
index 72957356a3da9..41129c8990923 100644
--- a/torch/csrc/api/src/nn/modules/normalization.cpp
+++ b/torch/csrc/api/src/nn/modules/normalization.cpp
@@ -40,7 +40,7 @@ void LayerNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::LayerNorm("
          << torch::IntArrayRef(options.normalized_shape())
          << ", eps=" << options.eps()
-         << ", elementwise_affine=" << options.elementwise_affine() << ')';
+         << ", elementwise_affine=" << options.elementwise_affine() << ")";
 }
 
 torch::Tensor LayerNormImpl::forward(const Tensor& input) {
@@ -64,7 +64,7 @@ void LocalResponseNormImpl::reset() {}
 void LocalResponseNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::LocalResponseNorm(" << options.size()
          << ", alpha=" << options.alpha() << ", beta=" << options.beta()
-         << ", k=" << options.k() << ')';
+         << ", k=" << options.k() << ")";
 }
 
 // ============================================================================
@@ -74,7 +74,7 @@ void CrossMapLRN2dImpl::reset() {}
 void CrossMapLRN2dImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::CrossMapLRN2d(" << options.size()
          << ", alpha=" << options.alpha() << ", beta=" << options.beta()
-         << ", k=" << options.k() << ')';
+         << ", k=" << options.k() << ")";
 }
 
 torch::Tensor CrossMapLRN2dImpl::forward(const torch::Tensor& input) {
@@ -115,7 +115,7 @@ torch::Tensor GroupNormImpl::forward(const Tensor& input) {
 void GroupNormImpl::pretty_print(std::ostream& stream) const {
   stream << std::boolalpha << "torch::nn::GroupNorm(" << options.num_groups()
          << ", " << options.num_channels() << ", eps=" << options.eps()
-         << ", affine=" << options.affine() << ')';
+         << ", affine=" << options.affine() << ")";
 }
 
 } // namespace torch::nn
diff --git a/torch/csrc/api/src/nn/modules/padding.cpp b/torch/csrc/api/src/nn/modules/padding.cpp
index 2e3212f7c94fe..d992bf696d0ca 100644
--- a/torch/csrc/api/src/nn/modules/padding.cpp
+++ b/torch/csrc/api/src/nn/modules/padding.cpp
@@ -21,8 +21,8 @@ Tensor ReflectionPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ReflectionPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ReflectionPad" << D << 'd'
-         << "(padding=" << options.padding() << ')';
+  stream << "torch::nn::ReflectionPad" << D << "d"
+         << "(padding=" << options.padding() << ")";
 }
 
 template class ReflectionPadImpl<1, ReflectionPad1dImpl>;
@@ -46,8 +46,8 @@ Tensor ReplicationPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ReplicationPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ReplicationPad" << D << 'd'
-         << "(padding=" << options.padding() << ')';
+  stream << "torch::nn::ReplicationPad" << D << "d"
+         << "(padding=" << options.padding() << ")";
 }
 
 template class ReplicationPadImpl<1, ReplicationPad1dImpl>;
@@ -70,8 +70,8 @@ Tensor ZeroPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ZeroPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ZeroPad" << D << 'd' << "(padding=" << options.padding()
-         << ')';
+  stream << "torch::nn::ZeroPad" << D << "d"
+         << "(padding=" << options.padding() << ")";
 }
 
 template class ZeroPadImpl<1, ZeroPad1dImpl>;
@@ -96,9 +96,9 @@ Tensor ConstantPadImpl<D, Derived>::forward(const Tensor& input) {
 
 template <size_t D, typename Derived>
 void ConstantPadImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::ConstantPad" << D << 'd'
+  stream << "torch::nn::ConstantPad" << D << "d"
          << "(padding=" << options.padding() << ", value=" << options.value()
-         << ')';
+         << ")";
 }
 
 template class ConstantPadImpl<1, ConstantPad1dImpl>;
diff --git a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
index bae89d1964961..b11a99eea4e47 100644
--- a/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
+++ b/torch/csrc/api/src/nn/modules/pixelshuffle.cpp
@@ -9,7 +9,7 @@ PixelShuffleImpl::PixelShuffleImpl(const PixelShuffleOptions& options_)
 
 void PixelShuffleImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PixelShuffle(upscale_factor="
-         << options.upscale_factor() << ')';
+         << options.upscale_factor() << ")";
 }
 
 void PixelShuffleImpl::reset() {}
@@ -23,7 +23,7 @@ PixelUnshuffleImpl::PixelUnshuffleImpl(const PixelUnshuffleOptions& options_)
 
 void PixelUnshuffleImpl::pretty_print(std::ostream& stream) const {
   stream << "torch::nn::PixelUnshuffle(downscale_factor="
-         << options.downscale_factor() << ')';
+         << options.downscale_factor() << ")";
 }
 
 void PixelUnshuffleImpl::reset() {}
diff --git a/torch/csrc/api/src/nn/modules/pooling.cpp b/torch/csrc/api/src/nn/modules/pooling.cpp
index 3d6aeb6dffb6c..f42cfe6b20294 100644
--- a/torch/csrc/api/src/nn/modules/pooling.cpp
+++ b/torch/csrc/api/src/nn/modules/pooling.cpp
@@ -15,10 +15,10 @@ void AvgPoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void AvgPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << "torch::nn::AvgPool" << D << 'd'
+  stream << "torch::nn::AvgPool" << D << "d"
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
-         << ')';
+         << ")";
 }
 
 Tensor AvgPool1dImpl::forward(const Tensor& input) {
@@ -68,11 +68,11 @@ void MaxPoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void MaxPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << std::boolalpha << "torch::nn::MaxPool" << D << 'd'
+  stream << std::boolalpha << "torch::nn::MaxPool" << D << "d"
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
          << ", dilation=" << options.dilation()
-         << ", ceil_mode=" << options.ceil_mode() << ')';
+         << ", ceil_mode=" << options.ceil_mode() << ")";
 }
 
 Tensor MaxPool1dImpl::forward(const Tensor& input) {
@@ -219,10 +219,10 @@ void MaxUnpoolImpl<D, Derived>::reset() {}
 
 template <size_t D, typename Derived>
 void MaxUnpoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
-  stream << std::boolalpha << "torch::nn::MaxUnpool" << D << 'd'
+  stream << std::boolalpha << "torch::nn::MaxUnpool" << D << "d"
          << "(kernel_size=" << options.kernel_size()
          << ", stride=" << options.stride() << ", padding=" << options.padding()
-         << ')';
+         << ")";
 }
 
 Tensor MaxUnpool1dImpl::forward(
@@ -401,7 +401,7 @@ void LPPoolImpl<D, Derived>::pretty_print(std::ostream& stream) const {
          << "norm_type=" << options.norm_type() << ", "
          << "kernel_size=" << options.kernel_size() << ", "
          << "stride=" << options.stride() << ", "
-         << "ceil_mode=" << options.ceil_mode() << ')';
+         << "ceil_mode=" << options.ceil_mode() << ")";
 }
 
 Tensor LPPool1dImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp
index 7ee864bc8ea94..be7c5ded2fc52 100644
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@@ -374,7 +374,7 @@ void RNNImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (options_base.proj_size() > 0) {
     stream << ", proj_size=" << options_base.proj_size();
   }
-  stream << ')';
+  stream << ")";
 }
 
 template <typename Derived>
@@ -837,7 +837,7 @@ template <typename Derived>
 void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   const std::string name = this->name();
   const std::string name_without_impl = name.substr(0, name.size() - 4);
-  stream << name_without_impl << '(' << options_base.input_size() << ", "
+  stream << name_without_impl << "(" << options_base.input_size() << ", "
          << options_base.hidden_size();
   if (!options_base.bias()) {
     stream << ", bias=" << std::boolalpha << false;
@@ -846,7 +846,7 @@ void RNNCellImplBase<Derived>::pretty_print(std::ostream& stream) const {
   if (!nonlinearity_str.empty() && nonlinearity_str != "kTanh") {
     stream << ", nonlinearity=" << nonlinearity_str;
   }
-  stream << ')';
+  stream << ")";
 }
 
 template <typename Derived>
diff --git a/torch/csrc/api/src/nn/modules/upsampling.cpp b/torch/csrc/api/src/nn/modules/upsampling.cpp
index e29f1034fa51c..420ffe5a8813d 100644
--- a/torch/csrc/api/src/nn/modules/upsampling.cpp
+++ b/torch/csrc/api/src/nn/modules/upsampling.cpp
@@ -18,7 +18,7 @@ void UpsampleImpl::pretty_print(std::ostream& stream) const {
     // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
     stream << "size=" << at::ArrayRef<int64_t>(options.size().value());
   }
-  stream << ", mode=" << enumtype::get_enum_name(options.mode()) << ')';
+  stream << ", mode=" << enumtype::get_enum_name(options.mode()) << ")";
 }
 
 Tensor UpsampleImpl::forward(const Tensor& input) {
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index 55def20af786f..0124a0212bc61 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -172,15 +172,15 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
       message
           << "one of the variables needed for gradient computation has been "
              "modified by an inplace operation: ["
-          << data_.toString() << ' ';
+          << data_.toString() << " ";
       if (data_.is_nested()) {
-        message << data_._nested_tensor_size() << ']';
+        message << data_._nested_tensor_size() << "]";
       } else {
-        message << data_.sizes() << ']';
+        message << data_.sizes() << "]";
       }
       if (grad_fn) {
         message << ", which is output " << output_nr_ << " of "
-                << grad_fn->name() << ',';
+                << grad_fn->name() << ",";
       }
       message << " is at version " << current_version << "; expected version "
               << saved_version_ << " instead.";
diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp
index a8ae82b1b66ea..b14323a47bf35 100644
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@@ -1114,7 +1114,7 @@ static void registerCudaDeviceProperties(PyObject* module) {
         stream << "_CudaDeviceProperties(name='" << prop.name
                << "', major=" << prop.major << ", minor=" << prop.minor
 #if USE_ROCM
-               << ", gcnArchName='" << prop.gcnArchName << '\''
+               << ", gcnArchName='" << prop.gcnArchName << "'"
 #endif // USE_ROCM
                << ", total_memory=" << prop.totalGlobalMem / (1024ull * 1024)
                << "MB, multi_processor_count=" << prop.multiProcessorCount
diff --git a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
index 28647b8c50f5a..88205c171941c 100644
--- a/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -24,8 +24,8 @@ std::string FlightRecorder<EventType>::Entry::getTraceback() {
   for (auto idx : c10::irange(s_tb.size())) {
     auto frame_id = s_tb[idx];
     const auto& frame = s_tbs.all_frames.at(frame_id);
-    oss << '#' << idx << ' ' << frame.funcname << " from " << frame.filename
-        << ':' << frame.lineno << '\n';
+    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
+        << ":" << frame.lineno << '\n';
   }
   /* Resulted format is like:
     #0 all_reduce from pytorch/torch/distributed/distributed_c10d.py:2696
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 8ae3bf3b314f3..e99d9b0cf8558 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -2016,7 +2016,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
           << pg_->logPrefix()
           << "ProcessGroupNCCL monitor thread is disabled, but would have terminated the process"
           << "after attempting to dump debug info, due to " << exitReason
-          << '.';
+          << ".";
     }
   }
 }
diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
index fa40ff15ec74f..624a8fc11b615 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -174,7 +174,7 @@ struct CollectiveFingerPrint {
           ss << "Detected mismatch between collectives on ranks. Rank "
              << backend->getRank() << " is running collective: " << *this
              << ", but Rank " << rank
-             << " is running collective: " << rank_fingerprint << '.';
+             << " is running collective: " << rank_fingerprint << ".";
           auto diff_result = compute_collective_diff(rank_fingerprint);
           if (std::get<0>(diff_result)) {
             ss << std::get<1>(diff_result);
diff --git a/torch/csrc/distributed/c10d/UCCTracing.cpp b/torch/csrc/distributed/c10d/UCCTracing.cpp
index 78fac30d2ab9f..66d62d662c259 100644
--- a/torch/csrc/distributed/c10d/UCCTracing.cpp
+++ b/torch/csrc/distributed/c10d/UCCTracing.cpp
@@ -51,7 +51,7 @@ void ProcessGroupUCCLogger::flushComms(int rank, int world_size) {
   _outfile.open(trace_filename, std::ofstream::out | std::ofstream::trunc);
   // flush the traced comms
   if (_outfile.is_open()) {
-    _outfile << '[' << c10::Join(",", trace_generator->getCommsTrace())
+    _outfile << "[" << c10::Join(",", trace_generator->getCommsTrace())
              << "\n]";
     _outfile.flush();
     _outfile.close();
diff --git a/torch/csrc/distributed/c10d/UCCUtils.cpp b/torch/csrc/distributed/c10d/UCCUtils.cpp
index 9e297ad339fa6..6794c4eaa594f 100644
--- a/torch/csrc/distributed/c10d/UCCUtils.cpp
+++ b/torch/csrc/distributed/c10d/UCCUtils.cpp
@@ -35,7 +35,7 @@ ucc_status_t oob_allgather(
     *req = coll_info;
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << '[' << ex.what() << ']';
+               << "[" << ex.what() << "]";
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
@@ -61,7 +61,7 @@ ucc_status_t oob_allgather_test(void* req) {
     }
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << '[' << ex.what() << ']';
+               << "[" << ex.what() << "]";
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
@@ -91,7 +91,7 @@ ucc_status_t oob_allgather_free(void* req) {
         info->getKey(kAllGatherFree + std::to_string(info->rank)));
   } catch (std::exception& ex) {
     LOG(ERROR) << "(oob_allgather) Caught exception in Store Operation .. "
-               << '[' << ex.what() << ']';
+               << "[" << ex.what() << "]";
     return UCC_ERR_NO_MESSAGE;
   }
   return UCC_OK;
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 25193b54af9fd..fc9d735401c73 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -48,14 +48,14 @@ TORCH_API std::vector<at::Tensor> getTensorShapes(
 // Turns at::IntArrayRef into "(1, 2, 3, 4)".
 inline std::string toString(at::IntArrayRef l) {
   std::stringstream ss;
-  ss << '(';
+  ss << "(";
   for (const auto i : c10::irange(l.size())) {
     if (i > 0) {
       ss << ", ";
     }
     ss << l[i];
   }
-  ss << ')';
+  ss << ")";
   return ss.str();
 }
 
diff --git a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
index 8bbe857620790..2f77bb119a956 100644
--- a/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
+++ b/torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp
@@ -87,17 +87,17 @@ WorkerServer::WorkerServer(const std::string& hostOrFile, int port) {
       "/handler/",
       [](const httplib::Request& req [[maybe_unused]], httplib::Response& res) {
         std::ostringstream body;
-        body << '[';
+        body << "[";
         bool first = true;
         for (const auto& name : getHandlerNames()) {
           if (!first) {
-            body << ',';
+            body << ",";
           }
           first = false;
 
-          body << '"' << jsonStrEscape(name) << '"';
+          body << "\"" << jsonStrEscape(name) << "\"";
         }
-        body << ']';
+        body << "]";
 
         res.set_content(body.str(), "application/json");
       });
diff --git a/torch/csrc/distributed/c10d/logger.cpp b/torch/csrc/distributed/c10d/logger.cpp
index c9ef7262f8c8b..170748a60352b 100644
--- a/torch/csrc/distributed/c10d/logger.cpp
+++ b/torch/csrc/distributed/c10d/logger.cpp
@@ -215,10 +215,10 @@ void Logger::set_construction_data_and_log(
         ddp_logging_data_->ints_map["rank"]);
     std::stringstream ddpLoggingDataInfo;
     for (const auto& intItem : ddp_logging_data_->ints_map) {
-      ddpLoggingDataInfo << intItem.first << ": " << intItem.second << '\n';
+      ddpLoggingDataInfo << intItem.first << ": " << intItem.second << "\n";
     }
     for (const auto& strItem : ddp_logging_data_->strs_map) {
-      ddpLoggingDataInfo << strItem.first << ": " << strItem.second << '\n';
+      ddpLoggingDataInfo << strItem.first << ": " << strItem.second << "\n";
     }
     LOG(INFO) << initInfo << ddpLoggingDataInfo.str();
   }
diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp
index a1c9b4a3039d5..10a2251754cde 100644
--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@@ -615,8 +615,8 @@ void Reducer::delay_all_reduce() {
           param_name != param_names_.end(),
           "Expected to find parameter name from unused parameters map in debug mode.");
       // Add the param_name
-      unused_params_stream << '{' << param_name->second << ',' << unused_index
-                           << '}';
+      unused_params_stream << "{" << param_name->second << "," << unused_index
+                           << "}";
     }
 
     // Each rank prints out all the unused parameters detected
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
index e246620df31e8..efec39e9eb72c 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
@@ -61,7 +61,7 @@ class StoreExchange {
     peer_keys.reserve(world_size);
     for (int r = 0; r < world_size; ++r) {
       std::ostringstream oss;
-      oss << store_prefix_ << '/' << seq_id_ << '/' << r;
+      oss << store_prefix_ << "/" << seq_id_ << "/" << r;
       peer_keys.push_back(oss.str());
     }
     ++seq_id_;
diff --git a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
index 44a19e96deeab..0d54c389ddee6 100644
--- a/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp
@@ -7,7 +7,7 @@ std::string get_detector_key(
     c10::DeviceType device_type,
     const std::string& connection_type) {
   std::ostringstream oss;
-  oss << device_type << '/' << connection_type;
+  oss << device_type << "/" << connection_type;
   return oss.str();
 }
 
diff --git a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
index c099e2d72ecfd..0eda605fad6fb 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
@@ -207,7 +207,7 @@ class NCCLSymmetricMemoryAllocator : public SymmetricMemoryAllocator {
     auto buffer_size_map =
         storeExchange.all_gather(group_info.store, group_info.rank, group_info.world_size, it->second->buffer_size);
 
-    LOG(INFO) << "[rank " << group_info.rank << ']'
+    LOG(INFO) << "[rank " << group_info.rank << "]"
               << "buffer_size_map: " << buffer_size_map;
     // NCCL window registration api requires all ranks to have the same buffer size
     // we have this check to make sure all ranks have the same buffer size.
diff --git a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
index 510f5c4dd1b32..69e75df453f51 100644
--- a/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu
@@ -71,7 +71,7 @@ class NVSHMEMPeerAllocInfo : public c10::intrusive_ptr_target {
           storeExchange.all_gather(store, rank_, world_size_, global_rank);
       exchanged_n_times++;
       if (rank_ == 0) {
-        LOG(INFO) << "[rank " << rank_ << ']'
+        LOG(INFO) << "[rank " << rank_ << "]"
                   << " rank_to_global_rank: " << group_info.rank_to_global_rank
                   << ", group_name: " << group_name
                   << ", exchanged_n_times: " << exchanged_n_times;
diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
index f62577e701847..0d53d100cee7d 100644
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@@ -121,7 +121,7 @@ static std::vector<T> storeAllGather(
   std::vector<std::string> peerKeys;
   for (size_t r = 0; r < worldSize; ++r) {
     std::ostringstream oss;
-    oss << prefix << '-' << r;
+    oss << prefix << "-" << r;
     peerKeys.push_back(oss.str());
   }
 
@@ -187,7 +187,7 @@ bool IntraNodeComm::rendezvous() {
     if (strcmp(info.hostname, peerDevInfos.front().hostname) != 0) {
       LOG(WARNING) << "Aborting IntraNodeComm::rendezvous because some "
                       "participants are not on the same host ("
-                   << info.hostname << ", " << devInfo.hostname << ')';
+                   << info.hostname << ", " << devInfo.hostname << ")";
       return false;
     }
     rankToDeviceIdx.emplace_back(info.deviceIdx);
diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
index a7a87e4bd8627..cb5d40ef41837 100644
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@@ -57,7 +57,7 @@ bool is_nvshmem_available() {
     // Open the shared library, RTLD_LAZY defers symbol resolution until needed
     handle = dlopen("libnvshmem_host.so.3", RTLD_LAZY);
     if (!handle) {
-      std::cerr << dlerror() << '\n';
+      std::cerr << dlerror() << "\n";
       is_available = 0;
     } else {
       is_available = 1;
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index a41969ebc1293..9eee15bdc4d88 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -326,7 +326,7 @@ std::unordered_map<std::string, std::string> RpcAgent::getDebugInfo() {
 
 std::ostream& operator<<(std::ostream& os, const WorkerInfo& workerInfo) {
   return os << "WorkerInfo(id=" << workerInfo.id_
-            << ", name=" << workerInfo.name_ << ')';
+            << ", name=" << workerInfo.name_ << ")";
 }
 
 } // namespace torch::distributed::rpc
diff --git a/torch/csrc/distributed/rpc/rref_impl.cpp b/torch/csrc/distributed/rpc/rref_impl.cpp
index 59087eb3e6a4e..ecf3cbd999104 100644
--- a/torch/csrc/distributed/rpc/rref_impl.cpp
+++ b/torch/csrc/distributed/rpc/rref_impl.cpp
@@ -290,12 +290,12 @@ void OwnerRRef::setError(std::exception_ptr eptr) {
 std::ostream& operator<<(std::ostream& os, const RRef& rref) {
   if (rref.isOwner()) {
     return os << "OwnerRRef("
-              << "rref_id=" << rref.rrefId() << ')';
+              << "rref_id=" << rref.rrefId() << ")";
   } else {
     return os << "UserRRef("
               << "rref_id=" << rref.rrefId()
               << ", fork_id=" << static_cast<const UserRRef*>(&rref)->forkId()
-              << ')';
+              << ")";
   }
 }
 
diff --git a/torch/csrc/distributed/rpc/types.cpp b/torch/csrc/distributed/rpc/types.cpp
index 1a19fa4708273..8a3a18e96a264 100644
--- a/torch/csrc/distributed/rpc/types.cpp
+++ b/torch/csrc/distributed/rpc/types.cpp
@@ -83,7 +83,7 @@ GloballyUniqueId GloballyUniqueId::fromIValue(const at::IValue& ivalue) {
 
 std::ostream& operator<<(std::ostream& os, GloballyUniqueId const& globalId) {
   return os << "GloballyUniqueId(created_on=" << globalId.createdOn_
-            << ", local_id=" << globalId.localId_ << ')';
+            << ", local_id=" << globalId.localId_ << ")";
 }
 
 ///////////////////////////  SerializedPyObj   ///////////////////////////
diff --git a/torch/csrc/dynamo/python_compiled_autograd.cpp b/torch/csrc/dynamo/python_compiled_autograd.cpp
index c24f2cffdd762..0e70be3e9ffc4 100644
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@@ -434,10 +434,10 @@ struct VerboseLogger : public PythonLogger {
       }
       oss << it->key_size;
       if (std::next(it) != cached_keys.end()) {
-        oss << ',';
+        oss << ",";
       }
     }
-    oss << ']';
+    oss << "]";
     std::string compile_reason = oss.str();
     log(PythonLogger::DEBUG, compile_reason);
     return compile_reason;
@@ -454,7 +454,7 @@ struct VerboseLogger : public PythonLogger {
     }
     oss << "sizes["
         << std::to_string(new_dyn_sizes_idx[new_dyn_sizes_idx.size() - 1])
-        << ']';
+        << "]";
     std::string recompile_reason = oss.str();
     log(PythonLogger::DEBUG, recompile_reason);
     return recompile_reason;
diff --git a/torch/csrc/export/upgrader.cpp b/torch/csrc/export/upgrader.cpp
index ec275593e6ff4..04da1ab2a2d28 100644
--- a/torch/csrc/export/upgrader.cpp
+++ b/torch/csrc/export/upgrader.cpp
@@ -78,7 +78,7 @@ void registerUpgrader(
                      << " and keypath: ";
         for (size_t i = 0; i < keypath.size(); ++i) {
           if (i > 0)
-            error_stream << '.';
+            error_stream << ".";
           error_stream << keypath[i];
         }
         TORCH_CHECK(false, error_stream.str());
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
index 25cd32b6b52fe..1642ee4beca01 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
@@ -100,12 +100,12 @@ std::ostream& operator<<(
   stream << "device_: " << tensor_metadata.device_ << '\n';
   stream << "sizes_: ";
   for (const auto& size : tensor_metadata.sizes_) {
-    stream << size << ' ';
+    stream << size << " ";
   }
   stream << '\n';
   stream << "strides_: ";
   for (const auto& stride : tensor_metadata.strides_) {
-    stream << stride << ' ';
+    stream << stride << " ";
   }
 
   stream << "requires_grad_: " << tensor_metadata.requires_grad_ << '\n';
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index ba5e1865ec14d..188f92557761d 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -593,7 +593,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
   } else {
     LOG(WARNING)
         << "You are using an outdated version of the pt2 archive which do not have a prefix in front of each filename. Example: \n"
-        << found_filenames[0] << '\n'
+        << found_filenames[0] << "\n"
         << found_filenames[1];
   }
 
diff --git a/torch/csrc/inductor/aoti_runtime/model_base.h b/torch/csrc/inductor/aoti_runtime/model_base.h
index bf8f07edb1458..19f1dca1b7e27 100644
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@@ -468,7 +468,7 @@ class AOTInductorModelBase {
       auto code = cudaEventDestroy(*run_finished_);
       if (code != cudaSuccess) {
         std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
-                  << cudaGetErrorString(code) << '\n';
+                  << cudaGetErrorString(code) << "\n";
       }
     }
 #endif // USE_CUDA
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index d6db06af5f2cc..2df922109975a 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -1261,7 +1261,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   at::Tensor* t = tensor_handle_to_tensor_pointer(self);
 
   // Display message
-  std::cout << '[';
+  std::cout << "[";
   if (msg) {
     std::cout << "  " << msg;
   }
@@ -1270,7 +1270,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
   // Print exact tensor values for small size tensors
   const int64_t numel = t->numel();
   if (numel <= AOTI_TORCH_MAX_NUMEL_TO_PRINT) {
-    std::cout << *t << '\n';
+    std::cout << *t << "\n";
   }
 
   // Print summary stats of the tensor
@@ -1316,7 +1316,7 @@ void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
         std::cout
             << "[INFO] Aten built-in function `min_all_cuda/max_all_cuda` not implemented for current dtype: "
             << t->dtype() << ". Printing out the whole value:\n"
-            << *t << '\n';
+            << *t << "\n";
       }
     }
   }
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 61c32680c7c0b..53be7504fe2c3 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -615,7 +615,7 @@ std::string Module::dump_to_str(
             print_method_bodies, print_attr_values, print_param_values));
   }
   ss << "  }" << '\n';
-  ss << '}' << '\n';
+  ss << "}" << '\n';
 
   return ss.str();
 }
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index 739eaf478f1e2..c9b7793c89b6f 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -652,7 +652,7 @@ struct NamedPolicy {
       std::ostringstream ss;
       for (const auto i : c10::irange(cursors.size())) {
         if (i > 0) {
-          ss << '.';
+          ss << ".";
         }
         ss << nameFragment(cursors[i]);
       }
diff --git a/torch/csrc/jit/backends/backend_detail.cpp b/torch/csrc/jit/backends/backend_detail.cpp
index 2edf832e04262..de352f50ab503 100644
--- a/torch/csrc/jit/backends/backend_detail.cpp
+++ b/torch/csrc/jit/backends/backend_detail.cpp
@@ -305,8 +305,8 @@ Module codegen_backend_module(
         TORCH_INTERNAL_ASSERT(default_value.has_value());
         std::stringstream def_ss, fwd_ss;
         // Annotate type of the arg
-        def_ss << name << ": " << arg.type()->annotation_str(nullptr) << '=';
-        fwd_ss << name << '=' << name;
+        def_ss << name << ": " << arg.type()->annotation_str(nullptr) << "=";
+        fwd_ss << name << "=" << name;
         default_value->repr(
             def_ss, [](std::ostream&, const IValue&) -> bool { return false; });
         def_inputs.emplace_back(def_ss.str());
@@ -337,18 +337,18 @@ Module codegen_backend_module(
 
     if (out_tuple_ty) {
       auto tuple_elements = out_tuple_ty->elements();
-      type_check_ss << tuple_elements[0]->annotation_str() << ')';
+      type_check_ss << tuple_elements[0]->annotation_str() << ")";
       type_checks.emplace_back(type_check_ss.str());
       for (unsigned i = 1, e = tuple_elements.size(); i < e; ++i) {
         type_check_ss.str(std::string());
         type_check_ss.clear();
         out_ss << ", _" << i;
         type_check_ss << "assert isinstance(_" << i << ", "
-                      << tuple_elements[i]->annotation_str() << ')';
+                      << tuple_elements[i]->annotation_str() << ")";
         type_checks.emplace_back(type_check_ss.str());
       }
     } else {
-      type_check_ss << out_ty->annotation_str() << ')';
+      type_check_ss << out_ty->annotation_str() << ")";
       type_checks.emplace_back(type_check_ss.str());
     }
 
@@ -364,7 +364,7 @@ Module codegen_backend_module(
     // If the output type is a single element tuple then add an extra comma
     // to ensure the final output maintains this type.
     if (out_tuple_ty && out_tuple_ty->elements().size() == 1) {
-      out_ss << ',';
+      out_ss << ",";
     }
 
     method_te.s("ret", out_ss.str());
diff --git a/torch/csrc/jit/codegen/fuser/tensor_desc.h b/torch/csrc/jit/codegen/fuser/tensor_desc.h
index 55cd4008e1814..0c5db65d54ad1 100644
--- a/torch/csrc/jit/codegen/fuser/tensor_desc.h
+++ b/torch/csrc/jit/codegen/fuser/tensor_desc.h
@@ -88,10 +88,10 @@ struct TORCH_API TensorDesc {
 };
 
 inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) {
-  out << d.scalar_type << '[';
+  out << d.scalar_type << "[";
   for (const auto b : d.contiguity)
-    out << b << ';';
-  out << ']';
+    out << b << ";";
+  out << "]";
   return out;
 }
 
diff --git a/torch/csrc/jit/frontend/concrete_module_type.cpp b/torch/csrc/jit/frontend/concrete_module_type.cpp
index 1cb5fb225dc92..91d41607f9df9 100644
--- a/torch/csrc/jit/frontend/concrete_module_type.cpp
+++ b/torch/csrc/jit/frontend/concrete_module_type.cpp
@@ -305,37 +305,39 @@ void ConcreteModuleTypeBuilder::addIgnoredAttribute(std::string name) {
 
 void ConcreteModuleType::dump() const {
   std::cout << "ConcreteModuleType for: "
-            << py::getattr(data_.pyClass_, "__name__") << '\n';
+            << py::getattr(data_.pyClass_, "__name__") << "\n";
   std::cout << "Constants: \n";
   for (const auto& pr : data_.constants_) {
-    std::cout << '\t' << pr.first << ": " << pr.second << '\n';
+    std::cout << "\t" << pr.first << ": " << pr.second << "\n";
   }
   std::cout << "\nAttributes: \n";
   for (const auto& pr : data_.attributes_) {
-    std::cout << '\t' << pr.key() << ": " << pr.value().type_->annotation_str()
-              << '\n';
+    std::cout << "\t" << pr.key() << ": " << pr.value().type_->annotation_str()
+              << "\n";
   }
   std::cout << "\nSubmodules: \n";
   for (const auto& info : data_.modules_) {
-    std::cout << '\t' << info.name_ << ": "
-              << info.meta_->getJitType()->annotation_str() << '\n';
+    std::cout << "\t" << info.name_ << ": "
+              << info.meta_->getJitType()->annotation_str() << "\n";
   }
   std::cout << "\nForward Pre-Hooks: \n";
   for (const auto& pre_hook_id : data_.forwardPreHooks_) {
-    std::cout << '\t' << "pre_hook id: " << pre_hook_id << '\n';
+    std::cout << "\t"
+              << "pre_hook id: " << pre_hook_id << "\n";
   }
   std::cout << "\nForward Hooks: \n";
   for (const auto& hook_id : data_.forwardHooks_) {
-    std::cout << '\t' << "hook id: " << hook_id << '\n';
+    std::cout << "\t"
+              << "hook id: " << hook_id << "\n";
   }
   std::cout << "\nOverloads: \n";
   for (const auto& pr : data_.overloads_) {
-    std::cout << '\t' << pr.first << ": " << pr.second << '\n';
+    std::cout << "\t" << pr.first << ": " << pr.second << "\n";
   }
   std::string isPoisoned = data_.isPoisoned_ ? "true" : "false";
-  std::cout << "isPoisoned: " << isPoisoned << '\n';
+  std::cout << "isPoisoned: " << isPoisoned << "\n";
   if (jitType_) {
-    std::cout << "jit type: " << jitType_->annotation_str() << '\n';
+    std::cout << "jit type: " << jitType_->annotation_str() << "\n";
   }
 }
 
diff --git a/torch/csrc/jit/frontend/error_report.cpp b/torch/csrc/jit/frontend/error_report.cpp
index 47a9343c5387f..d5a8408e971c0 100644
--- a/torch/csrc/jit/frontend/error_report.cpp
+++ b/torch/csrc/jit/frontend/error_report.cpp
@@ -99,7 +99,7 @@ std::string ErrorReport::current_call_stack() {
 
 const char* ErrorReport::what() const noexcept {
   std::stringstream msg;
-  msg << '\n' << ss.str();
+  msg << "\n" << ss.str();
   msg << ":\n";
   context.highlight(msg);
 
diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp
index fba613b5ea8f7..e7949b0ac4bee 100644
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@@ -421,7 +421,7 @@ struct Environment {
                    "of another type (torch.jit.annotate(List[T, []]) where T "
                    "is the type of elements in the list for Python 2)";
         }
-        error << '\n' << why_not.str();
+        error << "\n" << why_not.str();
         throw ErrorReport(error);
       }
     }
@@ -842,7 +842,7 @@ struct to_ir {
       throw(
           ErrorReport(def.decl().params().range())
           << "Number of type annotations for"
-          << " function parameters (" << schema.arguments().size() << ')'
+          << " function parameters (" << schema.arguments().size() << ")"
           << " does not match the number of parameters on the function ("
           << expected_annotation_size << ")!");
     }
@@ -3452,7 +3452,7 @@ struct to_ir {
           throw(
               ErrorReport(apply.inputs())
               << "expected an expression of type " << type->repr_str()
-              << " but found " << expr->type()->repr_str() << '\n'
+              << " but found " << expr->type()->repr_str() << "\n"
               << why_not.str());
         }
 
@@ -3828,13 +3828,13 @@ struct to_ir {
       if (!is_key_subtype) {
         err << "Generated key type " << key_type->repr_str()
             << " did not match the annotated key type, which was "
-            << annotated_k_type->repr_str() << '\n';
+            << annotated_k_type->repr_str() << "\n";
       }
 
       if (!is_value_subtype) {
         err << "Generated value type " << value_type->repr_str()
             << " did not match the annotated value type, which was "
-            << annotated_v_type->repr_str() << '\n'
+            << annotated_v_type->repr_str() << "\n"
             << ss.str();
       }
 
diff --git a/torch/csrc/jit/frontend/parser.cpp b/torch/csrc/jit/frontend/parser.cpp
index f56a392cc9327..ef49c15bab24c 100644
--- a/torch/csrc/jit/frontend/parser.cpp
+++ b/torch/csrc/jit/frontend/parser.cpp
@@ -23,7 +23,7 @@ Decl mergeTypesFromTypeComment(
         << type_annotation_decl.params().size()
         << ") did not match the number of "
         << (is_method ? "method" : "function") << " parameters ("
-        << expected_num_annotations << ')';
+        << expected_num_annotations << ")";
   }
   auto old = decl.params();
   auto _new = type_annotation_decl.params();
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index d866e4f434448..f191c7daf6e26 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -364,7 +364,7 @@ static std::optional<MatchedSchema> tryMatchSchema(
   }
 
   auto err = [&]() -> std::ostream& {
-    *failure_messages << '\n' << schema << ":\n";
+    *failure_messages << "\n" << schema << ":\n";
     return *failure_messages;
   };
 
@@ -751,7 +751,7 @@ Value* emitBuiltinCall(
     } else {
       error << "Here are some suggestions: \n";
       for (const auto& sym : close_symbols) {
-        error << '\t' << sym.toQualString() << '\n';
+        error << "\t" << sym.toQualString() << "\n";
       }
       error << "\nThe original call is";
     }
diff --git a/torch/csrc/jit/frontend/source_range.cpp b/torch/csrc/jit/frontend/source_range.cpp
index b9263ad08978f..89815d386ac05 100644
--- a/torch/csrc/jit/frontend/source_range.cpp
+++ b/torch/csrc/jit/frontend/source_range.cpp
@@ -310,7 +310,7 @@ void SourceRange::print_with_context(
     if (!funcname.empty()) {
       out << ", in " << funcname;
     }
-    out << '\n';
+    out << "\n";
   }
   // print out initial context
   out << str.substr(begin_context, start() - begin_context);
@@ -327,7 +327,7 @@ void SourceRange::print_with_context(
       auto actual_line = str.substr(line_start, (line_end - line_start) + 1);
       out << actual_line;
       if (actual_line.back() != '\n') {
-        out << '\n';
+        out << "\n";
       }
 
       size_t empty_space = 0;
@@ -377,7 +377,7 @@ void SourceRange::print_with_context(
     auto line_substr = str.substr(line_end, end_context - line_end);
     out << line_substr;
     if (!line_substr.empty() && line_substr.back() != '\n') {
-      out << '\n';
+      out << "\n";
     }
   }
 }
diff --git a/torch/csrc/jit/frontend/tree.h b/torch/csrc/jit/frontend/tree.h
index a11f196c5ac0c..12e75ec41c69d 100644
--- a/torch/csrc/jit/frontend/tree.h
+++ b/torch/csrc/jit/frontend/tree.h
@@ -93,9 +93,9 @@ struct Tree : c10::intrusive_ptr_target {
     if (trees().size() < expected_subtrees ||
         (!allow_more && trees().size() != expected_subtrees)) {
       std::stringstream ss;
-      ss << filename << ':' << lineno << ": expected at least "
+      ss << filename << ":" << lineno << ": expected at least "
          << expected_subtrees << " subtrees, but found only " << trees().size()
-         << '\n';
+         << "\n";
       range().highlight(ss);
       TORCH_CHECK(false, ss.str());
     }
@@ -184,11 +184,11 @@ struct pretty_tree {
         out << t->stringValue();
         break;
       default:
-        out << '(' << kindToString(t->kind());
+        out << "(" << kindToString(t->kind());
         for (const auto& e : t->trees()) {
-          out << ' ' << get_flat(e);
+          out << " " << get_flat(e);
         }
-        out << ')';
+        out << ")";
         break;
     }
     auto it_ = flat_strings.emplace(t, out.str());
@@ -201,12 +201,12 @@ struct pretty_tree {
       return;
     }
     std::string k = kindToString(t->kind());
-    out << '(' << k;
+    out << "(" << k;
     for (const auto& e : t->trees()) {
-      out << '\n' << std::string(indent + 2, ' ');
+      out << "\n" << std::string(indent + 2, ' ');
       print(out, e, indent + 2);
     }
-    out << ')';
+    out << ")";
   }
 };
 
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index ac99385401be4..16edf669da9be 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -419,14 +419,14 @@ std::string AliasDb::getElementName(const Element* e) const {
   } else {
     std::ostringstream ss;
     if (e->values.size() == 1) {
-      ss << '%' << (*e->values.begin())->debugName();
+      ss << "%" << (*e->values.begin())->debugName();
       return ss.str();
     }
-    ss << '(';
+    ss << "(";
     for (const Value* v : e->values) {
-      ss << '%' << v->debugName() << ", ";
+      ss << "%" << v->debugName() << ", ";
     }
-    ss << ')';
+    ss << ")";
     return ss.str();
   }
 }
@@ -454,7 +454,7 @@ std::string AliasDb::toString() const {
         ++ct;
         ss << getElementName(memoryDAG_->fromIndex(pointedTo));
       }
-      ss << '\n';
+      ss << "\n";
     }
     ct = 0;
     if (!element->containedElements.empty()) {
@@ -466,7 +466,7 @@ std::string AliasDb::toString() const {
         }
         ++ct;
       }
-      ss << '\n';
+      ss << "\n";
     }
   }
 
@@ -479,9 +479,9 @@ std::string AliasDb::toString() const {
     for (const auto value : values) {
       ss << getElementName(memoryDAG_->fromIndex(value)) << ", ";
     }
-    ss << '\n';
+    ss << "\n";
   }
-  ss << '\n';
+  ss << "\n";
   return ss.str();
 }
 
@@ -511,7 +511,7 @@ std::string AliasDb::toGraphviz() const {
     } else {
       std::ostringstream ss;
       if (e->values.size() == 1) {
-        ss << "\"\\%" << (*e->values.begin())->debugName() << '"';
+        ss << "\"\\%" << (*e->values.begin())->debugName() << "\"";
         return ss.str();
       }
       ss << "\"(";
@@ -538,7 +538,7 @@ std::string AliasDb::toGraphviz() const {
     if (!element->pointsTo.empty()) {
       for (const auto pointedTo : element->pointsTo) {
         dot << "  " << name(element) << " -> "
-            << name(memoryDAG_->fromIndex(pointedTo)) << '\n';
+            << name(memoryDAG_->fromIndex(pointedTo)) << "\n";
       }
     }
     if (!element->containedElements.empty()) {
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 08bfe47382952..4368b3c8191d8 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -64,7 +64,7 @@ constexpr topo_position_t kMidPoint = 0;
 constexpr topo_position_t kAppendInterval = 1099511627776ULL /* 2^40 */;
 
 void printValueRef(std::ostream& out, const Value* n) {
-  out << '%' << n->debugName();
+  out << "%" << n->debugName();
 }
 
 bool isNumber(std::string_view str) {
@@ -160,7 +160,7 @@ static void printAttribute(std::ostream& out, const at::Tensor& tensor) {
   // 1-elem tensors are usually boxed scalars, so print them like it
   if (tensor.numel() == 1) {
     auto scalar_tensor = tensor.view(std::vector<int64_t>{}).item();
-    out << '{';
+    out << "{";
     if (scalar_tensor.isFloatingPoint()) {
       out << scalar_tensor.toDouble();
     } else if (scalar_tensor.isComplex()) {
@@ -168,7 +168,7 @@ static void printAttribute(std::ostream& out, const at::Tensor& tensor) {
     } else {
       out << scalar_tensor.toLong();
     }
-    out << '}';
+    out << "}";
   } else if (tensor.numel() <= max_tensor_display_size) {
     // TODO: This is awful code.  Also it doesn't work on Windows.
     std::ostringstream tensor_ss;
@@ -191,7 +191,7 @@ static void printAttribute(std::ostream& out, const IValue& ival) {
       ss << "[<Tensors>]";
       return true;
     } else if (input.isObject() && !input.type()->is_module()) {
-      ss << "object(" << &input.toObjectRef() << ')';
+      ss << "object(" << &input.toObjectRef() << ")";
       return true;
     }
     return false;
@@ -202,14 +202,14 @@ static void printAttribute(std::ostream& out, const IValue& ival) {
 static void printTypeList(
     std::ostream& out,
     const std::vector<TypePtr>& items) {
-  out << '[';
+  out << "[";
   int i = 0;
   for (auto& item : items) {
     if (i++ > 0)
       out << ", ";
     out << *item;
   }
-  out << ']';
+  out << "]";
 }
 
 void Node::printAttrValue(std::ostream& out, const Symbol& name) const {
@@ -265,7 +265,7 @@ void Node::printAttrValue(std::ostream& out, const Symbol& name) const {
 
 void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
     const {
-  out << '[';
+  out << "[";
   auto names = attributeNames();
   int i = 0;
   for (auto name : names) {
@@ -279,11 +279,11 @@ void Node::printAttributes(std::ostream& out, bool ignore_subgraph = false)
     // don't want to print the qualifier since it should always
     // be attribute, but you might be able to track down a weird
     // bug by printing it out.
-    out << name.toUnqualString() << '=';
+    out << name.toUnqualString() << "=";
 
     printAttrValue(out, name);
   }
-  out << ']';
+  out << "]";
 }
 
 SourceRange Node::sourceRange() const {
@@ -313,11 +313,11 @@ std::ostream& Node::print(
   out << " = ";
   if (kind() == prim::PythonOp) {
     auto* pyOp = static_cast<const ::torch::jit::PythonOp*>(this);
-    out << '^' << pyOp->name();
+    out << "^" << pyOp->name();
     printAttributes(out, /*ignore_subgraph=*/false);
     pyOp->writeScalars(out);
   } else if (hasAttribute(attr::Subgraph) && groups) {
-    out << kind().toQualString() << '_' << groups->size();
+    out << kind().toQualString() << "_" << groups->size();
     if (print_attributes && numAttributes() > 1 &&
         kind() != prim::DifferentiableGraph) {
       printAttributes(out, /*ignore_subgraph=*/true);
@@ -330,7 +330,7 @@ std::ostream& Node::print(
       printAttributes(out);
     }
   }
-  out << '(' << inputs() << ')';
+  out << "(" << inputs() << ")";
 
   if (print_scopes) {
     std::string scName = scopeName();
@@ -350,7 +350,7 @@ std::ostream& Node::print(
     }
     if (auto file_line_col = r.file_line_col()) {
       auto [filename, line, col] = *file_line_col;
-      out << " # " << filename << ':' << line << ':' << col;
+      out << " # " << filename << ":" << line << ":" << col;
     }
   }
 
@@ -358,11 +358,11 @@ std::ostream& Node::print(
     return out;
   }
 
-  out << '\n';
+  out << "\n";
 
   for (const auto i : c10::irange(blocks().size())) {
     auto b = blocks()[i];
-    indent(out, level + 1) << "block" << i << '('
+    indent(out, level + 1) << "block" << i << "("
                            << const_value_list_with_types(b->inputs())
                            << "):\n";
     for (auto nested : b->nodes()) {
@@ -389,7 +389,7 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
   out << "  return (" << outputs() << ")\n";
   size_t i = 0;
   for (auto fg : groups) {
-    out << "with " << fg->kind().toQualString() << '_' << i++ << " = "
+    out << "with " << fg->kind().toQualString() << "_" << i++ << " = "
         << *fg->g(attr::Subgraph);
   }
   out.flush();
@@ -397,7 +397,7 @@ std::ostream& Graph::print(std::ostream& out, bool print_source_locations)
   /*
   // Uncomment this to debug all_nodes issues
   {
-    out << '\n';
+    out << "\n";
     out << "all_nodes:\n";
     for (auto& n : all_nodes) {
       printNode(out, const_cast<Node*>(n), nullptr);
@@ -654,7 +654,7 @@ void Graph::lint() const {
 }
 
 void Graph::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }
 
 void Graph::push_scope(const std::string& scope_name) {
@@ -888,7 +888,7 @@ Value* Value::setDebugName(const std::string& name) {
       static std::locale c_locale("C");
       ss.imbue(c_locale);
 #endif
-      ss << name_base << '.' << suffix++;
+      ss << name_base << "." << suffix++;
       replacement_name = ss.str();
     } while (names.count(replacement_name) > 0);
 
@@ -1069,7 +1069,7 @@ bool Node::mustBeNone() const {
 }
 
 void Node::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }
 
 const FunctionSchema& Node::schema() const {
@@ -1106,7 +1106,7 @@ const Operator& Node::getOperator() const {
 
   auto er = ErrorReport(sourceRange());
   er << "Schema not found for node. File a bug report.\n";
-  er << "Node: " << *this << '\n';
+  er << "Node: " << *this << "\n";
   er << "Input types:";
   for (const auto i : c10::irange(inputs().size())) {
     if (i > 0)
@@ -1117,13 +1117,13 @@ const Operator& Node::getOperator() const {
   if (!candidates.empty()) {
     er << "\ncandidates were:\n";
     for (auto& candidate : candidates) {
-      er << "  " << candidate->schema() << '\n';
+      er << "  " << candidate->schema() << "\n";
     }
   } else {
     er << "\nno candidates found\n";
   }
   er << "within the graph:\n";
-  er << *owningGraph() << '\n';
+  er << *owningGraph() << "\n";
   throw er;
 }
 
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index f2b237418627e..8adf4c8aab10c 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -154,9 +154,9 @@ std::string jit_log_prefix(
     int l,
     const std::string& in_str) {
   std::stringstream prefix_ss;
-  prefix_ss << '[';
-  prefix_ss << level << ' ';
-  prefix_ss << c10::filesystem::path(fn).filename() << ':';
+  prefix_ss << "[";
+  prefix_ss << level << " ";
+  prefix_ss << c10::filesystem::path(fn).filename() << ":";
   prefix_ss << std::setfill('0') << std::setw(3) << l;
   prefix_ss << "] ";
 
diff --git a/torch/csrc/jit/mobile/debug_info.cpp b/torch/csrc/jit/mobile/debug_info.cpp
index be61d1d2ec57b..0a410a42fef04 100644
--- a/torch/csrc/jit/mobile/debug_info.cpp
+++ b/torch/csrc/jit/mobile/debug_info.cpp
@@ -103,7 +103,7 @@ std::pair<std::string, std::string> getStackTraceWithModuleHierarchy(
       std::get<kDebugInfoTupleNodeNameIndex>(last_entry);
   module_info.append(".").append(node_name);
   std::ostringstream ss;
-  ss << "Module hierarchy:" << module_info << '\n';
+  ss << "Module hierarchy:" << module_info << "\n";
   format_stack_trace(ss, stack_entries);
   return {ss.str(), std::move(module_info)};
 }
diff --git a/torch/csrc/jit/mobile/import_data.cpp b/torch/csrc/jit/mobile/import_data.cpp
index 7071a08daf6f4..1bd34e4a823ae 100644
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@@ -138,7 +138,7 @@ c10::IValue IValueUnpickler::readArchive(
 
   auto read_record = [&](const std::string& name) {
     std::stringstream ss;
-    ss << archive_name << '/' << name;
+    ss << archive_name << "/" << name;
     return std::get<0>(reader_->getRecord(ss.str()));
   };
 
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index 41fc8d49efb16..b5e67cd83cbb2 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -95,11 +95,11 @@ bool InterpreterState::run(Stack& stack) {
         debug_handle = *handle;
       }
 
-      // std::cout << "RUNNING " << pc << ' ' << code.instructions_[pc];
+      // std::cout << "RUNNING " << pc << " " << code.instructions_[pc];
       // if (inst.op == OP) {
       //   std::cout << ", " << code.op_names_[inst.X].name;
       //   if (!code.op_names_[inst.X].overload_name.empty()) {
-      //     std::cout << '.' << code.op_names_[inst.X].overload_name;
+      //     std::cout << "." << code.op_names_[inst.X].overload_name;
       //   }
       // }
       // std::cout << std::endl;
diff --git a/torch/csrc/jit/mobile/model_tracer/tracer.cpp b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
index c6a94dc8a1fb8..b821e7dfcdcd7 100644
--- a/torch/csrc/jit/mobile/model_tracer/tracer.cpp
+++ b/torch/csrc/jit/mobile/model_tracer/tracer.cpp
@@ -60,7 +60,7 @@ static void printOpYAML(
     bool is_used_for_training,
     bool is_root_operator,
     bool include_all_overloads) {
-  out << std::string(indent, ' ') << op_name << ':' << '\n';
+  out << std::string(indent, ' ') << op_name << ":" << '\n';
   out << std::string(indent + 2, ' ')
       << "is_used_for_training: " << (is_used_for_training ? "true" : "false")
       << '\n';
@@ -88,7 +88,7 @@ static void printDTypeYAML(
     const std::string& kernel_tag_name,
     const std::set<std::string>& dtypes) {
   std::string indent_str = std::string(indent, ' ');
-  out << indent_str << kernel_tag_name << ':' << '\n';
+  out << indent_str << kernel_tag_name << ":" << '\n';
   for (auto& dtype : dtypes) {
     out << indent_str << "- " << dtype << '\n';
   }
diff --git a/torch/csrc/jit/passes/check_strict_fusion.cpp b/torch/csrc/jit/passes/check_strict_fusion.cpp
index 731382c316398..41f60fa359132 100644
--- a/torch/csrc/jit/passes/check_strict_fusion.cpp
+++ b/torch/csrc/jit/passes/check_strict_fusion.cpp
@@ -73,7 +73,7 @@ static void checkForUnfusedOps(Node* enter_node) {
     std::stringstream ss;
     ss << "Found multiple fusions: \n";
     for (Node* n : guarding_ifs) {
-      ss << *n << '\n';
+      ss << *n << "\n";
     }
     throw(ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str());
   }
@@ -100,13 +100,13 @@ static void checkForUnfusedOps(Node* enter_node) {
     std::stringstream ss;
     ss << "Found unfused operators: \n";
     for (Node* unfused : unfused_nodes_not_used_in_guard) {
-      ss << '\t';
+      ss << "\t";
       if (unfused->maybeSchema()) {
         ss << unfused->schema();
       } else {
         unfused->kind().toDisplayString();
       }
-      ss << '\n';
+      ss << "\n";
     }
     throw(ErrorReport(enter_node->input()->node()->sourceRange()) << ss.str());
   }
diff --git a/torch/csrc/jit/passes/liveness.cpp b/torch/csrc/jit/passes/liveness.cpp
index 138c6fc78f752..c4a80872d61b4 100644
--- a/torch/csrc/jit/passes/liveness.cpp
+++ b/torch/csrc/jit/passes/liveness.cpp
@@ -72,7 +72,7 @@ struct LivenessAnalyzer {
         std::cout << e.first->outputs()[0]->debugName();
       }
 
-      std::cout << ' ' << e.first->kind().toQualString();
+      std::cout << " " << e.first->kind().toQualString();
       std::cout << " = ";
       dump(e.second);
       std::cout << '\n';
@@ -83,16 +83,16 @@ struct LivenessAnalyzer {
 
   void dump(const std::vector<Value*>& set) {
     bool first = true;
-    std::cout << '[';
+    std::cout << "[";
     for (auto el : set) {
       if (first) {
         first = false;
       } else {
         std::cout << ", ";
       }
-      std::cout << el->debugName() << '(' << el->unique() << ')';
+      std::cout << el->debugName() << "(" << el->unique() << ")";
     }
-    std::cout << ']';
+    std::cout << "]";
   }
 
  private:
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index d3231222cb935..cddae77768228 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -292,7 +292,7 @@ void NodeToONNX(
       std::ostringstream ss;
       ss << "symbolic for " << op_name
          << " produced an incorrect number of outputs (expected ";
-      ss << num_old_outputs << ", but got " << outputs.size() << ')';
+      ss << num_old_outputs << ", but got " << outputs.size() << ")";
       throw std::runtime_error(ss.str());
     }
     // For const node, it does not need params_dict info, so set it to {}.
diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp
index 902dc5f8924cd..e4ec14a5a0175 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_map.cpp
@@ -301,7 +301,7 @@ void ConstantValueMap::PrintMaps() {
         }
       }
     }
-    ss << " (rank = " << x.second << ')';
+    ss << " (rank = " << x.second << ")";
     std::cout << "node " << x.first << ": " << ss.str() << '\n';
   }
   std::cout << '\n';
@@ -346,9 +346,9 @@ void ConstantValueMap::PrintMaps() {
     std::cout << "(node " << x.first << ": ";
     for (const auto& dim : x.second.dim()) {
       if (dim.has_dim_param()) {
-        std::cout << dim.dim_param() << ' ';
+        std::cout << dim.dim_param() << " ";
       } else {
-        std::cout << dim.dim_value() << ' ';
+        std::cout << dim.dim_value() << " ";
       }
     }
     std::cout << "), ";
@@ -361,7 +361,7 @@ void ConstantValueMap::PrintMaps() {
   std::cout << "SymbolDim Map:" << '\n';
   count = 0;
   for (const auto& x : ConstantValueMap::getInstance().symbolDimMap) {
-    std::cout << '(' << x.first << ": " << x.second << "), ";
+    std::cout << "(" << x.first << ": " << x.second << "), ";
     count++;
     if (count % 10 == 0) {
       std::cout << '\n';
@@ -370,7 +370,7 @@ void ConstantValueMap::PrintMaps() {
   std::cout << "DimSymbol Map:" << '\n';
   count = 0;
   for (const auto& x : ConstantValueMap::getInstance().dimSymbolMap) {
-    std::cout << '(' << x.first << ": " << x.second << "), ";
+    std::cout << "(" << x.first << ": " << x.second << "), ";
     count++;
     if (count % 10 == 0) {
       std::cout << '\n';
diff --git a/torch/csrc/jit/passes/onnx/function_extraction.cpp b/torch/csrc/jit/passes/onnx/function_extraction.cpp
index fab3110954fde..7901b44bb85f5 100644
--- a/torch/csrc/jit/passes/onnx/function_extraction.cpp
+++ b/torch/csrc/jit/passes/onnx/function_extraction.cpp
@@ -250,7 +250,7 @@ void FunctionExtractor::DebugPrintScopeContexts(
     GRAPH_UPDATE("Children scopes: ", [&]() {
       std::stringstream ss;
       for (const auto& child_scope : it.second->children_) {
-        ss << child_scope->name().toDisplayString() << ' ';
+        ss << child_scope->name().toDisplayString() << " ";
       }
       return ss.str();
     }());
diff --git a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
index 48f13499a5fc0..a188eb0abd6b8 100644
--- a/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
+++ b/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.cpp
@@ -440,7 +440,7 @@ std::string InplaceConverter::ValueTracker::toString() const {
     ss << "Value[" << idx << "]: " << it.first->debugName() << '\n';
     ss << "  Mapping to ";
     for (auto v : it.second) {
-      ss << v->debugName() << ' ';
+      ss << v->debugName() << " ";
     }
     ss << '\n';
     idx++;
diff --git a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
index 999f8247b7c84..153408b350bf2 100644
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@@ -152,11 +152,11 @@ static std::ostream& operator<<(std::ostream& os, const ShapeArguments& sa) {
     return os;
   }
 
-  os << '(';
+  os << "(";
   for (const auto i : c10::irange(sa.len())) {
     os << sa.at(i);
   }
-  os << ')';
+  os << ")";
 
   return os;
 }
diff --git a/torch/csrc/jit/passes/utils/subgraph_utils.cpp b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
index f54adbd7223a2..f9fd65f9ce541 100644
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@@ -612,7 +612,7 @@ static std::string truncateStrWithHash(const std::string& s, size_t maxlen) {
       (maxlen > hash_str.size() + 1) ? (maxlen - hash_str.size() - 1) : maxlen;
   std::stringstream truncated;
   truncated << s.substr(0, trunc_len);
-  truncated << '_' << hash_str;
+  truncated << "_" << hash_str;
   return truncated.str();
 }
 
@@ -626,7 +626,7 @@ std::string generateNameForGraph(
     if (!node->kind().is_aten()) {
       continue;
     }
-    graph_name << '_' << node->kind().toUnqualString();
+    graph_name << "_" << node->kind().toUnqualString();
   }
   return truncateStrWithHash(graph_name.str(), maxlen);
 }
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index 8dc4cb7ac9349..beb6f89519804 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1798,7 +1798,7 @@ void initJITBindings(PyObject* module) {
                     << "' with schema(s):\n";
 
           for (const auto& op : sortedOps) {
-            docstring << "  " << op->schema() << '\n';
+            docstring << "  " << op->schema() << "\n";
           }
 
           py::list overload_names;
diff --git a/torch/csrc/jit/python/python_arg_flatten.h b/torch/csrc/jit/python/python_arg_flatten.h
index 472b257736491..232f5b6ea0812 100644
--- a/torch/csrc/jit/python/python_arg_flatten.h
+++ b/torch/csrc/jit/python/python_arg_flatten.h
@@ -79,17 +79,17 @@ static inline std::ostream& operator<<(
       out << ", ";
     out << meta.sizes[i];
   }
-  out << '}';
+  out << "}";
   return out;
 }
 
 static inline std::ostream& operator<<(
     std::ostream& out,
     const IODescriptor& desc) {
-  out << desc.structure << '\n';
-  out << "  with grad_enabled=" << desc.grad_enabled << '\n';
+  out << desc.structure << "\n";
+  out << "  with grad_enabled=" << desc.grad_enabled << "\n";
   for (const auto i : c10::irange(desc.metadata.size())) {
-    out << "  with v" << i << " having type " << desc.metadata[i] << '\n';
+    out << "  with v" << i << " having type " << desc.metadata[i] << "\n";
   }
   return out;
 }
diff --git a/torch/csrc/jit/python/python_ir.cpp b/torch/csrc/jit/python/python_ir.cpp
index 6e5dcde957ddb..88794ecbf3d73 100644
--- a/torch/csrc/jit/python/python_ir.cpp
+++ b/torch/csrc/jit/python/python_ir.cpp
@@ -61,7 +61,7 @@ static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
     // tuple.__str__; this doesn't work because Python doesn't allow
     // monkeypatching methods of built-in types.
     auto pytuple = pyobj.cast<py::tuple>();
-    out << '(';
+    out << "(";
     size_t i = 0;
     for (const auto& o : pytuple) {
       if (i > 0) {
@@ -72,9 +72,9 @@ static std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
       i++;
     }
     if (i == 1) {
-      out << ',';
+      out << ",";
     }
-    out << ')';
+    out << ")";
     return out;
   } else {
     return out << THPUtils_unpackString(py::str(pyobj).ptr());
@@ -154,14 +154,14 @@ std::optional<THPObjectPtr> ConcretePythonOp::autogradFunction() const {
 }
 
 void ConcretePythonOp::writeScalars(std::ostream& out) const {
-  out << '(';
+  out << "(";
   int i = 0;
   for (auto& scalar : scalar_args) {
     if (i++ > 0)
       out << ", ";
     printPyObject(out, scalar);
   }
-  out << ')';
+  out << ")";
 }
 
 void ConcretePythonOp::lint_python() const {
@@ -506,7 +506,7 @@ void initPythonIRBindings(PyObject* module_) {
           "__repr__",
           [](Value& n) {
             std::stringstream ss;
-            ss << n.debugName() << " defined in (" << *n.node() << ')';
+            ss << n.debugName() << " defined in (" << *n.node() << ")";
             return ss.str();
           })
       .VS(type)
diff --git a/torch/csrc/jit/python/python_tracer.cpp b/torch/csrc/jit/python/python_tracer.cpp
index 9210311997384..81da1605fcbe2 100644
--- a/torch/csrc/jit/python/python_tracer.cpp
+++ b/torch/csrc/jit/python/python_tracer.cpp
@@ -55,8 +55,8 @@ SourceRange getPythonInterpreterSourceRange() {
       if (src && src->filename()) {
         auto line =
             src->starting_line_no() + src->lineno_for_offset(range.start());
-        stack_trace << *(src->filename()) << '(' << line
-                    << "): " << entry.filename << '\n';
+        stack_trace << *(src->filename()) << "(" << line
+                    << "): " << entry.filename << "\n";
         if (!source_filename) {
           source_filename = *(src->filename());
           source_line = line;
@@ -218,7 +218,7 @@ void initPythonTracerBindings(PyObject* module) {
           "__repr__",
           [](const TracingState& s) {
             std::ostringstream ss;
-            ss << "<TracingState " << (const void*)&s << '>';
+            ss << "<TracingState " << (const void*)&s << ">";
             return ss.str();
           })
       .def(
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index ca75e6b986404..b9fbf4d1ec30f 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -497,7 +497,7 @@ static bool ivalue_tags_match(const Module& lhs, const Module& rhs) {
     if (item.a.isPtrType()) {
       // uncomment to debug type matching errors
       // std::cout << "MATCHING " << /*item.a <<*/ "(" << *item.a.type() << ") "
-      //          << item.a.internalToPointer() << ' ' << /*item.b <<*/ " ("
+      //          << item.a.internalToPointer() << " " << /*item.b <<*/ " ("
       //          << *item.b.type() << ") " << item.b.internalToPointer() <<
       //          "\n";
 
@@ -902,7 +902,7 @@ void initJitScriptBindings(PyObject* module) {
                     std::stringstream err;
                     err << "Tried to deepcopy object ";
                     if (auto qualname = class_type->name()) {
-                      err << qualname->qualifiedName() << ' ';
+                      err << qualname->qualifiedName() << " ";
                     }
                     err << "which does not have a __setstate__ method defined!";
                     throw std::runtime_error(err.str());
@@ -912,7 +912,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to deepcopy object ";
                 if (auto qualname = self.type()->name()) {
-                  err << qualname->qualifiedName() << ' ';
+                  err << qualname->qualifiedName() << " ";
                 }
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
@@ -929,7 +929,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to serialize object ";
                 if (auto qualname = self.type()->name()) {
-                  err << qualname->qualifiedName() << ' ';
+                  err << qualname->qualifiedName() << " ";
                 }
                 err << "which does not have a __getstate__ method defined!";
                 throw std::runtime_error(err.str());
@@ -966,7 +966,7 @@ void initJitScriptBindings(PyObject* module) {
                 std::stringstream err;
                 err << "Tried to deserialize object ";
                 if (auto qualname = class_type->name()) {
-                  err << qualname->qualifiedName() << ' ';
+                  err << qualname->qualifiedName() << " ";
                 }
                 err << "which does not have a __setstate__ method defined!";
                 throw std::runtime_error(err.str());
diff --git a/torch/csrc/jit/runtime/argument_spec.cpp b/torch/csrc/jit/runtime/argument_spec.cpp
index 667b94556f2b9..0a50a64e5f1bf 100644
--- a/torch/csrc/jit/runtime/argument_spec.cpp
+++ b/torch/csrc/jit/runtime/argument_spec.cpp
@@ -127,7 +127,7 @@ void ArgumentSpecCreator::dump() const {
         break;
     }
   }
-  std::cout << '\n';
+  std::cout << "\n";
 }
 
 ArgumentSpec ArgumentSpecCreator::create(bool with_grad, const Stack& input)
diff --git a/torch/csrc/jit/runtime/argument_spec.h b/torch/csrc/jit/runtime/argument_spec.h
index a7758f1674fab..1b4cf86a1963c 100644
--- a/torch/csrc/jit/runtime/argument_spec.h
+++ b/torch/csrc/jit/runtime/argument_spec.h
@@ -402,12 +402,12 @@ inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) {
   }
   out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
       << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim()
-      << ')';
+      << ")";
   return out;
 }
 
 inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
-  out << '{';
+  out << "{";
   for (const auto i : c10::irange(spec.numTensors())) {
     if (i > 0)
       out << ", ";
@@ -419,7 +419,7 @@ inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
       out << ", ";
     out << spec.isPresent(i);
   }
-  out << '}';
+  out << "}";
   return out;
 }
 
@@ -431,20 +431,20 @@ inline std::ostream& operator<<(
   }
   out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
       << ", requires_grad=" << info.requires_grad()
-      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ')';
+      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")";
   return out;
 }
 
 inline std::ostream& operator<<(
     std::ostream& out,
     const CompleteArgumentSpec& spec) {
-  out << '{';
+  out << "{";
   for (const auto i : c10::irange(spec.size())) {
     if (i > 0)
       out << ", ";
     out << spec.at(i);
   }
-  out << '}';
+  out << "}";
   return out;
 }
 
diff --git a/torch/csrc/jit/runtime/instruction.cpp b/torch/csrc/jit/runtime/instruction.cpp
index 7388b8eac6700..b591bf7318b40 100644
--- a/torch/csrc/jit/runtime/instruction.cpp
+++ b/torch/csrc/jit/runtime/instruction.cpp
@@ -47,10 +47,10 @@ std::ostream& operator<<(std::ostream& out, Instruction inst) {
   auto nargs = std::strlen(OpInfo(inst.op));
   out << inst.op;
   if (nargs > 0) {
-    out << ' ' << inst.X;
+    out << " " << inst.X;
   }
   if (nargs > 1) {
-    out << ' ' << inst.N;
+    out << " " << inst.N;
   }
   return out;
 }
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
index 95b74376d2eb2..9d4d681f8b32f 100644
--- a/torch/csrc/jit/runtime/interpreter.cpp
+++ b/torch/csrc/jit/runtime/interpreter.cpp
@@ -213,7 +213,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     out << "Stack:\n";
     for (const auto& val : stack) {
       out << val;
-      out << '\n';
+      out << "\n";
     }
   }
 
@@ -929,7 +929,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
         python_class_name ? *python_class_name : "RuntimeError";
     ss << "The following operation failed in the TorchScript interpreter.\n";
     formatStackTrace(ss);
-    ss << class_name << ": " << msg << '\n';
+    ss << class_name << ": " << msg << "\n";
     if (future_) {
       future_->setError(std::make_exception_ptr(Future::FutureError(ss.str())));
     } else if (is_jit_exception) {
@@ -942,7 +942,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
           not_implemented_error->caller());
     } else {
       if (get_cpp_stacktraces_enabled()) {
-        ss << e.what() << '\n';
+        ss << e.what() << "\n";
       }
       throw std::runtime_error(ss.str());
     }
@@ -1143,7 +1143,7 @@ std::vector<std::string> currentModuleHierarchy() {
 }
 
 std::ostream& operator<<(std::ostream& out, const Code& code) {
-  out << *code.pImpl->graph_ << '\n';
+  out << *code.pImpl->graph_ << "\n";
   code.pImpl->dump(out);
   return out;
 }
diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h
index 3eddaec0dece4..02e64d1961513 100644
--- a/torch/csrc/jit/runtime/interpreter/code_impl.h
+++ b/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -866,17 +866,17 @@ struct CodeImpl {
   }
 
   void dump(std::ostream& out, size_t i) const {
-    out << i << ' ' << instructions_[i];
+    out << i << " " << instructions_[i];
     if (instructions_[i].op == OP || instructions_[i].op == CALL ||
         instructions_[i].op == OPN) {
       out << " # " << *instructions_source_[i];
     } else {
-      out << '\n';
+      out << "\n";
     }
   }
 
   void dump(std::ostream& out) const {
-    out << *graph_ << '\n';
+    out << *graph_ << "\n";
     for (const auto i : c10::irange(instructions_.size())) {
       dump(out, i);
     }
diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp
index 1f168d24e8adf..310fe35ffaacb 100644
--- a/torch/csrc/jit/runtime/register_prim_ops.cpp
+++ b/torch/csrc/jit/runtime/register_prim_ops.cpp
@@ -145,7 +145,7 @@ bool isSortableListOfObjectsOrTuples(
   why_not << "Only list of Tensors, ints, floats, bools, strs, "
           << "a User Defined Class that defines the __lt__ compare method "
           << "or Tuples of aforementioned types can be sorted, got list of "
-          << type->repr_str() << '\n';
+          << type->repr_str() << "\n";
   return false;
 }
 
@@ -820,7 +820,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs{
           bool first = true;
           for (const IValue& i : last(stack, num_inputs)) {
             if (!first)
-              ss << ' ';
+              ss << " ";
             first = false;
             ss << i;
           }
@@ -2971,10 +2971,10 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
         auto i = pop(stack).toInt();                     \
         std::stringstream ss;                            \
         if (i < 0) {                                     \
-          ss << '-';                                     \
+          ss << "-";                                     \
           i = -i;                                        \
         }                                                \
-        ss << '0' << prefix << char_op << i;             \
+        ss << "0" << prefix << char_op << i;             \
         push(stack, ss.str());                           \
       },                                                 \
       aliasAnalysisFromSchema())
@@ -2991,7 +2991,7 @@ static const std::vector<OperatorGeneratorArgs> opGenArgs2{
             push(stack, "0b0");
           } else {
             if (i < 0) {
-              ss << '-';
+              ss << "-";
               i = -i;
             }
             std::string str = std::bitset<8 * sizeof(i)>(i).to_string();
diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp
index 8ad348bb162c1..0a6e0b3564add 100644
--- a/torch/csrc/jit/runtime/static/impl.cpp
+++ b/torch/csrc/jit/runtime/static/impl.cpp
@@ -145,9 +145,9 @@ std::string dumpValueSet(
   std::ostringstream oss;
   oss << set_name << ": {";
   for (const auto* val : value_set) {
-    oss << '%' << val->debugName() << ", ";
+    oss << "%" << val->debugName() << ", ";
   }
-  oss << '}';
+  oss << "}";
   return oss.str();
 }
 
@@ -1521,7 +1521,7 @@ void BlockRunner::benchmark(
     } else if (results.native_nodes.count(kind)) {
       std::cout << ", native)" << '\n';
     } else {
-      std::cout << ')' << '\n';
+      std::cout << ")" << '\n';
     }
 
     if (generate_ai_pep_output) {
@@ -1566,13 +1566,13 @@ void BlockRunner::benchmark(
   auto unsupported_nodes_count = results.total_nodes_count -
       results.out_nodes_count - results.native_nodes.size();
   std::cout << "Total number of 'out' variant nodes/total number of nodes: "
-            << results.out_nodes_count << '/' << results.total_nodes_count
+            << results.out_nodes_count << "/" << results.total_nodes_count
             << " ("
             << 100.0 * static_cast<float>(results.out_nodes_count) /
           static_cast<float>(results.total_nodes_count)
             << "%)" << '\n';
   std::cout << "Total number of nodes not covered by SR/total number of nodes: "
-            << unsupported_nodes_count << '/' << results.total_nodes_count
+            << unsupported_nodes_count << "/" << results.total_nodes_count
             << " ("
             << 100.0 * static_cast<float>(unsupported_nodes_count) /
           static_cast<float>(results.total_nodes_count)
diff --git a/torch/csrc/jit/serialization/onnx.cpp b/torch/csrc/jit/serialization/onnx.cpp
index 499ed582a661d..82ce2e4e360c1 100644
--- a/torch/csrc/jit/serialization/onnx.cpp
+++ b/torch/csrc/jit/serialization/onnx.cpp
@@ -27,7 +27,7 @@ void dump(const onnx::TensorProto& tensor, std::ostream& stream) {
   for (const auto i : c10::irange(tensor.dims_size())) {
     stream << tensor.dims(i) << (i == tensor.dims_size() - 1 ? "" : " ");
   }
-  stream << ']';
+  stream << "]";
 }
 
 void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
@@ -36,7 +36,7 @@ void dump(const onnx::TensorShapeProto& shape, std::ostream& stream) {
     if (dim.has_dim_value()) {
       stream << dim.dim_value();
     } else {
-      stream << '?';
+      stream << "?";
     }
     stream << (i == shape.dim_size() - 1 ? "" : " ");
   }
@@ -67,7 +67,7 @@ void dump(const onnx::TypeProto_Optional& optional_type, std::ostream& stream) {
   } else {
     stream << "None";
   }
-  stream << '>';
+  stream << ">";
 }
 
 void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
@@ -77,7 +77,7 @@ void dump(const onnx::TypeProto_Sequence& sequence_type, std::ostream& stream) {
   } else {
     stream << "None";
   }
-  stream << '>';
+  stream << ">";
 }
 
 void dump(const onnx::TypeProto& type, std::ostream& stream) {
@@ -95,7 +95,7 @@ void dump(const onnx::TypeProto& type, std::ostream& stream) {
 void dump(const onnx::ValueInfoProto& value_info, std::ostream& stream) {
   stream << "{name: \"" << value_info.name() << "\", type:";
   dump(value_info.type(), stream);
-  stream << '}';
+  stream << "}";
 }
 
 void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent);
@@ -123,36 +123,36 @@ void dump(
     for (const auto i : c10::irange(attr.floats_size())) {
       stream << attr.floats(i) << (i == attr.floats_size() - 1 ? "" : " ");
     }
-    stream << ']';
+    stream << "]";
   } else if (attr.ints_size()) {
     stream << "ints, values: [";
     for (const auto i : c10::irange(attr.ints_size())) {
       stream << attr.ints(i) << (i == attr.ints_size() - 1 ? "" : " ");
     }
-    stream << ']';
+    stream << "]";
   } else if (attr.strings_size()) {
     stream << "strings, values: [";
     for (const auto i : c10::irange(attr.strings_size())) {
       stream << "'" << attr.strings(i) << "'"
              << (i == attr.strings_size() - 1 ? "" : " ");
     }
-    stream << ']';
+    stream << "]";
   } else if (attr.tensors_size()) {
     stream << "tensors, values: [";
     for (auto& t : attr.tensors()) {
       dump(t, stream);
     }
-    stream << ']';
+    stream << "]";
   } else if (attr.graphs_size()) {
     stream << "graphs, values: [";
     for (auto& g : attr.graphs()) {
       dump(g, stream, indent + 1);
     }
-    stream << ']';
+    stream << "]";
   } else {
     stream << "UNKNOWN";
   }
-  stream << '}';
+  stream << "}";
 }
 
 void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
@@ -174,31 +174,31 @@ void dump(const onnx::NodeProto& node, std::ostream& stream, size_t indent) {
 
 void dump(const onnx::GraphProto& graph, std::ostream& stream, size_t indent) {
   stream << idt(indent) << "GraphProto {" << nlidt(indent + 1) << "name: \""
-         << graph.name() << '"' << nlidt(indent + 1) << "inputs: [";
+         << graph.name() << "\"" << nlidt(indent + 1) << "inputs: [";
   for (const auto i : c10::irange(graph.input_size())) {
     dump(graph.input(i), stream);
     stream << (i == graph.input_size() - 1 ? "" : ",");
   }
-  stream << ']' << nlidt(indent + 1) << "outputs: [";
+  stream << "]" << nlidt(indent + 1) << "outputs: [";
   for (const auto i : c10::irange(graph.output_size())) {
     dump(graph.output(i), stream);
     stream << (i == graph.output_size() - 1 ? "" : ",");
   }
-  stream << ']' << nlidt(indent + 1) << "value_infos: [";
+  stream << "]" << nlidt(indent + 1) << "value_infos: [";
   for (const auto i : c10::irange(graph.value_info_size())) {
     dump(graph.value_info(i), stream);
     stream << (i == graph.value_info_size() - 1 ? "" : ",");
   }
-  stream << ']' << nlidt(indent + 1) << "initializers: [";
+  stream << "]" << nlidt(indent + 1) << "initializers: [";
   for (const auto i : c10::irange(graph.initializer_size())) {
     dump(graph.initializer(i), stream);
     stream << (i == graph.initializer_size() - 1 ? "" : ",");
   }
-  stream << ']' << nlidt(indent + 1) << "nodes: [" << nlidt(indent + 2);
+  stream << "]" << nlidt(indent + 1) << "nodes: [" << nlidt(indent + 2);
   for (const auto i : c10::irange(graph.node_size())) {
     dump(graph.node(i), stream, indent + 2);
     if (i != graph.node_size() - 1) {
-      stream << ',' << nlidt(indent + 2);
+      stream << "," << nlidt(indent + 2);
     }
   }
   stream << nlidt(indent + 1) << "]\n" << idt(indent) << "}\n";
@@ -208,14 +208,14 @@ void dump(
     const onnx::OperatorSetIdProto& operator_set_id,
     std::ostream& stream) {
   stream << "OperatorSetIdProto { domain: " << operator_set_id.domain()
-         << ", version: " << operator_set_id.version() << '}';
+         << ", version: " << operator_set_id.version() << "}";
 }
 
 void dump(const onnx::ModelProto& model, std::ostream& stream, size_t indent) {
   stream << idt(indent) << "ModelProto {" << nlidt(indent + 1)
-         << "producer_name: \"" << model.producer_name() << '"'
-         << nlidt(indent + 1) << "domain: \"" << model.domain() << '"'
-         << nlidt(indent + 1) << "doc_string: \"" << model.doc_string() << '"';
+         << "producer_name: \"" << model.producer_name() << "\""
+         << nlidt(indent + 1) << "domain: \"" << model.domain() << "\""
+         << nlidt(indent + 1) << "doc_string: \"" << model.doc_string() << "\"";
   if (model.has_graph()) {
     stream << nlidt(indent + 1) << "graph:\n";
     dump(model.graph(), stream, indent + 2);
diff --git a/torch/csrc/jit/serialization/pickler.cpp b/torch/csrc/jit/serialization/pickler.cpp
index 1d5a2e77931c0..0622dbb5cd98e 100644
--- a/torch/csrc/jit/serialization/pickler.cpp
+++ b/torch/csrc/jit/serialization/pickler.cpp
@@ -130,7 +130,7 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
     err << "Cannot serialize custom bound C++ class";
     if (memoized_class_types_ && !memoized_class_types_->empty()) {
       if (auto qualname = memoized_class_types_->back()->name()) {
-        err << ' ' << qualname->qualifiedName();
+        err << " " << qualname->qualifiedName();
       }
     }
     err << ". Please define serialization methods via def_pickle() for "
diff --git a/torch/csrc/jit/serialization/python_print.cpp b/torch/csrc/jit/serialization/python_print.cpp
index bf7e5250487d1..70e188816fb4c 100644
--- a/torch/csrc/jit/serialization/python_print.cpp
+++ b/torch/csrc/jit/serialization/python_print.cpp
@@ -381,7 +381,7 @@ struct PythonPrintImpl {
   static std::string makeValidIdentifier(const std::string& candidate) {
     std::stringstream ss;
     if (candidate.empty() || isdigit(candidate[0]))
-      ss << '_';
+      ss << "_";
     for (char c : candidate) {
       if (isupper(c) || islower(c) || isdigit(c) || c == '_')
         ss << c;
@@ -487,11 +487,11 @@ struct PythonPrintImpl {
     if (isValidIdentifier(val_name)) {
       stmt << val_name;
     } else {
-      stmt << '(' << val_name << ')';
+      stmt << "(" << val_name << ")";
     }
-    stmt << '[';
+    stmt << "[";
     stmt << useOf(inputs[1]);
-    stmt << ']';
+    stmt << "]";
   }
 
   void printDict(
@@ -534,7 +534,7 @@ struct PythonPrintImpl {
     body_ << " = ";
     // or if value is being assigned to something of a union type
     printValueList(body_, rhs);
-    body_ << '\n';
+    body_ << "\n";
   }
 
   bool requiresAnnotation(Value* lhs, Value* rhs) {
@@ -555,7 +555,7 @@ struct PythonPrintImpl {
       if (requiresAnnotation(lhs[i], rhs[i])) {
         body_ << ": " << lhs[i]->type()->annotation_str(type_printer_);
       }
-      body_ << " = " << useOf(rhs[i]) << '\n';
+      body_ << " = " << useOf(rhs[i]) << "\n";
     }
   }
 
@@ -705,7 +705,7 @@ struct PythonPrintImpl {
       printValueList(body_, node->outputs());
       body_ << " = ";
     }
-    body_ << expr << '\n';
+    body_ << expr << "\n";
   }
 
   // Recursively check contained types for any class dependencies
@@ -794,7 +794,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "return ";
           printValueList(body_, node->inputs());
-          body_ << '\n';
+          body_ << "\n";
         }
         break;
       case prim::Loop:
@@ -814,7 +814,7 @@ struct PythonPrintImpl {
         if (!node->outputs().empty()) {
           printValueList(body_, node->outputs(), "", ", = ");
         }
-        body_ << useOf(node->input()) << '\n';
+        body_ << useOf(node->input()) << "\n";
         break;
       case prim::SetAttr: {
         const auto obj = node->inputs().at(0);
@@ -822,8 +822,8 @@ struct PythonPrintImpl {
         const auto type = obj->type()->expect<ClassType>();
         const auto& attrname = node->s(attr::name);
         indent();
-        body_ << useOf(obj) << '.' << attrname << " = " << useOf(newVal)
-              << '\n';
+        body_ << useOf(obj) << "." << attrname << " = " << useOf(newVal)
+              << "\n";
       } break;
       case prim::fork: {
         // the subgraph gets emitted as another function
@@ -836,7 +836,7 @@ struct PythonPrintImpl {
         }
         printBody(graph->block());
         std::stringstream ss;
-        ss << "fork(" << name << ')';
+        ss << "fork(" << name << ")";
         printOutputDefinition(node, ss.str());
       } break;
       case prim::awaitable: {
@@ -850,7 +850,7 @@ struct PythonPrintImpl {
         }
         printBody(graph->block());
         std::stringstream ss;
-        ss << "awaitable(" << name << ')';
+        ss << "awaitable(" << name << ")";
         printOutputDefinition(node, ss.str());
       } break;
       case prim::Enter: {
@@ -884,7 +884,7 @@ struct PythonPrintImpl {
         auto name = useOf(node->output())->str();
         std::shared_ptr<Graph> graph = node->g(attr::Subgraph);
         indent();
-        body_ << "def " << name << '(';
+        body_ << "def " << name << "(";
         assignValuesToTheirUniqueNames(graph->inputs());
         for (size_t i = 0; i < graph->inputs().size(); ++i) {
           Value* v = graph->inputs().at(i);
@@ -903,7 +903,7 @@ struct PythonPrintImpl {
         assignValuesToTheirUniqueNames(out);
         indent();
         body_ << useOf(out) << " : " << out->type()->annotation_str() << " = "
-              << useOf(container) << '[' << useOf(key) << "]\n";
+              << useOf(container) << "[" << useOf(key) << "]\n";
       } break;
       default:
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
@@ -992,7 +992,7 @@ struct PythonPrintImpl {
       // doing it here ensures we do not have fix up archives later
       stmt << "torch." << kind.toUnqualString();
     } else {
-      stmt << "ops." << kind.ns().toUnqualString() << '.'
+      stmt << "ops." << kind.ns().toUnqualString() << "."
            << kind.toUnqualString();
     }
   }
@@ -1011,14 +1011,14 @@ struct PythonPrintImpl {
               << "If this is a nn.ModuleList, add it to __constants__");
         }
         std::stringstream scalars_stream;
-        stmt << '^' << value->name();
+        stmt << "^" << value->name();
         value->writeScalars(scalars_stream);
         stmt << scalars_stream.str();
         printValueList(stmt, node->inputs(), "(", ")");
       } break;
       case prim::Uninitialized: {
         stmt << "uninitialized("
-             << node->output()->type()->annotation_str(type_printer_) << ')';
+             << node->output()->type()->annotation_str(type_printer_) << ")";
       } break;
       case prim::Constant: {
         if (node->outputs().size() == 1 &&
@@ -1038,7 +1038,7 @@ struct PythonPrintImpl {
       case aten::IntImplicit: {
         stmt << "annotate("
              << node->output()->type()->annotation_str(type_printer_) << ", "
-             << useOf(node->input()) << ')';
+             << useOf(node->input()) << ")";
       } break;
       case aten::Int: {
         printValueList(stmt, node->inputs(), "int(", ")");
@@ -1070,12 +1070,12 @@ struct PythonPrintImpl {
             stmt, node->inputs(), "(", node->inputs().size() == 1 ? ",)" : ")");
       } break;
       case prim::TupleIndex: {
-        stmt << '(' << useOf(node->inputs().at(0)) << ")["
-             << useOf(node->inputs().at(1)) << ']';
+        stmt << "(" << useOf(node->inputs().at(0)) << ")["
+             << useOf(node->inputs().at(1)) << "]";
       } break;
       case prim::TupleSlice: {
-        stmt << '(' << useOf(node->input()) << ")[" << node->i(attr::beg) << ':'
-             << node->i(attr::end) << ']';
+        stmt << "(" << useOf(node->input()) << ")[" << node->i(attr::beg) << ":"
+             << node->i(attr::end) << "]";
       } break;
       case prim::ListConstruct: {
         ListTypePtr list_type = node->output()->type()->expect<ListType>();
@@ -1093,7 +1093,7 @@ struct PythonPrintImpl {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_) << ", ";
           printValueList(stmt, node->inputs(), "[", "]");
-          stmt << ')';
+          stmt << ")";
           // Otherwise just print a list
         } else {
           printValueList(stmt, node->inputs(), "[", "]");
@@ -1112,7 +1112,7 @@ struct PythonPrintImpl {
           stmt << "annotate("
                << node->output()->type()->annotation_str(type_printer_) << ", ";
           printDict(stmt, node->inputs());
-          stmt << ')';
+          stmt << ")";
           // Otherwise just print a dict
         } else {
           printDict(stmt, node->inputs());
@@ -1121,36 +1121,37 @@ struct PythonPrintImpl {
       case prim::CreateObject: {
         const auto classType = node->output()->type()->expect<ClassType>();
         stmt << classType->annotation_str(type_printer_) << ".__new__("
-             << classType->annotation_str(type_printer_) << ')';
+             << classType->annotation_str(type_printer_) << ")";
       } break;
       case prim::GetAttr: {
         const auto obj = node->inputs().at(0);
         const auto classType = obj->type()->expect<ClassType>();
         const auto& field = node->s(attr::name);
         if (isValidIdentifier(field)) {
-          stmt << useOf(obj) << '.' << field;
+          stmt << useOf(obj) << "." << field;
         } else {
           stmt << "getattr(" << useOf(obj) << ", ";
           std::stringstream field_stream;
           c10::printQuotedString(field_stream, field);
-          stmt << field_stream.str() << ')';
+          stmt << field_stream.str() << ")";
         }
       } break;
       case prim::CallFunction: {
-        stmt << useOf(node->inputs().at(0)) << '(';
+        stmt << useOf(node->inputs().at(0)) << "(";
         for (size_t i = 1; i < node->inputs().size(); i++) {
           stmt << useOf(node->inputs()[i]) << ", ";
         }
-        stmt << ')';
+        stmt << ")";
       } break;
       case prim::CallMethod: {
         const auto& self = node->inputs().at(0);
         const auto& methodName = node->s(attr::name);
-        stmt << '(' << useOf(self) << ')' << '.' << methodName << '(';
+        stmt << "(" << useOf(self) << ")"
+             << "." << methodName << "(";
         for (size_t i = 1; i < node->inputs().size(); i++) {
           stmt << useOf(node->inputs()[i]) << ", ";
         }
-        stmt << ')';
+        stmt << ")";
 
         if (auto selfClass = self->type()->cast<ClassType>()) {
           deps_table_.add(selfClass);
@@ -1168,7 +1169,7 @@ struct PythonPrintImpl {
       } break;
       case aten::_unwrap_optional: {
         printOpName(stmt, node->kind());
-        stmt << '(';
+        stmt << "(";
         // we cannot recover the type of unwrap_optional(None),
         // using normal schema matching, so we route around this by rewriting
         // the call to unwrap_optional(annotated(Optional[T], None))
@@ -1176,11 +1177,11 @@ struct PythonPrintImpl {
             node->input()->mustBeNone()) {
           auto input_type = OptionalType::create(node->output()->type());
           stmt << "annotate(" << input_type->annotation_str(type_printer_)
-               << ", " << useOf(node->input()) << ')';
+               << ", " << useOf(node->input()) << ")";
         } else {
           stmt << useOf(node->input());
         }
-        stmt << ')';
+        stmt << ")";
       } break;
       // unchecked_unwrap_optional is no longer generated by the compiler,
       // but may end up here if it was first loaded from a old model and
@@ -1190,7 +1191,7 @@ struct PythonPrintImpl {
       case prim::unchecked_cast: {
         stmt << "unchecked_cast("
              << node->output()->type()->annotation_str(type_printer_) << ", "
-             << useOf(node->input()) << ')';
+             << useOf(node->input()) << ")";
       } break;
       case prim::isinstance: {
         stmt << "isinstance(" << useOf(node->input()) << ", ";
@@ -1199,7 +1200,7 @@ struct PythonPrintImpl {
           stmt << types.at(0)->annotation_str(type_printer_);
         } else {
           // check multiple things, e.g. (str, list, int)
-          stmt << '(';
+          stmt << "(";
           bool first = true;
           for (const TypePtr& typ : types) {
             if (!first) {
@@ -1208,29 +1209,30 @@ struct PythonPrintImpl {
             stmt << typ->annotation_str(type_printer_);
             first = false;
           }
-          stmt << ')';
+          stmt << ")";
         }
-        stmt << ')';
+        stmt << ")";
       } break;
       case prim::tolist: {
         stmt << "annotate("
              << node->output()->type()->annotation_str(type_printer_) << ", ";
-        stmt << useOf(node->input(0)) << ".tolist()" << ')';
+        stmt << useOf(node->input(0)) << ".tolist()"
+             << ")";
       } break;
       case prim::EnumValue:
         // Note: This CAN NOT be printed as raw operator ops.prim.EnumValue
         // because its return type depends on type of enum and must be further
         // resolved, but ops.prim.EnumValue construction does not provide such
         // functionality.
-        stmt << '(' << useOf(node->input()) << ").value";
+        stmt << "(" << useOf(node->input()) << ").value";
         break;
       case prim::EnumName:
-        stmt << '(' << useOf(node->input()) << ").name";
+        stmt << "(" << useOf(node->input()) << ").name";
         break;
       default: {
         printOpName(stmt, node->kind());
         const FunctionSchema& schema = node->schema();
-        stmt << '(';
+        stmt << "(";
         // calculate how many args are specified.
         // see (https://github.com/pytorch/pytorch/pull/56079) for more
         // details.
@@ -1255,7 +1257,7 @@ struct PythonPrintImpl {
             if (i < num_schema_args) {
               auto arg = schema.arguments().at(i);
               if (arg.kwarg_only()) {
-                stmt << arg.name() << '=';
+                stmt << arg.name() << "=";
               }
             } else {
               // vararg functions like format can have extra arguments
@@ -1272,11 +1274,11 @@ struct PythonPrintImpl {
             // figure out the corresponding input at this index
             auto input_idx = node->inputs().size() - (num_schema_args - i);
             if (input_idx < node->inputs().size()) {
-              stmt << arg.name() << '=' << *useOf(node->inputs().at(input_idx));
+              stmt << arg.name() << "=" << *useOf(node->inputs().at(input_idx));
             }
           }
         }
-        stmt << ')';
+        stmt << ")";
       } break;
     }
   }
@@ -1311,7 +1313,7 @@ struct PythonPrintImpl {
       const Argument& arg,
       TaggedStringStream& stmt,
       const IValue& value) {
-    stmt << '=';
+    stmt << "=";
     // handle broadcasting lists
     if (arg.type()->kind() == ListType::Kind &&
         (value.isInt() || value.isDouble() || value.isBool())) {
@@ -1361,7 +1363,7 @@ struct PythonPrintImpl {
     WithSourceRange guard(&source_range_stack_, graph.param_node());
 
     indent();
-    body_ << "def " << func.name() << '(';
+    body_ << "def " << func.name() << "(";
     auto param_it = graph.inputs().begin();
     for (const Argument& arg : schema.arguments()) {
       registerClassDependencies(arg.type());
@@ -1446,14 +1448,14 @@ struct PythonPrintImpl {
         indent();
         body_ << "__parameters__ = [";
         for (const auto& param : params) {
-          body_ << '"' << param << "\", ";
+          body_ << "\"" << param << "\", ";
         }
         body_ << "]\n";
 
         indent();
         body_ << "__buffers__ = [";
         for (const auto& buffer : buffers) {
-          body_ << '"' << buffer << "\", ";
+          body_ << "\"" << buffer << "\", ";
         }
         body_ << "]\n";
         auto forwardPreHooks = classType->getForwardPreHooks();
@@ -1461,7 +1463,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "__forward_pre_hooks__ = [";
           for (const auto& pre_hook : forwardPreHooks) {
-            body_ << '"' << pre_hook->name() << "\", ";
+            body_ << "\"" << pre_hook->name() << "\", ";
           }
           body_ << "]\n";
         }
@@ -1471,7 +1473,7 @@ struct PythonPrintImpl {
           indent();
           body_ << "__forward_hooks__ = [";
           for (const auto& hook : forwardHooks) {
-            body_ << '"' << hook->name() << "\", ";
+            body_ << "\"" << hook->name() << "\", ";
           }
           body_ << "]\n";
         }
@@ -1494,12 +1496,13 @@ struct PythonPrintImpl {
           }
           // Print out a direct manipulation of the annotations dict, like:
           //   __annotations__["0"] = SomeType
-          body_ << "__annotations__[" << '"' << name
-                << "\"] = " << type->annotation_str(type_printer_) << '\n';
+          body_ << "__annotations__["
+                << "\"" << name
+                << "\"] = " << type->annotation_str(type_printer_) << "\n";
         } else {
           // Otherwise: just emit a python 3 attribute annotation, like:
           //   foo : SomeType
-          body_ << name << " : " << type->annotation_str(type_printer_) << '\n';
+          body_ << name << " : " << type->annotation_str(type_printer_) << "\n";
         }
       }
 
@@ -1513,7 +1516,7 @@ struct PythonPrintImpl {
               << "Final[" << v.type()->annotation_str(type_printer_) << "] = ";
         auto ss = std::make_shared<TaggedStringStream>(&source_range_stack_);
         printConstant(*ss, v);
-        body_ << ss->str() << '\n';
+        body_ << ss->str() << "\n";
       }
 
       // TODO fields
@@ -1551,7 +1554,7 @@ struct PythonPrintImpl {
           TORCH_INTERNAL_ASSERT(attr.type());
           indent();
           body_ << attr.name() << " : "
-                << attr.type()->annotation_str(type_printer_) << '\n';
+                << attr.type()->annotation_str(type_printer_) << "\n";
         }
       }
     } else if (auto interfaceType = type->cast<InterfaceType>()) {
@@ -1597,7 +1600,7 @@ struct PythonPrintImpl {
         for (const auto& name_value : enumType->enumNamesValues()) {
           indent();
           body_ << name_value.first << " = " << value_wrapper
-                << name_value.second << value_wrapper << '\n';
+                << name_value.second << value_wrapper << "\n";
         }
       }
     } else {
diff --git a/torch/csrc/jit/tensorexpr/block_codegen.cpp b/torch/csrc/jit/tensorexpr/block_codegen.cpp
index 6ec55f998cce0..24228cdea32dd 100644
--- a/torch/csrc/jit/tensorexpr/block_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/block_codegen.cpp
@@ -132,7 +132,7 @@ void BlockPrinter::visit(const ForPtr& v) {
     os() << '\n';
     emitIndent();
     PrintReshapeInfo(buf_writes, true); // print reverse reshape
-    os() << '}';
+    os() << "}";
     os() << '\n';
   } else if (loop_options.is_gpu_thread_index()) {
     PrintDMAs(buf_reads);
@@ -154,12 +154,12 @@ void BlockPrinter::PrintTensorInfo(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     auto num_dims = block_analysis_->getMultiDimBuf(buf)->dims().size();
     os() << block_analysis_->getInputName(buf) << " = ";
-    os() << '{';
+    os() << "{";
     for (unsigned long d = 0; d < num_dims; d++) {
-      os() << '{' << dim_names[d] << "};";
+      os() << "{" << dim_names[d] << "};";
     }
     os() << " elem : " << blockDtypeCppString(buf->dtype());
-    os() << '}';
+    os() << "}";
   }
 
   for (auto& buf : bufs) {
@@ -168,14 +168,15 @@ void BlockPrinter::PrintTensorInfo(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     auto num_dims = block_analysis_->getMultiDimBuf(buf)->dims().size();
     os() << block_analysis_->getFlatInputName(buf) << " = ";
-    os() << '{';
-    os() << '{' << flat_dim_names[num_dims - 1] << "};";
+    os() << "{";
+    os() << "{" << flat_dim_names[num_dims - 1] << "};";
     os() << " elem : " << blockDtypeCppString(buf->dtype());
-    os() << '}' << " // flattened tensor";
+    os() << "}"
+         << " // flattened tensor";
   }
   os() << '\n';
   emitIndent();
-  os() << '}' << '\n' << '\n';
+  os() << "}" << '\n' << '\n';
 }
 
 void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
@@ -212,7 +213,7 @@ void BlockPrinter::PrintArguments(const std::unordered_set<BufPtr>& bufs) {
   emitIndent();
   os() << "var bs_DPE = " << blck_sz << '\n';
   emitIndent();
-  os() << '}' << '\n' << '\n';
+  os() << "}" << '\n' << '\n';
 }
 
 void BlockPrinter::PrintBufferInfo(const std::unordered_set<BufPtr>& bufs) {
@@ -229,7 +230,7 @@ void BlockPrinter::PrintBufferInfo(const std::unordered_set<BufPtr>& bufs) {
   }
   os() << '\n';
   emitIndent();
-  os() << '}' << '\n' << '\n';
+  os() << "}" << '\n' << '\n';
 }
 
 void BlockPrinter::PrintDistribution(const std::unordered_set<BufPtr>& bufs) {
@@ -252,14 +253,14 @@ void BlockPrinter::PrintLoop(
   auto trip = 0;
   for (auto& buf : bufs) {
     if (trip > 0) {
-      os() << ',';
+      os() << ",";
     }
     os() << "{dim : ";
     os() << block_analysis_->getFlatInputName(buf) << ".dim.0, ";
     os() << (block_idx ? "block: bs_N}" : "block: bs_DPE}");
     ++trip;
   }
-  os() << ')';
+  os() << ")";
 }
 
 void BlockPrinter::PrintReshapeInfo(
@@ -273,7 +274,7 @@ void BlockPrinter::PrintReshapeInfo(
          << ", "
          << (reverse ? block_analysis_->getInputName(buf)
                      : block_analysis_->getFlatInputName(buf))
-         << ')' << '\n';
+         << ")" << '\n';
   }
 }
 
@@ -282,7 +283,7 @@ void BlockPrinter::PrintDMAs(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     os() << "dma_in(";
     os() << block_analysis_->getFlatInputName(read);
-    os() << ')' << '\n';
+    os() << ")" << '\n';
   }
 }
 void BlockPrinter::PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs) {
@@ -290,7 +291,7 @@ void BlockPrinter::PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs) {
     emitIndent();
     os() << "adjust_buffer(";
     os() << block_analysis_->getFlatInputName(read);
-    os() << ')' << '\n';
+    os() << ")" << '\n';
   }
 }
 
@@ -304,14 +305,14 @@ void BlockPrinter::visit(const StorePtr& v) {
 }
 
 void BlockPrinter::visit(const BlockPtr& v) {
-  os() << '{' << '\n';
+  os() << "{" << '\n';
   indent_++;
   for (const StmtPtr& s : v->stmts()) {
     s->accept(this);
   }
   indent_--;
   emitIndent();
-  os() << '}';
+  os() << "}";
 }
 
 std::string BlockCodeGen::GetUniqueFuncName(const std::string& func_prefix) {
@@ -340,14 +341,14 @@ void BlockCodeGen::Initialize() {
   };
 
   std::string func_name = GetUniqueFuncName("func");
-  os() << "kernel " << func_name << '(';
+  os() << "kernel " << func_name << "(";
   for (auto const& arg : buf_writes) {
     os() << block_analysis_->getInputName(arg);
   }
   for (auto const& arg : buf_reads) {
-    os() << ';' << block_analysis_->getInputName(arg);
+    os() << ";" << block_analysis_->getInputName(arg);
   }
-  os() << ')';
+  os() << ")";
 
   stmt_v->accept(printer_.get());
 
diff --git a/torch/csrc/jit/tensorexpr/bounds_inference.cpp b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
index 034f51f46b8f7..bbc9d845fa4f7 100644
--- a/torch/csrc/jit/tensorexpr/bounds_inference.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_inference.cpp
@@ -128,10 +128,10 @@ void printBoundsInfo(const BoundsInfo& v) {
       if (!first) {
         std::cerr << ", ";
       }
-      std::cerr << ((b.kind == kLoad) ? "LOAD" : "STORE") << '(';
+      std::cerr << ((b.kind == kLoad) ? "LOAD" : "STORE") << "(";
       int i = 0;
       if (b.start.empty()) {
-        std::cerr << '0';
+        std::cerr << "0";
       }
       for (auto& s : b.start) {
         if (i != 0) {
@@ -143,7 +143,7 @@ void printBoundsInfo(const BoundsInfo& v) {
       std::cerr << "; ";
       i = 0;
       if (b.stop.empty()) {
-        std::cerr << '0';
+        std::cerr << "0";
       }
       for (auto& s : b.stop) {
         if (i != 0) {
@@ -152,7 +152,7 @@ void printBoundsInfo(const BoundsInfo& v) {
         std::cerr << *s;
         i++;
       }
-      std::cerr << ')';
+      std::cerr << ")";
       first = false;
     }
     std::cerr << "]\n";
diff --git a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
index 0c785504efe85..0c352e3b19f3b 100644
--- a/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
+++ b/torch/csrc/jit/tensorexpr/bounds_overlap.cpp
@@ -35,7 +35,7 @@ static bool mustBeZero(const ExprPtr& e) {
 }
 
 void Bound::print() const {
-  std::cout << '(' << *start << ", " << *end << ')';
+  std::cout << "(" << *start << ", " << *end << ")";
 }
 
 bool Bound::equals(const Bound& other) const {
diff --git a/torch/csrc/jit/tensorexpr/codegen.cpp b/torch/csrc/jit/tensorexpr/codegen.cpp
index b19a8b8964ad5..41e54869850c8 100644
--- a/torch/csrc/jit/tensorexpr/codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/codegen.cpp
@@ -41,7 +41,7 @@ RegisterCodeGenList::StmtFactoryMethod RegisterCodeGenList::
       oss << entry.first;
       index++;
     }
-    oss << ']';
+    oss << "]";
     throw std::runtime_error(oss.str());
   }
   return iter->second;
diff --git a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
index 6b03b939ace99..fa42d48c75e93 100644
--- a/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cpp_codegen.cpp
@@ -89,28 +89,28 @@ static inline std::enable_if_t<std::is_floating_point_v<T>, void> visit_mod(
     std::ostream& os,
     const ExprPtr& lhs,
     const ExprPtr& rhs) {
-  os << "std::fmod(" << *lhs << ", " << *rhs << ')';
+  os << "std::fmod(" << *lhs << ", " << *rhs << ")";
 }
 
 template <typename T>
 static inline std::
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "std::max(" << *lhs << ", " << *rhs << ')';
+  os << "std::max(" << *lhs << ", " << *rhs << ")";
 }
 
 template <typename T>
 static inline std::
     enable_if_t<!std::is_floating_point_v<T> && !std::is_integral_v<T>, void>
     visit_max(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << '(' << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
+  os << "(" << *lhs << " < " << *rhs << ") ? " << *rhs << " : " << *lhs;
 }
 
 template <typename T>
 static inline std::
     enable_if_t<std::is_floating_point_v<T> || std::is_integral_v<T>, void>
     visit_min(std::ostream& os, const ExprPtr& lhs, const ExprPtr& rhs) {
-  os << "std::min(" << *lhs << ", " << *rhs << ')';
+  os << "std::min(" << *lhs << ", " << *rhs << ")";
 }
 
 template <typename T>
@@ -176,14 +176,14 @@ void CppPrinter::visit(const MinPtr& v) {
 }
 
 void CppPrinter::visit(const CompareSelectPtr& v) {
-  os() << "((" << *v->lhs() << ' '
-       << IRPrinter::to_string(v->compare_select_op()) << ' ' << *v->rhs()
-       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ')';
+  os() << "((" << *v->lhs() << " "
+       << IRPrinter::to_string(v->compare_select_op()) << " " << *v->rhs()
+       << ") ? " << *v->ret_val1() << " : " << *v->ret_val2() << ")";
 }
 
 void CppPrinter::visit(const IfThenElsePtr& v) {
   os() << "((" << *v->condition() << ") ? " << *v->true_value() << " : "
-       << *v->false_value() << ')';
+       << *v->false_value() << ")";
 }
 
 void CppPrinter::visit(const AllocatePtr& v) {
@@ -211,7 +211,7 @@ void CppPrinter::visit(const FreePtr& v) {
 void CppPrinter::visit(const LoadPtr& v) {
   auto flat_idx =
       flatten_index(v->buf()->dims(), v->indices(), v->buf()->strides());
-  os() << *v->base_handle() << '[' << *flat_idx << ']';
+  os() << *v->base_handle() << "[" << *flat_idx << "]";
 }
 
 void CppPrinter::visit(const StorePtr& v) {
@@ -221,19 +221,19 @@ void CppPrinter::visit(const StorePtr& v) {
   for (int lane = 0; lane < lanes; lane++) {
     lane_ = lane;
     emitIndent();
-    os() << *v->base_handle() << '[' << *flat_idx << "] = " << *v->value()
-         << ';' << '\n';
+    os() << *v->base_handle() << "[" << *flat_idx << "] = " << *v->value()
+         << ";" << '\n';
   }
 }
 
 void CppPrinter::visit(const CastPtr& v) {
   os() << "static_cast<" << v->dtype().ToCppString() << ">(" << *v->src_value()
-       << ')';
+       << ")";
 }
 
 void CppPrinter::visit(const BitCastPtr& v) {
   os() << "std::bitcast<" << v->src_value()->dtype().ToCppString() << ", "
-       << v->dtype().ToCppString() << ">(" << *v->src_value() << ')';
+       << v->dtype().ToCppString() << ">(" << *v->src_value() << ")";
 }
 
 void CppPrinter::visit(const IntrinsicsPtr& v) {
@@ -241,14 +241,14 @@ void CppPrinter::visit(const IntrinsicsPtr& v) {
     throw std::runtime_error("kRand and kSigmoid are not supported");
   }
 
-  os() << "std::" << v->func_name() << '(';
+  os() << "std::" << v->func_name() << "(";
   for (size_t i = 0; i < v->nparams(); i++) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ')';
+  os() << ")";
 }
 
 void CppPrinter::visit(const ExternalCallPtr& v) {
@@ -272,7 +272,7 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   };
 
   emitIndent();
-  os() << '{' << '\n';
+  os() << "{" << '\n';
   indent_++;
 
   emitIndent();
@@ -315,9 +315,9 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   os() << "};" << '\n';
 
   emitIndent();
-  os() << v->func_name() << '(' << '\n';
+  os() << v->func_name() << "(" << '\n';
   emitIndent();
-  os() << "    " << bufs.size() << ',' << '\n';
+  os() << "    " << bufs.size() << "," << '\n';
   emitIndent();
   os() << "    buf_ptrs," << '\n';
   emitIndent();
@@ -327,20 +327,20 @@ void CppPrinter::visit(const ExternalCallPtr& v) {
   emitIndent();
   os() << "    buf_dtypes," << '\n';
   emitIndent();
-  os() << "    " << v->args().size() << ',' << '\n';
+  os() << "    " << v->args().size() << "," << '\n';
   emitIndent();
   os() << "    extra_args);" << '\n';
 
   indent_--;
   emitIndent();
-  os() << '}' << '\n';
+  os() << "}" << '\n';
 }
 
 void CppPrinter::visit(const LetPtr& v) {
   if (v->var()->dtype().lanes() == 1) {
     emitIndent();
-    os() << v->var()->dtype().ToCppString() << ' ' << *v->var() << " = "
-         << *v->value() << ';' << '\n';
+    os() << v->var()->dtype().ToCppString() << " " << *v->var() << " = "
+         << *v->value() << ";" << '\n';
   } else {
     vector_vars_[v->var()] = v->value();
   }
@@ -370,7 +370,7 @@ void CppCodeGen::init() {
   apply_visitor(var_name_rewriter_.get());
 
   printer_->printPrologue();
-  os() << "void " << kernel_func_name() << '(';
+  os() << "void " << kernel_func_name() << "(";
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
@@ -381,7 +381,7 @@ void CppCodeGen::init() {
     Dtype dtype = buffer_arg.dtype();
     os() << dtype.ToCppString() << (buffer_arg.isVar() ? " " : "* ") << *var;
   }
-  os() << ')';
+  os() << ")";
   stmt()->accept(printer_.get());
   os() << '\n';
 }
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 264e01d65db94..6131b55883dfb 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -195,8 +195,8 @@ void CudaPrinter::print_flat_alloc(const AllocatePtr& alloc) {
       throw std::runtime_error("Only integer dimensions are supported for now");
     }
   }
-  os() << dtypeToCppString(alloc->dtype()) << ' ' << (*alloc->buffer_var())
-       << '[' << flat_size << "];" << '\n';
+  os() << dtypeToCppString(alloc->dtype()) << " " << (*alloc->buffer_var())
+       << "[" << flat_size << "];" << '\n';
 }
 
 void CudaPrinter::visit(const AllocatePtr& v) {
@@ -234,9 +234,9 @@ void CudaPrinter::visit(const CastPtr& v) {
       : v->src_value()->dtype().scalar_type() == ScalarType::BFloat16
       ? "__bfloat162float"
       : ("(" + dtypeToCppString(v->dtype()) + ")");
-  os() << castFn << '(';
+  os() << castFn << "(";
   v->src_value()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void CudaPrinter::visit(const IntrinsicsPtr& v) {
@@ -265,14 +265,14 @@ void CudaPrinter::visit(const IntrinsicsPtr& v) {
     func_name = "isnan";
   }
 
-  os() << func_name << '(';
+  os() << func_name << "(";
   for (const auto i : c10::irange(v->nparams())) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ')';
+  os() << ")";
 }
 
 void CudaPrinter::visit(const ExternalCallPtr& v) {
@@ -293,15 +293,15 @@ void CudaPrinter::visit(const LoadPtr& v) {
       v->dtype().scalar_type() == ScalarType::Half ||
       v->dtype().scalar_type() == ScalarType::BFloat16) {
     // There's no __ldg overload for bool or half.
-    os() << *v->base_handle() << '[' << *v->flat_index() << ']';
+    os() << *v->base_handle() << "[" << *v->flat_index() << "]";
     return;
   }
   if (cuda_analysis_->is_buf_store_target(v->buf())) {
     // Cuda __ldg can only be applied on read-only buffers.
-    os() << *v->base_handle() << '[' << *v->flat_index() << ']';
+    os() << *v->base_handle() << "[" << *v->flat_index() << "]";
     return;
   }
-  os() << "__ldg(" << *v->base_handle() << " + " << *v->flat_index() << ')';
+  os() << "__ldg(" << *v->base_handle() << " + " << *v->flat_index() << ")";
 }
 
 // TODO: maybe this should be a more shared location?
@@ -412,9 +412,9 @@ void CudaPrinter::visit(const StorePtr& v) {
   if (v->indices().empty()) {
     os() << *v->base_handle() << " = ";
   } else {
-    os() << *v->base_handle() << '[' << *v->flat_index() << "] = ";
+    os() << *v->base_handle() << "[" << *v->flat_index() << "] = ";
   }
-  os() << *v->value() << ';';
+  os() << *v->value() << ";";
   os() << '\n';
 }
 
@@ -422,10 +422,10 @@ void CudaPrinter::visit(const AtomicAddPtr& v) {
   emitIndent();
   if (cuda_analysis_->thread_local_bufs().count(v->base_handle()) > 0) {
     // atomicAdd only works on global and shared memory
-    os() << *v->base_handle() << '[' << *v->flat_index()
-         << "] += " << *v->value() << ';';
+    os() << *v->base_handle() << "[" << *v->flat_index()
+         << "] += " << *v->value() << ";";
   } else {
-    os() << "atomicAdd(&" << *v->base_handle() << '[' << *v->flat_index() << ']'
+    os() << "atomicAdd(&" << *v->base_handle() << "[" << *v->flat_index() << "]"
          << ", " << *v->value() << ");";
   }
   os() << '\n';
@@ -438,9 +438,9 @@ void CudaPrinter::visit(const MaxPtr& v) {
     os() << "maximum(";
   }
   v->lhs()->accept(this);
-  os() << ',';
+  os() << ",";
   v->rhs()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void CudaPrinter::visit(const MinPtr& v) {
@@ -450,9 +450,9 @@ void CudaPrinter::visit(const MinPtr& v) {
     os() << "minimum(";
   }
   v->lhs()->accept(this);
-  os() << ',';
+  os() << ",";
   v->rhs()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void CudaPrinter::visit(const IfThenElsePtr& v) {
@@ -462,11 +462,11 @@ void CudaPrinter::visit(const IfThenElsePtr& v) {
   v->true_value()->accept(this);
   os() << " : ";
   v->false_value()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void CudaPrinter::visit(const BlockPtr& v) {
-  os() << '{' << '\n';
+  os() << "{" << '\n';
   indent_++;
 
   for (const StmtPtr& s : v->stmts()) {
@@ -475,15 +475,15 @@ void CudaPrinter::visit(const BlockPtr& v) {
 
   indent_--;
   emitIndent();
-  os() << '}';
+  os() << "}";
 }
 
 void CudaPrinter::visit(const LetPtr& v) {
   emitIndent();
   os() << dtypeToCppString(v->var()->dtype());
-  os() << ' ' << *v->var() << " = ";
+  os() << " " << *v->var() << " = ";
   v->value()->accept(this);
-  os() << ';' << '\n';
+  os() << ";" << '\n';
 }
 
 class PrioritizeLoad : public IRMutator {
@@ -911,7 +911,7 @@ void CudaCodeGen::Initialize() {
   // https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size
   os() << "__attribute__((amdgpu_flat_work_group_size(1, 1024)))" << std::endl;
 #endif
-  os() << "void " << func_name << '(';
+  os() << "void " << func_name << "(";
   const std::vector<BufferArg> buffer_args = this->buffer_args();
   for (size_t i = 0; i < buffer_args.size(); i++) {
     if (i > 0) {
@@ -932,7 +932,7 @@ void CudaCodeGen::Initialize() {
     rand_seed = alloc<Var>("rand_seed", kInt);
     rand_offset = alloc<Var>("rand_offset", kInt);
     std::string uint64_str = "unsigned long long";
-    os() << ", " << uint64_str << ' ' << *rand_seed << ", " << uint64_str << ' '
+    os() << ", " << uint64_str << " " << *rand_seed << ", " << uint64_str << " "
          << *rand_offset;
   }
   os() << ") {";
@@ -942,7 +942,7 @@ void CudaCodeGen::Initialize() {
     VarPtr idx = alloc<Var>("idx", kInt);
     os() << "int " << *idx << " = blockIdx.x*blockDim.x + threadIdx.x;" << '\n';
     VarPtr rand_func = printer_->rand_func();
-    os() << "Philox " << *rand_func << '(' << *rand_seed << ", " << *idx << ", "
+    os() << "Philox " << *rand_func << "(" << *rand_seed << ", " << *idx << ", "
          << *rand_offset << ");" << '\n';
     os() << '\n';
   }
@@ -969,7 +969,7 @@ void CudaCodeGen::Initialize() {
 
   stmt_v->accept(printer_.get());
   os() << '\n';
-  os() << '}';
+  os() << "}";
 
   // Check that all block extents had been set.
   const std::vector<ExprPtr>& gpu_block_extents =
diff --git a/torch/csrc/jit/tensorexpr/ir_printer.cpp b/torch/csrc/jit/tensorexpr/ir_printer.cpp
index 31b7866a73d21..9b2ecd0e11515 100644
--- a/torch/csrc/jit/tensorexpr/ir_printer.cpp
+++ b/torch/csrc/jit/tensorexpr/ir_printer.cpp
@@ -71,21 +71,21 @@ static void visitBinaryOp(
   int rhs_prec = getPrecedence(v->rhs()->expr_type());
 
   if (lhs_prec >= self_prec) {
-    os << '(';
+    os << "(";
   }
   v->lhs()->accept(printer);
   if (lhs_prec >= self_prec) {
-    os << ')';
+    os << ")";
   }
 
-  os << ' ' << op_str << ' ';
+  os << " " << op_str << " ";
 
   if (rhs_prec >= self_prec) {
-    os << '(';
+    os << "(";
   }
   v->rhs()->accept(printer);
   if (rhs_prec >= self_prec) {
-    os << ')';
+    os << ")";
   }
 }
 
@@ -129,7 +129,7 @@ void IRPrinter::visit(const ModPtr& v) {
   if (v->dtype().is_integral()) {
     visitBinaryOp(v, "%", this);
   } else if (v->dtype().is_floating_point()) {
-    os() << "mod(" << *v->lhs() << ", " << *v->rhs() << ')';
+    os() << "mod(" << *v->lhs() << ", " << *v->rhs() << ")";
   } else {
     throw std::runtime_error("invalid dtype: " + std::to_string(v->dtype()));
   }
@@ -140,7 +140,7 @@ void IRPrinter::visit(const MaxPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ", " << (unsigned int)v->propagate_nans() << ')';
+  os() << ", " << (unsigned int)v->propagate_nans() << ")";
 }
 
 void IRPrinter::visit(const MinPtr& v) {
@@ -148,7 +148,7 @@ void IRPrinter::visit(const MinPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ", " << (unsigned int)v->propagate_nans() << ')';
+  os() << ", " << (unsigned int)v->propagate_nans() << ")";
 }
 
 void IRPrinter::visit(const CompareSelectPtr& v) {
@@ -158,32 +158,32 @@ void IRPrinter::visit(const CompareSelectPtr& v) {
   int rhs_prec = getPrecedence(v->rhs()->expr_type());
 
   if (lhs_prec >= self_prec) {
-    os() << '(';
+    os() << "(";
   }
   v->lhs()->accept(this);
   if (lhs_prec >= self_prec) {
-    os() << ')';
+    os() << ")";
   }
 
   os() << to_string(cmp_op);
 
   if (rhs_prec >= self_prec) {
-    os() << '(';
+    os() << "(";
   }
   v->rhs()->accept(this);
   if (rhs_prec >= self_prec) {
-    os() << ')';
+    os() << ")";
   }
   os() << " ? ";
 
   auto withParens = [&](const ExprPtr& e) {
     auto prec = getPrecedence(e->expr_type());
     if (prec >= self_prec) {
-      os() << '(';
+      os() << "(";
     }
     e->accept(this);
     if (prec >= self_prec) {
-      os() << ')';
+      os() << ")";
     }
   };
   withParens(v->ret_val1());
@@ -237,16 +237,16 @@ AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
 
 void IRPrinter::visit(const CastPtr& v) {
   auto dtype = v->dtype();
-  os() << dtypeToCppString(dtype) << '(';
+  os() << dtypeToCppString(dtype) << "(";
   v->src_value()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const BitCastPtr& v) {
   auto dtype = v->dtype();
   os() << "BitCast<" << dtype.ToCppString() << ">(";
   v->src_value()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const VarPtr& v) {
@@ -273,7 +273,7 @@ void IRPrinter::visit(const BufPtr& v) {
     }
     s->accept(this);
   }
-  os() << ']';
+  os() << "]";
   os() << ", strides=[";
   i = 0;
   for (const ExprPtr& s : v->strides()) {
@@ -282,14 +282,14 @@ void IRPrinter::visit(const BufPtr& v) {
     }
     s->accept(this);
   }
-  os() << ']';
+  os() << "]";
 
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const RampPtr& v) {
   os() << "Ramp(" << *v->base() << ", " << *v->stride() << ", " << v->lanes()
-       << ')';
+       << ")";
 }
 
 void IRPrinter::visit(const LoadPtr& v) {
@@ -297,7 +297,7 @@ void IRPrinter::visit(const LoadPtr& v) {
   if (v->indices().empty()) {
     os() << *v->base_handle();
   } else {
-    os() << *v->base_handle() << '[';
+    os() << *v->base_handle() << "[";
     size_t i = 0;
     for (const ExprPtr& ind : v->indices()) {
       if (i++) {
@@ -306,40 +306,40 @@ void IRPrinter::visit(const LoadPtr& v) {
       ind->accept(this);
     }
     if (v->indices().empty()) {
-      os() << '0';
+      os() << "0";
     }
-    os() << ']';
+    os() << "]";
   }
 }
 
 void IRPrinter::visit(const BroadcastPtr& v) {
-  os() << "Broadcast(" << *v->value() << ", " << v->lanes() << ')';
+  os() << "Broadcast(" << *v->value() << ", " << v->lanes() << ")";
 }
 
 void IRPrinter::visit(const IfThenElsePtr& v) {
   os() << "IfThenElse(" << *v->condition() << ", " << *v->true_value() << ", "
-       << *v->false_value() << ')';
+       << *v->false_value() << ")";
 }
 
 void IRPrinter::visit(const IntrinsicsPtr& v) {
-  os() << v->func_name() << '(';
+  os() << v->func_name() << "(";
   for (const auto i : c10::irange(v->nparams())) {
     if (i > 0) {
       os() << ", ";
     }
     os() << *v->param(i);
   }
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const TermPtr& v) {
   os() << "Term(";
   v->scalar()->accept(this);
   for (const auto& t : v->variables()) {
-    os() << ',';
+    os() << ",";
     t->accept(this);
   }
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const PolynomialPtr& v) {
@@ -357,7 +357,7 @@ void IRPrinter::visit(const PolynomialPtr& v) {
     os() << " + ";
   }
   v->scalar()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const RoundOffPtr& v) {
@@ -365,7 +365,7 @@ void IRPrinter::visit(const RoundOffPtr& v) {
   v->lhs()->accept(this);
   os() << ", ";
   v->rhs()->accept(this);
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const MaxTermPtr& v) {
@@ -380,7 +380,7 @@ void IRPrinter::visit(const MaxTermPtr& v) {
       os() << ", ";
     }
   }
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const MinTermPtr& v) {
@@ -395,7 +395,7 @@ void IRPrinter::visit(const MinTermPtr& v) {
       os() << ", ";
     }
   }
-  os() << ')';
+  os() << ")";
 }
 
 void IRPrinter::visit(const ReduceOpPtr& v) {
@@ -423,11 +423,11 @@ void IRPrinter::visit(const ReduceOpPtr& v) {
 void IRPrinter::visit(const StorePtr& v) {
   // TODO: handle the mask
   if (v->indices().empty()) {
-    os() << *v->base_handle() << " = " << *v->value() << ';';
+    os() << *v->base_handle() << " = " << *v->value() << ";";
     return;
   }
 
-  os() << *v->base_handle() << '[';
+  os() << *v->base_handle() << "[";
   size_t i = 0;
   for (const ExprPtr& ind : v->indices()) {
     if (i++) {
@@ -436,15 +436,15 @@ void IRPrinter::visit(const StorePtr& v) {
     ind->accept(this);
   }
   if (v->indices().empty()) {
-    os() << '0';
+    os() << "0";
   }
-  os() << "] = " << *v->value() << ';';
+  os() << "] = " << *v->value() << ";";
 }
 
 void IRPrinter::visit(const ForPtr& v) {
   VarPtr var = v->var();
   VarHandle vv(var);
-  os() << "for (" << dtypeToCppString(var->dtype()) << ' ' << vv << " = "
+  os() << "for (" << dtypeToCppString(var->dtype()) << " " << vv << " = "
        << ExprHandle(v->start()) << "; " << vv << " < " << ExprHandle(v->stop())
        << "; " << vv << "++) ";
   std::string loop_options_str = v->loop_options().ToString();
@@ -464,11 +464,11 @@ void IRPrinter::visit(const BlockPtr& v) {
 
   for (const StmtPtr& s : *v) {
     emitIndent();
-    os() << *s << '\n';
+    os() << *s << "\n";
   }
   indent_--;
   emitIndent();
-  os() << '}';
+  os() << "}";
 }
 
 void IRPrinter::visit(const AllocatePtr& v) {
@@ -482,7 +482,7 @@ void IRPrinter::visit(const AllocatePtr& v) {
     }
     os() << *dims[i];
   }
-  os() << ']';
+  os() << "]";
 }
 
 void IRPrinter::visit(const FreePtr& v) {
@@ -503,13 +503,13 @@ void IRPrinter::visit(const FreeExtPtr& v) {
 }
 
 void IRPrinter::visit(const PlacementAllocatePtr& v) {
-  os() << "Alias(" << *v->buf()->base_handle() << ','
+  os() << "Alias(" << *v->buf()->base_handle() << ","
        << *v->buf_to_reuse()->base_handle() << ");";
 }
 
 void IRPrinter::visit(const LetPtr& v) {
-  os() << dtypeToCppString(v->var()->dtype()) << ' ' << *v->var();
-  os() << " = " << *v->value() << ';';
+  os() << dtypeToCppString(v->var()->dtype()) << " " << *v->var();
+  os() << " = " << *v->value() << ";";
 }
 
 void IRPrinter::visit(const CondPtr& v) {
@@ -530,7 +530,7 @@ void IRPrinter::visit(const CondPtr& v) {
 }
 
 void IRPrinter::visit(const AtomicAddPtr& v) {
-  os() << "atomicAdd(&" << *v->base_handle() << '[';
+  os() << "atomicAdd(&" << *v->base_handle() << "[";
   size_t i = 0;
   for (const ExprPtr& ind : v->indices()) {
     if (i++) {
@@ -539,7 +539,7 @@ void IRPrinter::visit(const AtomicAddPtr& v) {
     ind->accept(this);
   }
   if (v->indices().empty()) {
-    os() << '0';
+    os() << "0";
   }
   os() << "], " << *v->value() << ");";
 }
@@ -549,7 +549,7 @@ void IRPrinter::visit(const SyncThreadsPtr& v) {
 }
 
 void IRPrinter::visit(const ExternalCallPtr& v) {
-  os() << *v->buf() << " = " << v->func_name() << '(';
+  os() << *v->buf() << " = " << v->func_name() << "(";
 
   os() << "buf_args={";
   int i = 0;
@@ -580,7 +580,7 @@ void IRPrinter::visit(const ExternalCallWithAllocPtr& v) {
     os() << *buf_out_arg;
   }
 
-  os() << " := " << v->func_name() << '(';
+  os() << " := " << v->func_name() << "(";
 
   os() << "buf_args={";
   i = 0;
@@ -657,7 +657,7 @@ void print(const ExprPtr& expr) {
   } else {
     std::cout << "(null expr)";
   }
-  std::cout << '\n';
+  std::cout << "\n";
 }
 
 void print(const StmtPtr& stmt) {
@@ -691,14 +691,14 @@ std::string to_string(const StmtPtr& stmt) {
 std::string to_string(const Tensor& t) {
   std::ostringstream oss;
   // TODO: move this to Buf printer
-  oss << "Tensor " << t.buf()->name_hint() << '[';
+  oss << "Tensor " << t.buf()->name_hint() << "[";
   for (const auto i : c10::irange(t.buf()->ndim())) {
     if (i != 0) {
       oss << ", ";
     }
     oss << *t.buf()->dim(i);
   }
-  oss << "]:\n" << *t.stmt() << '\n';
+  oss << "]:\n" << *t.stmt() << "\n";
   return oss.str();
 }
 } // namespace std
diff --git a/torch/csrc/jit/tensorexpr/loopnest.cpp b/torch/csrc/jit/tensorexpr/loopnest.cpp
index cca7efcd0adaf..7f0888666d3af 100644
--- a/torch/csrc/jit/tensorexpr/loopnest.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest.cpp
@@ -131,9 +131,9 @@ std::string sanitizeName(const std::string& input_name) {
     } else {
       if (i == 0) {
         // Don't start names with underscore
-        sanitized_name << 'v';
+        sanitized_name << "v";
       }
-      sanitized_name << '_';
+      sanitized_name << "_";
     }
   }
   return sanitized_name.str();
diff --git a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
index 3dda98ff0faf6..46a09314fb7bf 100644
--- a/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
+++ b/torch/csrc/jit/tensorexpr/loopnest_randomization.cpp
@@ -733,7 +733,7 @@ void loopnestRandomization(int64_t seed, LoopNest& l) {
     }
   } catch (...) {
     std::cout << "EXCEPTION THROWN!\n";
-    std::cout << "SEED: " << seed << '\n';
+    std::cout << "SEED: " << seed << "\n";
     throw std::runtime_error("Random test failed");
   }
   message = "End of transformations;\n";
diff --git a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
index bbd43f0fa8a8c..73a1c6a4a2d5a 100644
--- a/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
+++ b/torch/csrc/jit/tensorexpr/mem_dependency_checker.cpp
@@ -151,7 +151,7 @@ bool AccessInfo::isWrite() const {
 }
 
 void AccessInfo::print() const {
-  std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << '[';
+  std::cout << id_ << ". " << AccessToString(type_) << ": " << *var_ << "[";
   if (!bounds_.empty()) {
     for (size_t i = 0; i < bounds_.size() - 1; ++i) {
       bounds_[i].print();
@@ -161,30 +161,30 @@ void AccessInfo::print() const {
     size_t i = bounds_.size() - 1;
     bounds_[i].print();
   }
-  std::cout << ']';
+  std::cout << "]";
 
   if (!dependencies_.empty()) {
     std::cout << " - depends on: ";
     for (auto& pair : dependencies_) {
-      std::cout << pair.second->id() << ' ';
+      std::cout << pair.second->id() << " ";
     }
   }
 
   if (!dependents_.empty()) {
     std::cout << " - dependents: ";
     for (auto& pair : dependents_) {
-      std::cout << pair.second.lock()->id() << ' ';
+      std::cout << pair.second.lock()->id() << " ";
     }
   }
 
-  std::cout << '\n';
+  std::cout << "\n";
 }
 
 void AccessInfo::dumpDOT(std::ostream& os) const {
   if (type_ == AccessType::Input || type_ == AccessType::Output ||
       type_ == AccessType::Alloc) {
-    os << 'n' << id_ << " [\n";
-    os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << '[';
+    os << "n" << id_ << " [\n";
+    os << "label = \"" << AccessToString(type_) << "\\n " << *var_ << "[";
     if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
         os << *IRSimplifier::simplify(
@@ -203,17 +203,17 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
       os << "\tshape = \"house\"\n";
     }
   } else {
-    os << 'n' << id_ << " [\n";
+    os << "n" << id_ << " [\n";
     os << "label = \"" << AccessToString(type_) << " (#" << id_ << ")\\n";
     os << "buf : " << *var_ << "\\n";
     os << "bounds : [";
     if (!bounds_.empty()) {
       for (size_t i = 0; i < bounds_.size() - 1; ++i) {
-        os << '(' << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
+        os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << "), ";
       }
 
       size_t i = bounds_.size() - 1;
-      os << '(' << *bounds_[i].start << ", " << *bounds_[i].end << ")]";
+      os << "(" << *bounds_[i].start << ", " << *bounds_[i].end << ")]";
     }
     os << "\"\n";
     os << "\tshape = \"box\"\n";
@@ -228,8 +228,8 @@ void AccessInfo::dumpDOT(std::ostream& os) const {
   }
   os << "]\n";
   for (auto& pair : dependencies_) {
-    os << 'n' << pair.second->id() << " -> " << 'n' << id_ << " [color=\""
-       << edgeColour << "\"]\n";
+    os << "n" << pair.second->id() << " -> "
+       << "n" << id_ << " [color=\"" << edgeColour << "\"]\n";
   }
 }
 
diff --git a/torch/csrc/jit/tensorexpr/registerizer.cpp b/torch/csrc/jit/tensorexpr/registerizer.cpp
index 9ad44e31a3873..37f79d529238d 100644
--- a/torch/csrc/jit/tensorexpr/registerizer.cpp
+++ b/torch/csrc/jit/tensorexpr/registerizer.cpp
@@ -131,17 +131,17 @@ std::shared_ptr<AccessInfo> AccessInfo::cloneWithHiddenInfo(
 }
 
 void AccessInfo::print() const {
-  std::cout << "Access: " << *buf_ << '{';
+  std::cout << "Access: " << *buf_ << "{";
   for (const auto& i : indices_) {
-    std::cout << *i << ' ';
+    std::cout << *i << " ";
   }
   std::cout << "} stores: " << stores_.size() << " (" << *store_cost_ << ") -";
-  std::cout << " loads: " << loads_.size() << " (" << *load_cost_ << ')';
+  std::cout << " loads: " << loads_.size() << " (" << *load_cost_ << ")";
   if (conditionId_) {
     std::cout << " cond: " << conditionId_;
   }
 
-  std::cout << '\n';
+  std::cout << "\n";
 }
 
 // Scope
diff --git a/torch/csrc/jit/tensorexpr/types.cpp b/torch/csrc/jit/tensorexpr/types.cpp
index f3a62fa374056..0ee8fd4a956bb 100644
--- a/torch/csrc/jit/tensorexpr/types.cpp
+++ b/torch/csrc/jit/tensorexpr/types.cpp
@@ -57,7 +57,7 @@ Dtype ToDtype(ScalarType type) {
 TORCH_API std::ostream& operator<<(std::ostream& stream, const Dtype& dtype) {
   stream << dtype.scalar_type_;
   if (dtype.lanes() > 1) {
-    stream << 'x' << dtype.lanes();
+    stream << "x" << dtype.lanes();
     ;
   }
   return stream;
diff --git a/torch/csrc/jit/testing/file_check.cpp b/torch/csrc/jit/testing/file_check.cpp
index fb1280400a89d..aeac1233e4d23 100644
--- a/torch/csrc/jit/testing/file_check.cpp
+++ b/torch/csrc/jit/testing/file_check.cpp
@@ -116,7 +116,7 @@ size_t assertFind(
     const std::string& sub,
     const Check& check) {
   return assertFind(search_range, sub, [&](std::ostream& out) {
-    out << "From " << check << '\n';
+    out << "From " << check << "\n";
   });
 }
 
@@ -156,7 +156,7 @@ size_t assertFindRegex(
     const std::string& sub,
     const Check& check) {
   return assertFindRegex(search_range, sub, [&](std::ostream& out) {
-    out << "From " << check << '\n';
+    out << "From " << check << "\n";
   });
 }
 
@@ -182,7 +182,7 @@ void assertNotFind(
     c10::printQuotedString(ss, sub);
     ss << " but found it\n";
     found_range.highlight(ss);
-    ss << "From " << check << '\n';
+    ss << "From " << check << "\n";
     throw std::runtime_error(ss.str());
   }
 }
@@ -543,7 +543,7 @@ FileCheck::FileCheck() : fcImpl(new FileCheckImpl()) {}
 std::ostream& operator<<(std::ostream& out, const FileCheckImpl& fc) {
   out << "FileCheck checks:\n";
   for (const Check& c : fc.checks) {
-    out << '\t' << c << '\n';
+    out << "\t" << c << "\n";
   }
   return out;
 }
diff --git a/torch/csrc/lazy/core/debug_util.cpp b/torch/csrc/lazy/core/debug_util.cpp
index 2eb448e75f61d..3cc35c6d0cf05 100644
--- a/torch/csrc/lazy/core/debug_util.cpp
+++ b/torch/csrc/lazy/core/debug_util.cpp
@@ -77,7 +77,7 @@ std::string GetFirstUserFrameInPython() {
     auto& loc = frames[i - 1];
     if (loc.file.find("site-packages") == std::string::npos) {
       std::stringstream ss;
-      ss << loc.file << ' ' << loc.function << ' ' << loc.line;
+      ss << loc.file << " " << loc.function << " " << loc.line;
       return ss.str();
     }
   }
@@ -120,7 +120,7 @@ std::string DebugUtil::GetTensorsGraphInfo(
   std::vector<SourceLocation> frames = GetPythonFramesFunction()();
   ss << "Python Stacktrace:\n";
   for (auto& location : frames) {
-    ss << "  " << location.function << " (" << location.file << ':'
+    ss << "  " << location.function << " (" << location.file << ":"
        << location.line << ")\n";
   }
   ss << "\nHashes: (";
@@ -160,7 +160,7 @@ void DebugUtil::SaveTensorsGraphInfo(
     std::string info = GetTensorsGraphInfo(tensors, indices, format);
     std::lock_guard<std::mutex> guard(lock);
     std::ofstream graph_file(save_file, std::ios_base::app);
-    graph_file << '[' << name << "]\n" << info << '\n';
+    graph_file << "[" << name << "]\n" << info << "\n";
   }
 }
 
diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp
index 3cd25d2f5e85e..709b5b028b242 100644
--- a/torch/csrc/lazy/core/ir.cpp
+++ b/torch/csrc/lazy/core/ir.cpp
@@ -143,7 +143,7 @@ const Output& Node::nullable_operand(size_t i) const {
 
 std::string Node::ToString() const {
   std::stringstream ss;
-  ss << shapes() << ' ' << op();
+  ss << shapes() << " " << op();
   if (num_outputs() > 1) {
     ss << ", num_outputs=" << num_outputs();
   }
diff --git a/torch/csrc/lazy/core/ir_dump_util.cpp b/torch/csrc/lazy/core/ir_dump_util.cpp
index b7f959682452c..3f33c4fce2246 100644
--- a/torch/csrc/lazy/core/ir_dump_util.cpp
+++ b/torch/csrc/lazy/core/ir_dump_util.cpp
@@ -137,7 +137,7 @@ std::string GenerateDotNodeLabel(
   std::stringstream ss;
   ss << node->op() << "\\n" << node->shape();
   for (auto& tag : GetNodeTags(node)) {
-    ss << "\\n" << tag.name << '=';
+    ss << "\\n" << tag.name << "=";
     if (tag.value.size() < kMaxValueSize) {
       ss << tag.value;
     } else {
@@ -155,27 +155,27 @@ std::string GenerateDotNodeSpec(
     const Node* node,
     const std::unordered_map<const Node*, size_t>& roots_ids) {
   std::stringstream ss;
-  ss << "label=\"" << GenerateDotNodeLabel(node, roots_ids) << '"';
+  ss << "label=\"" << GenerateDotNodeLabel(node, roots_ids) << "\"";
   return ss.str();
 }
 
 std::string GenerateTextNodeSpec(const Node* node, const NodeIdMap& id_map) {
   std::stringstream ss;
-  ss << node->shapes() << ' ' << node->op() << '(';
+  ss << node->shapes() << " " << node->op() << "(";
   size_t count = 0;
   for (auto& output : node->operands()) {
     if (count > 0) {
       ss << ", ";
     }
-    ss << '%' << id_map.at(output.node);
+    ss << "%" << id_map.at(output.node);
     if (output.node->num_outputs() > 1) {
-      ss << '.' << output.index;
+      ss << "." << output.index;
     }
     ++count;
   }
-  ss << ')';
+  ss << ")";
   for (auto& tag : GetNodeTags(node)) {
-    ss << ", " << tag.name << '=' << tag.value;
+    ss << ", " << tag.name << "=" << tag.value;
   }
   return ss.str();
 }
@@ -214,7 +214,7 @@ std::string DumpUtil::PostOrderToDot(
         if (output.node->num_outputs() > 1) {
           ss << " [label=\"o=" << output.index << "\"]";
         }
-        ss << '\n';
+        ss << "\n";
       }
     }
   }
@@ -242,7 +242,7 @@ std::string DumpUtil::PostOrderToText(
       ss << ", ROOT=" << *opt_root_id;
     }
     ss << ", NodeType=" << typeid(*node).name();
-    ss << '\n';
+    ss << "\n";
   }
   ss << "}\n";
   return ss.str();
diff --git a/torch/csrc/lazy/core/ir_metadata.cpp b/torch/csrc/lazy/core/ir_metadata.cpp
index 5da2860ed6cea..50aedaca0293b 100644
--- a/torch/csrc/lazy/core/ir_metadata.cpp
+++ b/torch/csrc/lazy/core/ir_metadata.cpp
@@ -16,8 +16,8 @@ void EmitShortFrameInfo(
     } else {
       ++pos;
     }
-    stream << ", location=" << frame.function << '@' << frame.file.substr(pos)
-           << ':' << frame.line;
+    stream << ", location=" << frame.function << "@" << frame.file.substr(pos)
+           << ":" << frame.line;
   }
 }
 
@@ -26,7 +26,7 @@ std::ostream& operator<<(
     const std::vector<SourceLocation>& frames) {
   stream << "Frames:\n";
   for (auto& location : frames) {
-    stream << "  " << location.function << " (" << location.file << ':'
+    stream << "  " << location.function << " (" << location.file << ":"
            << location.line << ")\n";
   }
   return stream;
diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp
index 413601f70afd4..c440357f9e16e 100644
--- a/torch/csrc/lazy/core/lazy_graph_executor.cpp
+++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp
@@ -404,7 +404,7 @@ void LazyGraphExecutor::SyncLiveTensorsGraph(
     bool wait) {
   auto tensors = GetLiveTensors(device);
   VLOG(4) << tensors.size() << " live tensors: devices=("
-          << c10::Join(", ", devices) << ')';
+          << c10::Join(", ", devices) << ")";
   SyncTensorsGraph(&tensors, devices, wait, /*sync_ltc_data=*/true);
 }
 
diff --git a/torch/csrc/lazy/core/shape_inference.cpp b/torch/csrc/lazy/core/shape_inference.cpp
index ada3a2fed1693..e7ab494d18e32 100644
--- a/torch/csrc/lazy/core/shape_inference.cpp
+++ b/torch/csrc/lazy/core/shape_inference.cpp
@@ -85,7 +85,7 @@ static std::vector<int64_t> expand_param_if_needed(
     std::ostringstream ss;
     ss << "expected " << param_name << " to be a single integer value or a "
        << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << '=' << list_param;
+       << "dimensions, but got " << param_name << "=" << list_param;
     TORCH_CHECK(false, ss.str());
   } else {
     return list_param.vec();
diff --git a/torch/csrc/lazy/core/trie.cpp b/torch/csrc/lazy/core/trie.cpp
index e0e657aae137e..a4a5d6f0c8b86 100644
--- a/torch/csrc/lazy/core/trie.cpp
+++ b/torch/csrc/lazy/core/trie.cpp
@@ -19,7 +19,7 @@ void TraverseTrie(TrieNode* node, std::stringstream& ss) {
        << ", " << node->hit_counter << " hits\"]\n";
   }
   for (auto& successor : node->successors) {
-    ss << node->unique_id << " -> " << successor->unique_id << '\n';
+    ss << node->unique_id << " -> " << successor->unique_id << "\n";
     TraverseTrie(successor.get(), ss);
   }
 }
diff --git a/torch/csrc/monitor/counters.h b/torch/csrc/monitor/counters.h
index 046c63a78eddb..65a0f516a58d3 100644
--- a/torch/csrc/monitor/counters.h
+++ b/torch/csrc/monitor/counters.h
@@ -226,7 +226,7 @@ class Stat {
     for (auto& kv : stats) {
       std::stringstream key;
       key << name_;
-      key << '.';
+      key << ".";
       key << aggregationName(kv.first);
       e.data[key.str()] = kv.second;
     }
diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
index 524b84070cbf6..ec9994e15ec9c 100644
--- a/torch/csrc/profiler/kineto_shim.cpp
+++ b/torch/csrc/profiler/kineto_shim.cpp
@@ -201,13 +201,13 @@ class ExperimentalConfigWrapper {
     for (size_t i = 0; i < num_metrics; i++) {
       configss << config_.profiler_metrics[i];
       if (num_metrics > 1 && i < (num_metrics - 1)) {
-        configss << ',';
+        configss << ",";
       }
     }
     configss << "\nCUPTI_PROFILER_ENABLE_PER_KERNEL="
              << (config_.profiler_measure_per_kernel ? "true" : "false")
-             << '\n';
-    configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << '\n';
+             << "\n";
+    configss << "CUSTOM_CONFIG=" << config_.custom_profiler_config << "\n";
     LOG(INFO) << "Generated config = " << configss.str();
 
     libkineto::api().activityProfiler().prepareTrace(
@@ -236,8 +236,8 @@ static const std::string setTraceID(const std::string& trace_id) {
     return "";
   }
   std::stringstream configss;
-  configss << "REQUEST_TRACE_ID=" << trace_id << '\n';
-  configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << '\n';
+  configss << "REQUEST_TRACE_ID=" << trace_id << "\n";
+  configss << "REQUEST_GROUP_TRACE_ID=" << trace_id << "\n";
   return configss.str();
 }
 
@@ -249,7 +249,7 @@ static const std::string appendCustomConfig(
   }
   std::stringstream configss;
   configss << config;
-  configss << "CUSTOM_CONFIG=" << custom_profiler_config << '\n';
+  configss << "CUSTOM_CONFIG=" << custom_profiler_config << "\n";
   return configss.str();
 }
 #endif
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index 29b2b94af4472..5edc59c893d7a 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -279,7 +279,7 @@ static std::ofstream openOutputFile(const std::string& name) {
   std::ofstream stream;
   stream.open(name, std::ofstream::out | std::ofstream::trunc);
   if (!stream) {
-    LOG(ERROR) << "Failed to open '" << name << '\'';
+    LOG(ERROR) << "Failed to open '" << name << "'";
   } else {
     VLOG(1) << "PyTorch Execution Trace: writing to " << name;
   }
@@ -754,7 +754,7 @@ static void recordOperatorStart(
                 RecordScope::USER_SCOPE),
             tid,
             0); // fw_tid
-        ob.out << ',';
+        ob.out << ",";
       }
     }
 
@@ -928,7 +928,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
             fc.kernelFile,
             fc.get_string_for_tensor_range(),
             additiona_attrs);
-        ob->out << ',';
+        ob->out << ",";
       }
     } catch (const std::exception& e) {
       LOG(WARNING) << "Exception in execution trace observer: [" << fc.name
@@ -977,7 +977,7 @@ bool addExecutionTraceObserver(const std::string& output_file_path) {
       // 5 is the length of ".json"
       ob.resourceDir.replace(ext_pos, 5, "_resources/");
       VLOG(1) << "Execution trace resource directory: " << ob.resourceDir
-              << '\n';
+              << "\n";
     } else {
       LOG(WARNING)
           << "Execution trace output file does not end with \".json\".";
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index f62afd8d1f303..b590b2d985d02 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -17,7 +17,7 @@ namespace {
 static void cudaCheck(cudaError_t result, const char* file, int line) {
   if (result != cudaSuccess) {
     std::stringstream ss;
-    ss << file << ':' << line << ": ";
+    ss << file << ":" << line << ": ";
     if (result == cudaErrorInitializationError) {
       // It is common for users to use DataLoader with multiple workers
       // and the autograd profiler. Throw a nice error message here.
diff --git a/torch/csrc/profiler/unwind/action.h b/torch/csrc/profiler/unwind/action.h
index 5a982cfd046a0..1a8373d9dfe14 100644
--- a/torch/csrc/profiler/unwind/action.h
+++ b/torch/csrc/profiler/unwind/action.h
@@ -40,16 +40,16 @@ struct Action {
   friend std::ostream& operator<<(std::ostream& out, const Action& self) {
     switch (self.kind) {
       case A_UNDEFINED:
-        out << 'u';
+        out << "u";
         break;
       case A_REG_PLUS_DATA:
-        out << 'r' << (int)self.reg << " + " << self.data;
+        out << "r" << (int)self.reg << " + " << self.data;
         break;
       case A_REG_PLUS_DATA_DEREF:
-        out << "*(r" << (int)self.reg << " + " << self.data << ')';
+        out << "*(r" << (int)self.reg << " + " << self.data << ")";
         break;
       case A_LOAD_CFA_OFFSET:
-        out << "*(cfa + " << self.data << ')';
+        out << "*(cfa + " << self.data << ")";
         break;
     }
     return out;
diff --git a/torch/csrc/profiler/unwind/eh_frame_hdr.h b/torch/csrc/profiler/unwind/eh_frame_hdr.h
index 5884685433b0d..740f4beb2c85c 100644
--- a/torch/csrc/profiler/unwind/eh_frame_hdr.h
+++ b/torch/csrc/profiler/unwind/eh_frame_hdr.h
@@ -81,7 +81,7 @@ struct EHFrameHdr {
   friend std::ostream& operator<<(std::ostream& out, const EHFrameHdr& self) {
     out << "EHFrameHeader(version=" << self.version_
         << ",table_size=" << self.table_size_
-        << ",fde_count=" << self.fde_count_ << ')';
+        << ",fde_count=" << self.fde_count_ << ")";
     return out;
   }
 
diff --git a/torch/csrc/profiler/unwind/fde.h b/torch/csrc/profiler/unwind/fde.h
index ffb06b5ab1f46..083578ec391e5 100644
--- a/torch/csrc/profiler/unwind/fde.h
+++ b/torch/csrc/profiler/unwind/fde.h
@@ -17,7 +17,7 @@ struct TableState {
     out << "cfa = " << self.cfa << "; ";
     for (auto r : c10::irange(self.registers.size())) {
       if (self.registers.at(r).kind != A_UNDEFINED) {
-        out << 'r' << r << " = " << self.registers.at(r) << "; ";
+        out << "r" << r << " = " << self.registers.at(r) << "; ";
       }
     }
     return out;
@@ -110,21 +110,21 @@ struct FDE {
     auto previous_pc = current_pc_;
     current_pc_ += amount;
     if (LOG) {
-      (*out_) << (void*)(previous_pc - load_bias_) << '-'
-              << (void*)(current_pc_ - load_bias_) << ": " << state() << '\n';
+      (*out_) << (void*)(previous_pc - load_bias_) << "-"
+              << (void*)(current_pc_ - load_bias_) << ": " << state() << "\n";
     }
   }
 
   void advance_loc(int64_t amount) {
     if (LOG) {
-      (*out_) << "advance_loc " << amount << '\n';
+      (*out_) << "advance_loc " << amount << "\n";
     }
     advance_raw(amount * code_alignment_factor_);
   }
 
   void offset(int64_t reg, int64_t offset) {
     if (LOG) {
-      (*out_) << "offset " << reg << ' ' << offset << '\n';
+      (*out_) << "offset " << reg << " " << offset << "\n";
     }
     if (reg > (int64_t)state().registers.size()) {
       if (LOG) {
@@ -138,7 +138,7 @@ struct FDE {
 
   void restore(int64_t reg) {
     if (LOG) {
-      (*out_) << "restore " << reg << '\n';
+      (*out_) << "restore " << reg << "\n";
     }
     if (reg > (int64_t)state().registers.size()) {
       if (LOG) {
@@ -151,7 +151,7 @@ struct FDE {
 
   void def_cfa(int64_t reg, int64_t off) {
     if (LOG) {
-      (*out_) << "def_cfa " << reg << ' ' << off << '\n';
+      (*out_) << "def_cfa " << reg << " " << off << "\n";
     }
     last_reg_ = reg;
     last_offset_ = off;
@@ -179,13 +179,13 @@ struct FDE {
 
   void undefined(int64_t reg) {
     if (LOG) {
-      (*out_) << "undefined " << reg << '\n';
+      (*out_) << "undefined " << reg << "\n";
     }
     state().registers.at(reg) = Action::undefined();
   }
   void register_(int64_t reg, int64_t rhs_reg) {
     if (LOG) {
-      (*out_) << "register " << reg << ' ' << rhs_reg << '\n';
+      (*out_) << "register " << reg << " " << rhs_reg << "\n";
     }
     state().registers.at(reg) =
         Action::regPlusData(static_cast<int32_t>(reg), 0);
@@ -214,7 +214,7 @@ struct FDE {
     if (LOG) {
       // NOLINTNEXTLINE(performance-no-int-to-ptr)
       (*out_) << "readUpTo " << (void*)addr << " for " << library_name_
-              << " at " << (void*)load_bias_ << '\n';
+              << " at " << (void*)load_bias_ << "\n";
     }
     state_stack_.emplace_back();
     current_pc_ = low_pc_;
@@ -245,8 +245,8 @@ struct FDE {
   }
 
   void dumpAddr2Line() {
-    std::cout << "addr2line -f -e " << library_name_ << ' '
-              << (void*)(low_pc_ - load_bias_) << '\n';
+    std::cout << "addr2line -f -e " << library_name_ << " "
+              << (void*)(low_pc_ - load_bias_) << "\n";
   }
 
   void readInstruction(Lexer& L) {
diff --git a/torch/csrc/profiler/unwind/unwind.cpp b/torch/csrc/profiler/unwind/unwind.cpp
index db7e8a60e4a19..2b30df4e2a60e 100644
--- a/torch/csrc/profiler/unwind/unwind.cpp
+++ b/torch/csrc/profiler/unwind/unwind.cpp
@@ -354,7 +354,7 @@ struct Symbolizer {
     entry.queried.push_back(addr);
     auto libaddress = maybe_library->second - 1;
     // NOLINTNEXTLINE(performance-no-int-to-ptr)
-    entry.comm->out() << (void*)libaddress << '\n';
+    entry.comm->out() << (void*)libaddress << "\n";
     // we need to make sure we don't write more than 64k bytes to
     // a pipe before reading the results. Otherwise the buffer may
     // get filled and block before we read the results.
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index b547bc528da55..d266958e2cb63 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -145,7 +145,7 @@ std::vector<std::string> callstackStr(const std::vector<FileLineFunc>& cs) {
   cs_str.reserve(cs.size());
   for (const auto& entry : cs) {
     std::stringstream loc;
-    loc << entry.filename << '(' << entry.line << "): " << entry.funcname;
+    loc << entry.filename << "(" << entry.line << "): " << entry.funcname;
     cs_str.push_back(loc.str());
   }
   return cs_str;
@@ -310,11 +310,11 @@ std::string ivalueToStr(const c10::IValue& val, bool isString) {
   } else {
     ss.str("");
     if (isString) {
-      ss << '"';
+      ss << "\"";
     }
     ss << val;
     if (isString) {
-      ss << '"';
+      ss << "\"";
     }
     std::string mystr = ss.str();
 
@@ -934,7 +934,7 @@ int getTensorStartHint(const at::Tensor& t) {
 bool checkFunctionOutputsForLogging(const at::RecordFunction& fn) {
   const auto& outputs = fn.outputs();
   auto num_outputs = fn.num_outputs();
-  VLOG(2) << "outputs: " << num_outputs << ' ' << outputs.size() << '\n';
+  VLOG(2) << "outputs: " << num_outputs << " " << outputs.size() << '\n';
   // We have two cases: for unboxed kernel, we have num_outputs ==
   // outputs.size() for boxed kernel using stack, there could be more elements
   // on the stack from previous ops.
@@ -948,7 +948,7 @@ bool checkFunctionOutputsForLogging(const at::RecordFunction& fn) {
 bool checkFunctionInputsForLogging(const at::RecordFunction& fn) {
   auto num_inputs = fn.num_inputs();
   const auto inputs = fn.inputs();
-  VLOG(2) << "inputs: " << num_inputs << ' ' << inputs.size() << '\n';
+  VLOG(2) << "inputs: " << num_inputs << " " << inputs.size() << '\n';
   // We have two cases: for unboxed kernel, we have num_inputs ==
   // inputs.size() for boxed kernel using stack, there could be more elements
   // on the stack from previous ops.
diff --git a/torch/csrc/tensor/python_tensor.cpp b/torch/csrc/tensor/python_tensor.cpp
index d4c810d95c608..ad418955e0559 100644
--- a/torch/csrc/tensor/python_tensor.cpp
+++ b/torch/csrc/tensor/python_tensor.cpp
@@ -218,7 +218,7 @@ static void py_initialize_tensor_type(
 
 static std::string get_name(Backend backend, ScalarType scalarType) {
   std::ostringstream ss;
-  ss << torch::utils::backend_to_string(backend) << '.' << toString(scalarType)
+  ss << torch::utils::backend_to_string(backend) << "." << toString(scalarType)
      << "Tensor";
   return ss.str();
 }
diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp
index e89f7887320a0..79994eeb8621e 100644
--- a/torch/csrc/utils/python_arg_parser.cpp
+++ b/torch/csrc/utils/python_arg_parser.cpp
@@ -663,20 +663,20 @@ auto handle_torch_function_no_python_arg_parser(
     std::stringstream ss;
     ss << "Multiple dispatch failed for '";
     if (module_name && func_name) {
-      ss << module_name << '.' << func_name;
+      ss << module_name << "." << func_name;
     } else {
       py::handle fn = torch_api_function;
-      ss << py::str(fn.attr("__module__")) << '.'
+      ss << py::str(fn.attr("__module__")) << "."
          << py::str(fn.attr("__name__"));
     }
     ss << "'; all " << torch_function_name_str
        << " handlers returned NotImplemented:\n\n";
     if (mode_obj) {
-      ss << "  - mode object " << py::repr(mode_obj) << '\n';
+      ss << "  - mode object " << py::repr(mode_obj) << "\n";
     }
     for (auto& arg : overloaded_args) {
       ss << "  - tensor subclass " << py::repr(get_type_of_overloaded_arg(arg))
-         << '\n';
+         << "\n";
     }
     ss << "\nFor more information, try re-running with TORCH_LOGS=not_implemented";
     const std::string& tmp = ss.str();
@@ -1542,7 +1542,7 @@ std::string FunctionSignature::toString() const {
   // optionals, etc.
   std::ostringstream ss;
   bool keyword_already = false;
-  ss << '(';
+  ss << "(";
   int i = 0;
   for (auto& param : params) {
     if (i != 0) {
@@ -1552,13 +1552,13 @@ std::string FunctionSignature::toString() const {
       ss << "*, ";
       keyword_already = true;
     }
-    ss << param.type_name() << ' ' << param.name;
+    ss << param.type_name() << " " << param.name;
     if (param.optional) {
       ss << " = " << param.default_value;
     }
     i++;
   }
-  ss << ')';
+  ss << ")";
   return ss.str();
 }
 
diff --git a/torch/csrc/utils/python_dispatch.cpp b/torch/csrc/utils/python_dispatch.cpp
index 3380bb0a13e57..f97b6ac0ba9b1 100644
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@@ -692,7 +692,7 @@ void initDispatchBindings(PyObject* module) {
       std::stringstream ss;
       ss << op.name;
       if (!op.overload_name.empty()) {
-        ss << '.' << op.overload_name;
+        ss << "." << op.overload_name;
       }
       names.emplace_back(std::move(ss).str());
     }
diff --git a/torch/csrc/utils/structseq.cpp b/torch/csrc/utils/structseq.cpp
index 2e804aa44bad9..29d20d5a9bfe2 100644
--- a/torch/csrc/utils/structseq.cpp
+++ b/torch/csrc/utils/structseq.cpp
@@ -66,7 +66,7 @@ PyObject* returned_structseq_repr(PyStructSequence* obj) {
       ss << ",\n";
     }
   }
-  ss << ')';
+  ss << ")";
 
   return PyUnicode_FromString(ss.str().c_str());
 }
diff --git a/torch/csrc/utils/tensor_types.cpp b/torch/csrc/utils/tensor_types.cpp
index c46baea82a442..d696a0cdf4ddd 100644
--- a/torch/csrc/utils/tensor_types.cpp
+++ b/torch/csrc/utils/tensor_types.cpp
@@ -66,14 +66,14 @@ const char* backend_to_string(const at::Backend& backend) {
 
 std::string options_to_string(const at::TensorOptions& options) {
   std::ostringstream ss;
-  ss << backend_to_string(options.backend()) << '.'
+  ss << backend_to_string(options.backend()) << "."
      << toString(at::typeMetaToScalarType(options.dtype())) << "Tensor";
   return ss.str();
 }
 
 std::string type_to_string(const at::DeprecatedTypeProperties& type) {
   std::ostringstream ss;
-  ss << backend_to_string(type.backend()) << '.' << toString(type.scalarType())
+  ss << backend_to_string(type.backend()) << "." << toString(type.scalarType())
      << "Tensor";
   return ss.str();
 }
diff --git a/torch/csrc/xpu/Module.cpp b/torch/csrc/xpu/Module.cpp
index ba5998ba3d3ce..b3d1dd929a216 100644
--- a/torch/csrc/xpu/Module.cpp
+++ b/torch/csrc/xpu/Module.cpp
@@ -367,7 +367,7 @@ static void registerXpuDeviceProperties(PyObject* module) {
                    << ", sub_group_sizes=[" << prop.sub_group_sizes
                    << "], has_fp16=" << prop.has_fp16
                    << ", has_fp64=" << prop.has_fp64
-                   << ", has_atomic64=" << prop.has_atomic64 << ')';
+                   << ", has_atomic64=" << prop.has_atomic64 << ")";
             return stream.str();
           });
 }
diff --git a/torch/nativert/executor/OpKernel.cpp b/torch/nativert/executor/OpKernel.cpp
index fa628733804a4..ee4a8503d5ce2 100644
--- a/torch/nativert/executor/OpKernel.cpp
+++ b/torch/nativert/executor/OpKernel.cpp
@@ -65,7 +65,7 @@ std::string readableArgs(
     } else {
       ss << arg;
     }
-    ss << '\n';
+    ss << "\n";
   }
   return ss.str();
 }
diff --git a/torch/nativert/executor/memory/FunctionSchema.cpp b/torch/nativert/executor/memory/FunctionSchema.cpp
index 80347dad2965a..264ed702cbc0d 100644
--- a/torch/nativert/executor/memory/FunctionSchema.cpp
+++ b/torch/nativert/executor/memory/FunctionSchema.cpp
@@ -11,8 +11,8 @@ bool FunctionSchema::alias(size_t input_idx, size_t output_idx) const {
     }
   }
 
-  VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << ' '
-          << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << ' '
+  VLOG(1) << "checking aliasing spec for " << c10_fn_schema_.name() << " "
+          << (c10_fn_schema_.is_varret() ? "varret" : "non-varret") << " "
           << (c10_fn_schema_.is_vararg() ? "vararg" : "non-vararg");
 
   if (!aliasing_spec_.empty()) {
diff --git a/torch/nativert/graph/Graph.cpp b/torch/nativert/graph/Graph.cpp
index 47d082f44332f..260af58a2a492 100644
--- a/torch/nativert/graph/Graph.cpp
+++ b/torch/nativert/graph/Graph.cpp
@@ -1031,7 +1031,7 @@ std::ostream& operator<<(std::ostream& out, const Constant& constant) {
         } else if constexpr (is_same_v<T, c10::Layout>) {
           out << kLayoutPrefix << arg;
         } else if constexpr (is_same_v<T, c10::Device>) {
-          out << kDevicePrefix << '{' << arg << '}';
+          out << kDevicePrefix << "{" << arg << "}";
         } else if constexpr (is_same_v<T, vector<string>>) {
           out << fmt::format("[{}]", fmt::join(arg, ","));
         } else if constexpr (is_same_v<T, unique_ptr<Graph>>) {
@@ -1054,16 +1054,16 @@ void printValue(std::ostream& out, const Value* v) {
 }
 
 void printNamedArgument(std::ostream& out, const NamedArgument& nv) {
-  out << nv.name << '=' << *nv.value;
+  out << nv.name << "=" << *nv.value;
 }
 
 void printAttribute(std::ostream& out, const Attribute& nv) {
-  out << nv.name << '=' << nv.value;
+  out << nv.name << "=" << nv.value;
 }
 } // namespace
 
 std::ostream& operator<<(std::ostream& out, const Value& v) {
-  out << '%' << v.name();
+  out << "%" << v.name();
   // If a list, distinguish it by adding a []
   // Looks like %my_list[]
   if (v.type() == Type::Kind::TensorList) {
@@ -1085,14 +1085,14 @@ std::ostream& operator<<(std::ostream& out, const Node& node) {
     printList(out, false, node.inputs(), [](std::ostream& out, const auto& nv) {
       out << *nv.value;
     });
-    out << ')';
+    out << ")";
     return out;
   }
 
   printList(out, false, node.outputs_, printValue);
 
   out << " = ";
-  out << node.target_ << '(';
+  out << node.target_ << "(";
   printList(out, false, node.inputs_, printNamedArgument);
   if (!node.inputs_.empty() && !node.attributes_.empty()) {
     // Emit a connective ',' between inputs and attributes.
@@ -1100,13 +1100,13 @@ std::ostream& operator<<(std::ostream& out, const Node& node) {
   }
 
   printList(out, false, node.attributes_, printAttribute);
-  out << ')';
+  out << ")";
   return out;
 }
 
 std::ostream& operator<<(std::ostream& out, const Graph& graph) {
   for (const auto& node : graph.nodes_) {
-    out << node << '\n';
+    out << node << "\n";
   }
   return out;
 }
diff --git a/torch/nativert/graph/GraphSignature.cpp b/torch/nativert/graph/GraphSignature.cpp
index 569fff36a945c..cd07af807198f 100644
--- a/torch/nativert/graph/GraphSignature.cpp
+++ b/torch/nativert/graph/GraphSignature.cpp
@@ -313,7 +313,7 @@ GraphSignature::GraphSignature(const torch::_export::GraphSignature& storage) {
   }
 
   if (FLAGS_caffe2_log_level > 2) {
-    std::cout << *this << '\n';
+    std::cout << *this << "\n";
   }
 }
 
@@ -401,14 +401,14 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
   if (!sig.inputsToParameters().empty()) {
     out << "inputsToParameters: {\n";
     for (const auto& [inputName, paramName] : sig.inputsToParameters()) {
-      out << '\t' << inputName << " : " << paramName << '\n';
+      out << "\t" << inputName << " : " << paramName << "\n";
     }
     out << "}\n";
   }
   if (!sig.inputsToBuffers().empty()) {
     out << "inputsToBuffers: {\n";
     for (const auto& [inputName, bufferName] : sig.inputsToBuffers()) {
-      out << '\t' << inputName << " : " << bufferName << '\n';
+      out << "\t" << inputName << " : " << bufferName << "\n";
     }
     out << "}\n";
   }
@@ -416,28 +416,28 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     out << "inputsToTensorConstants: {\n";
     for (const auto& [inputName, tensorConstantName] :
          sig.inputsToTensorConstants()) {
-      out << '\t' << inputName << " : " << tensorConstantName << '\n';
+      out << "\t" << inputName << " : " << tensorConstantName << "\n";
     }
     out << "}\n";
   }
   if (!sig.inputsToCustomObjs().empty()) {
     out << "inputsToCustomObjs: {\n";
     for (const auto& [inputName, customObjName] : sig.inputsToCustomObjs()) {
-      out << '\t' << inputName << " : " << customObjName << '\n';
+      out << "\t" << inputName << " : " << customObjName << "\n";
     }
     out << "}\n";
   }
   if (!sig.userOutputs().empty()) {
     out << "userOutputs: {\n";
     for (const auto& outputName : sig.userOutputs()) {
-      out << '\t' << outputName.value_or("Constant") << '\n';
+      out << "\t" << outputName.value_or("Constant") << "\n";
     }
     out << "}\n";
   }
   if (!sig.buffersToMutate().empty()) {
     out << "buffersToMutate: {\n";
     for (const auto& [outputName, mutatedBufferName] : sig.buffersToMutate()) {
-      out << '\t' << outputName << " : " << mutatedBufferName << '\n';
+      out << "\t" << outputName << " : " << mutatedBufferName << "\n";
     }
     out << "}\n";
   }
@@ -445,7 +445,7 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     out << "userInputsToMutate: {\n";
     for (const auto& [outputName, mutatedUserInputName] :
          sig.userInputsToMutate()) {
-      out << '\t' << outputName << " : " << mutatedUserInputName << '\n';
+      out << "\t" << outputName << " : " << mutatedUserInputName << "\n";
     }
     out << "}\n";
   }
@@ -453,7 +453,7 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
     if (!sig.gradientsToParameters().empty()) {
       out << "gradientsToParameters: {\n";
       for (const auto& [outputName, paramName] : sig.gradientsToParameters()) {
-        out << '\t' << outputName << " : " << paramName << '\n';
+        out << "\t" << outputName << " : " << paramName << "\n";
       }
       out << "}\n";
     }
@@ -461,11 +461,11 @@ std::ostream& operator<<(std::ostream& out, const GraphSignature& sig) {
       out << "gradientsToUserInputs: {\n";
       for (const auto& [outputName, userInputName] :
            sig.gradientsToUserInputs()) {
-        out << '\t' << outputName << " : " << userInputName << '\n';
+        out << "\t" << outputName << " : " << userInputName << "\n";
       }
       out << "}\n";
     }
-    out << "lossOutput: " << sig.lossOutput() << '\n';
+    out << "lossOutput: " << sig.lossOutput() << "\n";
   }
   return out;
 }
diff --git a/torch/nativert/graph/passes/pass_manager/PassManager.cpp b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
index 4dbb0012877d8..e023f223ed6f1 100644
--- a/torch/nativert/graph/passes/pass_manager/PassManager.cpp
+++ b/torch/nativert/graph/passes/pass_manager/PassManager.cpp
@@ -35,7 +35,7 @@ bool GraphPassManager::run_pass(Graph* graph, const GraphPassIdentifier& name) {
 
 bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
   if (opts_.logGraphBetweenPasses()) {
-    LOG(INFO) << "Before pass: " << pass.name() << '\n'
+    LOG(INFO) << "Before pass: " << pass.name() << "\n"
               << graph->toString() << "-------------------------";
   }
   return false;
@@ -43,7 +43,7 @@ bool GraphPassManager::pass_pre_run_hook(Graph* graph, const GraphPass& pass) {
 
 bool GraphPassManager::pass_post_run_hook(Graph* graph, const GraphPass& pass) {
   if (opts_.logGraphBetweenPasses()) {
-    LOG(INFO) << "After pass: " << pass.name() << '\n'
+    LOG(INFO) << "After pass: " << pass.name() << "\n"
               << graph->toString() << "-------------------------";
   }
   return false;

From a4c7bf7e8ddde51b2c4e53bb6e7c8985b59cfec0 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 16:48:57 +0000
Subject: [PATCH 25/47] Revert "Use c10::filesystem (#167821)"

This reverts commit deabb3e36de207aa497b035a8bdf6ec1b37d17fe.

Reverted https://github.com/pytorch/pytorch/pull/167821 on behalf of https://github.com/jeanschmidt due to Breaks internal tests, see D87148810. @Skylion007 may you help the author to get this PR merged? ([comment](https://github.com/pytorch/pytorch/pull/167821#issuecomment-3542877623))
---
 c10/util/Exception.cpp                        |   3 +-
 c10/util/Logging.cpp                          |   4 +-
 .../aoti_package/model_package_loader.cpp     | 130 ++++++++++++++++--
 torch/csrc/jit/jit_log.cpp                    |  11 +-
 torch/csrc/jit/jit_opt_limit.cpp              |   7 +-
 5 files changed, 131 insertions(+), 24 deletions(-)

diff --git a/c10/util/Exception.cpp b/c10/util/Exception.cpp
index cccdb28607141..1928c2c175c7b 100644
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Exception.h>
-#include <c10/util/FileSystem.h>
 #include <c10/util/Logging.h>
 #include <c10/util/Type.h>
 
@@ -28,7 +27,7 @@ Error::Error(
     const void* caller)
     : Error(
           str("[enforce fail at ",
-              c10::filesystem::path(file).filename(),
+              detail::StripBasename(file),
               ":",
               line,
               "] ",
diff --git a/c10/util/Logging.cpp b/c10/util/Logging.cpp
index b95eaec9d3ebb..4bf96b1b6808a 100644
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Backtrace.h>
-#include <c10/util/FileSystem.h>
 #include <c10/util/Flags.h>
 #include <c10/util/Lazy.h>
 #include <c10/util/Logging.h>
@@ -479,7 +478,8 @@ MessageLogger::MessageLogger(
           << std::setfill('0') << " " << std::setw(2) << timeinfo->tm_hour
           << ":" << std::setw(2) << timeinfo->tm_min << ":" << std::setw(2)
           << timeinfo->tm_sec << "." << std::setw(9) << ns << " "
-          << c10::filesystem::path(file).filename() << ":" << line << "] ";
+          << c10::detail::StripBasename(std::string(file)) << ":" << line
+          << "] ";
 }
 
 // Output the contents of the stream to the proper channel on destruction.
diff --git a/torch/csrc/inductor/aoti_package/model_package_loader.cpp b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
index 188f92557761d..05d7aa04425f5 100644
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@@ -1,6 +1,5 @@
 #if !defined(C10_MOBILE) && !defined(ANDROID)
 
-#include <c10/util/FileSystem.h>
 #include <c10/util/error.h>
 #include <c10/util/string_view.h>
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
@@ -31,6 +30,8 @@ namespace fs = std::filesystem;
 #include <direct.h>
 #include <io.h>
 #include <process.h>
+#define access _access
+#define F_OK 0
 #else
 #include <sys/types.h>
 #include <unistd.h>
@@ -78,6 +79,15 @@ std::string normalize_path_separator(const std::string& orig_path) {
   return normalized_path;
 }
 
+bool file_exists(const std::string& path) {
+#ifdef _WIN32
+  return fs::exists(path);
+#else
+  struct stat rc{};
+  return lstat(path.c_str(), &rc) == 0;
+#endif
+}
+
 std::string create_temp_dir() {
 #ifdef _WIN32
   try {
@@ -145,8 +155,7 @@ namespace torch::inductor {
 
 namespace {
 const nlohmann::json& load_json_file(const std::string& json_path) {
-  TORCH_CHECK(
-      c10::filesystem::exists(json_path), "File not found: ", json_path);
+  TORCH_CHECK(file_exists(json_path), "File not found: ", json_path);
 
   std::ifstream json_file(json_path);
   TORCH_CHECK(json_file.is_open());
@@ -283,6 +292,102 @@ std::tuple<std::string, std::string> get_cpp_compile_command(
   return std::make_tuple(cmd, target_file);
 }
 
+bool recursive_mkdir(const std::string& dir) {
+  // Creates directories recursively, copied from jit_utils.cpp
+  // Check if current dir exists
+  const char* p_dir = dir.c_str();
+  const bool dir_exists = (access(p_dir, F_OK) == 0);
+  if (dir_exists) {
+    return true;
+  }
+
+  // Try to create current directory
+#ifdef _WIN32
+  int ret = _mkdir(dir.c_str());
+#else
+  int ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
+  // Success
+  if (ret == 0) {
+    return true;
+  }
+
+  // Find folder separator and check if we are at the top
+  auto pos = dir.find_last_of(k_separator);
+  if (pos == std::string::npos) {
+    return false;
+  }
+
+  // Try to create parent directory
+  if (!(recursive_mkdir(dir.substr(0, pos)))) {
+    return false;
+  }
+
+  // Try to create complete path again
+#ifdef _WIN32
+  ret = _mkdir(dir.c_str());
+#else
+  ret = mkdir(dir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
+  return ret == 0;
+}
+
+bool recursive_rmdir(const std::string& path) {
+#ifdef _WIN32
+  std::error_code ec;
+  return fs::remove_all(path, ec) != static_cast<std::uintmax_t>(-1);
+#else
+  DIR* dir = opendir(path.c_str());
+  if (!dir) {
+    return false;
+  }
+
+  struct dirent* entry = nullptr;
+  struct stat statbuf{};
+  bool success = true;
+
+  // Iterate through directory entries
+  while ((entry = readdir(dir)) != nullptr) {
+    std::string name = entry->d_name;
+
+    // Skip "." and ".."
+    if (name == "." || name == "..") {
+      continue;
+    }
+
+    std::string full_path = path;
+    full_path.append("/").append(name);
+
+    // Get file status
+    if (stat(full_path.c_str(), &statbuf) != 0) {
+      success = false;
+      continue;
+    }
+
+    if (S_ISDIR(statbuf.st_mode)) {
+      // Recursively delete subdirectory
+      if (!recursive_rmdir(full_path)) {
+        success = false;
+      }
+    } else {
+      // Delete file
+      if (unlink(full_path.c_str()) != 0) {
+        success = false;
+      }
+    }
+  }
+
+  closedir(dir);
+
+  // Remove the directory itself
+  if (rmdir(path.c_str()) != 0) {
+    success = false;
+  }
+
+  return success;
+#endif
+}
+
 std::string compile_so(
     const std::string& cpp_filename,
     std::vector<std::string>& obj_filenames) {
@@ -312,7 +417,7 @@ std::string compile_so(
 
   // Move the mmapped weights onto the .so
   std::string serialized_weights_path = filename + "_serialized_weights.bin";
-  if (c10::filesystem::exists(serialized_weights_path)) {
+  if (file_exists(serialized_weights_path)) {
     std::ifstream serialized_weights_file(
         serialized_weights_path, std::ios::binary);
     TORCH_CHECK(
@@ -534,13 +639,11 @@ std::unordered_map<std::string, std::string> AOTIModelPackageLoader::
       parent_path_idx != std::string::npos,
       "Failed to find parent path in " + output_path_str);
   std::string parent_path = output_path_str.substr(0, parent_path_idx);
-  std::error_code ec{};
-  c10::filesystem::create_directories(parent_path, ec);
   TORCH_CHECK(
-      ec.value() == 0,
+      recursive_mkdir(parent_path),
       "Failed to create directory " + parent_path,
       ": ",
-      ec.message());
+      c10::utils::str_error(errno));
 
   LOG(INFO) << "Extract file: " << metadata_filename << " to "
             << output_path_str;
@@ -554,7 +657,7 @@ std::unordered_map<std::string, std::string> AOTIModelPackageLoader::
     metadata[item.key()] = item.value().get<std::string>();
   }
   // Clean up temporary directory
-  c10::filesystem::remove_all(temp_dir, ec);
+  recursive_rmdir(temp_dir);
 
   return metadata;
 }
@@ -646,13 +749,11 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
           "Failed to find parent path in " + output_file_path);
 
       std::string parent_path = output_file_path.substr(0, parent_path_idx);
-      std::error_code ec{};
-      c10::filesystem::create_directories(parent_path, ec);
       TORCH_CHECK(
-          ec.value() == 0,
+          recursive_mkdir(parent_path),
           "Failed to create directory " + parent_path,
           ": ",
-          ec.message());
+          c10::utils::str_error(errno));
 
       // Extracts file to the temp directory
       zip_archive.extract_file(zip_filename_str, output_path_str);
@@ -731,8 +832,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
 AOTIModelPackageLoader::~AOTIModelPackageLoader() {
   // Clean up the temporary directory
   if (!temp_dir_.empty()) {
-    std::error_code ec;
-    c10::filesystem::remove_all(temp_dir_, ec);
+    recursive_rmdir(temp_dir_);
   }
 }
 
diff --git a/torch/csrc/jit/jit_log.cpp b/torch/csrc/jit/jit_log.cpp
index 8adf4c8aab10c..745d397f593c0 100644
--- a/torch/csrc/jit/jit_log.cpp
+++ b/torch/csrc/jit/jit_log.cpp
@@ -8,7 +8,7 @@
 
 #include <ATen/core/function.h>
 #include <c10/util/Exception.h>
-#include <c10/util/FileSystem.h>
+#include <c10/util/StringUtil.h>
 #include <c10/util/env.h>
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/frontend/error_report.h>
@@ -113,7 +113,12 @@ void JitLoggingConfig::parse() {
 bool is_enabled(const char* cfname, JitLoggingLevels level) {
   const auto& files_to_levels =
       JitLoggingConfig::getInstance().getFilesToLevels();
-  const auto fname_no_ext = c10::filesystem::path(cfname).stem().string();
+  std::string fname{cfname};
+  fname = c10::detail::StripBasename(fname);
+  const auto end_index = fname.find_last_of('.') == std::string::npos
+      ? fname.size()
+      : fname.find_last_of('.');
+  const auto fname_no_ext = fname.substr(0, end_index);
 
   const auto it = files_to_levels.find(fname_no_ext);
   if (it == files_to_levels.end()) {
@@ -156,7 +161,7 @@ std::string jit_log_prefix(
   std::stringstream prefix_ss;
   prefix_ss << "[";
   prefix_ss << level << " ";
-  prefix_ss << c10::filesystem::path(fn).filename() << ":";
+  prefix_ss << c10::detail::StripBasename(std::string(fn)) << ":";
   prefix_ss << std::setfill('0') << std::setw(3) << l;
   prefix_ss << "] ";
 
diff --git a/torch/csrc/jit/jit_opt_limit.cpp b/torch/csrc/jit/jit_opt_limit.cpp
index 385cbe4acdc95..c4c1a2307659f 100644
--- a/torch/csrc/jit/jit_opt_limit.cpp
+++ b/torch/csrc/jit/jit_opt_limit.cpp
@@ -1,10 +1,11 @@
+#include <cstdlib>
 #include <sstream>
 #include <string>
 #include <utility>
 
 #include <ATen/core/function.h>
 #include <c10/util/Exception.h>
-#include <c10/util/FileSystem.h>
+#include <c10/util/StringUtil.h>
 #include <c10/util/env.h>
 #include <torch/csrc/jit/api/function_impl.h>
 #include <torch/csrc/jit/jit_opt_limit.h>
@@ -56,7 +57,9 @@ bool opt_limit(const char* pass_name) {
 
   static const std::unordered_map<std::string, int64_t> passes_to_opt_limits =
       parseJITOptLimitOption(opt_limit.value());
-  auto pass = c10::filesystem::path(pass_name).stem().string();
+  std::string pass{pass_name};
+  pass = c10::detail::StripBasename(pass);
+  pass = c10::detail::ExcludeFileExtension(pass);
 
   auto opt_limit_it = passes_to_opt_limits.find(pass);
   if (opt_limit_it == passes_to_opt_limits.end()) {

From 094e529c64bdf80671bcd7c214e32a9379ef4a3a Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@gmail.com>
Date: Mon, 17 Nov 2025 06:36:15 -0800
Subject: [PATCH 26/47] [MPS] Fix repeat_interleave with slices (#167961)

Alas, one can not use `repeat_interleave_common` for MPS tensors, as `data_offset` is not a valid pointer to `id<MTLTensor>`
On the other hand, one does not need to use `AT_DISPATCH_INDEX_TYPES` as dispatching is happening on the shader side

Fixes https://github.com/pytorch/pytorch/issues/167924
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167961
Approved by: https://github.com/manuelcandales
---
 aten/src/ATen/native/mps/operations/Repeat.mm | 46 +++++++++----------
 test/test_mps.py                              |  8 ++++
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm
index 40afa15b4f700..f350b0137b05e 100644
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@@ -91,26 +91,31 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) {
 #include <ATen/native/mps/Repeat_metallib.h>
 #endif
 
-template <typename index_t>
-void computeRepeatIndices(const index_t* repeat_ptr,
-                          const int64_t* cumsum_ptr,
-                          index_t* result_ptr,
-                          int64_t size,
-                          int64_t result_size) {
-  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
-  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
-  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
-  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
-
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
+  TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
   std::string scalar_type;
-  if constexpr (std::is_same_v<index_t, int32_t>) {
+  if (repeat.scalar_type() == kInt) {
     scalar_type = "int32_t";
-  } else if constexpr (std::is_same_v<index_t, int64_t>) {
+  } else if (repeat.scalar_type() == kLong) {
     scalar_type = "int64_t";
   } else {
-    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
+    TORCH_CHECK(false, "repeats has to be Long or Int tensor");
+  }
+  if (repeat.size(0) == 0) {
+    return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  Tensor repeat_ = repeat.contiguous();
+  Tensor cumsum = repeat.cumsum(0);
+  int64_t total = 0;
+  if (output_size.has_value()) {
+    total = output_size.value();
+  } else {
+    total = cumsum[-1].item<int64_t>();
+    TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
   }
 
+  auto result = at::empty({total}, repeat.options());
+
   MPSStream* mpsStream = getCurrentMPSStream();
   dispatch_sync(mpsStream->queue(), ^() {
     @autoreleasepool {
@@ -121,20 +126,13 @@ void computeRepeatIndices(const index_t* repeat_ptr,
       getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
 
       [computeEncoder setComputePipelineState:pipelineState];
-      mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
-      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
+      mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
+      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
 
       getMPSProfiler().endProfileKernel(pipelineState);
     }
   });
-}
-
-Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
-  Tensor output;
-  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
-    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
-  });
-  return output;
+  return result;
 }
 
 } // namespace at::native
diff --git a/test/test_mps.py b/test/test_mps.py
index f78af8437060b..a84ac7d355169 100644
--- a/test/test_mps.py
+++ b/test/test_mps.py
@@ -3332,6 +3332,14 @@ def helper(shape, dtype=torch.float32, num_repeats=torch.Tensor(), dim=None):
         helper(shape=(10, 15, 8), num_repeats=torch.randint(0, 100, (15, ), device="mps"), dim=1)
         helper(shape=(10, 15, 30), num_repeats=torch.randint(0, 100, (30, ), device="mps"), dim=2)
 
+    def test_repeat_interleave_offset(self):
+        # Regression test for https://github.com/pytorch/pytorch/issues/167924
+        counts = torch.tensor([0, 1, 0], device="mps")
+        data = torch.arange(2, device="mps")
+        out_mps = data.repeat_interleave(counts[1:], dim=0)
+        out_cpu = data.cpu().repeat_interleave(counts.cpu()[1:], dim=0)
+        self.assertEqual(out_mps.cpu(), out_cpu)
+
     def test_count_nonzero(self):
         def helper(dtype):
             n = [

From 95d1df7d4e604fecea1f6977345217781c2cd990 Mon Sep 17 00:00:00 2001
From: Simon Layton <simonlayton@meta.com>
Date: Mon, 17 Nov 2025 05:11:13 -0800
Subject: [PATCH 27/47] Disable CUDA MXFP4 on non-B200 GPUs (#167857)

Summary:

MXFP4 unit tests pass on B200, fail on RTX 5090 - disable non-B200
cases.

Also add a fail w/a not implemented error for non-B200 to avoid
unhelpful failure messages.

Test Plan:

```
pytest -sv -k "mxfp4" test/test_scaled_matmul_cuda.py
```

Reviewers:

@nWEIdia

Subscribers:

Tasks:

Fixes https://github.com/pytorch/pytorch/issues/167850

Tags:
Signed-off-by: Simon Layton <simonlayton@meta.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167857
Approved by: https://github.com/nWEIdia, https://github.com/malfet
---
 aten/src/ATen/native/cuda/ScaledBlas.cpp | 14 ++++++++++++++
 test/test_scaled_matmul_cuda.py          | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp
index ac12b812c0670..4ff61f71f2b61 100644
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@@ -1101,6 +1101,19 @@ _scaled_mxfp8_mxfp8(
   return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }
 
+void
+_check_mxfp4_support() {
+#ifndef USE_ROCM
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  // Only on B200 GPUs
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    // B200 = 10.0, B300 = 10.3
+    dprops->major == 10,
+    "MXFP4 scaling only supported in CUDA for B200/B300"
+  );
+#endif
+}
+
 
 Tensor&
 _scaled_mxfp4_mxfp4(
@@ -1113,6 +1126,7 @@ _scaled_mxfp4_mxfp4(
 #if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
   TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
 #else
+  _check_mxfp4_support();
   // Restrictions:
   // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
   TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
diff --git a/test/test_scaled_matmul_cuda.py b/test/test_scaled_matmul_cuda.py
index 531629b082d92..94d6ece0f6369 100644
--- a/test/test_scaled_matmul_cuda.py
+++ b/test/test_scaled_matmul_cuda.py
@@ -20,6 +20,7 @@
     PLATFORM_SUPPORTS_MX_GEMM,
     PLATFORM_SUPPORTS_MXFP8_GROUPED_GEMM,
     SM100OrLater,
+    SM120OrLater,
     SM89OrLater,
     SM90OrLater,
     with_tf32_off,
@@ -53,6 +54,7 @@
 
 
 _IS_SM8X = False
+
 if TEST_CUDA:
     _IS_SM8X = torch.cuda.get_device_capability(0)[0] == 8
 
@@ -736,6 +738,10 @@ def test_float8_scale(self, device) -> None:
     @parametrize("format", ["mxfp8"] + (["nvfp4", "mxfp4"] if torch.version.cuda else []))
     def test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(self, G, M, N, K, format):
         torch.manual_seed(42)
+
+        if format == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
+
         total_K = K  # Alias for clarity, communicating this consists of several groups along this dim
         input_group_end_offsets = generate_jagged_offs(
             G, total_K, multiple_of=32, device="cuda"
@@ -799,6 +805,10 @@ def test_mxfp8_nvfp4_scaled_grouped_mm_2d_2d(self, G, M, N, K, format):
     @parametrize("format", ["mxfp8"] + (["nvfp4", "mxfp4"] if torch.version.cuda else []))
     def test_mxfp8_scaled_grouped_mm_2d_3d(self, G, M, N, K, format):
         torch.manual_seed(42)
+
+        if format == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
+
         # Simulate 2d-3d grouped gemm `out = input @ weight.t()`
         # 2D inputs with groups along M, 3D weights.
         block_size = 32
@@ -1879,6 +1889,8 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
             raise unittest.SkipTest("nvfp4 not supported on ROCm, skipping")
         if (recipe == "nvfp4" or recipe == "mxfp4") and fast_accum:
             raise unittest.SkipTest("fast_accum not supported in nvfp4/mxfp4 cublas gemm, skipping")
+        if recipe == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
 
         device = "cuda"
         M, K, N = mkn
@@ -2099,6 +2111,8 @@ def test_blockwise_mxfp8_nvfp4_mxfp4_numerics(self, test_case_name, fast_accum,
     @unittest.skipIf(not PLATFORM_SUPPORTS_MX_GEMM or IS_WINDOWS, mx_skip_msg)
     @parametrize("recipe", ["mxfp8", "mxfp4" if torch.version.hip else "nvfp4"])
     def test_blockwise_mxfp8_nvfp4_error_messages(self, device, recipe) -> None:
+        if recipe == "mxfp4" and SM120OrLater:
+            raise unittest.SkipTest("MXFP4 on CUDA only supported on B200/B300")
         M, K, N = (1024, 512, 2048)
         BLOCK_SIZE_K = 16 if recipe == "nvfp4" else 32
         BLOCK_SIZE_MN = 128

From 77acc66df917a2b9f6305d089ac88b8975786552 Mon Sep 17 00:00:00 2001
From: Xinya Zhang <Xinya.Zhang@amd.com>
Date: Mon, 17 Nov 2025 17:17:25 +0000
Subject: [PATCH 28/47] [ROCm][CI] Upgrade ROCm CI to 7.1 (#166743)

Upgrade all the ROCm docker images to ROCm 7.1 release version.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166743
Approved by: https://github.com/atalman, https://github.com/jeffdaily

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
Co-authored-by: Prachi Gupta <prachi.gupta@amd.com>
---
 .ci/docker/build.sh                           |  2 +-
 .ci/docker/common/install_rocm.sh             | 18 ++++++++-------
 .ci/docker/common/install_rocm_magma.sh       |  4 ++--
 cmake/External/aotriton.cmake                 |  3 +++
 .../_composable/fsdp/test_fully_shard_comm.py | 22 ++++++++++++++-----
 .../c10d/symm_mem/CUDASymmetricMemory.cu      |  4 ++++
 6 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 203ab597a75bc..b7e61115e37d6 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -188,7 +188,7 @@ case "$tag" in
     fi
     GCC_VERSION=11
     VISION=yes
-    ROCM_VERSION=7.0
+    ROCM_VERSION=7.1
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 9376d259d9cca..988347e28e9d8 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -60,14 +60,16 @@ EOF
         DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
     fi
 
-    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
-    # search for all unversioned packages
-    # if search fails it will abort this script; use true to avoid case where search fails
-    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-    if [[ "x${MIOPENHIPGFX}" = x ]]; then
-      echo "miopen-hip-gfx package not available" && exit 1
-    else
-      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+    if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
+      # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
+      # search for all unversioned packages
+      # if search fails it will abort this script; use true to avoid case where search fails
+      MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+      if [[ "x${MIOPENHIPGFX}" = x ]]; then
+        echo "miopen-hip-gfx package not available" && exit 1
+      else
+        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+      fi
     fi
 
     # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 2d03c6186b8e5..9bf45e6f1b0a9 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -12,8 +12,8 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
-    # post merge of https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index b19f25609cad3..21369c2981c31 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -15,12 +15,14 @@ if(NOT __AOTRITON_INCLUDED)
       "manylinux_2_28"  # rocm6.3
       "manylinux_2_28"  # rocm6.4
       "manylinux_2_28"  # rocm7.0
+      "manylinux_2_28"  # rocm7.1
       )
   set(__AOTRITON_ROCM_LIST
       "rocm6.2"
       "rocm6.3"
       "rocm6.4"
       "rocm7.0"
+      "rocm7.1"
       )
   set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
   set(__AOTRITON_SHA256_LIST
@@ -28,6 +30,7 @@ if(NOT __AOTRITON_INCLUDED)
       "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
       "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
       "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
+      "d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722"  # rocm7.1
       )
   set(__AOTRITON_IMAGE_LIST
       "amd-gfx90a"
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index 44000e761d8a0..ad3064608960d 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -59,7 +59,12 @@
     patch_reshard,
     patch_unshard,
 )
-from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_ROCM,
+    TEST_XPU,
+    xfailIf,
+)
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     FeedForward,
     ModelArgs,
@@ -1658,10 +1663,17 @@ def test_exception_when_used_together_with_comm_hooks(self):
 class TestFullyShardForceSumReduction(FSDPTest):
     # The messages might change when we move to a different NCCL version.
     # Please update this test if it starts failing.
-    COLLECTIVE_RE = (
-        "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
-        "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
-    )
+
+    if TEST_WITH_ROCM and torch.cuda.nccl.version()[:2] >= (2, 27):
+        COLLECTIVE_RE = (
+            r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
+            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+        )
+    else:
+        COLLECTIVE_RE = (
+            "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
+            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+        )
     # See here for the numerical values for each reduction op:
     # https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
     SUM_REDUCTION = 0
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 4523333c7fad4..6352330c3872c 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -699,7 +699,11 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
 #elif defined(USE_ROCM)
     C10_HIP_CHECK(hipMemImportFromShareableHandle(
         &handles[r],
+#if ROCM_VERSION >= 70100
+        reinterpret_cast<void*>(static_cast<uintptr_t>(imported_handles[r])),
+#else
         (void*)(uintptr_t) & (imported_handles[r]),
+#endif
         hipMemHandleTypePosixFileDescriptor));
 #else
     TORCH_CHECK(

From 567dcdba757aebd92b8d2b4b1604d66f55eb5e02 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 14 Nov 2025 11:23:08 -0800
Subject: [PATCH 29/47] Fix longstanding race condition around
 getAllOperatorsFor (#167860)

getAllOperatorsFor returns a const reference to internal state that is protected by a lock. Presuming that the lock is necessary in the first place (about which I offer no opinion because it's unclear to what extent the GIL should help here), this is a straightforward way to cause callers to create race conditions.

This should fix those race conditions by copying the state instead. I modified calling code to stop binding a const reference to the result for clarity.

Differential Revision: [D87088731](https://our.internmc.facebook.com/intern/diff/D87088731/)

**NOTE FOR REVIEWERS**: This PR has internal Meta-specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D87088731/)!

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167860
Approved by: https://github.com/zou3519
---
 test/cpp/jit/test_custom_operators.cpp        | 13 ++--
 test/custom_operator/test_custom_ops.cpp      |  2 +-
 torch/csrc/jit/frontend/schema_matching.cpp   |  2 +-
 torch/csrc/jit/ir/alias_analysis.cpp          |  2 +-
 torch/csrc/jit/ir/ir.cpp                      |  2 +-
 torch/csrc/jit/python/init.cpp                |  6 +-
 torch/csrc/jit/runtime/operator.cpp           | 70 +++++++++++--------
 torch/csrc/jit/runtime/operator.h             |  5 +-
 .../jit/runtime/symbolic_shape_registry.cpp   |  2 +-
 9 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/test/cpp/jit/test_custom_operators.cpp b/test/cpp/jit/test_custom_operators.cpp
index 58f87717844de..66295d0380629 100644
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@@ -15,7 +15,7 @@ namespace jit {
 TEST(CustomOperatorTest, InferredSchema) {
   torch::RegisterOperators reg(
       "foo::bar", [](double a, at::Tensor b) { return a + b; });
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -43,8 +43,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
       "foo::bar_with_schema(float a, Tensor b) -> Tensor",
       [](double a, at::Tensor b) { return a + b; });
 
-  auto& ops =
-      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -77,7 +76,7 @@ TEST(CustomOperatorTest, ListParameters) {
          torch::List<c10::complex<double>> complexdoubles,
          torch::List<at::Tensor> tensors) { return floats; });
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -123,7 +122,7 @@ TEST(CustomOperatorTest, ListParameters2) {
       "foo::lists2(Tensor[] tensors) -> Tensor[]",
       [](torch::List<at::Tensor> tensors) { return tensors; });
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
@@ -213,7 +212,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
       },
       aliasAnalysisFromSchema())});
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
   ASSERT_EQ(ops.size(), 0);
 }
 
@@ -232,7 +231,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
       },
       aliasAnalysisFromSchema())});
 
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
   ASSERT_EQ(ops.size(), 1);
 
   auto& op = ops.front();
diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp
index a526bebd26144..9791006d1498f 100644
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@@ -22,7 +22,7 @@ void check_all_parameters(
 
 template<class Result, class... Args>
 Result get_operator_from_registry_and_execute(const char* op_name, Args&&... args) {
-  auto& ops = torch::jit::getAllOperatorsFor(
+  auto ops = torch::jit::getAllOperatorsFor(
       torch::jit::Symbol::fromQualString(op_name));
   TORCH_INTERNAL_ASSERT(ops.size() == 1);
 
diff --git a/torch/csrc/jit/frontend/schema_matching.cpp b/torch/csrc/jit/frontend/schema_matching.cpp
index f191c7daf6e26..8e0d94b59acab 100644
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@@ -679,7 +679,7 @@ Value* emitBuiltinCall(
     at::ArrayRef<NamedValue> args,
     at::ArrayRef<NamedValue> kwargs,
     const std::optional<NamedValue>& self) {
-  const auto& variants = getAllOperatorsFor(name);
+  auto variants = getAllOperatorsFor(name);
   const auto& builtin_functions = getAllBuiltinFunctionsFor(name);
 
   // first let's set the graph's version
diff --git a/torch/csrc/jit/ir/alias_analysis.cpp b/torch/csrc/jit/ir/alias_analysis.cpp
index 16edf669da9be..f1353bd3103cc 100644
--- a/torch/csrc/jit/ir/alias_analysis.cpp
+++ b/torch/csrc/jit/ir/alias_analysis.cpp
@@ -617,7 +617,7 @@ void AliasDb::analyzeImpl(Node* node) {
         oss << input->type()->str() << ", ";
       }
       oss << "\n\nCandidates:";
-      const auto& candidates = getAllOperatorsFor(node->kind());
+      auto candidates = getAllOperatorsFor(node->kind());
       for (const auto& candidate : candidates) {
         oss << "\n\t" << candidate->schema();
       }
diff --git a/torch/csrc/jit/ir/ir.cpp b/torch/csrc/jit/ir/ir.cpp
index 4368b3c8191d8..6febed3540526 100644
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@@ -1088,7 +1088,7 @@ const FunctionSchema* Node::maybeSchema() const {
 
 const Operator* Node::maybeOperator() const {
   if (!op_) {
-    const auto& candidates = getAllOperatorsFor(kind());
+    auto candidates = getAllOperatorsFor(kind());
     for (const auto& candidate : candidates) {
       if (matches(candidate->schema())) {
         op_ = candidate.get();
diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp
index beb6f89519804..f7d855a515789 100644
--- a/torch/csrc/jit/python/init.cpp
+++ b/torch/csrc/jit/python/init.cpp
@@ -1693,7 +1693,7 @@ void initJITBindings(PyObject* module) {
       [](const std::string& op_name, const std::string& overload_name) {
         try {
           auto symbol = Symbol::fromQualString(op_name);
-          const auto& operations = getAllOperatorsFor(symbol);
+          auto operations = getAllOperatorsFor(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
               return op->schema();
@@ -1714,7 +1714,7 @@ void initJITBindings(PyObject* module) {
          const std::string& overload_name) -> std::optional<py::tuple> {
         try {
           auto symbol = Symbol::fromQualString(op_name);
-          const auto& operations = getAllOperatorsFor(symbol);
+          auto operations = getAllOperatorsFor(symbol);
           bool allow_numbers_as_tensors = opAllowsNumbersAsTensors(symbol);
           for (const auto& op : operations) {
             if (op->schema().overload_name() == overload_name) {
@@ -2138,7 +2138,7 @@ void initJITBindings(PyObject* module) {
   m.def("_jit_get_custom_class_schemas", customClassSchemasForBCCheck);
   m.def("_jit_get_schemas_for_operator", [](const std::string& qualified_name) {
     auto symbol = Symbol::fromQualString(qualified_name);
-    const auto& operations = getAllOperatorsFor(symbol);
+    auto operations = getAllOperatorsFor(symbol);
     return fmap(operations, [](const std::shared_ptr<Operator>& op) {
       return op->schema();
     });
diff --git a/torch/csrc/jit/runtime/operator.cpp b/torch/csrc/jit/runtime/operator.cpp
index 35dead2a395c9..6f9dec70cddc9 100644
--- a/torch/csrc/jit/runtime/operator.cpp
+++ b/torch/csrc/jit/runtime/operator.cpp
@@ -53,6 +53,16 @@ struct OperatorRegistry {
     to_register.clear();
   }
 
+  const std::vector<std::shared_ptr<Operator>>& getOperatorsWithLockHeld(
+      Symbol name) {
+    registerPendingOperators();
+    static std::vector<std::shared_ptr<Operator>> empty;
+    auto it = operators.find(name);
+    if (it != operators.end())
+      return it->second;
+    return empty;
+  }
+
  public:
   void registerOperator(Operator&& op) {
     std::lock_guard<std::mutex> guard(lock);
@@ -143,14 +153,35 @@ struct OperatorRegistry {
     return it->second;
   }
 
-  const std::vector<std::shared_ptr<Operator>>& getOperators(Symbol name) {
+  // This function returns internal lock-protected state. We need to
+  // copy it to avoid race conditions.
+  std::vector<std::shared_ptr<Operator>> getOperators(Symbol name) {
     std::lock_guard<std::mutex> guard(lock);
-    registerPendingOperators();
-    static std::vector<std::shared_ptr<Operator>> empty;
-    auto it = operators.find(name);
-    if (it != operators.end())
-      return it->second;
-    return empty;
+    return getOperatorsWithLockHeld(name);
+  }
+
+  std::vector<std::shared_ptr<Operator>> getSortedOperators(Symbol name) {
+    std::lock_guard<std::mutex> guard(lock);
+    const auto& unsortedOps = getOperatorsWithLockHeld(name);
+    // Depending on the order of registration, aten or jit ops may be
+    // registered first. This sorting is helpful in cases where
+    // deterministic (i.e. not dependent on build config) behavior is
+    // desired; e.g. torch.ops.aten.* uses this function, and tries to
+    // find the "first" op that matches input args. Without the sorting,
+    // the "first" op may change depending on registration order.
+    std::vector<std::shared_ptr<Operator>> sortedOps;
+    sortedOps.reserve(unsortedOps.size());
+    std::copy_if(
+        unsortedOps.begin(),
+        unsortedOps.end(),
+        std::back_inserter(sortedOps),
+        [](const std::shared_ptr<Operator>& op) { return op->isC10Op(); });
+    std::copy_if(
+        unsortedOps.begin(),
+        unsortedOps.end(),
+        std::back_inserter(sortedOps),
+        [](const std::shared_ptr<Operator>& op) { return !op->isC10Op(); });
+    return sortedOps;
   }
 
   std::vector<Symbol> findSimilarOperators(Symbol input_op) {
@@ -387,35 +418,16 @@ void deregisterOperator(const FunctionSchema& schema) {
   getRegistry().deregisterOperator(schema);
 }
 
-const std::vector<std::shared_ptr<Operator>> getAllOperators() {
+std::vector<std::shared_ptr<Operator>> getAllOperators() {
   return getRegistry().getAllOperators();
 }
 
-const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(Symbol name) {
+std::vector<std::shared_ptr<Operator>> getAllOperatorsFor(Symbol name) {
   return getRegistry().getOperators(name);
 }
 
 std::vector<std::shared_ptr<Operator>> getAllSortedOperatorsFor(Symbol name) {
-  const auto& unsortedOps = getAllOperatorsFor(name);
-  // Depending on the order of registration, aten or jit ops may be
-  // registered first. This sorting is helpful in cases where
-  // deterministic (i.e. not dependent on build config) behavior is
-  // desired; e.g. torch.ops.aten.* uses this function, and tries to
-  // find the "first" op that matches input args. Without the sorting,
-  // the "first" op may change depending on registration order.
-  std::vector<std::shared_ptr<Operator>> sortedOps;
-  sortedOps.reserve(unsortedOps.size());
-  std::copy_if(
-      unsortedOps.begin(),
-      unsortedOps.end(),
-      std::back_inserter(sortedOps),
-      [](const std::shared_ptr<Operator>& op) { return op->isC10Op(); });
-  std::copy_if(
-      unsortedOps.begin(),
-      unsortedOps.end(),
-      std::back_inserter(sortedOps),
-      [](const std::shared_ptr<Operator>& op) { return !op->isC10Op(); });
-  return sortedOps;
+  return getRegistry().getSortedOperators(name);
 }
 
 std::shared_ptr<Operator> findOperatorFor(const c10::OperatorName& full_name) {
diff --git a/torch/csrc/jit/runtime/operator.h b/torch/csrc/jit/runtime/operator.h
index bde3825f5ea38..6b6972deeebf0 100644
--- a/torch/csrc/jit/runtime/operator.h
+++ b/torch/csrc/jit/runtime/operator.h
@@ -260,8 +260,9 @@ struct TORCH_API Operator {
 
 TORCH_API std::string canonicalSchemaString(const FunctionSchema& schema);
 
-TORCH_API const std::vector<std::shared_ptr<Operator>> getAllOperators();
-TORCH_API const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(
+TORCH_API std::vector<std::shared_ptr<Operator>> getAllOperators();
+// This function returns a copy for thread safety.
+TORCH_API std::vector<std::shared_ptr<Operator>> getAllOperatorsFor(
     Symbol name);
 // Returns operators in the order which OpOverloadPacket resolves them.
 TORCH_API std::vector<std::shared_ptr<Operator>> getAllSortedOperatorsFor(
diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
index 74f87e46757ea..b1f0f410f14fe 100644
--- a/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
+++ b/torch/csrc/jit/runtime/symbolic_shape_registry.cpp
@@ -79,7 +79,7 @@ auto compilation_unit = std::make_shared<CompilationUnit>();
 
 const std::optional<const FunctionSchema*> getInplaceVariant(
     const FunctionSchema& base_schema) {
-  auto& inplace_variants =
+  auto inplace_variants =
       getAllOperatorsFor(c10::Symbol::fromQualString(base_schema.name() + "_"));
 
   for (const auto& variant : inplace_variants) {

From 2f3bb7482cef6eb462abedd2fc72330e0eb4d129 Mon Sep 17 00:00:00 2001
From: Adrian Abeyta <aabeyta@redhat.com>
Date: Mon, 17 Nov 2025 17:54:24 +0000
Subject: [PATCH 30/47] Improve benchmarks/dynamo:check_perf_csv output and
 failure summary (#161728)

Resolves https://github.com/pytorch/pytorch/issues/161290

## Summary

Expands `dynamo/check_perf_csv.py` output capabilities with latency, compile time and memory information:

- Display's measured speedup and display % from target
- Added clear messaging for all passing model tests when no regression is found
- Added error handling if csv file is missing

### Example (Failing Check)

```bash
python benchmarks/dynamo/check_perf_csv.py -f reports-dir/inductor_training_smoketest.csv -t 1.40
```

**Example Output:**
```
Checking inductor_training_smoketest.csv (speedup threshold >= 1.40x)
hf_Bert                            speedup=1.005x, latency=390.8 ms/iter, compile=1.526s, mem_ratio=1.02x (eager=360.6 GB, dynamo=369.3 GB)
Error 1 model(s) performance regressed
    hf_Bert
  - hf_Bert: 1.005x (< 1.40x; -28.2% from target)
```

### Example (Passing Check)

```bash
python benchmarks/dynamo/check_perf_csv.py -f reports-dir/inductor_training_smoketest.csv -t 1.40
```

**Example Output:**
```
Checking inductor_training_smoketest.csv (speedup threshold >= 1.00x)
hf_Bert                            speedup=1.005x, latency=390.8 ms/iter, compile=1.526s, mem_ratio=1.02x (eager=360.6 GB, dynamo=369.3 GB)
All 1 model(s) passed threshold check (>= 1.00x)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161728
Approved by: https://github.com/isuruf
---
 benchmarks/dynamo/check_perf_csv.py | 49 ++++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/benchmarks/dynamo/check_perf_csv.py b/benchmarks/dynamo/check_perf_csv.py
index 320a4544f829b..08070dda4444c 100644
--- a/benchmarks/dynamo/check_perf_csv.py
+++ b/benchmarks/dynamo/check_perf_csv.py
@@ -9,28 +9,61 @@ def check_perf_csv(filename, threshold, threshold_scale):
     """
     Basic performance checking.
     """
+    try:
+        df = pd.read_csv(filename)
+    except FileNotFoundError:
+        print(f"Error: File {filename} not found")
+        sys.exit(1)
 
-    df = pd.read_csv(filename)
+    effective_threshold = threshold * threshold_scale
+    print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")
 
     failed = []
     for _, row in df.iterrows():
         model_name = row["name"]
-        speedup = row["speedup"]
-        if speedup < threshold * threshold_scale:
-            failed.append(model_name)
+        speedup = float(row["speedup"])
+        abs_latency = float(row["abs_latency"])
+        compilation_latency = float(row["compilation_latency"])
+        compression_ratio = float(row["compression_ratio"])
+        eager_peak_mem = float(row["eager_peak_mem"])
+        dynamo_peak_mem = float(row["dynamo_peak_mem"])
+
+        perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
+        if pd.notna(abs_latency):
+            perf_summary += f", latency={abs_latency:.1f} ms/iter"
+        if pd.notna(compilation_latency):
+            perf_summary += f", compile={compilation_latency:.3f}s"
+        if pd.notna(compression_ratio):
+            perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
+            if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
+                perf_summary += (
+                    f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
+                )
+
+        if speedup < effective_threshold:
+            failed.append((model_name, speedup))
 
-        print(f"{model_name:34} {speedup}")
+        print(perf_summary)
 
     if failed:
         print(
             textwrap.dedent(
                 f"""
-                Error {len(failed)} models performance regressed
-                    {" ".join(failed)}
+                Error {len(failed)} model(s) performance regressed
+                    {" ".join([name for name, _ in failed])}
                 """
             )
         )
+        for name, sp in sorted(failed, key=lambda x: x[1]):
+            pct_from_target = (sp / effective_threshold - 1.0) * 100.0
+            print(
+                f"  - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
+            )
         sys.exit(1)
+    else:
+        print(
+            f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
+        )
 
 
 if __name__ == "__main__":
@@ -44,7 +77,7 @@ def check_perf_csv(filename, threshold, threshold_scale):
         "-s",
         type=float,
         default=1.0,
-        help="multiple threshold by this value to relax the check",
+        help="multiply threshold by this value to relax the check",
     )
     args = parser.parse_args()
     check_perf_csv(args.file, args.threshold, args.threshold_scale)

From ae3ce54f27adfc5b580cc08be0690f4a595f2a61 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 17:59:42 +0000
Subject: [PATCH 31/47] Revert "[ROCm] Enable StaticCudaLauncher for ROCm
 (#166492)"

This reverts commit 99fdca8f4d856cc52eb39d5e70be73dbd48228f8.

Reverted https://github.com/pytorch/pytorch/pull/166492 on behalf of https://github.com/jeanschmidt due to Internally we still depends on the old logic, so we need to find a way to maintain backwards compatibility, for now ([comment](https://github.com/pytorch/pytorch/pull/166492#issuecomment-3543198811))
---
 test/inductor/test_ck_backend.py              |   1 -
 test/inductor/test_codecache.py               |   9 +-
 test/inductor/test_static_cuda_launcher.py    |  21 +++-
 .../_inductor/runtime/static_cuda_launcher.py |  55 ++--------
 torch/_inductor/runtime/triton_heuristics.py  |  11 +-
 torch/csrc/Module.cpp                         |   2 +-
 torch/csrc/inductor/static_cuda_launcher.cpp  | 102 ++----------------
 torch/csrc/inductor/static_cuda_launcher.h    |   2 +-
 8 files changed, 41 insertions(+), 162 deletions(-)

diff --git a/test/inductor/test_ck_backend.py b/test/inductor/test_ck_backend.py
index 405e46d8ded52..079be79fcc9d8 100644
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@@ -235,7 +235,6 @@ def mm(a, b):
             Y_eager = a @ b
             torch.testing.assert_close(Y_compiled, Y_eager, equal_nan=True)
 
-    @unittest.skip("Autotune Mismatch being investigated")
     @unittest.skipIf(not torch.version.hip, "ROCM only")
     @unittest.mock.patch.dict(os.environ, _test_env)
     @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
diff --git a/test/inductor/test_codecache.py b/test/inductor/test_codecache.py
index c1b41fd8ec5c3..4b9030b5cae4b 100644
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@@ -479,17 +479,14 @@ def test_remote_cache_load_function(
 
         if device == GPU_TYPE and not HAS_GPU:
             raise unittest.SkipTest(f"requires {GPU_TYPE}")
-        if (
-            device == "cuda"
-            and torch.version.hip is None
-            and dtype == torch.bfloat16
-            and not SM80OrLater
-        ):
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")
         if use_static_cuda_launcher and not (device == "cuda" and bundle_triton):
             raise unittest.SkipTest(
                 "Static cuda launcher requires cuda and triton bundling"
             )
+        if use_static_cuda_launcher and TEST_WITH_ROCM:
+            raise unittest.SkipTest("Static cuda launcher doesn't work with ROCM")
 
         def fn(x, y):
             return (x * 2, y @ y)
diff --git a/test/inductor/test_static_cuda_launcher.py b/test/inductor/test_static_cuda_launcher.py
index ec9586197d085..654bfd269f761 100644
--- a/test/inductor/test_static_cuda_launcher.py
+++ b/test/inductor/test_static_cuda_launcher.py
@@ -12,6 +12,7 @@
 from torch._inductor.runtime.triton_compat import CompiledKernel, tl, triton
 from torch._inductor.runtime.triton_helpers import libdevice
 from torch._inductor.test_case import TestCase
+from torch.testing._internal.common_utils import skipIfRocm
 from torch.testing._internal.triton_utils import requires_cuda_and_triton
 
 
@@ -38,9 +39,8 @@ def write_cubin_to_tmp(self, kernel: CompiledKernel) -> str:
         # Just used by tests for now.
         # TODO: derive cubin_path from wherever triton stores the cubin file on disk.
         tmp_file = tempfile.NamedTemporaryFile(mode="wb", delete=False)
-        binary_key = "hsaco" if torch.version.hip else "cubin"
         with tmp_file:
-            tmp_file.write(kernel.asm[binary_key])
+            tmp_file.write(kernel.asm["cubin"])
         self.tmp_files.append(tmp_file)
         return tmp_file.name
 
@@ -64,6 +64,7 @@ def _make_launcher(
         result.load_kernel(device_interface.current_device())
         return result
 
+    @skipIfRocm
     def test_basic(self):
         @triton.jit
         def simple_kernel(arg0, arg1):
@@ -90,6 +91,7 @@ def simple_kernel(arg0, arg1):
     # 2. triton relies on inspect.get_source to get the type annotations
     # so I can't even use exec() to generate the test cases.
     # So we'll just make a few kernels by hand
+    @skipIfRocm
     def test_unsigned_integers(self):
         @triton.jit
         def unsigned_integers(
@@ -113,6 +115,7 @@ def unsigned_integers(
         launcher.run(1, 1, 1, stream, new_arg0, 50, 50, 50, 50)
         self.assertEqual(new_arg0, arg0)
 
+    @skipIfRocm
     def test_signed_integers(self):
         @triton.jit
         def signed_integers(
@@ -136,6 +139,7 @@ def signed_integers(
         launcher.run(1, 1, 1, stream, new_arg0, 50, 50, 50, 50)
         self.assertEqual(new_arg0, arg0)
 
+    @skipIfRocm
     def test_basic_1arg(self):
         @triton.jit
         def simple_kernel_1_arg(arg0):
@@ -160,6 +164,7 @@ def simple_kernel_1_arg(arg0):
         )
         self.assertEqual(new_arg0, arg0)
 
+    @skipIfRocm
     def test_constexpr(self):
         # Constexprs are compiled directly into the cubin file,
         # so we never need to pass it to StaticCudaLauncher.
@@ -188,6 +193,7 @@ def kernel_constexpr(arg0, CONSTANT: tl.constexpr):
         )
         self.assertEqual(new_arg0, arg0)
 
+    @skipIfRocm
     def test_implied_constant(self):
         """xnumel is unused in this kernel, but isn't explicitly marked as a constexpr"""
 
@@ -240,6 +246,7 @@ def triton_red_fused_any_isinf_0(
         launcher.run(1, 1, 1, stream, arg0, arg2, 128)
         self.assertEqual(arg1, arg2)
 
+    @skipIfRocm
     def test_kernel_no_args(self):
         # Just an easy way to test incompatible number of arguments
         @triton.jit
@@ -252,6 +259,7 @@ def kernel_no_op():
         stream = device_interface.get_raw_stream(device_interface.current_device())
         launcher.run(1, 1, 1, stream)
 
+    @skipIfRocm
     def test_high_shared_mem(self):
         @triton.jit
         def simple_kernel(arg0, arg1):
@@ -275,6 +283,7 @@ def simple_kernel(arg0, arg1):
         launcher.run(1, 1, 1, stream, new_arg0, arg1)
         self.assertEqual(new_arg0, arg0)
 
+    @skipIfRocm
     def test_too_high_shared_mem(self):
         @triton.jit
         def simple_kernel(arg0, arg1):
@@ -294,6 +303,7 @@ def simple_kernel(arg0, arg1):
             lambda: self._make_launcher(compiled_kernel),
         )
 
+    @skipIfRocm
     def test_kernel_empty_tensor(self):
         # Triton kernel generated by torch.compile of the following:
         # @torch.compile()
@@ -354,6 +364,7 @@ def triton_poi_fused_cat_0(
         launcher.run(1, 1, 1, stream, arg1, arg2, buf1, arg0, xnumel)
         self.assertEqual(buf0, buf1)
 
+    @skipIfRocm
     def test_kernel_many_args(self):
         N = 200
         # Make 200 arguments
@@ -394,6 +405,7 @@ class TestStaticTritonCompileResult(TestCase):
     Tests static cuda launcher with torch.compile()
     """
 
+    @skipIfRocm
     def test_basic_compile(self):
         @torch.compile
         def foo(x, y):
@@ -403,6 +415,7 @@ def foo(x, y):
         y = torch.randn(10, device="cuda")
         self.assertEqual(foo(x, y), x + y)
 
+    @skipIfRocm
     # The error gets raised on a worker, so we want to not use a separate process
     @torch._inductor.config.patch("compile_threads", 1)
     def test_incompatible_code(self):
@@ -425,6 +438,7 @@ def foo(x):
             lambda: foo(x),
         )
 
+    @skipIfRocm
     # The error gets raised on a worker, so we want to not use a separate process
     @torch._inductor.config.patch(
         {"compile_threads": 1, "static_launch_user_defined_triton_kernels": True}
@@ -446,6 +460,7 @@ def foo(x):
         x2 = x.clone().detach_()
         self.assertEqual(foo(x), x2 + 5)
 
+    @skipIfRocm
     def test_empty_tensor(self):
         @torch.compile()
         def foo(x, y):
@@ -457,6 +472,7 @@ def foo(x, y):
         result = foo(x, y)
         self.assertEqual(result, torch.cat(((x * 4), y + 10)))
 
+    @skipIfRocm
     def test_any(self):
         def fn(x):
             return (
@@ -476,6 +492,7 @@ def fn(x):
         compiled_result = compiled_fn(arg)
         self.assertEqual(eager_result, compiled_result)
 
+    @skipIfRocm
     def test_disable_static_cuda_launcher(self):
         @torch.compile
         def fn(x, y):
diff --git a/torch/_inductor/runtime/static_cuda_launcher.py b/torch/_inductor/runtime/static_cuda_launcher.py
index 4eede8631e9ce..f48f351ce823a 100644
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@@ -38,20 +38,7 @@ def __init__(self, kernel: CompiledKernel) -> None:
         # pyrefly: ignore [missing-attribute]
         self.name = kernel.src.fn.__name__
         # pyrefly: ignore [missing-attribute]
-        if "hsaco" in kernel.asm:
-            # pyrefly: ignore [missing-attribute]
-            self.cubin_raw = kernel.asm["hsaco"]
-            self.is_rocm = True
-        # pyrefly: ignore [missing-attribute]
-        elif "cubin" in kernel.asm:
-            # pyrefly: ignore [missing-attribute]
-            self.cubin_raw = kernel.asm["cubin"]
-            self.is_rocm = False
-        else:
-            raise RuntimeError(
-                "Expected either 'hsaco' (ROCm) or 'cubin' (CUDA) in kernel.asm"
-            )
-
+        self.cubin_raw = kernel.asm.get("cubin", None)
         # pyrefly: ignore [missing-attribute]
         self.cubin_path = kernel._cubin_path
 
@@ -258,42 +245,12 @@ def run(
         # thing, it should always match.
         # Get rid of constants before passing to cubin launcher
 
+        # Add a None if triton wants extra parameters for scratch spaces
         arg_tys = self.arg_tys
-
-        if self.is_rocm:
-            # ROCm/HIP kernel ABI: The Triton HIP backend ALWAYS includes both
-            # global_scratch and profile_scratch parameters in the kernel signature,
-            # even when the kernel doesn't use them (i.e., when has_*_scratch is False).
-            #
-            # This differs fundamentally from CUDA, where these parameters are only
-            # present in the signature if the corresponding has_*_scratch flag is True.
-            #
-            # The flags indicate whether memory will be allocated/used:
-            # - has_global_scratch: Whether global scratch workspace is needed
-            # - has_profile_scratch: Whether profiling instrumentation is enabled
-            #
-            # However, regardless of flag values, we MUST always pass both parameters
-            # to match the HIP kernel ABI. Passing None is safe:
-            #
-            # - If scratch is not needed (has_*_scratch=False or scratch_size=0):
-            #   The None becomes nullptr, which the kernel never dereferences
-            #
-            # - If scratch is needed (has_*_scratch=True and scratch_size>0):
-            #   The None becomes nullptr initially, but the HIP runtime intercepts
-            #   the kernel launch, allocates the required scratch memory based on
-            #   kernel metadata, and replaces the nullptr with a valid pointer before
-            #   the kernel actually executes
-            #
-            # Not passing both parameters causes segmentation faults because the kernel
-            # expects them at specific positions in the argument array.
-            arg_tys = arg_tys + "OO"
-            args = (*args, None, None)
-
-        else:
-            for has_scratch in [self.has_global_scratch, self.has_profile_scratch]:
-                if has_scratch:
-                    arg_tys = arg_tys + "O"
-                    args = (*args, None)
+        for has_scratch in [self.has_global_scratch, self.has_profile_scratch]:
+            if has_scratch:
+                arg_tys = arg_tys + "O"
+                args = (*args, None)
         # pyrefly: ignore [bad-argument-type]
         assert len(args) == len(arg_tys)
 
diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py
index 3cc47219258f9..d5851eeceeb24 100644
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@@ -1608,8 +1608,9 @@ def can_statically_launch(
             return None
 
         def check_can_launch() -> StaticallyLaunchedCudaKernel:
-            if triton_meta.get("device_type") not in ("cuda", "hip"):
-                raise CannotStaticallyLaunchKernel("Non-cuda/ROCm device")
+            if triton_meta.get("device_type") != "cuda":
+                # Only cuda kernels
+                raise CannotStaticallyLaunchKernel("Non-cuda device")
 
             if torch._inductor.config.cpp_wrapper:
                 # If we're running with cpp wrapper, it doesn't
@@ -1635,11 +1636,10 @@ def check_can_launch() -> StaticallyLaunchedCudaKernel:
                     "static launch does not support launch attributes"
                 )
 
-            binary_ext = "hsaco" if triton_meta.get("device_type") == "hip" else "cubin"
             cubin_location = os.path.join(
                 triton_cache_dir(triton_meta.get("device", 0)),
                 triton_hash_to_path_key(kernel.hash),
-                f"{kernel.src.fn.__name__}.{binary_ext}",
+                f"{kernel.src.fn.__name__}.cubin",
             )
 
             if not os.path.exists(cubin_location):
@@ -1671,11 +1671,10 @@ def reload_cubin_path(self):
         When loading from cache on disk, we want to reload cubin
         files from their appropriate location on disc.
         """
-        binary_ext = "hsaco" if torch.version.hip else "cubin"
         cubin_location = os.path.join(
             triton_cache_dir(self.compile_meta.get("device", 0)),
             triton_hash_to_path_key(self.kernel.hash),
-            f"{self.kernel.name}.{binary_ext}",
+            f"{self.kernel.name}.cubin",
         )
         if not os.path.exists(cubin_location):
             if self.kernel.cubin_raw is not None:
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 42a2e5c526742..adf1c8c4c4d20 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -2150,7 +2150,7 @@ PyObject* initModule() {
 #ifdef USE_CUDA
   torch::cuda::initModule(module);
 #endif
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && !defined(USE_ROCM)
   ASSERT_TRUE(StaticCudaLauncher_init(module));
 #endif
 #ifdef USE_MPS
diff --git a/torch/csrc/inductor/static_cuda_launcher.cpp b/torch/csrc/inductor/static_cuda_launcher.cpp
index da61cd28c1b6f..59916b6763bfa 100644
--- a/torch/csrc/inductor/static_cuda_launcher.cpp
+++ b/torch/csrc/inductor/static_cuda_launcher.cpp
@@ -1,4 +1,7 @@
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+// We disable this file from being hipified because there are CUDA drivers hip
+// has not implemented yet. Also, we're passing in a cubin file directly, so it
+// would take more work to support ROCM anyway.
 #include <torch/csrc/utils/pythoncapi_compat.h>
 
 #include <ATen/Context.h>
@@ -13,11 +16,6 @@
 #include <torch/csrc/utils/python_numbers.h>
 #include <filesystem>
 #include <optional>
-
-#if defined(USE_ROCM)
-#include <hip/hip_runtime_api.h>
-#endif
-
 /**
   Implements a static launcher for triton compiled CUDA kernels.
   Given a path to a cubin file, a function name, and some metadata,
@@ -58,14 +56,8 @@ const at::cuda::NVRTC& nvrtc() {
 
 CUdeviceptr getPointer(PyObject* obj) {
   CUdeviceptr data_ptr = 0;
-
   if (THPUtils_checkLong(obj)) {
-#if defined(USE_ROCM)
-    data_ptr = reinterpret_cast<hipDeviceptr_t>(THPUtils_unpackUInt64(obj));
-#else
     data_ptr = THPUtils_unpackUInt64(obj);
-#endif
-
     return data_ptr;
   }
   if (obj == Py_None) {
@@ -81,25 +73,13 @@ CUdeviceptr getPointer(PyObject* obj) {
   TORCH_CHECK(
       THPUtils_checkLong(ret),
       "data_ptr method of Pointer object must return 64-bit int");
-
-#if defined(USE_ROCM)
-  data_ptr = reinterpret_cast<hipDeviceptr_t>(THPUtils_unpackUInt64(ret));
-#else
   data_ptr = THPUtils_unpackUInt64(ret);
-#endif
-
   if (!data_ptr)
     return data_ptr;
 
   CUdeviceptr dev_ptr = 0;
-#if defined(USE_ROCM)
-  AT_CUDA_DRIVER_CHECK(hipPointerGetAttribute(
-      &dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, data_ptr));
-#else
   AT_CUDA_DRIVER_CHECK(nvrtc().cuPointerGetAttribute(
       &dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, data_ptr));
-#endif
-
   return dev_ptr;
 }
 
@@ -118,15 +98,6 @@ CUfunction loadKernel(
   }
   CUmodule mod = nullptr;
   CUfunction func = nullptr;
-
-#if defined(USE_ROCM)
-  AT_CUDA_DRIVER_CHECK(hipModuleLoad(&mod, filePath.c_str()));
-  AT_CUDA_DRIVER_CHECK(hipModuleGetFunction(&func, mod, funcName.c_str()));
-  int shared_optin = 0;
-  AT_CUDA_DRIVER_CHECK(hipDeviceGetAttribute(
-      &shared_optin, hipDeviceAttributeSharedMemPerBlockOptin, device));
-
-#else
   AT_CUDA_DRIVER_CHECK(nvrtc().cuModuleLoad(&mod, filePath.c_str()));
   AT_CUDA_DRIVER_CHECK(
       nvrtc().cuModuleGetFunction(&func, mod, funcName.c_str()));
@@ -135,9 +106,6 @@ CUfunction loadKernel(
       &shared_optin,
       CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
       device));
-
-#endif
-
   // Shared memory logic from triton/third-party/nvidia/backend/driver.c
   // If we're using more than 48 KB of shared memory, and we have
   // access to more than 48 KB of shared memory on the device,
@@ -156,21 +124,6 @@ CUfunction loadKernel(
       " Reducing block sizes or `num_stages` may help.");
   if (sharedMemBytes > SHARED_MEM_STATIC_MAX &&
       shared_optin > SHARED_MEM_STATIC_MAX) {
-#if defined(USE_ROCM)
-    AT_CUDA_DRIVER_CHECK(hipFuncSetCacheConfig(func, hipFuncCachePreferShared));
-    int shared_total = 0, shared_static = 0;
-    AT_CUDA_DRIVER_CHECK(hipDeviceGetAttribute(
-        &shared_total,
-        hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,
-        device));
-    AT_CUDA_DRIVER_CHECK(hipFuncGetAttribute(
-        &shared_static, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
-    AT_CUDA_DRIVER_CHECK(hipFuncSetAttribute(
-        func,
-        hipFuncAttributeMaxDynamicSharedMemorySize,
-        shared_optin - shared_static));
-
-#else
     AT_CUDA_DRIVER_CHECK(
         nvrtc().cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_SHARED));
     int shared_total = 0, shared_static = 0;
@@ -184,7 +137,6 @@ CUfunction loadKernel(
         func,
         CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
         shared_optin - shared_static));
-#endif
   }
   return func;
 }
@@ -200,27 +152,6 @@ inline void launchKernel(
     cudaStream_t stream) {
   // cta_args is always 1 for inductor generated triton kernels,
   // so we don't need to figure out grid dimension here
-#if defined(USE_ROCM)
-  int device = 0;
-  AT_CUDA_DRIVER_CHECK(hipGetDevice(&device));
-  int warp_size = 0;
-  AT_CUDA_DRIVER_CHECK(
-      hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, device));
-
-  AT_CUDA_DRIVER_CHECK(hipModuleLaunchKernel(
-      func,
-      gridX,
-      gridY,
-      gridZ,
-      warp_size * numWarps, // blockDim.x
-      1, // blockDim.y
-      1, // blockDim.z
-      sharedMemBytes,
-      stream,
-      args,
-      nullptr));
-
-#else
   AT_CUDA_DRIVER_CHECK(nvrtc().cuLaunchKernel(
       func,
       gridX,
@@ -233,7 +164,6 @@ inline void launchKernel(
       stream,
       args,
       nullptr));
-#endif
 }
 
 template <typename FINAL, typename F>
@@ -339,20 +269,11 @@ PyObject* load_kernel(PyObject* self, PyObject* args) {
   CUdevice device = static_cast<CUdevice>(device_ptr); // NOLINT
   CUfunction func = nullptr;
   func = loadKernel(filePath, funcName, sharedMemBytes, device);
-
-#if defined(USE_ROCM)
-  AT_CUDA_DRIVER_CHECK(
-      hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, func));
-  AT_CUDA_DRIVER_CHECK(hipFuncGetAttribute(
-      &n_spills, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func));
-
-#else
+  // Taken from triton/nvidia/backend/driver.c
   AT_CUDA_DRIVER_CHECK(
       nvrtc().cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, func));
   AT_CUDA_DRIVER_CHECK(nvrtc().cuFuncGetAttribute(
       &n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, func));
-
-#endif
   n_spills /= 4;
   // Return a tuple of CUFunction, n_regs, n_spills
   return Py_BuildValue(
@@ -378,6 +299,7 @@ PyObject* launch_kernel_inner(
   std::array<uint64_t, MAX_ARGS> argStorage = {};
   std::array<void*, MAX_ARGS> kernelArgs = {};
   parseKernelArgs(varArgs, argTypes, argStorage.data(), kernelArgs.data());
+
   launchKernel(
       func,
       gridX,
@@ -464,25 +386,13 @@ PyObject* launch_kernel(PyObject* self, PyObject* args) {
     Py_RETURN_NONE;
   }
   CUcontext pctx = nullptr;
-#if defined(USE_ROCM)
-  AT_CUDA_DRIVER_CHECK(hipCtxGetCurrent(&pctx));
-#else
   AT_CUDA_DRIVER_CHECK(nvrtc().cuCtxGetCurrent(&pctx));
-#endif
-
   if (!pctx) {
     // Ensure device context exists
     CUdevice device = 0;
-#if defined(USE_ROCM)
-    AT_CUDA_DRIVER_CHECK(hipDeviceGet(&device, 0));
-    AT_CUDA_DRIVER_CHECK(hipDevicePrimaryCtxRetain(&pctx, device));
-    AT_CUDA_DRIVER_CHECK(hipCtxSetCurrent(pctx));
-#else
     AT_CUDA_DRIVER_CHECK(nvrtc().cuDeviceGet(&device, 0));
     AT_CUDA_DRIVER_CHECK(nvrtc().cuDevicePrimaryCtxRetain(&pctx, device));
     AT_CUDA_DRIVER_CHECK(nvrtc().cuCtxSetCurrent(pctx));
-
-#endif
   }
   CUfunction func = reinterpret_cast<CUfunction>(func_ptr); // NOLINT
   cudaStream_t cudaStream = reinterpret_cast<cudaStream_t>(stream); // NOLINT
diff --git a/torch/csrc/inductor/static_cuda_launcher.h b/torch/csrc/inductor/static_cuda_launcher.h
index 6f3980172275b..517036b9975e6 100644
--- a/torch/csrc/inductor/static_cuda_launcher.h
+++ b/torch/csrc/inductor/static_cuda_launcher.h
@@ -1,5 +1,5 @@
 #pragma once
-#if defined(USE_CUDA)
+#if defined(USE_CUDA) && !defined(USE_ROCM)
 #include <torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h>
 #include <torch/csrc/python_headers.h>
 

From 02b55c3f4a065e24ce0f25b7e93089436b7a7b03 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Sun, 16 Nov 2025 14:59:15 +0200
Subject: [PATCH 32/47] Move isQIntType to headeronly (#167772)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167772
Approved by: https://github.com/janeyx99
---
 c10/core/ScalarType.h                       |  7 -------
 test/cpp/aoti_abi_check/test_scalartype.cpp | 11 +++++++++++
 torch/headeronly/core/ScalarType.h          |  8 ++++++++
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h
index 5bc537dbd83c8..040c6abb7d8e2 100644
--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@@ -92,13 +92,6 @@ inline bool isComplexType(ScalarType t) {
       t == ScalarType::ComplexDouble);
 }
 
-inline bool isQIntType(ScalarType t) {
-  // Don't forget to extend this when adding new QInt types
-  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
-      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
-      t == ScalarType::QUInt2x4;
-}
-
 inline bool isBitsType(ScalarType t) {
   return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
       t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
diff --git a/test/cpp/aoti_abi_check/test_scalartype.cpp b/test/cpp/aoti_abi_check/test_scalartype.cpp
index c299d58664c8e..6df242b5a4cec 100644
--- a/test/cpp/aoti_abi_check/test_scalartype.cpp
+++ b/test/cpp/aoti_abi_check/test_scalartype.cpp
@@ -101,3 +101,14 @@ TEST(TestScalarType, toUnderlying) {
   AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
 #undef DEFINE_CHECK
 }
+
+TEST(TestScalarType, isQIntType) {
+  using torch::headeronly::isQIntType;
+  using torch::headeronly::ScalarType;
+#define DEFINE_CHECK(_, name) EXPECT_TRUE(isQIntType(ScalarType::name));
+  AT_FORALL_QINT_TYPES(DEFINE_CHECK);
+#undef DEFINE_CHECK
+#define DEFINE_CHECK(_, name) EXPECT_FALSE(isQIntType(ScalarType::name));
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
diff --git a/torch/headeronly/core/ScalarType.h b/torch/headeronly/core/ScalarType.h
index 5c395e5d5aa29..ce43ce6866cd9 100644
--- a/torch/headeronly/core/ScalarType.h
+++ b/torch/headeronly/core/ScalarType.h
@@ -336,6 +336,13 @@ inline std::ostream& operator<<(
   return stream << toString(scalar_type);
 }
 
+inline bool isQIntType(ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
+      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
+      t == ScalarType::QUInt2x4;
+}
+
 inline ScalarType toUnderlying(ScalarType t) {
   switch (t) {
     case ScalarType::QUInt8:
@@ -362,6 +369,7 @@ using c10::NumScalarTypes;
 using c10::ScalarType;
 using c10::toString;
 using c10::operator<<;
+using c10::isQIntType;
 using c10::toUnderlying;
 
 namespace impl {

From 1233be09233aa307f326e1076c370c8dbddf38b6 Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Sun, 16 Nov 2025 14:59:15 +0200
Subject: [PATCH 33/47] [STABLE ABI] Add mutable_data_ptr() and
 const_data_ptr() methods to torch::stable::Tensor. (#161891)

This ghstack is a prerequisite for porting torchaudio C++ extensions to use torch stable ABI, see https://github.com/pytorch/audio/issues/4074, https://github.com/pytorch/audio/issues/4075, https://github.com/pytorch/audio/issues/4076, https://github.com/pytorch/audio/issues/4077, https://github.com/pytorch/audio/issues/4078

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161891
Approved by: https://github.com/mikaylagawarecki
ghstack dependencies: #167772
---
 .../libtorch_agnostic/csrc/kernel.cpp         | 37 ++++++++++-
 .../libtorch_agnostic/ops.py                  | 31 +++++++++
 .../test/test_libtorch_agnostic.py            | 64 +++++++++++++++++++
 torch/csrc/inductor/aoti_torch/c/shim.h       | 10 +++
 .../csrc/inductor/aoti_torch/shim_common.cpp  | 18 ++++++
 torch/csrc/stable/tensor_inl.h                | 31 +++++++++
 torch/csrc/stable/tensor_struct.h             | 22 +++++++
 7 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
index 46bcb9633a312..cb61946e3c702 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@@ -310,7 +310,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
   m.def("my_amax(Tensor a) -> Tensor");
   m.def("my_amax_vec(Tensor a) -> Tensor");
   m.def("my_is_cpu(Tensor t) -> bool");
-   m.def("test_default_constructor(bool undefined) -> bool");
+  m.def("test_default_constructor(bool undefined) -> bool");
 }
 
 bool test_default_constructor(bool defined) {
@@ -332,12 +332,47 @@ bool test_default_constructor(bool defined) {
   return out.defined();
 }
 
+uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
+  if (mutable_) {
+    return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
+  } else {
+    return reinterpret_cast<uint64_t>(t.const_data_ptr());
+  }
+}
+
+uint64_t get_template_any_data_ptr(Tensor t, c10::ScalarType dtype, bool mutable_) {
+#define DEFINE_CASE(T, name)                                            \
+  case torch::headeronly::ScalarType::name: {                           \
+    if (mutable_) {                                                     \
+      return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>());       \
+    } else {                                                            \
+      return reinterpret_cast<uint64_t>(t.const_data_ptr<T>());         \
+    }                                                                   \
+  }
+  switch (dtype) {
+    // per aten/src/ATen/templates/TensorMethods.cpp:
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+    DEFINE_CASE(uint16_t, UInt16)
+    DEFINE_CASE(uint32_t, UInt32)
+    DEFINE_CASE(uint64_t, UInt64)
+  default:
+      return 0;
+  }
+#undef DEFINE_CASE
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
+  m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
+}
 
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
   m.impl("my_zero_", TORCH_BOX(&my_zero_));
   m.impl("my_amax", TORCH_BOX(&my_amax));
   m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
   m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
+  m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
+  m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
 }
 
 std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
index 43e24f0f20466..e12c167fc7296 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@@ -227,6 +227,37 @@ def test_tensor_device(t):
     return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
 
 
+def get_any_data_ptr(t, mutable) -> int:
+    """
+    Return data pointer value of the tensor.
+
+    Args:
+        t: Input tensor
+        mutable: whether data pointer qualifier is mutable or const
+
+    Returns: int - pointer value
+    """
+    return torch.ops.libtorch_agnostic.get_any_data_ptr.default(t, mutable)
+
+
+def get_template_any_data_ptr(t, dtype, mutable) -> int:
+    """
+    Return data pointer value of the tensor iff it has dtype.
+
+    Args:
+        t: Input tensor
+        dtype: Input dtype
+        mutable: whether data pointer qualifier is mutable or const
+
+    Returns: int - pointer value
+
+    Raises RuntimeError when t.dtype() != dtype.
+    """
+    return torch.ops.libtorch_agnostic.get_template_any_data_ptr.default(
+        t, dtype, mutable
+    )
+
+
 def my_pad(t) -> Tensor:
     """
     Pads the input tensor with hardcoded padding parameters.
diff --git a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
index 864b9fc89a1c1..21e0573f33ad6 100644
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@@ -14,11 +14,38 @@
     install_cpp_extension,
     IS_WINDOWS,
     run_tests,
+    skipIfTorchDynamo,
     TestCase,
     xfailIfTorchDynamo,
 )
 
 
+def get_supported_dtypes():
+    """Return a list of dtypes that are supported by torch stable ABI."""
+    return [
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+        torch.bfloat16,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.bool,
+    ]
+
+
 # TODO: Fix this error in Windows:
 # LINK : error LNK2001: unresolved external symbol PyInit__C
 if not IS_WINDOWS:
@@ -274,6 +301,43 @@ def test_my_narrow(self, device):
             expected0 = torch.narrow(t, dim0, start0, length0)
             self.assertEqual(out0, expected0)
 
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_any_data_ptr(self, device):
+            import libtorch_agnostic
+
+            t = torch.empty(2, 5, device=device, dtype=torch.float32)
+            expected_p = t.data_ptr()
+
+            for mutable in [True, False]:
+                p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
+                self.assertEqual(p, expected_p)
+
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_template_any_data_ptr(self, device):
+            import libtorch_agnostic
+
+            supported_dtypes = get_supported_dtypes()
+
+            for dtype in supported_dtypes:
+                t = torch.empty(2, 5, device=device, dtype=dtype)
+                expected_p = t.data_ptr()
+
+                for rdtype in supported_dtypes:
+                    if dtype == rdtype:
+                        for mutable in [True, False]:
+                            p = libtorch_agnostic.ops.get_template_any_data_ptr(
+                                t, rdtype, mutable
+                            )
+                            self.assertEqual(p, expected_p)
+                    else:
+                        for mutable in [True, False]:
+                            with self.assertRaisesRegex(
+                                RuntimeError, "expected scalar type.* but found"
+                            ):
+                                libtorch_agnostic.ops.get_template_any_data_ptr(
+                                    t, rdtype, mutable
+                                )
+
         @onlyCUDA
         @deviceCountAtLeast(2)
         def test_device_guard(self, device):
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index 4fb746ea15271..dfd11109050a3 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -204,6 +204,16 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
     void** ret_data_ptr // returns borrowed reference
 );
 
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_mutable_data_ptr(
+    AtenTensorHandle tensor,
+    void** ret_data_ptr // returns borrowed reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_const_data_ptr(
+    AtenTensorHandle tensor,
+    const void** ret_data_ptr // returns borrowed reference
+);
+
 // Get the nbytes of the underlying storage
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t* ret_size);
diff --git a/torch/csrc/inductor/aoti_torch/shim_common.cpp b/torch/csrc/inductor/aoti_torch/shim_common.cpp
index 2df922109975a..4a0975fe5606c 100644
--- a/torch/csrc/inductor/aoti_torch/shim_common.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_common.cpp
@@ -281,6 +281,24 @@ AOTITorchError aoti_torch_get_data_ptr(
   });
 }
 
+AOTITorchError aoti_torch_get_const_data_ptr(
+    AtenTensorHandle tensor,
+    const void** ret_data_ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
+    *ret_data_ptr = t->const_data_ptr();
+  });
+}
+
+AOTITorchError aoti_torch_get_mutable_data_ptr(
+    AtenTensorHandle tensor,
+    void** ret_data_ptr) {
+  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
+    at::Tensor* t = tensor_handle_to_tensor_pointer(tensor);
+    *ret_data_ptr = t->mutable_data_ptr();
+  });
+}
+
 AOTITorchError aoti_torch_get_storage_size(
     AtenTensorHandle tensor,
     int64_t* ret_size) {
diff --git a/torch/csrc/stable/tensor_inl.h b/torch/csrc/stable/tensor_inl.h
index 8eb69f1a63b74..cfbdc04ca922e 100644
--- a/torch/csrc/stable/tensor_inl.h
+++ b/torch/csrc/stable/tensor_inl.h
@@ -33,4 +33,35 @@ inline Device Tensor::device() const {
   return Device(extension_device_type, static_cast<DeviceIndex>(device_index));
 }
 
+// The following data ptr cast methods mirror the methods defined in
+// aten/src/ATen/templates/TensorMethods.cpp
+#define DEFINE_DATA_PTR_CAST(T, name, PRED)               \
+  template <>                                             \
+  inline T* Tensor::mutable_data_ptr() const {            \
+    auto stype = scalar_type();                           \
+    STD_TORCH_CHECK(                                      \
+        PRED(stype, torch::headeronly::ScalarType::name), \
+        "expected scalar type " #name " but found ",      \
+        torch::headeronly::toString(stype));              \
+    return static_cast<T*>(mutable_data_ptr());           \
+  }                                                       \
+  template <>                                             \
+  inline const T* Tensor::const_data_ptr() const {        \
+    auto stype = scalar_type();                           \
+    STD_TORCH_CHECK(                                      \
+        PRED(stype, torch::headeronly::ScalarType::name), \
+        "expected scalar type " #name " but found ",      \
+        torch::headeronly::toString(stype));              \
+    return static_cast<const T*>(const_data_ptr());       \
+  }
+
+#define _PRED(S1, S2) S1 == S2
+#define DEFINE_CAST(T, name) DEFINE_DATA_PTR_CAST(T, name, _PRED)
+AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CAST)
+DEFINE_CAST(uint16_t, UInt16)
+DEFINE_CAST(uint32_t, UInt32)
+DEFINE_CAST(uint64_t, UInt64)
+#undef DEFINE_CAST
+#undef _PRED
+
 HIDDEN_NAMESPACE_END(torch, stable)
diff --git a/torch/csrc/stable/tensor_struct.h b/torch/csrc/stable/tensor_struct.h
index e3f50ad26781c..e7f1541b1d513 100644
--- a/torch/csrc/stable/tensor_struct.h
+++ b/torch/csrc/stable/tensor_struct.h
@@ -78,12 +78,34 @@ class Tensor {
   // semantics as their counterparts in TensorBase.h.
   // =============================================================================
 
+  // Do not add new uses of data_ptr(), use const_data_ptr() if
+  // possible, mutable_data_ptr() otherwise.
   void* data_ptr() const {
     void* data_ptr;
     TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(ath_.get(), &data_ptr));
     return data_ptr;
   }
 
+  void* mutable_data_ptr() const {
+    void* data_ptr{};
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_mutable_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  const void* const_data_ptr() const {
+    const void* data_ptr{};
+    TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_const_data_ptr(ath_.get(), &data_ptr));
+    return data_ptr;
+  }
+
+  template <typename T>
+  T* mutable_data_ptr() const;
+
+  template <typename T, std::enable_if_t<!std::is_const_v<T>, int> = 0>
+  const T* const_data_ptr() const;
+
   int64_t dim() const {
     int64_t dim;
     TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(ath_.get(), &dim));

From 01deee228ad4fde348a5d1d721fba9b7339a8e73 Mon Sep 17 00:00:00 2001
From: Divyansh Khanna <divyanshkhanna09@gmail.com>
Date: Mon, 17 Nov 2025 18:10:22 +0000
Subject: [PATCH 34/47] Fix dataloader tests failing on python 3.14 (#167429)

The following tests are failing on python 3.14 on linux machine

* TestSetAffinity::test_set_affinity_in_worker_init
    * Why? 3.14 makes `forkserver` the default start method for multiprocessing. With it, local functions are not pickle-able and unit test fail.
* TestIndividualWorkerQueue::test_ind_worker_queue
    * Why? The test was hitting timeout. This is also related to the start method. I am increasing timeout and reducing batch size iterations to reduce total unit test time.
    * Fixes https://github.com/pytorch/pytorch/issues/68643

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167429
Approved by: https://github.com/aelavender, https://github.com/ramanishsingh
---
 test/test_dataloader.py | 59 ++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index 8568920e8b196..d42d1cd56600a 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -3480,7 +3480,7 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
             batch_size=batch_size,
             shuffle=False,
             num_workers=num_workers,
-            timeout=5,
+            timeout=JOIN_TIMEOUT,
             worker_init_fn=self.dataset.worker_init_fn,
         )
         current_worker_idx = 0
@@ -3498,33 +3498,31 @@ def _run_ind_worker_queue_test(self, batch_size, num_workers):
         "Flaky on Windows and MacOS https://github.com/pytorch/pytorch/issues/68643",
     )
     def test_ind_worker_queue(self):
-        max_num_workers = None
-        if hasattr(os, "sched_getaffinity"):
-            try:
-                max_num_workers = len(os.sched_getaffinity(0))
-            except Exception:
-                pass
-        if max_num_workers is None:
-            cpu_count = os.cpu_count()
-            if cpu_count is not None:
-                # Use half number of CPUs
-                max_num_workers = cpu_count // 2
-
-        if max_num_workers is None:
-            max_num_workers = 1
-
-        for batch_size in (8, 16, 32, 64):
-            for num_workers in range(min(6, max_num_workers)):
+        for batch_size in (8, 32, 64):
+            for num_workers in range(1, 6):
                 self._run_ind_worker_queue_test(
-                    batch_size=batch_size, num_workers=num_workers + 1
+                    batch_size=batch_size, num_workers=num_workers
                 )
 
 
 class SetAffinityDataset(IterableDataset):
+    def __init__(self, expected_affinity=None):
+        self.expected_affinity = expected_affinity
+
     def __iter__(self):
-        torch.randperm(1)
-        after = os.sched_getaffinity(0)
-        return iter(after)
+        affinity_mask = os.sched_getaffinity(0)
+        return iter(affinity_mask)
+
+
+def _worker_set_affinity_init(worker_id):
+    worker_info = torch.utils.data.get_worker_info()
+    if worker_info is not None:
+        dataset = worker_info.dataset
+        if (
+            isinstance(dataset, SetAffinityDataset)
+            and dataset.expected_affinity is not None
+        ):
+            os.sched_setaffinity(0, [dataset.expected_affinity])
 
 
 @unittest.skipIf(
@@ -3539,19 +3537,14 @@ def test_set_affinity_in_worker_init(self):
         # Choose any
         expected_affinity = list(old_affinity)[-1]
 
-        def worker_set_affinity(_):
-            os.sched_setaffinity(0, [expected_affinity])
-
-        dataset = SetAffinityDataset()
-
-        if not IS_WINDOWS and not IS_MACOS:
-            import multiprocessing as py_mp
-
-            py_mp.set_start_method("fork", force=True)
-
+        # Pass expected affinity through the dataset
+        dataset = SetAffinityDataset(expected_affinity=expected_affinity)
         dataloader = torch.utils.data.DataLoader(
-            dataset, num_workers=2, worker_init_fn=worker_set_affinity
+            dataset,
+            num_workers=2,
+            worker_init_fn=_worker_set_affinity_init,
         )
+
         for sample in dataloader:
             self.assertEqual(sample, [expected_affinity])
 

From 694f9b943c12c02c6907c5aa19177865895b81ab Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 18:25:59 +0000
Subject: [PATCH 35/47] Revert "[ROCm][CI] Upgrade ROCm CI to 7.1 (#166743)"

This reverts commit 77acc66df917a2b9f6305d089ac88b8975786552.

Reverted https://github.com/pytorch/pytorch/pull/166743 on behalf of https://github.com/pytorch-auto-revert due to Reverted automatically by pytorch's autorevert, to avoid this behaviour add the tag autorevert: disable ([comment](https://github.com/pytorch/pytorch/pull/166743#issuecomment-3543307333))
---
 .ci/docker/build.sh                           |  2 +-
 .ci/docker/common/install_rocm.sh             | 18 +++++++--------
 .ci/docker/common/install_rocm_magma.sh       |  4 ++--
 cmake/External/aotriton.cmake                 |  3 ---
 .../_composable/fsdp/test_fully_shard_comm.py | 22 +++++--------------
 .../c10d/symm_mem/CUDASymmetricMemory.cu      |  4 ----
 6 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index b7e61115e37d6..203ab597a75bc 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -188,7 +188,7 @@ case "$tag" in
     fi
     GCC_VERSION=11
     VISION=yes
-    ROCM_VERSION=7.1
+    ROCM_VERSION=7.0
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 988347e28e9d8..9376d259d9cca 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -60,16 +60,14 @@ EOF
         DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
     fi
 
-    if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
-      # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
-      # search for all unversioned packages
-      # if search fails it will abort this script; use true to avoid case where search fails
-      MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-      if [[ "x${MIOPENHIPGFX}" = x ]]; then
-        echo "miopen-hip-gfx package not available" && exit 1
-      else
-        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
-      fi
+    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
+    # search for all unversioned packages
+    # if search fails it will abort this script; use true to avoid case where search fails
+    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+    if [[ "x${MIOPENHIPGFX}" = x ]]; then
+      echo "miopen-hip-gfx package not available" && exit 1
+    else
+      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
     fi
 
     # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 9bf45e6f1b0a9..2d03c6186b8e5 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -12,8 +12,8 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 21369c2981c31..b19f25609cad3 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -15,14 +15,12 @@ if(NOT __AOTRITON_INCLUDED)
       "manylinux_2_28"  # rocm6.3
       "manylinux_2_28"  # rocm6.4
       "manylinux_2_28"  # rocm7.0
-      "manylinux_2_28"  # rocm7.1
       )
   set(__AOTRITON_ROCM_LIST
       "rocm6.2"
       "rocm6.3"
       "rocm6.4"
       "rocm7.0"
-      "rocm7.1"
       )
   set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
   set(__AOTRITON_SHA256_LIST
@@ -30,7 +28,6 @@ if(NOT __AOTRITON_INCLUDED)
       "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
       "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
       "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
-      "d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722"  # rocm7.1
       )
   set(__AOTRITON_IMAGE_LIST
       "amd-gfx90a"
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index ad3064608960d..44000e761d8a0 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -59,12 +59,7 @@
     patch_reshard,
     patch_unshard,
 )
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TEST_WITH_ROCM,
-    TEST_XPU,
-    xfailIf,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     FeedForward,
     ModelArgs,
@@ -1663,17 +1658,10 @@ def test_exception_when_used_together_with_comm_hooks(self):
 class TestFullyShardForceSumReduction(FSDPTest):
     # The messages might change when we move to a different NCCL version.
     # Please update this test if it starts failing.
-
-    if TEST_WITH_ROCM and torch.cuda.nccl.version()[:2] >= (2, 27):
-        COLLECTIVE_RE = (
-            r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
-            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
-        )
-    else:
-        COLLECTIVE_RE = (
-            "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
-            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
-        )
+    COLLECTIVE_RE = (
+        "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
+        "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+    )
     # See here for the numerical values for each reduction op:
     # https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
     SUM_REDUCTION = 0
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 6352330c3872c..4523333c7fad4 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -699,11 +699,7 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
 #elif defined(USE_ROCM)
     C10_HIP_CHECK(hipMemImportFromShareableHandle(
         &handles[r],
-#if ROCM_VERSION >= 70100
-        reinterpret_cast<void*>(static_cast<uintptr_t>(imported_handles[r])),
-#else
         (void*)(uintptr_t) & (imported_handles[r]),
-#endif
         hipMemHandleTypePosixFileDescriptor));
 #else
     TORCH_CHECK(

From 4414e1bff06487f85b1e2ebd1919625298f1444f Mon Sep 17 00:00:00 2001
From: IvanKobzarev <ivan.kobzarev@gmail.com>
Date: Mon, 17 Nov 2025 03:05:46 -0800
Subject: [PATCH 36/47] Cleanup in inductor usage of nccl estimator after its
 fix (#167633)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167633
Approved by: https://github.com/eellison
ghstack dependencies: #167827
---
 torch/_inductor/comm_analysis.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/torch/_inductor/comm_analysis.py b/torch/_inductor/comm_analysis.py
index e95db64b03a39..e6f85204a2c14 100644
--- a/torch/_inductor/comm_analysis.py
+++ b/torch/_inductor/comm_analysis.py
@@ -196,16 +196,9 @@ def estimate_nccl_collective_runtime_nccl_estimator(snode) -> Optional[float]:
     if "all_gather_into_tensor_out" in py_kernel_name:
         args = args[1:] + args[0]
 
-    try:
-        with torch.distributed._time_estimator(
-            group=pg, device=device
-        ) as time_estimator:
-            w = fn(*args, **kwargs)
-            torch.ops._c10d_functional.wait_tensor.default(w)
-    except Exception as e:
-        # NCCL estimator can fail
-        log.info(e)  # noqa: G200
-        return None
+    with torch.distributed._time_estimator(group=pg, device=device) as time_estimator:
+        w = fn(*args, **kwargs)
+        torch.ops._c10d_functional.wait_tensor.default(w)
 
     est_time_us = time_estimator.estimated_time
     # -1000 constant is NCCL return in case of error during estimations.
@@ -359,7 +352,6 @@ def estimate_fx_collective_size(fx_node: torch.fx.Node) -> int:
 def estimate_nccl_collective_runtime_from_fx_node(
     fx_node: torch.fx.Node,
     override_size: Optional[int] = None,
-    # TODO(ivankobzarev): NCCL estimator sometimes fail unexpectedly, enable back after fix.
     use_nccl_estimator: bool = True,
 ) -> float:
     """

From b288d0020b6cdd9532be2456083a861434faf8b9 Mon Sep 17 00:00:00 2001
From: shunting314 <shunting@meta.com>
Date: Fri, 14 Nov 2025 18:01:14 -0800
Subject: [PATCH 37/47] [inductor] unittest for run2run determinism (#167482)

Not sure if the path are already properly setup so I can call 'benchmarks/dynamo/huggingface.py' in unit test directly. Let's tell from CI.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167482
Approved by: https://github.com/v0i0, https://github.com/mlazos
---
 benchmarks/dynamo/common.py         |  4 +-
 test/inductor/test_deterministic.py | 62 +++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index a3bd58c4de747..b3484e7196a83 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -2379,7 +2379,9 @@ def record_status(accuracy_status, dynamo_start_stats):
                     print(
                         f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
                     )
-                    saved_result = torch.load(self.args.compare_model_outputs_with)
+                    saved_result = torch.load(
+                        self.args.compare_model_outputs_with, weights_only=False
+                    )
                     is_bitwise_same = bitwise_same(saved_result, new_result)
                     if not is_bitwise_same:
                         print(
diff --git a/test/inductor/test_deterministic.py b/test/inductor/test_deterministic.py
index 0de777dd81b5c..382838c31bed4 100644
--- a/test/inductor/test_deterministic.py
+++ b/test/inductor/test_deterministic.py
@@ -1,5 +1,9 @@
 # Owner(s): ["module: inductor"]
 import contextlib
+import os
+import subprocess
+import sys
+import tempfile
 import unittest
 
 import torch
@@ -104,6 +108,64 @@ def foo(x):
             else:
                 self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)
 
+    @parametrize("model_name", ["GoogleFnet", "BertForMaskedLM", "DistillGPT2"])
+    @parametrize("training_or_inference", ["training", "inference"])
+    @parametrize("precision", ["float32", "bfloat16", "float16", "amp"])
+    def test_run2run_determinism(self, model_name, training_or_inference, precision):
+        """
+        Test run2run determinism for a few huggingface models.
+
+        The test assumes benchmarks/dynamo/huggingface.py can be found from
+        the current working directory.
+        """
+
+        if not os.path.exists("benchmarks/dynamo/huggingface.py"):
+            self.skipTest("Skip due to benchmarks/dynamo/huggingface.py not found.")
+
+        def _setup_env(env):
+            env["TORCHINDUCTOR_FORCE_DISABLE_CACHES"] = "1"  # disable autotune cache
+            env["TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE"] = "0"
+            env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "0"
+            if enable_determinism:
+                env["TORCHINDUCTOR_DETERMINISTIC"] = "1"
+
+        # set to false if you want to check how the test fails without
+        # the deterministic mode
+        enable_determinism = True
+        with tempfile.TemporaryDirectory() as tmpdir:
+            saved_pkl = os.path.join(tmpdir, "saved.pkl")
+            cmd = (
+                f"{sys.executable} benchmarks/dynamo/huggingface.py --backend inductor"
+                + f" --{precision} --accuracy --only {model_name} --{training_or_inference}"
+                + f" --disable-cudagraphs --save-model-outputs-to={saved_pkl}"
+            )
+            print("Command", cmd)
+            env = os.environ.copy()
+            _setup_env(env)
+            out = subprocess.run(cmd.split(), capture_output=True, env=env)
+
+            # We don't check the accuracy against eager here because some
+            # of the combination between model and precision can not
+            # pass that accuracy test. But it's still valuable to make
+            # sure we generate bitwise equivalent result from run to run.
+            # self.assertTrue("pass" in out.stdout.decode())
+
+            cmd = (
+                f"{sys.executable} benchmarks/dynamo/huggingface.py --backend inductor"
+                + f" --{precision} --accuracy --only {model_name} --{training_or_inference}"
+                + f" --disable-cudagraphs --compare-model-outputs-with={saved_pkl}"
+            )
+            print("Command", cmd)
+
+            # distort benchmarking results
+            env["TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT"] = "inverse"
+            out = subprocess.run(cmd.split(), capture_output=True, env=env)
+            self.assertTrue(
+                "The result is bitwise equivalent to the previously saved result"
+                in out.stdout.decode(),
+                f"stdout: {out.stdout.decode()}, stderr: {out.stderr.decode()}",
+            )
+
 
 if __name__ == "__main__":
     if HAS_CUDA_AND_TRITON:

From 689d731ece80ceed232b59040afecabc1df520ec Mon Sep 17 00:00:00 2001
From: shunting314 <shunting@meta.com>
Date: Fri, 14 Nov 2025 17:41:48 -0800
Subject: [PATCH 38/47] [inductor] fix the decision of inner reduction
 (#167697)

Inductor may treat an outer reduction as inner reduction when the reduction ranges contains a 1. This cause some weird issue that we skip fusing with mix order reduction. While I'm still debugging why that happens, I think we should fix the decision here anyways

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167697
Approved by: https://github.com/jansel, https://github.com/v0i0
---
 test/inductor/test_mix_order_reduction.py | 19 +++++++++++++++++--
 test/inductor/test_torchinductor.py       | 15 +++++++++++++++
 torch/_inductor/ir.py                     |  4 +++-
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/test/inductor/test_mix_order_reduction.py b/test/inductor/test_mix_order_reduction.py
index 592e42ce41735..1114810ceccdf 100644
--- a/test/inductor/test_mix_order_reduction.py
+++ b/test/inductor/test_mix_order_reduction.py
@@ -270,11 +270,20 @@ def f(x, y):
         ],
     )
     @parametrize("split_reductions", (False, True))
-    @parametrize("shape", ((32768, 2048), (32768, 768), (32768 + 1023, 768)))
+    @parametrize(
+        "shape", ((1000000, 256), (32768, 2048), (32768, 768), (32768 + 1023, 768))
+    )
     @parametrize("max_autotune", (False, True))
     @parametrize("initial_xblock", (1, 2))
+    @parametrize("add_1dim", (False, True))
     def test_rms_norm_bwd(
-        self, wdtype, split_reductions, shape, max_autotune, initial_xblock
+        self,
+        wdtype,
+        split_reductions,
+        shape,
+        max_autotune,
+        initial_xblock,
+        add_1dim,
     ):
         # max_autotune can be slow and cost resource, trim down the tests
         # for max autotune
@@ -287,6 +296,9 @@ def test_rms_norm_bwd(
         ):
             self.skipTest("Skip non-critical tests to save resources.")
 
+        if shape != (1000000, 256) and add_1dim:
+            self.skipTest("Skip non-critical tests to save resources.")
+
         def f(x, w, eps):
             orig_dtype = x.dtype
 
@@ -307,6 +319,9 @@ def fwd_bwd(f):
         # M, N = 1152 * 500, 384
         M, N = shape
         x = torch.randn(M, N, dtype=torch.bfloat16, device=GPU_TYPE, requires_grad=True)
+        if add_1dim:
+            x = x[:, None, :]
+
         w = torch.randn(N, dtype=wdtype, device=GPU_TYPE, requires_grad=True)
         dy = torch.randn_like(x)
         eps = 1e-5
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 780cfc1716824..9d687a8828a8c 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -14728,6 +14728,21 @@ def test_weight_norm_conv2d(self):
 
         self.assertTrue(same((ref, ref_grad), (act, act_grad), tol=1e-3))
 
+    @skipIfMPS
+    def test_inner_reduction_detection(self):
+        if self.device == "cpu":
+            self.skipTest("Skip for CPU device")
+
+        x = torch.randn(100000, 1, 256, device=self.device)
+
+        @torch.compile
+        def f(x):
+            return x.sum(dim=(0, 1))
+
+        code = run_and_get_triton_code(f, x)
+        self.assertTrue("ReductionHint.OUTER" in code)
+        self.assertFalse("ReductionHint.INNER" in code)
+
     @skip_if_halide
     @requires_cuda_and_triton
     @skip_if_cpp_wrapper("skip cpp wrapper")
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index 67e0174443882..72d8383d2b812 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -1435,7 +1435,9 @@ def get_read_indices(r: Reduction) -> tuple[Sequence[Expr], bool]:
             strides = V.graph.sizevars.stride_hints(
                 j, reduction_vars, list(ranges1.keys())
             )
-            outer = all(s > 1 for s in strides)
+            # A 0 stride does not make a reduction contiguous.
+            # This can happen when the reduction ranges contains a 1.
+            outer = all(s == 0 or s > 1 for s in strides)
             if outer:
                 num_outer += 1
             else:

From 2ddcf53e1a98d4453a2d2ff2422af19bc04bd26e Mon Sep 17 00:00:00 2001
From: Chris Leonard <chleonar@redhat.com>
Date: Mon, 17 Nov 2025 20:30:51 +0000
Subject: [PATCH 39/47] Logaddexp complex inconsistent bw cpu and cuda
 (#163509)

Fixes #158429

Updated LogAddExpKernel.cu to allow for complex numbers. Also, updated unittest to run test_logaddexp on CUDA with complex data types and added a unit test in test_linalg.py to compare results between CUDA and cpu.

@drisspg
Pull Request resolved: https://github.com/pytorch/pytorch/pull/163509
Approved by: https://github.com/isuruf
---
 aten/src/ATen/native/cuda/LogAddExpKernel.cu  | 235 +++++++++++++++++-
 test/test_binary_ufuncs.py                    |  24 +-
 test/test_linalg.py                           |  59 +++++
 .../_internal/common_methods_invocations.py   |  18 +-
 4 files changed, 322 insertions(+), 14 deletions(-)

diff --git a/aten/src/ATen/native/cuda/LogAddExpKernel.cu b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
index 7b8b5b5bb2032..910d3c1cddc93 100644
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@@ -2,18 +2,250 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
+#include <c10/util/complex.h>
+
+#include <cmath>
+#include <limits>
 
 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.
 
 namespace at::native {
 
+// custom min and max to be used in logaddexp for  complex arguments
+template <typename scalar_t, bool min>
+__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  scalar_t xr = std::real(x);
+  scalar_t yr = std::real(y);
+  if (::isnan(yr) || (::isnan(std::imag(y)))) {
+    return y;
+  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
+    return x;
+  } else if (min) { // min
+    return (xr < yr) ? x : y;
+  } else { // max
+    return (xr >= yr) ? x : y;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
+  const auto isnan_x = at::_isnan(x);
+  const auto isnan_y = at::_isnan(y);
+  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
+  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
+  if (min != max || ::isfinite(min)) {
+    // nan will be propagated here
+    return ::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the x is finite (not inf nor nan)
+  const auto xreal = std::real(x);
+  const auto ximag = std::imag(x);
+  const auto exp_x_abs = std::exp(xreal);
+  auto exp_x_real = exp_x_abs * std::cos(ximag);
+  auto exp_x_imag = exp_x_abs * std::sin(ximag);
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the real part of x is infinite
+  const auto ximag = std::imag(x);
+  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
+  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
+    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  const auto sin = std::sin(ximag);
+  const auto cos = std::cos(ximag);
+  // special case if the angle is exactly the multiple of pi/2
+  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
+  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
+  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
+  scalar_t min_real = std::real(min);
+  scalar_t max_real = std::real(max);
+
+  if (::isnan(min_real) || ::isnan(std::imag(min))) {
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      const auto exp_min = _fast_build_exp_inf(min);
+      const auto exp_max = _fast_build_exp_inf(max);
+      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
+    }
+  } else {
+    const auto minmax = min - max;
+    c10::complex<scalar_t> exp_minmax;
+    if (!::isfinite(minmax.real())) {
+        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
+    } else {
+        exp_minmax = _fast_build_exp(minmax);
+    }
+    return ::log1p(exp_minmax) + max;
+  }
+}
+
+// Complex logaddexp jiterator string
+const auto logaddexp_complex_string = jiterator_stringify(
+    template<typename T>
+    std::complex<T> log1p(const std::complex<T>& z)
+    {
+      using complex_t = std::complex<T>;
+      T x = z.real();
+      T y = z.imag();
+      T zabs = abs(z);
+      T theta = atan2(y, x + T(1));
+      if (zabs < 0.5) {
+          T r = x * (T(2) + x) + y * y;
+          if (r == 0) { // handle underflow
+              return complex_t(x, theta);
+          }
+          return complex_t(T(0.5) * std::log1p(r), theta);
+      } else {
+          T z0 = std::hypot(x + 1, y);
+          return complex_t(log(z0), theta);
+      }
+    }
+
+    // separated _logaddexp_minmax into 2 different functions for jiterator_string
+    template <typename T>
+    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
+        T xr = x.real();
+        T yr = y.real();
+        if (isnan(yr) || isnan(y.imag())) {
+            return y;
+        } else if (isnan(xr) || isnan(x.imag())) {
+            return x;
+        } else {
+            return (xr < yr) ? x : y;
+        }
+    }
+
+    template <typename T>
+    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
+        T xr = x.real();
+        T yr = y.real();
+        if (isnan(yr) || isnan(y.imag())) {
+            return y;
+        } else if (isnan(xr) || isnan(x.imag())) {
+            return x;
+        } else {
+            return (xr >= yr) ? x : y;
+        }
+    }
+
+    template <typename T>
+    std::complex<T> fast_build_exp(const std::complex<T>& x) {
+        const auto xreal = x.real();
+        const auto ximag = x.imag();
+        const auto exp_x_abs = exp(xreal);
+        auto exp_x_real = exp_x_abs * cos(ximag);
+        auto exp_x_imag = exp_x_abs * sin(ximag);
+        return std::complex<T>(exp_x_real, exp_x_imag);
+    }
+
+    template <typename T>
+    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
+        using complex_t = std::complex<T>;
+        const auto ximag = x.imag();
+        const T exp_x_abs = INFINITY;
+        if (!isfinite(ximag)) {
+            return complex_t(exp_x_abs, NAN);
+        }
+        const auto sin_val = sin(ximag);
+        const auto cos_val = cos(ximag);
+        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
+        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
+        return complex_t(exp_x_real, exp_x_imag);
+    }
+
+    template <typename complex_t>
+    complex_t logaddexp_complex(complex_t x, complex_t y) {
+        using T = typename complex_t::value_type;
+        complex_t min_val = logaddexp_min(x, y);
+        complex_t max_val = logaddexp_max(x, y);
+        T min_real = min_val.real();
+        T max_real = max_val.real();
+
+        if (isnan(min_real) || isnan(min_val.imag())) {
+            return complex_t(NAN, NAN);
+        }
+        else if ((!isfinite(min_real)) && (min_real == max_real)) {
+            if (min_real < T(0)) {
+                return min_val;
+            } else {
+                const auto exp_min = fast_build_exp_inf<T>(min_val);
+                const auto exp_max = fast_build_exp_inf<T>(max_val);
+                return log1p(exp_min + exp_max - complex_t(1, 0));
+            }
+        } else {
+            const auto minmax = min_val - max_val;
+            complex_t exp_minmax;
+            if (!isfinite(minmax.real())) {
+                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
+            } else {
+                exp_minmax = fast_build_exp<T>(minmax);
+            }
+            return log1p(exp_minmax) + max_val;
+        }
+    }
+);
+
+constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(
+  if (at::isComplexType(iter.dtype())) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/logaddexp_complex_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/2>(iter, logaddexp_complex_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
+        const auto a = static_cast<opmath_t>(a_);
+        const auto b = static_cast<opmath_t>(b_);
+        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
+      });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
       ScalarType::BFloat16, ScalarType::Half,
       iter.dtype(), "logaddexp_cuda",
       [&]() {
@@ -29,6 +261,7 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
           }
         });
       });
+  }
 }
 
 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index 56a4202cded3f..2b5606aec98d6 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -3539,7 +3539,7 @@ def _test_logaddexp(self, device, dtype, base2):
         if base2:
             ref_func = np.logaddexp2
             our_func = torch.logaddexp2
-        elif dtype in (torch.complex64, torch.complex128):
+        elif dtype in (torch.complex32, torch.complex64, torch.complex128):
             # numpy has not implemented logaddexp for complex
             def complex_logaddexp(x1, x2):
                 x = np.stack((x1, x2))
@@ -3558,6 +3558,13 @@ def _test_helper(a, b):
                 ref = ref_func(a.cpu().float().numpy(), b.cpu().float().numpy())
                 v = our_func(a, b)
                 self.assertEqual(ref, v.float(), atol=0.01, rtol=0.01)
+            elif dtype == torch.complex32:
+                ref = ref_func(
+                    a.cpu().to(torch.complex64).numpy(),
+                    b.cpu().to(torch.complex64).numpy(),
+                )
+                v = our_func(a, b)
+                self.assertEqual(ref, v.to(torch.complex64), atol=0.01, rtol=0.01)
             else:
                 ref = ref_func(a.cpu().numpy(), b.cpu().numpy())
                 v = our_func(a, b)
@@ -3588,12 +3595,23 @@ def _test_helper(a, b):
         _test_helper(a, b)
 
     @skipIfTorchDynamo()  # complex infs/nans differ under Dynamo/Inductor
-    @dtypesIfCUDA(torch.float32, torch.float64, torch.bfloat16)
+    @dtypesIfCUDA(
+        torch.float32,
+        torch.float64,
+        torch.bfloat16,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+    )
     @dtypes(
         torch.float32, torch.float64, torch.bfloat16, torch.complex64, torch.complex128
     )
     def test_logaddexp(self, device, dtype):
-        if sys.version_info >= (3, 12) and dtype in (torch.complex64, torch.complex128):
+        if sys.version_info >= (3, 12) and dtype in (
+            torch.complex32,
+            torch.complex64,
+            torch.complex128,
+        ):
             return self.skipTest("complex flaky in 3.12")
         self._test_logaddexp(device, dtype, base2=False)
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 9168964369920..7e3a1ebaa6f3a 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -10071,6 +10071,65 @@ def test_1_sized_with_0_strided(self, device, dtype):
             a_strided.cpu().numpy() @ b_strided.cpu().numpy()).to(device=device, dtype=dtype)
         self.assertEqual(expect, res)
 
+    @onlyCUDA
+    def test_logaddexp_cpu_vs_cuda_complex(self, device):
+        # test logaddexp with complex values produce the same values (up to machine precision) on cpu and CUDA.
+        input_real = torch.tensor([0.052, -0.2115, 0.6913], dtype=torch.float64)
+        input_img = torch.tensor([-0.3229, -0.8374, 0.8391], dtype=torch.float64)
+        input_complex = torch.complex(input_real, input_img).cuda()
+
+        other_real = torch.tensor([0.2550, 0.8769, -0.4884], dtype=torch.float64)
+        other_img = torch.tensor([0.6063, 0.4343, -1.4166], dtype=torch.float64)
+        other_complex = torch.complex(other_real, other_img).cuda()
+
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+
+        torch.testing.assert_close(out_gpu.cpu(), out_cpu, rtol=1e-12, atol=1e-14)
+
+        # test extreme cases (infty, -infty, and nan) are handled the same between cuda and cpu
+        input_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(-float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(-float('inf')), torch.tensor(2.))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(2.), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
+        input_complex = torch.complex(torch.tensor(float('nan')), torch.tensor(float('inf')))
+        other_complex = torch.complex(torch.tensor(float('inf')), torch.tensor(float('inf')))
+        out_gpu = torch.logaddexp(input=input_complex, other=other_complex)
+        out_cpu = torch.logaddexp(input=input_complex.cpu(), other=other_complex.cpu())
+        self.assertEqual(out_gpu.cpu(), out_cpu)
+
 instantiate_device_type_tests(TestLinalg, globals())
 
 if __name__ == '__main__':
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
index 2f59b520a8b43..6724ab2ae739a 100644
--- a/torch/testing/_internal/common_methods_invocations.py
+++ b/torch/testing/_internal/common_methods_invocations.py
@@ -14264,15 +14264,11 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
                     ], ),
     BinaryUfuncInfo('logaddexp',
                     dtypes=floating_and_complex_types_and(torch.bfloat16, torch.float16),
-                    dtypesIfCUDA=floating_types_and(torch.bfloat16, torch.float16),
+                    dtypesIfCUDA=floating_and_complex_types_and(torch.bfloat16, torch.float16, torch.complex32),
                     dtypesIfHpu=custom_types(torch.float32, torch.bfloat16),
                     supports_forward_ad=True,
                     supports_fwgrad_bwgrad=True,
-                    supports_rhs_python_scalar=False,
-                    skips=(
-                        # TODO: FIXME: RuntimeError: not implemented for 'ComplexFloat'
-                        DecorateInfo(unittest.expectedFailure, 'TestBinaryUfuncs', 'test_type_promotion', device_type='cuda'),
-                    )),
+                    supports_rhs_python_scalar=False),
     OpInfo('logaddexp2',
            dtypes=floating_types_and(torch.bfloat16, torch.half),
            dtypesIfHpu=custom_types(torch.float32, torch.bfloat16),
@@ -23643,10 +23639,12 @@ def sample_inputs_alias_copy(op_info, device, dtype, requires_grad, **kwargs):
         torch_opinfo_name="logaddexp",
         skips=(
             # failure due to mismatch in edge cases, which boils down to what torch.exp(inf + infj) should be
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref', device_type='cpu',
-                         dtypes=(torch.complex64, torch.complex128)),
-            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback', device_type='cpu',
-                         dtypes=(torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_torch_fallback',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
+            DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_python_ref_executor',
+                         dtypes=(torch.complex32, torch.complex64, torch.complex128)),
         ),
     ),
     PythonRefInfo(

From a892f76d062ac424f9e1a7506543edf3fcc74d50 Mon Sep 17 00:00:00 2001
From: Isalia20 <irakli.salia854@gmail.com>
Date: Mon, 17 Nov 2025 20:44:58 +0000
Subject: [PATCH 40/47] [MPS] mm out sparse (#167908)

Enables mm out for sparse tensors
Pull Request resolved: https://github.com/pytorch/pytorch/pull/167908
Approved by: https://github.com/malfet
---
 aten/src/ATen/native/native_functions.yaml | 2 +-
 test/test_sparse.py                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 98873abe0c499..9a1c7c790afaa 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -4225,7 +4225,7 @@
     MTIA: mm_out_mtia
     MPS: mm_out_mps
     XPU: mm_out_xpu
-    SparseCPU, SparseCUDA: _sparse_mm_out
+    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
     SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
 
 - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
diff --git a/test/test_sparse.py b/test/test_sparse.py
index e44e0d873553a..779ce21484d20 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -1391,9 +1391,9 @@ def run_test(nnz, size):
         # case nnz > size[d]
         run_test(tlen, tlen // 2)
 
-    @onlyCPU
     @coalescedonoff
     @dtypes(torch.double, torch.cdouble)
+    @dtypesIfMPS(torch.float32, torch.complex64)
     def test_mm(self, device, dtype, coalesced):
         def test_shape(di, dj, dk, nnz):
             x, _, _ = self._gen_sparse(2, nnz, [di, dj], dtype, device, coalesced)

From 927899dc05104a4c451f83d6ffaee735b0efeffd Mon Sep 17 00:00:00 2001
From: Natalia Gimelshein <ngimel@meta.com>
Date: Mon, 17 Nov 2025 20:50:27 +0000
Subject: [PATCH 41/47] fixes a few issues with out_dtype overload for
 addmm/baddbmm (#167931)

Per title
1) allows `self` argument to have the same precision as output
2) fixes broadcasting of `self` argument - it used to allocate incorrectly sized output and resize it later, causing a warning, in addmm, and error out in baddbmm
3) fixes `out` handling for `out` baddbmm overload, where the implementation used uninitialized memory in `out` instead of copying `self` to out.
4) removes couple unneeded iife patterns

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167931
Approved by: https://github.com/PaulZhang12, https://github.com/drisspg, https://github.com/malfet
---
 aten/src/ATen/native/cuda/Blas.cpp | 83 ++++++++++++++----------------
 test/test_matmul_cuda.py           | 28 +++++++---
 2 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 2754d70cac013..75a4d357a1c0b 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
     // args contains result which is modified
     cublasCommonArgs& args,
-    const Tensor& self,
+    const std::optional<Tensor>& self,
     const Scalar& alpha,
     Activation activation = Activation::None
 ) {
@@ -304,12 +304,8 @@ bool launchGemmAndBiasCublasLt(
   // or when it can be squeezed to 1D.
   // self_ptr == nullptr implies ignore bias epilogue
   // and use standard gemm-like API.
-  const auto* self_ptr = [&]() -> auto {
-    if (self.dim() == 1 || self.squeeze().dim() == 1) {
-      return self.const_data_ptr<scalar_t>();
-    }
-    return static_cast<const scalar_t*>(nullptr);
-  }();
+  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
+
 
   const auto tuning_ctx = at::cuda::tunable::getTuningContext();
   if (tuning_ctx->IsTunableOpEnabled()) {
@@ -392,35 +388,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
   bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
   #ifdef USE_ROCM
   // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
   #endif
   // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
-  // }
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
 
   at::ScalarType scalar_type = mat1.scalar_type();
   bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
 
+  #ifdef USE_ROCM
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
+  #endif
+
+  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+  // for float output with half input cublasLT with bias produces wrong results
+  use_bias_ptr_lt &= !is_float_output_with_half_input;
+
   // Handle result/self shapes
   if (!result.is_same(self)) {
     at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
 
-    // We use bias ptr in the Lt path only when bias is 1D
-    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
-    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (!use_bias_ptr_lt) {
-        // We do expand self even before
-        // check for beta != 0.0 to make sure that
-        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
-        // runs green.
-        return expand_size(self, result.sizes(), "addmm");
-      }
-      return c10::MaybeOwned<Tensor>::borrowed(self);
-    }();
-    // We do not copy bias only when we need the bias ptr
+      // We do not copy bias only when we need the bias ptr
     if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
       // NOTE: self should broadcast over result
-      at::native::copy_(result, *self_maybe_expanded);
+      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
     }
   }
 
@@ -468,7 +459,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         scalar_type,
         "addmm_cuda_lt",
         [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
         }
       );
       #endif
@@ -480,7 +471,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
         scalar_type,
         "addmm_cuda_lt",
         [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
         }
       );
     } // end is_float_output_with_half_input
@@ -936,7 +927,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
   return _int_mm_out_cuda(self, mat2, result);
 }
 
-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
   // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
   TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
   TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@@ -960,7 +951,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
     (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
     "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
 
-  if (!is_bmm && self_baddbmm.has_value()) {
+  if (self_baddbmm.has_value()) {
     const auto& self = self_baddbmm.value();
     TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
     TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@@ -968,15 +959,12 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }
 
 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  IntArrayRef batch1_sizes = batch1.sizes();
-  IntArrayRef batch2_sizes = batch2.sizes();
-
-  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
+  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
   return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }
 
 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
   Scalar beta(0.0);
   Scalar alpha(1.0);
   {
@@ -988,14 +976,16 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }
 
 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  // We need to copy the tensor
-  Tensor out = self.clone().to(self.options().dtype(out_dtype));
-
-  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
+  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
+  "self dtype must match either out_dtype or batch1 dtype");
+  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
 }
 
 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
+  // We need to copy the tensor
+  out.copy_(self);
   {
     NoNamesGuard guard;
     baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@@ -1030,24 +1020,27 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }
 
 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
   return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }
 
 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+// repeat dimensionality checks for direct calls to `out` overload
   TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
   TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
   TORCH_CHECK(
       mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
       mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
+  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
+  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
 
   TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() ||
-    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
-    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
-  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
+    "self dtype must match either out_dtype or mat1 dtype");
 
   addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
 
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index a8e9be4c972a1..7a6585f3b63a8 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -747,11 +747,13 @@ def create_inputs(B=None):
     @onlyCUDA
     @parametrize("input_dtype", [torch.float32, torch.float16, torch.bfloat16])
     @parametrize("M", [1, 32, 64])
-    @parametrize("N", [1, 32, 64])
+    @parametrize("N", [1, 64])
     @parametrize("K", [1, 32, 64])
-    @parametrize("batch_size", [None, 1, 32])
+    @parametrize("batch_size", [None, 1])
+    @parametrize("broadcast_self", [False, True])
+    @parametrize("high_precision_self", [False, True])
     @parametrize("backend", ["cublas", "cublaslt"])
-    def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, backend):
+    def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, broadcast_self, high_precision_self, backend):
         if torch.version.hip:
             msg = "accuracy regression in hipblas and hipblaslt in ROCm 7.0 for certain shapes"
             if input_dtype == torch.bfloat16 and N == 1 and K == 32 and batch_size:
@@ -766,19 +768,21 @@ def test_addmm_baddmm_dtype_overload(self, input_dtype, M, N, K, batch_size, bac
         device = "cuda"
         dtype = input_dtype
         with blas_library_context(backend):
-            def create_inputs(B=None):
+            def create_inputs(B, broadcast_self):
                 if B is None:
                     a = torch.randn(M, K, device=device, dtype=dtype)
                     b = torch.randn(K, N, device=device, dtype=dtype)
-                    c = torch.randn(M, N, device=device, dtype=dtype)
+                    c_shape = (M, N) if not broadcast_self else (N)
+                    c = torch.randn(c_shape, device=device, dtype=dtype)
                 else:
                     a = torch.randn(B, M, K, device=device, dtype=dtype)
                     b = torch.randn(B, K, N, device=device, dtype=dtype)
-                    c = torch.randn(B, M, N, device=device, dtype=dtype)
+                    c_shape = (B, M, N) if not broadcast_self else (N)
+                    c = torch.randn(c_shape, device=device, dtype=dtype)
 
                 return a, b, c
 
-            a, b, c = create_inputs(batch_size)
+            a, b, c = create_inputs(batch_size, broadcast_self)
 
             a_fp32, b_fp32, c_fp32 = a.to(torch.float32), b.to(torch.float32), c.to(torch.float32)
 
@@ -800,21 +804,31 @@ def create_inputs(B=None):
                         with self.assertRaises(RuntimeError):
                             torch.addmm(c, a, b, out_dtype=output_dtype)
                 else:
+                    if c.dtype != output_dtype and high_precision_self:
+                        c = c.to(output_dtype)
                     if batch_size:
                         out = torch.baddbmm(c, a, b, out_dtype=output_dtype)
                         if output_dtype == torch.float32:
                             baseline = torch.baddbmm(c_fp32, a_fp32, b_fp32)
                         else:
                             baseline = torch.baddbmm(c, a, b)
+                        # test out variant
+                        out_ten = torch.full_like(out, float("nan"))
+                        torch.baddbmm(c, a, b, out_dtype=output_dtype, out=out_ten)
                     else:
                         out = torch.addmm(c, a, b, out_dtype=output_dtype)
                         if output_dtype == torch.float32:
                             baseline = torch.addmm(c_fp32, a_fp32, b_fp32)
                         else:
                             baseline = torch.addmm(c, a, b)
+                        # test out variant
+                        out_ten = torch.full_like(out, float("nan"))
+                        torch.addmm(c, a, b, out_dtype=output_dtype, out=out_ten)
 
                     self.assertEqual(out.dtype, output_dtype)
+                    self.assertEqual(out_ten.dtype, output_dtype)
                     torch.testing.assert_close(out, baseline, atol=1e-3, rtol=1e-3)
+                    torch.testing.assert_close(out_ten, out, atol=0, rtol=0)
 
 
     @onlyCUDA

From 9d8ceaa36f085410f3712a6101efbb5b423f7da0 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 21:06:26 +0000
Subject: [PATCH 42/47] Revert "[ARM] Improve LLM performance & mem usage using
 int4-bf16 KleidiAI kernels (#158250)"

This reverts commit 53809f964083a9e89182c2db7638fd44f3a6e304.

Reverted https://github.com/pytorch/pytorch/pull/158250 on behalf of https://github.com/zou3519 due to reverting to see if it fixes inductor halide test failure ([comment](https://github.com/pytorch/pytorch/pull/158250#issuecomment-3543840277))
---
 aten/src/ATen/native/LinearAlgebra.cpp        |   4 +-
 aten/src/ATen/native/cpu/int4mm_kernel.cpp    | 343 +++++-------------
 aten/src/ATen/native/kleidiai/kai_kernels.cpp | 200 ++--------
 aten/src/ATen/native/kleidiai/kai_kernels.h   |   3 +-
 aten/src/ATen/native/kleidiai/kai_pack.h      |   9 +-
 .../native/kleidiai/kai_ukernel_interface.cpp |  34 --
 .../native/kleidiai/kai_ukernel_interface.h   |  89 +----
 test/inductor/test_torchinductor.py           | 106 +-----
 torch/_meta_registrations.py                  |  11 +-
 9 files changed, 137 insertions(+), 662 deletions(-)

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
index 934ecb99d3382..07bdc19ec8ff7 100644
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -3541,9 +3541,9 @@ Tensor _dyn_quant_matmul_4bit_cpu(
     const int64_t out_features) {
   auto M = inp.size(0);
   TORCH_CHECK(
-      inp.dtype() == kFloat || (inp.dtype() == kBFloat16 && block_size == in_features),
+      inp.dtype() == kFloat,
       __func__,
-      " : expect input to be float32 or bfloat16 tensor.");
+      " : expect input to be 32-bit float tensor.");
   TORCH_CHECK(
       block_size == in_features ||
           (!(block_size % 32) && !(in_features % block_size)),
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
index 1ffaa7bcd90b7..33aae4fbf27a5 100644
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -8,7 +8,6 @@
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/native/cpu/int_mm_kernel.h>
 #include <ATen/native/cpu/utils.h>
-#include <cmath>
 #include <c10/util/Unroll.h>
 #include <c10/util/irange.h>
 
@@ -794,139 +793,6 @@ bool can_use_kleidiai(
 }
 #endif
 
-static void ref_dyn_quant_matmul_4bit_channelwise_kernel_bf16(
-    size_t m,
-    size_t n,
-    size_t k,
-    const uint16_t* lhs_bf16,
-    const uint8_t* rhs_qs4cx,
-    const float* rhs_scales,
-    uint16_t* dst_bf16,
-    float scalar_min,
-    float scalar_max,
-    const float* bias) {
-  // Roundup lambda for internal stride calculations
-  auto roundup = [](size_t a, size_t b) { return ((a + b - 1) / b) * b; };
-
-  // Cast bfloat16 to float32 inline
-  auto cast_bf16_to_f32 = [](uint16_t bf16_val) {
-    uint32_t tmp = static_cast<uint32_t>(bf16_val) << 16;
-    float f;
-    std::memcpy(&f, &tmp, sizeof(f));
-    return f;
-  };
-
-  // Cast float32 to bfloat16 inline
-  auto cast_f32_to_bf16 = [](float f) {
-    uint32_t bits;
-    std::memcpy(&bits, &f, sizeof(bits));
-    return static_cast<uint16_t>(bits >> 16);
-  };
-
-  // Quantization pack lambda (channelwise QA8DX)
-  auto quant_pack_8bit_channelwise =
-      [&](size_t M, size_t K, const uint16_t* src_bf16, int8_t* dst_qa8dx) {
-        constexpr int8_t kI8Min = std::numeric_limits<std::int8_t>::lowest();
-        constexpr int8_t kI8Max = std::numeric_limits<std::int8_t>::max();
-
-        const size_t dst_stride =
-            K * sizeof(int8_t) + sizeof(float) + sizeof(int32_t);
-        for (size_t i = 0; i < M; ++i) {
-          const uint16_t* row_ptr = src_bf16 + i * K;
-          // find min/max
-          float mn = FLT_MAX, mx = -FLT_MAX;
-          for (size_t j = 0; j < K; ++j) {
-            float v = cast_bf16_to_f32(row_ptr[j]);
-            mn = std::min(mn, v);
-            mx = std::max(mx, v);
-          }
-          float rmin = std::min(0.0f, mn);
-          float rmax = std::max(0.0f, mx);
-          constexpr float qmin = static_cast<float>(kI8Min);
-          constexpr float qmax = static_cast<float>(kI8Max);
-          float scale = (rmin == rmax) ? 1.f : (qmax - qmin) / (rmax - rmin);
-          float recip = scale ? 1.0f / scale : 0.0f;
-          int32_t zp;
-          float des_min = rmin * scale;
-          float des_max = rmax * scale;
-          float err_min = qmin + des_min;
-          float err_max = qmax + des_max;
-          float zp_f =
-              (err_min + err_max) > 0 ? qmin - des_min : qmax - des_max;
-          zp_f = std::clamp(zp_f, qmin, qmax);
-          zp = std::lrintf(zp_f);
-          int8_t* out_ptr = dst_qa8dx + i * dst_stride;
-          // store header
-          *reinterpret_cast<float*>(out_ptr) = recip;
-          *reinterpret_cast<int32_t*>(out_ptr + sizeof(float)) = -zp;
-          out_ptr += sizeof(float) + sizeof(int32_t);
-          // quantize
-          for (size_t j = 0; j < K; ++j) {
-            float v = cast_bf16_to_f32(row_ptr[j]);
-            int32_t q = static_cast<int32_t>(std::round(v * scale)) + zp;
-            q = std::clamp(
-                q, static_cast<int32_t>(kI8Min), static_cast<int32_t>(kI8Max));
-            *out_ptr++ = static_cast<int8_t>(q);
-          }
-        }
-      };
-
-  // MatMul lambda (MXN x MXK -> MNXK BF16)
-  auto matmul_kernel = [&](size_t M,
-                           size_t N,
-                           size_t K,
-                           const int8_t* lhs,
-                           const uint8_t* rhs,
-                           const float* scales,
-                           uint16_t* dst,
-                           float lo,
-                           float hi) {
-    const size_t lhs_stride =
-        K * sizeof(int8_t) + sizeof(float) + sizeof(int32_t);
-    const size_t rhs_stride = roundup(K, 2) / 2;
-    for (size_t i = 0; i < M; ++i) {
-      const int8_t* lhs_row = lhs + i * lhs_stride;
-      for (size_t j = 0; j < N; ++j) {
-        int32_t acc = 0;
-        const int8_t* lptr = lhs_row;
-        const uint8_t* rptr = rhs + j * rhs_stride;
-        float lhs_scale = *reinterpret_cast<const float*>(lptr);
-        int32_t lhs_off =
-            *reinterpret_cast<const int32_t*>(lptr + sizeof(float));
-        lptr += sizeof(float) + sizeof(int32_t);
-        for (size_t t = 0; t < K; ++t) {
-          int32_t lv = static_cast<int32_t>(lptr[t]);
-          uint8_t bv = rptr[t / 2];
-          int32_t rv = ((t & 1) == 0) ? (static_cast<int32_t>(bv & 0xF) - 8)
-                                      : (static_cast<int32_t>(bv >> 4) - 8);
-          acc += lv * rv + lhs_off * rv;
-        }
-        float res = static_cast<float>(acc) * scales[j] * lhs_scale;
-        if (bias) {
-          res += bias[j];
-        }
-        res = std::clamp(res, lo, hi);
-        *dst++ = cast_f32_to_bf16(res);
-      }
-    }
-  };
-
-  // allocate and run
-  std::unique_ptr<int8_t[]> packed(
-      new int8_t[m * (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t))]);
-  quant_pack_8bit_channelwise(m, k, lhs_bf16, packed.get());
-  matmul_kernel(
-      m,
-      n,
-      k,
-      packed.get(),
-      rhs_qs4cx,
-      rhs_scales,
-      dst_bf16,
-      scalar_min,
-      scalar_max);
-}
-
 /**
  * The Int4 quantized weights must be represented as a uint8 tensor
  * For matrix multiplication with a weight shape of (N x K)
@@ -953,21 +819,21 @@ void dyn_quant_pack_4bit_weight_kernel(
 #if AT_KLEIDIAI_ENABLED()
   if (can_use_kleidiai(scales_zeros, K, block_size)) {
     const int64_t weight_packed_size =
-        kleidiai::kai_pack_rhs_int4_size(N, K, block_size, weights.scalar_type());
+        kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
     packed_weights.resize_({weight_packed_size});
     kleidiai::kai_pack_int4_rhs(
         packed_weights, weights, scales_zeros, bias, N, K, block_size);
   } else
 #endif
   {
+    TORCH_CHECK(
+        bias.has_value() == 0,
+        __func__,
+        " : Bias is unsupported in reference implementation");
     packed_weights = packed_weights.to(kFloat);
-    auto weight_reshaped = weights.reshape({-1}).to(kFloat);
-    auto scales_zeros_reshaped = scales_zeros.reshape({-1}).to(kFloat);
-    std::vector<at::Tensor> tensors_to_cat = {weight_reshaped, scales_zeros_reshaped};
-    if (bias.has_value()) {
-      tensors_to_cat.push_back(bias.value().view({-1}).to(kFloat));
-    }
-    auto res = at::cat(tensors_to_cat, 0);
+    auto weight_reshaped = weights.view({-1}).to(kFloat);
+    auto scales_zeros_reshaped = scales_zeros.view({-1}).to(kFloat);
+    auto res = at::cat({weight_reshaped, scales_zeros_reshaped}, 0);
     packed_weights.resize_(res.sizes()).copy_(res);
   }
 }
@@ -981,8 +847,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
     const float* rhs_scales_f32,
     float* dst_f32,
     float scalar_min,
-    float scalar_max,
-    const float* bias) {
+    float scalar_max) {
   const size_t input_size_8bit = m * (k + sizeof(int32_t) + sizeof(float));
 
   auto lhs_qa8dx_buffer = std::make_unique<uint8_t[]>(input_size_8bit);
@@ -992,9 +857,6 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
   // required format for matmul
   auto input_quant_pack_8bit_channelwise =
       [&](size_t m, size_t k, const float* lhs_f32, int8_t* lhs_qa8dx) {
-        constexpr int8_t kI8Min = std::numeric_limits<std::int8_t>::lowest();
-        constexpr int8_t kI8Max = std::numeric_limits<std::int8_t>::max();
-
         const size_t dst_stride =
             (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t));
 
@@ -1015,8 +877,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
           }
 
           // Maximum/minimum int8 values
-          constexpr float qmin = static_cast<float>(kI8Min);
-          constexpr float qmax = static_cast<float>(kI8Max);
+          const float qmin = (float)INT8_MIN;
+          const float qmax = (float)INT8_MAX;
 
           const float rmin0 = std::min(0.0f, min0);
           const float rmax0 = std::max(0.0f, max0);
@@ -1042,7 +904,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
           zero_point0 = std::min(zero_point0, qmax);
 
           // Round to nearest integer
-          const int32_t nudged_zero_point0 = std::lrintf(zero_point0);
+          const int32_t nudged_zero_point0 = lrintf(zero_point0);
 
           int8_t* dst_ptr = lhs_qa8dx + m_idx * dst_stride;
 
@@ -1060,8 +922,8 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
             int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
 
             v0_s32 = v0_s32 + nudged_zero_point0;
-            v0_s32 = std::max(v0_s32, static_cast<int32_t>(kI8Min));
-            v0_s32 = std::min(v0_s32, static_cast<int32_t>(kI8Max));
+            v0_s32 = std::max(v0_s32, static_cast<int32_t>(INT8_MIN));
+            v0_s32 = std::min(v0_s32, static_cast<int32_t>(INT8_MAX));
             dst_ptr[0] = (int8_t)v0_s32;
             dst_ptr += sizeof(int8_t);
           }
@@ -1125,10 +987,6 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
 
       main_acc = main_acc * lhs_scale;
 
-      if (bias) {
-        main_acc += bias[n_idx];
-      }
-
       // Clamp (min-max) operation
       main_acc = std::max(main_acc, scalar_min);
       main_acc = std::min(main_acc, scalar_max);
@@ -1149,16 +1007,12 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
     const float* rhs_scales_fp32,
     float* dst_f32,
     float scalar_min,
-    float scalar_max,
-    const float* bias) {
+    float scalar_max) {
   // Lambda for LHS quantization
   auto lhs_quant_pack = [&](size_t m,
                             size_t k,
                             const float* lhs_f32,
                             int8_t* lhs_qa8dx) {
-    constexpr int8_t kI8Min = std::numeric_limits<std::int8_t>::lowest();
-    constexpr int8_t kI8Max = std::numeric_limits<std::int8_t>::max();
-
     const size_t dst_stride =
         (k * sizeof(int8_t) + sizeof(float) + sizeof(int32_t));
 
@@ -1174,8 +1028,8 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
         min0 = std::min(src0_0, min0);
       }
 
-      constexpr float qmin = static_cast<float>(kI8Min);
-      constexpr float qmax = static_cast<float>(kI8Max);
+      const float qmin = (float)INT8_MIN;
+      const float qmax = (float)INT8_MAX;
 
       const float rmin0 = std::min(0.0f, min0);
       const float rmax0 = std::max(0.0f, max0);
@@ -1192,7 +1046,7 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
 
       zero_point0 = std::max(zero_point0, qmin);
       zero_point0 = std::min(zero_point0, qmax);
-      const int32_t nudged_zero_point0 = std::lrintf(zero_point0);
+      const int32_t nudged_zero_point0 = lrintf(zero_point0);
 
       int8_t* dst_ptr = lhs_qa8dx + row_idx * dst_stride;
 
@@ -1205,8 +1059,9 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
         const float src0_0 = src_ptr[k_idx];
         int32_t v0_s32 = (int32_t)(std::round(src0_0 * scale0));
         v0_s32 = std::max(
-            std::min(v0_s32 + nudged_zero_point0, static_cast<int32_t>(kI8Max)),
-            static_cast<int32_t>(kI8Min));
+            std::min(
+                v0_s32 + nudged_zero_point0, static_cast<int32_t>(INT8_MAX)),
+            static_cast<int32_t>(INT8_MIN));
         dst_ptr[0] = (int8_t)v0_s32;
         dst_ptr += sizeof(int8_t);
       }
@@ -1263,11 +1118,6 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
       }
 
       main_acc = main_acc * lhs_scale;
-
-      if (bias) {
-        main_acc += bias[col_idx];
-      }
-
       main_acc = std::max(main_acc, scalar_min);
       main_acc = std::min(main_acc, scalar_max);
 
@@ -1278,27 +1128,28 @@ void ref_dyn_quant_matmul_4bit_groupwise_kernel(
 }
 
 /**
- * Dynamic INT4 weight-only MatMul with per-row input quantization.
- *
- * Execution Flow:
- *
- *   (INT4 Weights + FP Scales [+ optional Bias])
- *
- *    Input (FP32 or BF16)         Packed Weight Buffer
- *           |                             |
- *    Row-wise Quantization (INT8)         |
- *           |                             |
- *     INT8 Input Activation      INT4 Quantized Weights + Scales
- *                  \             /
- *                   \           /
- *              Quantized Matrix Multiply
- *                     |
- *              Output Tensor (BF16 or FP32)
- *
- * Notes:
- *   - Groupwise kernels expect BF16 scales
- *   - Channelwise kernels expect FP32 scales
- *   - Bias is currently unsupported in fallback path
+ * Dynamic Input Quant 4 bit weights matmul execution flow
+              (INT4 Weights + FP scales + FP32 Bias)
+  FP32 Input              Packed Buffer
+       |                       |
+    Quantize                Cast
+   to INT8                 to INT8
+       |                       |
+       v                       v
+ INT8 Input              INT8 Weights
+          \               /
+            \            /
+             \         /
+           INT8 Matrix Multiplication
+                   |
+                   v
+ FP32 Dequantized and Accumulate in FP32
+                   |
+                   v
+             FP32 Final Output
+
+ * The Groupwise kernel requires BFloat16 Scales and Channelwise kernel requires
+ * Float32 Scales. If not provided, we will use fallback implementation.
  */
 void dyn_quant_matmul_4bit_kernel(
     const Tensor& output,
@@ -1310,75 +1161,65 @@ void dyn_quant_matmul_4bit_kernel(
     const int64_t block_size) {
 #if AT_KLEIDIAI_ENABLED()
   const int64_t weight_packed_size =
-      kleidiai::kai_pack_rhs_int4_size(N, K, block_size, inp.scalar_type());
+      kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
   if (weight_packed_size == packed_weights.numel()) {
     // KleidiAI interface internally handles the Channelwise and groupwise
     // distinction
-    kleidiai::kai_quant_pack_lhs_int4_mm(output, inp, packed_weights, M, N, K, block_size);
+    kleidiai::kai_quant_pack_lhs_int4_mm(
+        output, inp, packed_weights, M, N, K, block_size);
   } else
 #endif
   {
-    {
-    void* input = inp.data_ptr();
-    void* dst = output.data_ptr();
-
-    // Extract weights, sclaes and biases form from packed tensor
-    const int weights_elements = N * K / 2;
-    const int scale_elements = N * (K / block_size);
-    TORCH_CHECK(packed_weights.numel() >=  (weights_elements + scale_elements), "Invalid packed weight tensor size");
-
-    auto extracted_weights = packed_weights.narrow(0, 0, weights_elements).to(kByte);
-    auto extracted_scales_and_bias = packed_weights.narrow(0, weights_elements, packed_weights.size(0) - weights_elements).to(kFloat);
-    auto float32_scales = extracted_scales_and_bias.narrow(0, 0, scale_elements);
-
-    int bias_elements = packed_weights.numel() - (weights_elements + scale_elements);
-    float* weight_scales = float32_scales.data_ptr<float>();
-
-    void* bias_data = nullptr;
-    if (bias_elements) {
-        auto float32_bias = extracted_scales_and_bias.narrow(0, scale_elements, bias_elements);
-        TORCH_CHECK(float32_bias.size(0) == N, "Expected bias length to match output dimension");
-        bias_data = float32_bias.data_ptr();
-
-    }
-    // 2 elements of 4 bit weights are packed into 1 uint8 packet
-    uint8_t* weights_4bit = reinterpret_cast<uint8_t*>(extracted_weights.data_ptr());
-
-    // Dispatch to reference kernels
-    if (inp.scalar_type() == at::kBFloat16) {
-        // BF16 input, BF16 output
-        constexpr float BF16_MAX = 3.38953139e+38f;
-        constexpr float BF16_MIN = -BF16_MAX;
-        if (block_size == K) {
-            ref_dyn_quant_matmul_4bit_channelwise_kernel_bf16(
-                M, N, K,
-                (uint16_t*)input, weights_4bit, weight_scales,
-                (uint16_t*)dst, BF16_MIN, BF16_MAX, (float*)bias_data);
-        } else {
-            TORCH_CHECK(false, "Unsupported block size for BF16 fallback");
-        }
-    } else if (inp.scalar_type() == at::kFloat) {
-        // FP32 input, FP32 output
-        if (block_size == K) {
-            ref_dyn_quant_matmul_4bit_channelwise_kernel(
-                M, N, K,
-                (float*)input, weights_4bit, weight_scales,
-                (float*)dst, -FLT_MAX, FLT_MAX, (float*)bias_data);
-        } else if (!(block_size % 32) && !(K % block_size)) {
-            ref_dyn_quant_matmul_4bit_groupwise_kernel(
-                M, N, K, block_size,
-                (float*)input, weights_4bit, weight_scales,
-                (float*)dst, -FLT_MAX, FLT_MAX, (float*)bias_data);
-        } else {
-            TORCH_CHECK(false, "Unsupported block size for FP32 fallback");
-        }
+    float* lhs_f32 = reinterpret_cast<float*>(inp.data_ptr());
+    const auto weights_size = N * K / 2;
+    // The weights needs to be in uint8_t data type after quantization
+    auto extracted_weights =
+        (packed_weights.narrow(0, 0, weights_size)).to(kByte);
+    auto float32_scales =
+        (packed_weights.narrow(
+             0, weights_size, packed_weights.size(0) - weights_size))
+            .to(kFloat);
+    uint8_t* rhs_4bit =
+        reinterpret_cast<uint8_t*>(extracted_weights.data_ptr());
+    float* rhs_scales_f32 = reinterpret_cast<float*>(float32_scales.data_ptr());
+    float* dst_f32 = reinterpret_cast<float*>(output.data_ptr());
+    if (block_size == K) {
+      ref_dyn_quant_matmul_4bit_channelwise_kernel(
+          M,
+          N,
+          K,
+          lhs_f32,
+          rhs_4bit,
+          rhs_scales_f32,
+          dst_f32,
+          -FLT_MAX,
+          FLT_MAX);
+    } else if (!(block_size % 32) && !(K % block_size)) {
+      ref_dyn_quant_matmul_4bit_groupwise_kernel(
+          M,
+          N,
+          K,
+          block_size,
+          lhs_f32,
+          rhs_4bit,
+          rhs_scales_f32,
+          dst_f32,
+          -FLT_MAX,
+          FLT_MAX);
     } else {
-        TORCH_CHECK(false, "Unsupported input/output dtype combination for int4mm kernel");
+      TORCH_CHECK(
+          block_size == K || (!(block_size % 32) && !(K % block_size)),
+          __func__,
+          ": Group size should be multiple 32 or in_features [",
+          K,
+          "]. Provided ",
+          block_size);
     }
+  }
 }
-}
+
 } // anonymous namespace
-}
+
 ALSO_REGISTER_AVX512_DISPATCH(weight_to_int4pack_stub, &weight_to_int4pack_kernel)
 ALSO_REGISTER_AVX512_DISPATCH(int4pack_mm_stub, &int4pack_mm_kernel)
 REGISTER_DISPATCH(dyn_quant_pack_4bit_weight_stub, &dyn_quant_pack_4bit_weight_kernel)
diff --git a/aten/src/ATen/native/kleidiai/kai_kernels.cpp b/aten/src/ATen/native/kleidiai/kai_kernels.cpp
index 1313f98f90109..ce0f10bf6df1f 100644
--- a/aten/src/ATen/native/kleidiai/kai_kernels.cpp
+++ b/aten/src/ATen/native/kleidiai/kai_kernels.cpp
@@ -21,27 +21,18 @@ void kai_pack_int4_rhs(
     const int64_t n,
     const int64_t k,
     const int64_t bl) {
+  // Prefer Channelwise kernel over Groupwise kernel for conflicting cases
   if (bl == k) {
     // Channelwise
-    if (weight.scalar_type() == at::kBFloat16) {
-      auto kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(
-          kai_kernel_id::
-              matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod);
-      auto& params = kernel_packet.rhs_pack_params;
-      params.lhs_zero_point = 1;
-      params.rhs_zero_point = 8;
-      kai_pack_rhs_channelwise_int4<kai_matmul_ukernel_bf16_qa8dxp_qs4cxp>(
-          kernel_packet, weight_packed, weight, scales, bias, n, k);
-    } else {
-      auto kernel_packet = kai_select_channelwise_matmul_ukernel(
-          kai_kernel_id::
-              matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
-      auto& params = kernel_packet.rhs_pack_params;
-      params.lhs_zero_point = 1;
-      params.rhs_zero_point = 8;
-      kai_pack_rhs_channelwise_int4<kai_matmul_ukernel_f32_qa8dxp_qs4cxp>(
-          kernel_packet, weight_packed, weight, scales, bias, n, k);
-    }
+    auto kernel_packet = kai_select_channelwise_matmul_ukernel(
+        kai_kernel_id::
+            matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
+    auto& params = kernel_packet.rhs_pack_params;
+    params.lhs_zero_point = 1;
+    params.rhs_zero_point = 8;
+
+    kai_pack_rhs_channelwise_int4<kai_matmul_ukernel_f32_qa8dxp_qs4cxp>(
+        kernel_packet, weight_packed, weight, scales, bias, n, k);
   } else if (!(bl % 32) && !(k % bl)) {
     // Groupwise
     auto kernel_packet = kai_select_groupwise_matmul_ukernel(
@@ -72,29 +63,19 @@ void kai_pack_int4_rhs(
 size_t kai_pack_rhs_int4_size(
     const int64_t n,
     const int64_t k,
-    const int64_t bl,
-    at::ScalarType tensor_dtype) {
+    const int64_t bl) {
   size_t packed_size = n * k;
+  // Prefer Channelwise kernel over Groupwise kernel for conflicting cases
   if (bl == k) {
-    if (tensor_dtype == at::kBFloat16) {
-      auto kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(
-          kai_kernel_id::
-              matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod);
-      const auto& ukernel = kernel_packet.ukernel;
-      const size_t nr = ukernel.get_nr();
-      const size_t kr = ukernel.get_kr();
-      const size_t sr = ukernel.get_sr();
-      packed_size = kernel_packet.kai_get_rhs_packed_size(n, k, nr, kr, sr);
-    } else {
-      auto kernel_packet = kai_select_channelwise_matmul_ukernel(
-          kai_kernel_id::
-              matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
-      const auto& ukernel = kernel_packet.ukernel;
-      const size_t nr = ukernel.get_nr();
-      const size_t kr = ukernel.get_kr();
-      const size_t sr = ukernel.get_sr();
-      packed_size = kernel_packet.kai_get_rhs_packed_size(n, k, nr, kr, sr);
-    }
+    // Channelwise
+    auto kernel_packet = kai_select_channelwise_matmul_ukernel(
+        kai_kernel_id::
+            matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod);
+    const auto& ukernel = kernel_packet.ukernel;
+    const size_t nr = ukernel.get_nr();
+    const size_t kr = ukernel.get_kr();
+    const size_t sr = ukernel.get_sr();
+    packed_size = kernel_packet.kai_get_rhs_packed_size(n, k, nr, kr, sr);
   } else if (!(bl % 32) && !(k % bl)) {
     // Groupwise
     auto kernel_packet = kai_select_groupwise_matmul_ukernel(
@@ -167,7 +148,8 @@ static void kai_quant_pack_lhs_int4_mm_groupwise(
     const auto lhs_src_ptr = lhs_native_mtx_f32 + thread_id * src_stride;
     const int64_t m_idx = thread_id * vec_per_thread;
     auto lhs_packed_ptr = lhs_packed_base +
-        kernel_packet.kai_get_lhs_quant_pack_offset(m_idx, k, mr, kr, sr);
+        kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(
+                              m_idx, k, mr, kr, sr);
     const int64_t vec_num = (thread_id == num_threads - 1)
         ? (m - vec_per_thread * thread_id)
         : vec_per_thread;
@@ -277,7 +259,8 @@ static void kai_quant_pack_lhs_int4_mm_channelwise(
     const auto lhs_src_ptr = lhs_native_mtx_f32 + thread_id * src_stride;
     const int64_t m_idx = thread_id * vec_per_thread;
     auto lhs_packed_ptr = lhs_packed_base +
-        kernel_packet.kai_get_lhs_quant_pack_offset(m_idx, k, mr, kr, sr);
+        kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32(
+                              m_idx, k, mr, kr, sr);
     const int64_t vec_num = (thread_id == num_threads - 1)
         ? (m - vec_per_thread * thread_id)
         : vec_per_thread;
@@ -337,144 +320,19 @@ static void kai_quant_pack_lhs_int4_mm_channelwise(
       });
 }
 
-static void kai_quant_pack_lhs_int4_mm_bf16_channelwise(
+void kai_quant_pack_lhs_int4_mm(
     const Tensor& output,
     const Tensor& input,
     const Tensor& weight,
     const int64_t m,
     const int64_t n,
-    const int64_t k) {
-  // Kernel IDs for GEMM and GEMV
-  constexpr kai_kernel_id gemm_id =
-      kai_kernel_id::matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm;
-  constexpr kai_kernel_id gemv_id =
-      kai_kernel_id::matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod;
-
-  // Get total threads and select kernel
-  const int64_t total_threads = at::get_num_threads();
-  auto kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(gemv_id);
-  if (cpuinfo_has_arm_i8mm() && m > 1) {
-    kernel_packet = kai_select_bf16_channelwise_matmul_ukernel(gemm_id);
-  }
-
-  // Thread blocking parameters
-  const int64_t n_step = kernel_packet.ukernel.get_n_step();
-  const size_t mr = kernel_packet.ukernel.get_mr();
-  const size_t kr = kernel_packet.ukernel.get_kr();
-  const size_t sr = kernel_packet.ukernel.get_sr();
-
-  const size_t lhs_packed_size =
-      kernel_packet.kai_get_lhs_packed_size(m, k, mr, kr, sr);
-  auto lhs_packed = std::make_unique<uint8_t[]>(lhs_packed_size);
-  uint8_t* dst_act_mtx_bf16 = reinterpret_cast<uint8_t*>(output.data_ptr());
-  const uint8_t* lhs_native_mtx_bf16 =
-      reinterpret_cast<const uint8_t*>(input.data_ptr());
-  const uint8_t* rhs_packed_mtx_qs4cx =
-      reinterpret_cast<const uint8_t*>(weight.data_ptr());
-  uint8_t* lhs_packed_base = lhs_packed.get();
-
-  constexpr int32_t element_size = sizeof(uint16_t);
-  const size_t lhs_stride = k * element_size;
-  const size_t dst_stride = n * element_size;
-
-  // LHS quantization packing
-  int64_t vec_per_thread = get_vec_per_thread(m, total_threads, mr);
-  int64_t num_threads = (m + vec_per_thread - 1) / vec_per_thread;
-  const size_t src_stride = vec_per_thread * lhs_stride;
-
-  auto lhs_quant_pack = [=, &kernel_packet](int64_t thread_id) {
-    const auto lhs_src_ptr = lhs_native_mtx_bf16 + thread_id * src_stride;
-    const int64_t m_idx = thread_id * vec_per_thread;
-    auto lhs_packed_ptr = lhs_packed_base +
-        kernel_packet.kai_get_lhs_quant_pack_offset(m_idx, k, mr, kr, sr);
-    const int64_t vec_num = (thread_id == num_threads - 1)
-        ? (m - vec_per_thread * thread_id)
-        : vec_per_thread;
-
-    kernel_packet.kai_run_lhs_quant_pack(
-        vec_num,
-        k,
-        mr,
-        kr,
-        sr,
-        0,
-        (const uint16_t*)lhs_src_ptr,
-        lhs_stride,
-        lhs_packed_ptr);
-  };
-
-  at::parallel_for(
-      0, num_threads, /*grain_size=*/1, [&](int64_t begin, int64_t end) {
-        for (int64_t thread_id = begin; thread_id < end; ++thread_id) {
-          lhs_quant_pack(thread_id);
-        }
-      });
-
-  // Matrix multiplication
-  vec_per_thread = get_vec_per_thread(n, total_threads, n_step);
-  num_threads = (n + vec_per_thread - 1) / vec_per_thread;
-
-  auto mm = [=, &kernel_packet](int64_t thread_id) {
-    const auto rhs_packed_ptr = rhs_packed_mtx_qs4cx +
-        kernel_packet.ukernel.get_rhs_packed_offset(
-            thread_id * vec_per_thread, k);
-    auto dst_ptr = dst_act_mtx_bf16 +
-        kernel_packet.ukernel.get_dst_offset(
-            0, thread_id * vec_per_thread, dst_stride);
-    const int64_t vec_num = (thread_id == num_threads - 1)
-        ? (n - vec_per_thread * thread_id)
-        : vec_per_thread;
-
-    kernel_packet.ukernel.run_matmul(
-        m,
-        vec_num,
-        k,
-        lhs_packed_base,
-        rhs_packed_ptr,
-        (uint16_t*)dst_ptr,
-        dst_stride,
-        element_size, // dst_stride_col
-        -FLT_MAX,
-        FLT_MAX);
-  };
-
-  at::parallel_for(
-      0, num_threads, /*grain_size=*/1, [&](int64_t begin, int64_t end) {
-        for (int64_t thread_id = begin; thread_id < end; ++thread_id) {
-          mm(thread_id);
-        }
-      });
-}
-void kai_quant_pack_lhs_int4_mm(
-    const at::Tensor& output,
-    const at::Tensor& input,
-    const at::Tensor& weight,
-    const int64_t m,
-    const int64_t n,
     const int64_t k,
     const int64_t bl) {
   // Prefer Channelwise kernel over Groupwise kernel for conflicting cases
   if (bl == k) {
-    const auto input_dtype = input.dtype();
-
-    if (input_dtype == at::kBFloat16) {
-      if (cpuinfo_has_arm_bf16()) {
-        kleidiai::kai_quant_pack_lhs_int4_mm_bf16_channelwise(
-            output, input, weight, m, n, k);
-      } else {
-        TORCH_CHECK(
-            false,
-            "BF16 Unsupported: CPU does not support BF16. Please use a CPU with BF16 support.");
-      }
-    } else if (input_dtype == at::kFloat) {
-      kleidiai::kai_quant_pack_lhs_int4_mm_channelwise(
-          output, input, weight, m, n, k);
-    } else {
-      TORCH_CHECK(
-          false,
-          "Unsupported input data type: Only Bfloat16 and Float inputs are supported.");
-    }
-  } else if ((bl % 32 == 0) && (k % bl == 0)) {
+    kleidiai::kai_quant_pack_lhs_int4_mm_channelwise(
+        output, input, weight, m, n, k);
+  } else if (!(bl % 32) && !(k % bl)) {
     kleidiai::kai_quant_pack_lhs_int4_mm_groupwise(
         output, input, weight, m, n, k, bl);
   }
diff --git a/aten/src/ATen/native/kleidiai/kai_kernels.h b/aten/src/ATen/native/kleidiai/kai_kernels.h
index a4179cefd06cf..9b522d7f7705a 100644
--- a/aten/src/ATen/native/kleidiai/kai_kernels.h
+++ b/aten/src/ATen/native/kleidiai/kai_kernels.h
@@ -25,8 +25,7 @@ void kai_pack_int4_rhs(
 size_t kai_pack_rhs_int4_size(
     const int64_t n,
     const int64_t k,
-    const int64_t bl,
-    at::ScalarType tensor_dtype = at::kFloat);
+    const int64_t bl);
 
 /**
  * @brief Run 2 operations ( Input quantize and pack -> 4 bit Matmul )
diff --git a/aten/src/ATen/native/kleidiai/kai_pack.h b/aten/src/ATen/native/kleidiai/kai_pack.h
index d9f08333591ed..4ff3371ab5e2a 100644
--- a/aten/src/ATen/native/kleidiai/kai_pack.h
+++ b/aten/src/ATen/native/kleidiai/kai_pack.h
@@ -36,8 +36,7 @@ void kai_pack_rhs_groupwise_int4(
     AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null");
   }
 
-  float* bias_ptr =
-      bias.has_value() ? bias.value().to(kFloat).data_ptr<float>() : NULL;
+  float* bias_ptr = bias.has_value() ? bias.value().data_ptr<float>() : NULL;
   auto& params = kernel.rhs_pack_params;
 
   kernel.kai_run_rhs_pack(
@@ -74,8 +73,7 @@ void kai_pack_rhs_channelwise_int4(
   auto weight_packed_data =
       reinterpret_cast<uint8_t*>(weight_packed.data_ptr());
   const auto weight_data = weight.data_ptr<uint8_t>();
-
-  const auto scales_data = scales.to(kFloat).data_ptr<float>();
+  const auto scales_data = scales.data_ptr<float>();
 
   if (weight_data == nullptr) {
     AT_ERROR("kai_pack_rhs_channelwise_int4: Weight data pointer is null");
@@ -85,8 +83,7 @@ void kai_pack_rhs_channelwise_int4(
     AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null");
   }
 
-  float* bias_ptr =
-      bias.has_value() ? bias.value().to(kFloat).data_ptr<float>() : NULL;
+  float* bias_ptr = bias.has_value() ? bias.value().data_ptr<float>() : NULL;
   auto& params = kernel.rhs_pack_params;
 
   kernel.kai_run_rhs_pack(
diff --git a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp
index 783133b83e670..0de198d7dc012 100644
--- a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp
+++ b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.cpp
@@ -68,39 +68,5 @@ kai_matmul_ukernel_f32_qa8dxp_qs4cxp kai_select_channelwise_matmul_ukernel(
     const kai_kernel_id id) {
   return channelwise_8bit_4bit_kernels.at(id);
 }
-
-// Kernel Mapping - BF16 Channelwise
-std::unordered_map<kai_kernel_id, kai_matmul_ukernel_bf16_qa8dxp_qs4cxp>
-    bf16_channelwise_8bit_4bit_kernels = {
-        {kai_kernel_id::
-             matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-         {{kai_get_m_step_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_n_step_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_mr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_nr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_kr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_sr_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_lhs_packed_offset_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_rhs_packed_offset_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_dst_offset_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_get_dst_size_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod,
-           kai_run_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod}}},
-        {kai_kernel_id::matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-         {{kai_get_m_step_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_n_step_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_mr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_nr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_kr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_sr_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_lhs_packed_offset_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_rhs_packed_offset_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_dst_offset_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_get_dst_size_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm,
-           kai_run_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm}}}};
-
-kai_matmul_ukernel_bf16_qa8dxp_qs4cxp kai_select_bf16_channelwise_matmul_ukernel(
-    const kai_kernel_id id) {
-  return bf16_channelwise_8bit_4bit_kernels.at(id);
-}
 } // namespace at::native::kleidiai
 #endif
diff --git a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
index cfcf7a81ba85f..8480469cdea86 100644
--- a/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
+++ b/aten/src/ATen/native/kleidiai/kai_ukernel_interface.h
@@ -10,32 +10,21 @@
 #include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod.h>
 #include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.h>
 #include <kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp_qsi4cxp_interface.h>
-#include <kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod.h>
-#include <kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm.h>
-#include <kai/ukernels/matmul/matmul_clamp_bf16_qai8dxp_qsi4cxp/kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_interface.h>
 #include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h>
-#include <kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_bf16_neon.h>
 #include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0.h>
 #include <kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0.h>
 
 namespace at::native::kleidiai {
 
 enum class kai_kernel_id {
-  // FP32 inputs, 4-bit weights, FP32 output
   matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod =
-      0, // Groupwise 4-bit GEMV (per-group scales, NEON DOTPROD)
+      0, // Groupwise 4 bit GEMV
   matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_4x8x32_neon_i8mm =
-      1, // Groupwise 4-bit GEMM (per-group scales, NEON I8MM)
+      1, // Groupwise 4 bit GEMM
   matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod =
-      2, // Channelwise 4-bit GEMV (per-channel scales, NEON DOTPROD)
+      2, // Channelwise 4 bit GEMV
   matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm =
-      3, // Channelwise 4-bit GEMM (per-channel scales, NEON I8MM)
-
-  // BF16 inputs, 4-bit weights, BF16 output
-  matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod =
-      4, // Channelwise 4-bit GEMV with BF16 input/output
-  matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm =
-      5  // Channelwise 4-bit GEMM with BF16 input/output
+      3 // Channelwise 4 bit GEMM
 };
 
 // Channelwise Kernel mapping
@@ -77,9 +66,6 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp {
       void* rhs_packed,
       size_t extra_bytes,
       const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params);
-   size_t(*kai_get_lhs_quant_pack_offset)(
-        size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr
-    );
 
   kai_matmul_ukernel_f32_qa8dxp_qs4cxp(
       const kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel& kernel)
@@ -89,71 +75,12 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp {
         kai_get_rhs_packed_size(
             &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
         kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32),
-        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
-        kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32){}
+        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0) {}
 };
 
 struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp
 kai_select_channelwise_matmul_ukernel(const kai_kernel_id id);
 
-// bf16 Channelwise Kernel mapping
-struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp {
-    struct kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel ukernel;
-    struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params rhs_pack_params;
-    size_t (*kai_get_lhs_packed_size)(
-        size_t m,
-        size_t k,
-        size_t mr,
-        size_t kr,
-        size_t sr);
-    size_t (*kai_get_rhs_packed_size)(
-        size_t n,
-        size_t k,
-        size_t nr,
-        size_t kr,
-        size_t sr);
-    void (*kai_run_lhs_quant_pack)(
-        size_t m,
-        size_t k,
-        size_t mr,
-        size_t kr,
-        size_t sr,
-        size_t m_idx_start,
-        const void* lhs,
-        size_t lhs_stride,
-        void* lhs_packed);
-    void (*kai_run_rhs_pack)(
-        size_t num_groups,
-        size_t n,
-        size_t k,
-        size_t nr,
-        size_t kr,
-        size_t sr,
-        const uint8_t* rhs,
-        const float* bias,
-        const float* scale,
-        void* rhs_packed,
-        size_t extra_bytes,
-        const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params);
-        size_t(*kai_get_lhs_quant_pack_offset)(
-            size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr
-        );
-
-    kai_matmul_ukernel_bf16_qa8dxp_qs4cxp(
-        const kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel& kernel)
-        : ukernel(kernel),
-          kai_get_lhs_packed_size(
-              &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_bf16_neon),
-          kai_get_rhs_packed_size(
-              &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
-          kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_bf16_neon),
-          kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0),
-          kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_bf16_neon){}
-  };
-
-struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp
-kai_select_bf16_channelwise_matmul_ukernel(const kai_kernel_id id);
-
 // Groupwise Kernel mapping
 struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p {
   struct kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel ukernel;
@@ -198,9 +125,6 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p {
       void* rhs_packed,
       size_t extra_bytes,
       const struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params* params);
-      size_t(*kai_get_lhs_quant_pack_offset)(
-        size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr
-    );
 
   kai_matmul_ukernel_f32_qa8dxp_qs4c32p(
       const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& kernel)
@@ -210,8 +134,7 @@ struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p {
         kai_get_rhs_packed_size(
             &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0),
         kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32),
-        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0),
-        kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32) {}
+        kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0) {}
 };
 
 struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p kai_select_groupwise_matmul_ukernel(
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 9d687a8828a8c..fc0bdd2c0be03 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -2482,7 +2482,7 @@ def fn(a, b_int8pack, b_scales, c):
     @skipCUDAIf(True, "No _dyn_quant_pack_4bit_weight implementation on CUDA")
     @skipIfRocm
     @skipIfXpu(msg="No _dyn_quant_pack_4bit_weight implementation on XPU")
-    def test__dyn_quant_pack_4bit_weight_fp32(self):
+    def test__dyn_quant_pack_4bit_weight(self):
         q_group = 32
         k = 128
         n = 128
@@ -2513,53 +2513,12 @@ def fn(b, in_features, out_features):
 
         self.common(fn, (b, in_features, out_features))
 
-    @xfail_if_mps_unimplemented
-    @xfail_if_triton_cpu
-    @skipCUDAIf(True, "No _dyn_quant_pack_4bit_weight implementation on CUDA")
-    @skipIfRocm
-    @skipIfXpu(msg="No _dyn_quant_pack_4bit_weight implementation on XPU")
-    def test__dyn_quant_pack_4bit_weight_bf16(self):
-        k = 128
-        n = 128
-        q_group = 32
-
-        if not self.is_dtype_supported(torch.bfloat16):
-            raise unittest.SkipTest(
-                f"torch.bfloat16 not supported for device {self.device}"
-            )
-
-        torch.manual_seed(1)
-        b = torch.rand((k, n), dtype=torch.bfloat16)
-        in_features = b.size(0)
-        out_features = b.size(1)
-
-        def dyn_quant_pack_4bit_weight(b, in_features, out_features):
-            b_uint8, b_scales_and_zeros = _group_quantize_tensor_symmetric(
-                b, n_bit=4, groupsize=q_group
-            )
-
-            if q_group == in_features:
-                b_scales_and_zeros = b_scales_and_zeros.to(torch.float)
-            else:
-                b_scales_and_zeros = b_scales_and_zeros.to(torch.bfloat16)
-            b_int4pack = torch._dyn_quant_pack_4bit_weight(
-                b_uint8, b_scales_and_zeros, None, q_group, in_features, out_features
-            )
-
-            return b_int4pack, b_scales_and_zeros
-
-        def fn(b, in_features, out_features):
-            b_int4pack, _ = dyn_quant_pack_4bit_weight(b, in_features, out_features)
-            return b_int4pack
-
-        self.common(fn, (b, in_features, out_features))
-
     @xfail_if_mps_unimplemented
     @xfail_if_triton_cpu
     @skipCUDAIf(True, "No _dyn_quant_matmul_4bit implementation on CUDA")
     @skipIfRocm
     @skipIfXpu(msg="No _dyn_quant_matmul_4bit implementation on XPU")
-    def test__dyn_quant_matmul_4bit_fp32_input(self):
+    def test__dyn_quant_matmul_4bit(self):
         q_group = 32
         m = 32
         k = 128
@@ -2599,67 +2558,6 @@ def fn(a, q_group, in_features, out_features):
 
         self.common(fn, (a, q_group, in_features, out_features))
 
-    @skipCPUIf(IS_MACOS, "fails on M1, mismatch in bf16 support reporting")
-    @xfail_if_mps_unimplemented
-    @xfail_if_triton_cpu
-    @skipCUDAIf(True, "No _dyn_quant_matmul_4bit implementation on CUDA")
-    @skipIfRocm
-    @skipIfXpu(msg="No _dyn_quant_matmul_4bit implementation on XPU")
-    def test__dyn_quant_matmul_4bit_bf16_input(self):
-        m = 32
-        k = 128
-        n = 128
-        q_group = k
-
-        if not self.is_dtype_supported(torch.bfloat16):
-            raise unittest.SkipTest(
-                f"torch.bfloat16 not supported for device {self.device}"
-            )
-
-        torch.manual_seed(1)
-        a = torch.rand((m, k), dtype=torch.bfloat16)
-        b = torch.rand((k, n), dtype=torch.bfloat16)
-
-        # codegen_dynamic_shape test fails without explicitly marking these dynamic
-        torch._dynamo.mark_dynamic(a, 0)
-        torch._dynamo.mark_dynamic(b, 1)
-
-        in_features = b.size(0)
-        out_features = b.size(1)
-
-        if not self.is_dtype_supported(torch.bfloat16):
-            raise unittest.SkipTest(
-                f"torch.bfloat16 not supported for device {self.device}"
-            )
-
-        def dyn_quant_pack_4bit_weight(b, in_features, out_features):
-            b_uint8, b_scales_and_zeros = _group_quantize_tensor_symmetric(
-                b, n_bit=4, groupsize=q_group
-            )
-
-            if q_group == in_features:
-                b_scales_and_zeros = b_scales_and_zeros.to(torch.float)
-            else:
-                b_scales_and_zeros = b_scales_and_zeros.to(torch.bfloat16)
-            b_int4pack = torch._dyn_quant_pack_4bit_weight(
-                b_uint8, b_scales_and_zeros, None, q_group, in_features, out_features
-            )
-
-            return b_int4pack, b_scales_and_zeros
-
-        def fn(a, q_group, in_features, out_features):
-            b_int4pack, _ = dyn_quant_pack_4bit_weight(b, in_features, out_features)
-            res = torch.ops.aten._dyn_quant_matmul_4bit(
-                a,
-                b_int4pack,
-                q_group,
-                in_features,
-                out_features,
-            )
-            return res
-
-        self.common(fn, (a, q_group, in_features, out_features), atol=1, rtol=0.5)
-
     def test_expanded_reduction(self):
         def fn(x, y):
             z = x * y
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index 2ed88a4ec2344..5a629b371c766 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -3741,7 +3741,6 @@ def kai_roundup(a: int, b: int) -> int:
 
 def get_kai_packed_weight_size(n_bits, N, K, groupsize):
     if n_bits == 4:
-        # Works for both fp32 and bf16 Kernels
         if groupsize == K:  # channelwise
             # dotprod params only [1x8x32_neon_dotprod]
             kai_nr = 8
@@ -3871,8 +3870,6 @@ def meta__dyn_quant_pack_4bit_weight(
         )
         return weights.new_empty(int(packed_weight_size), dtype=torch.uint8)
     packed_weight_size = weights.numel() + scales_zeros.numel()
-    if bias is not None:
-        packed_weight_size += bias.numel()
     return weights.new_empty(packed_weight_size, dtype=torch.float)
 
 
@@ -3886,12 +3883,8 @@ def meta__dyn_quant_matmul_4bit(
 ):
     torch._check(inp.dim() == 2, lambda: "input must be a 2D tensor")
     torch._check(
-        (inp.dtype == torch.float32)
-        or (inp.dtype == torch.bfloat16 and block_size == in_features),
-        lambda: (
-            f"expected input to be f32 or bf16 (bf16 requires block_size == in_features), "
-            f"got {inp.dtype} with block_size={block_size} and in_features={in_features}"
-        ),
+        inp.dtype == torch.float32,
+        lambda: f"expected input to be f32, got {inp.dtype}",
     )
     M = inp.size(0)
     return inp.new_empty(M, out_features, dtype=inp.dtype)

From bdd3c3a29cae1a1be7075c670c8f811bb1255c8a Mon Sep 17 00:00:00 2001
From: Nan Zhang <nanzha@meta.com>
Date: Mon, 17 Nov 2025 21:10:52 +0000
Subject: [PATCH 43/47] Support SymInt placeholder in wrapper fxir (#167757)

Summary:
add support for symint placeholders

added two test cases with dynamic reshape
- dynamic info coming from tmd on placeholders
- dynamic info coming from placeholders (symints)

Test Plan:
test_reshape_dynamic_ph
test_reshape_dynamic_tmd

Differential Revision: D86984100

Pull Request resolved: https://github.com/pytorch/pytorch/pull/167757
Approved by: https://github.com/blaine-rister
---
 test/inductor/test_fxir_backend.py | 36 +++++++++++++++++++++++++++++-
 torch/_inductor/compile_fx.py      | 15 ++++++++-----
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/test/inductor/test_fxir_backend.py b/test/inductor/test_fxir_backend.py
index f9e84284f0d8d..2c232594f3329 100644
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@@ -831,7 +831,9 @@ def check(
             gm = torch._inductor.aot_compile(
                 ep.module(), inp, options={"fx_wrapper": True, **test_config}
             )
-            self.assertTrue(same(model(*inp), gm(*inp)))
+            # Flatten args for fx_wrapper gm
+            flat_args, _ = pytree.tree_flatten(inp)
+            self.assertTrue(same(model(*inp), gm(*flat_args)))
 
             for node in gm.graph.nodes:
                 if (
@@ -1182,6 +1184,38 @@ def mock_set_hook(gm: torch.fx.GraphModule, fn):
             compiled_out = compiled(*args)
             self.assertEqual(compiled_out.shape, shape)
 
+    def test_reshape_dynamic_ph(self):
+        """
+        Test dynamic scalars using SymInts placeholder
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x, shape):
+                return torch.reshape(x, shape) + 2
+
+        ds = {
+            "x": (torch.export.Dim.AUTO, torch.export.Dim.AUTO),
+            "shape": [torch.export.Dim.AUTO, torch.export.Dim.AUTO],
+        }
+        args = (torch.randn((12, 14), device=self.device), [6, 28])
+        self.check(TestModule(), args, ds)
+
+    def test_reshape_dynamic_tmd(self):
+        """
+        Test dynamic reshape using shape dependent information
+        """
+
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                new_shape = [x.shape[0] // 2, x.shape[1] * 2]
+                return torch.reshape(x, new_shape) + 2
+
+        ds = {
+            "x": (torch.export.Dim.AUTO, torch.export.Dim.AUTO),
+        }
+        args = (torch.randn((12, 14), device=self.device),)
+        self.check(TestModule(), args, ds)
+
 
 class TestReplaceFloorDiv(InductorTestCase):
     """
diff --git a/torch/_inductor/compile_fx.py b/torch/_inductor/compile_fx.py
index b6796d2b7ce38..46ca60483828d 100644
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@@ -2537,16 +2537,19 @@ def _extract_inputs_from_exported_gm(
     fake_inputs = [
         node.meta.get("val") for node in gm.graph.nodes if node.op == "placeholder"
     ]
-    # Replace non-tensor (constant) inputs with Nones, since these are not being
-    # used anyways by the graph
-    fake_inputs = [
-        inp if isinstance(inp, torch.Tensor) else None for inp in fake_inputs
-    ]
+
+    if not config.fx_wrapper:
+        # Replace non-tensor inputs with Nones
+        # constant scalars embedded in the graph
+        # symbolic scalars (symint) are not supported in non-fx_wrapper mode
+        fake_inputs = [
+            inp if isinstance(inp, torch.Tensor) else None for inp in fake_inputs
+        ]
 
     if any(v is not None for v in fake_inputs):
         # Validate devices before switching to fake tensors.
         for idx, fi, i in zip(count(), fake_inputs, example_inputs_):
-            if fi is not None:
+            if fi is not None and isinstance(fi, torch.Tensor):
                 assert isinstance(i, torch.Tensor)
                 if fi.device != i.device:
                     raise ValueError(

From 4e1b772103786e914abe91a0048bc2e98df5a7e1 Mon Sep 17 00:00:00 2001
From: Abhishek Nandy <anandy@redhat.com>
Date: Mon, 17 Nov 2025 21:14:34 +0000
Subject: [PATCH 44/47] Fix: Improve fallback behavior in
 `deserialize_torch_artifact` and relocate test into `TestSaveLoad` (#158247)

This is a follow-up to [#154333](https://github.com/pytorch/pytorch/pull/154333), where I initially introduced a fallback mechanism in deserialize_torch_artifact.

In this revised PR:

Cleaned up commit history for clarity and reproducibility.

Relocated the test into the TestSaveLoad class in test_serialize.py.

There were some issues with last PR so opened this PR

The previous PR had inconsistencies due to local branch issues and was closed in favor of this cleaner submission.

Feedback is very welcome
Pull Request resolved: https://github.com/pytorch/pytorch/pull/158247
Approved by: https://github.com/angelayi
---
 test/export/test_serialize.py    | 12 +++++++++++-
 torch/_export/serde/serialize.py | 12 +++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/test/export/test_serialize.py b/test/export/test_serialize.py
index 472ddcf556f83..1280ab45f2a82 100644
--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@@ -38,6 +38,7 @@
     _to_json_bytes,
     canonicalize,
     deserialize,
+    deserialize_torch_artifact,
     ExportedProgramDeserializer,
     ExportedProgramSerializer,
     GraphModuleSerializer,
@@ -1904,6 +1905,16 @@ def forward(self, x):
 
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
+    def test_deserialize_torch_artifact_dict(self):
+        data = {"key": torch.tensor([1, 2, 3])}
+        buf = io.BytesIO()
+        torch.save(data, buf)
+        serialized = buf.getvalue()
+        result = deserialize_torch_artifact(serialized)
+
+        self.assertIsInstance(result, dict)
+        self.assertTrue(torch.equal(result["key"], torch.tensor([1, 2, 3])))
+
     @unittest.skipIf(IS_WINDOWS, "Cannot modify file in windows")
     def test_save_file(self):
         class Foo(torch.nn.Module):
@@ -2010,7 +2021,6 @@ def forward(self, x):
         save(ep, buffer)
         buffer.seek(0)
         loaded_ep = load(buffer)
-
         inp = (torch.tensor(1),)
         self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))
 
diff --git a/torch/_export/serde/serialize.py b/torch/_export/serde/serialize.py
index e328422ec5e66..84978f0066712 100644
--- a/torch/_export/serde/serialize.py
+++ b/torch/_export/serde/serialize.py
@@ -422,7 +422,17 @@ def deserialize_torch_artifact(
     buffer = io.BytesIO(serialized)
     buffer.seek(0)
     # weights_only=False as we want to load custom objects here (e.g. ScriptObject)
-    artifact = torch.load(buffer, weights_only=False)
+    try:
+        artifact = torch.load(buffer, weights_only=True)
+    except Exception as e:
+        buffer.seek(0)
+        artifact = torch.load(buffer, weights_only=False)
+        log.warning(
+            "Fallback to weights_only=False succeeded. "
+            "Loaded object of type %s after initial failure: %s",
+            type(artifact),
+            exc_info=e,
+        )
     assert isinstance(artifact, (tuple, dict))
     return artifact
 

From 661fb534494e88da84d025ee3da2ca362b81bfcd Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 21:51:04 +0000
Subject: [PATCH 45/47] Revert "Remove old NVTX interface (#167637)"

This reverts commit 99117c1238c9adcd3fb2621e36c91f9d20ed2ff7.

Reverted https://github.com/pytorch/pytorch/pull/167637 on behalf of https://github.com/yangw-dev due to breaks internal build with torch/csrc/profiler/stubs/cuda.cpp:4:10: fatal error: 'nvtx3/nvtx3.hpp' file not found 4 | #include <nvtx3/nvtx3.hpp>, please find a meta fella to resolve this issue and try again, diff:[D87229660] ([comment](https://github.com/pytorch/pytorch/pull/167637#issuecomment-3543984021))
---
 caffe2/CMakeLists.txt              |  5 +++++
 cmake/Dependencies.cmake           |  5 ++++-
 cmake/TorchConfig.cmake.in         |  3 +++
 torch/CMakeLists.txt               |  4 ++++
 torch/csrc/cuda/shared/nvtx.cpp    | 11 ++++++++++-
 torch/csrc/profiler/stubs/cuda.cpp |  4 ++++
 6 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 9af0305778d38..d5c585c1e1f0b 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -1643,6 +1643,8 @@ if(USE_CUDA)
   target_link_libraries(torch_cuda PUBLIC c10_cuda)
   if(TARGET torch::nvtx3)
     target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
+  else()
+    target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
   endif()
 
   target_include_directories(
@@ -1739,6 +1741,9 @@ if(BUILD_SHARED_LIBS)
   if(USE_CUDA)
     target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
     target_link_libraries(torch_global_deps torch::cudart)
+    if(TARGET torch::nvtoolsext)
+      target_link_libraries(torch_global_deps torch::nvtoolsext)
+    endif()
   endif()
   install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 444a7590a8a07..733183ef50bd5 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -968,8 +968,11 @@ find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
 if(nvtx3_FOUND)
   add_library(torch::nvtx3 INTERFACE IMPORTED)
   target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
+  target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
 else()
-  message(FATAL_ERROR "Cannot find NVTX3!")
+  message(WARNING "Cannot find NVTX3, find old NVTX instead")
+  add_library(torch::nvtoolsext INTERFACE IMPORTED)
+  set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
 endif()
 
 
diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in
index 8a5587cad272a..0b32ffa99ceb5 100644
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@@ -132,6 +132,9 @@ if(@USE_CUDA@)
   else()
     set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
   endif()
+  if(TARGET torch::nvtoolsext)
+    list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
+  endif()
 
   if(@BUILD_SHARED_LIBS@)
     find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 4e65720180617..d92b9e19a76c5 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -150,6 +150,10 @@ if(USE_CUDA)
 
     if(TARGET torch::nvtx3)
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtx3)
+    else()
+      if(TARGET torch::nvtoolsext)
+        list(APPEND TORCH_PYTHON_LINK_LIBRARIES torch::nvtoolsext)
+      endif()
     endif()
 endif()
 
diff --git a/torch/csrc/cuda/shared/nvtx.cpp b/torch/csrc/cuda/shared/nvtx.cpp
index 8faf319071c37..f4b3c8824b85c 100644
--- a/torch/csrc/cuda/shared/nvtx.cpp
+++ b/torch/csrc/cuda/shared/nvtx.cpp
@@ -2,13 +2,18 @@
 #include <wchar.h> // _wgetenv for nvtx
 #endif
 
+#include <cuda_runtime.h>
+
 #ifndef ROCM_ON_WINDOWS
+#if CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvtx3/nvtx3.hpp>
+#else // CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
+#include <nvToolsExt.h>
+#endif // CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS
 #include <c10/cuda/CUDAException.h>
-#include <cuda_runtime.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch::cuda::shared {
@@ -50,7 +55,11 @@ static void* device_nvtxRangeStart(const char* msg, std::intptr_t stream) {
 void initNvtxBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
 
+#ifdef TORCH_CUDA_USE_NVTX3
   auto nvtx = m.def_submodule("_nvtx", "nvtx3 bindings");
+#else
+  auto nvtx = m.def_submodule("_nvtx", "libNvToolsExt.so bindings");
+#endif
   nvtx.def("rangePushA", nvtxRangePushA);
   nvtx.def("rangePop", nvtxRangePop);
   nvtx.def("rangeStartA", nvtxRangeStartA);
diff --git a/torch/csrc/profiler/stubs/cuda.cpp b/torch/csrc/profiler/stubs/cuda.cpp
index b590b2d985d02..2b634b0303c26 100644
--- a/torch/csrc/profiler/stubs/cuda.cpp
+++ b/torch/csrc/profiler/stubs/cuda.cpp
@@ -1,7 +1,11 @@
 #include <sstream>
 
 #ifndef ROCM_ON_WINDOWS
+#if CUDART_VERSION >= 13000 || defined(TORCH_CUDA_USE_NVTX3)
 #include <nvtx3/nvtx3.hpp>
+#else
+#include <nvToolsExt.h>
+#endif
 #else // ROCM_ON_WINDOWS
 #include <c10/util/Exception.h>
 #endif // ROCM_ON_WINDOWS

From 1c04a4395955590b211d090d394e5b1d98139151 Mon Sep 17 00:00:00 2001
From: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>
Date: Mon, 17 Nov 2025 21:54:56 +0000
Subject: [PATCH 46/47] Revert "Tiling bug fix (#167771)"

This reverts commit 7ede33b8e3cd5f068c6e88d678ed3f67f5249c64.

Reverted https://github.com/pytorch/pytorch/pull/167771 on behalf of https://github.com/eellison due to needs one fix ([comment](https://github.com/pytorch/pytorch/pull/167771#issuecomment-3543999822))
---
 test/inductor/test_loop_ordering.py | 14 -----
 torch/_inductor/codegen/simd.py     | 15 +----
 torch/_inductor/tiling_utils.py     | 87 +++++------------------------
 3 files changed, 16 insertions(+), 100 deletions(-)

diff --git a/test/inductor/test_loop_ordering.py b/test/inductor/test_loop_ordering.py
index 8356ecd0b6998..051a5f5905997 100644
--- a/test/inductor/test_loop_ordering.py
+++ b/test/inductor/test_loop_ordering.py
@@ -1188,20 +1188,6 @@ def fn(nodes):
         with torch._inductor.config.patch(_post_fusion_custom_pass=fn), torch.no_grad():
             torch.compile(f)(x)
 
-    def test_find_broadcast_var(self):
-        """Test broadcast variable detection for tiling improvements."""
-        from torch._inductor import tiling_utils
-
-        i, j = sympy.symbols("i j", integer=True)
-
-        # Test broadcast pattern detection: FloorDiv creates broadcast
-        result = tiling_utils.find_broadcast_var(FloorDiv(i, 10), {i: 100, j: 50})
-        self.assertEqual(result, i)
-
-        # Test non-broadcast: linear access pattern
-        result = tiling_utils.find_broadcast_var(i + j * 10, {i: 10, j: 8})
-        self.assertEqual(result, None)
-
 
 class TestIndexInversion(TestCase):
     @classmethod
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
index 65e8f88b1c425..2ad02ca97a54b 100644
--- a/torch/_inductor/codegen/simd.py
+++ b/torch/_inductor/codegen/simd.py
@@ -2819,8 +2819,6 @@ def process_node_vars(
         bad_size_additional_tiling_penalty = 1.025
         good_size_tiling_penalty = 1.005
 
-        total_uncoalesced = sum(coalesce_analysis.uncoalesced_addrs.values())
-
         def score_mod(t):
             score_factor = 1.0
             for tile_size in t[0].tiling.values():
@@ -2829,19 +2827,12 @@ def score_mod(t):
                 else:
                     score_factor = score_factor / good_size_tiling_penalty
 
-            # Add uncoalesced memory score to prevent small coalesced benefits
-            # from dominating large amounts of uncoalesced memory
-            uncoalesced_penalty = total_uncoalesced * 0.05
-
-            return -(t[0].score + uncoalesced_penalty) * score_factor
+            return -t[0].score * score_factor
 
         # apply penalty for longer tilings that dont increase score much
         for cand, tiling_score in sorted(tilings, key=score_mod):
-            if (
-                cls.tiling_is_compatible(
-                    node_schedule, pointwise_numel, reduction_numel, cand.tiling
-                )
-                or cand.tiling == default_tiling
+            if cls.tiling_is_compatible(
+                node_schedule, pointwise_numel, reduction_numel, cand.tiling
             ):
                 # we always include default reduction numel == 1, dont include
                 tiling_len = len(cand.tiling) - (1 if reduction_numel == 1 else 0)
diff --git a/torch/_inductor/tiling_utils.py b/torch/_inductor/tiling_utils.py
index 4a4efdccf4b38..5b394b9ea9914 100644
--- a/torch/_inductor/tiling_utils.py
+++ b/torch/_inductor/tiling_utils.py
@@ -145,38 +145,6 @@ def indexing_div_rep(
     return None
 
 
-def find_broadcast_var(
-    index: sympy.Expr, var_ranges: dict[sympy.Expr, int]
-) -> Optional[sympy.Expr]:
-    """
-    Try to find the variable that this index is broadcast over.
-    A broadcast pattern is one where consecutive values of a variable
-    access the same memory location (e.g., x // 10).
-    """
-    # Approximate analysis by evaluating at 1 and 0
-    variables: dict[sympy.Symbol, int] = {}
-    for v in index.free_symbols:
-        if v in var_ranges:
-            variables[v] = 0
-        else:
-            variables[v] = get_hint(v)
-
-    zero_index = sympy_subs(index, variables)
-    for v in var_ranges.keys():
-        variables[v] = 1
-        try:
-            new_val = sympy_subs(index, variables)
-        except ZeroDivisionError:
-            loop_tiling_log.info("zero division error %s %s", index, variables)
-            continue
-        # Broadcast means the value doesn't change when the variable increments
-        if new_val == zero_index:
-            return v
-        variables[v] = 0
-
-    return None
-
-
 def find_coalesced_var(
     index: sympy.Expr, var_ranges: dict[sympy.Expr, int]
 ) -> Optional[sympy.Expr]:
@@ -600,12 +568,11 @@ def remove_identity(expr: sympy.Expr) -> sympy.Expr:
     return fused_out
 
 
-def get_score(
-    addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int], buf_names: OrderedSet[str]
-) -> int:
+def get_score(addr: sympy.Expr, var_ranges: dict[sympy.Symbol, int]) -> int:
     """
-    Score addr according to its approximate size.
+    Score addr according to its approximate size
     """
+
     # TODO - deduplicate with candidate_tilings
     var_sizes = []
     for v in addr.free_symbols:
@@ -620,15 +587,6 @@ def get_score(
     )
 
 
-def try_get_buf_size(buf_name: str) -> Optional[int]:
-    buf = V.graph.try_get_buffer(buf_name)
-    if not buf:
-        return None
-    return V.graph.sizevars.atomically_apply_size_hint(
-        sympy_product(buf.get_size()), fallback=config.unbacked_symint_fallback
-    )
-
-
 def get_hint(v: Union[sympy.Expr, int]) -> int:
     if isinstance(v, int):
         return v
@@ -654,8 +612,6 @@ class CoalesceVarAnalysis:
     # TODO: separate into dataclass that olds mem, dtype, is_write
     coalesced_by_var: dict[sympy.Expr, int]
 
-    uncoalesced_addrs: dict[sympy.Expr, int]
-
     norm_read_writes: FusedNormalizedReadsWrites
 
     suggested_split: Optional[VarTiling] = None
@@ -701,40 +657,28 @@ def analyze_memory_coalescing(
         if indirect_expr:
             continue
 
-        size = get_score(memory_expr, var_ranges, buf_names)
-
+        size = get_score(memory_expr, var_ranges)
         if size == 0:
             continue
 
         maybe_coalesced_var = find_coalesced_var(memory_expr, var_ranges)
-        # while broadcasting vars are not technically coalesced,
-        # accesses at least stay in cache, so they provide most of the benefit.
-        # treat the same for now.
-        if maybe_coalesced_var is None:
-            maybe_coalesced_var = find_broadcast_var(memory_expr, var_ranges)
 
-        total_score = 0
+        byte_multipler = 0
         for buf_name in buf_names:
-            if (buf := V.graph.try_get_buffer(buf_name)) and (
-                buf_size := try_get_buf_size(buf_name)
-            ):
-                # constrain by buf size since we'll read at most that many elements
-                # score could be more through either masking or by broadcasting (e.g. x // 16)
-                total_score += min(buf_size, size) * buf.dtype.itemsize
+            if buf := V.graph.try_get_buffer(buf_name):
+                byte_multipler += buf.dtype.itemsize
 
         # coalesced writes more important
-        total_score *= 1 if is_read else 2
+        byte_multipler *= 1 if is_read else 2
 
         if maybe_coalesced_var:
-            coalesced_by_var[maybe_coalesced_var] += total_score
+            coalesced_by_var[maybe_coalesced_var] += size * byte_multipler
         else:
-            uncoalesced_addrs[memory_expr] += total_score
+            uncoalesced_addrs[memory_expr] += size * byte_multipler
 
     if not uncoalesced_addrs:
         return CoalesceVarAnalysis(
-            coalesced_by_var=coalesced_by_var,
-            uncoalesced_addrs=uncoalesced_addrs,
-            norm_read_writes=norm_read_writes,
+            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
         )
 
     # map from var -> tiling -> total_score
@@ -778,9 +722,7 @@ def analyze_memory_coalescing(
 
     if len(tiling_scores) == 0:
         return CoalesceVarAnalysis(
-            coalesced_by_var=coalesced_by_var,
-            uncoalesced_addrs=uncoalesced_addrs,
-            norm_read_writes=norm_read_writes,
+            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
         )
 
     best_tiling: Optional[tuple[sympy.Expr, int]] = None
@@ -794,9 +736,7 @@ def analyze_memory_coalescing(
 
     if best_tiling is None:
         return CoalesceVarAnalysis(
-            coalesced_by_var=coalesced_by_var,
-            uncoalesced_addrs=uncoalesced_addrs,
-            norm_read_writes=norm_read_writes,
+            coalesced_by_var=coalesced_by_var, norm_read_writes=norm_read_writes
         )
 
     # TODO - for strictly pointwise fusions,
@@ -805,7 +745,6 @@ def analyze_memory_coalescing(
     # TODO - could also prefer index var splits to reduction, better tested
     return CoalesceVarAnalysis(
         coalesced_by_var=coalesced_by_var,
-        uncoalesced_addrs=uncoalesced_addrs,
         norm_read_writes=norm_read_writes,
         suggested_split=VarTiling(best_tiling[0], best_tiling[1], best_tiling_score),
     )

From 4d1947b0411ad098a30190b6b315871eb52bdbf6 Mon Sep 17 00:00:00 2001
From: Mikayla Gawarecki <mikaylagawarecki@gmail.com>
Date: Mon, 17 Nov 2025 21:19:52 -0800
Subject: [PATCH 47/47] Update base for Update on "Test that
 TORCH_FEATURE_VERSION guards are used where needed"

Splits each torch library registration in the 2.10 folder into its own file -- I had a script that parsed kernel.cpp to do this but I felt like forcing this responsibility on the user might be less error prone

Compiles each file targetting 2.9 and asserts that compilation fails. (There are 2 2.9 kernels we use as negative tests where compilation is expected to succeed)


[ghstack-poisoned]