pytorch · cyyever · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/.clang-tidy b/.clang-tidy
@@ -59,9 +59,9 @@ performance-*,
 -performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
-readability-duplicate-include
+readability-duplicate-include,
 readability-misplaced-array-index,
-readability-redundant*
+readability-redundant*,
 readability-simplify-subscript-expr,
 readability-string-compare,
 -readability-redundant-access-specifiers,

@@ -1375,7 +1375,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
       && ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
       // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
-      ||  (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
+      ||  (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty())))) {
     TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
     at::cuda::detail::f8f8bf16_rowwise(
         mat1,

@@ -8,7 +8,6 @@
 #include <ATen/NativeFunctions.h>
 #include <ATen/Dispatch.h>
 #include <ATen/DynamicLibrary.h>
-#include <ATen/NativeFunctions.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/LinearAlgebra.h>

@@ -1041,8 +1041,8 @@ std::string generate_code(
   // and `extra_args` for computation call if
   // extra arguments to capture runtime state are passed.
   // (look at polygamma for example).
-  std::string extra_params = "";
-  std::string extra_args = "";
+  std::string extra_params;
+  std::string extra_args;
   for (size_t i = 0; i < extra_args_typenames.size(); i++) {
     auto type = std::string(extra_args_typenames[i]);
     auto name = "extra_arg_" + std::to_string(i);
@@ -1352,7 +1352,7 @@ std::string generate_reduction_code(
     int vec_size,
     int max_threads_codegen) {
   TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
-  TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
+  TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());
 
   return generate_reduction_code(
       desc.nOutputs,
@@ -1451,7 +1451,7 @@ std::optional<std::string> get_cache_dir() {
   std::string cache_dir;
   char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
   // Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
-  std::string kernels_cache_dir = "";
+  std::string kernels_cache_dir;
   if (ptkcp != nullptr) {
     cache_dir = std::string(ptkcp);
   } else {

@@ -14,7 +14,6 @@
 #include <ATen/native/LinearAlgebraUtils.h>
 #include <ATen/native/cuda/MiscUtils.h>
 #include <ATen/native/LinearAlgebra.h>
-#include <ATen/native/BatchLinearAlgebra.h>
 #include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
 #include <ATen/native/cuda/linalg/MagmaUtils.h>
 #include <ATen/native/cpu/zmath.h>

@@ -813,7 +813,7 @@ static void _save_variables(
     const std::vector<std::optional<at::Tensor>>& tensors_to_save,
     const std::shared_ptr<PyNode>& cdata_ptr,
     THPFunction* self) {
-  if (tensors_to_save.size() == 0)
+  if (tensors_to_save.empty())
     return;
   size_t num_saved = tensors_to_save.size();
   self->saved_variables.clear();

diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
@@ -35,7 +35,7 @@ at::Tensor allocate_all_gather_output(
     int64_t group_size) {
   TORCH_CHECK(input.is_contiguous());
   auto output_size = input.sizes().vec();
-  if (output_size.size() == 0) {
+  if (output_size.empty()) {
     output_size.push_back(group_size);
   } else {
     output_size[0] *= group_size;

diff --git a/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp b/torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
@@ -196,7 +196,7 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
   static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
   if (transportName.has_value()) {
     return GlooDeviceRegistry()->Create(
-        transportName.value().c_str(), interfaceName, hostName, lazyInit);
+        transportName.value(), interfaceName, hostName, lazyInit);
   }
 
 #ifdef __linux__

diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -165,7 +165,7 @@ c10::intrusive_ptr<ProcessGroup> ProcessGroup::splitGroup(
     const std::optional<std::string>& name,
     const std::optional<std::string>& desc) {
   TORCH_CHECK(
-      ranks.size() > 0,
+      !ranks.empty(),
       "Split ranks cannot be empty. Please provide a non-empty list of ranks to split the group.");
   TORCH_CHECK(
       ranks.size() <= static_cast<size_t>(size_),

diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@@ -559,7 +559,7 @@ c10::intrusive_ptr<ProcessGroupGloo::Options> ProcessGroupGloo::Options::
   // Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
   auto ifnameEnv = c10::utils::get_env("GLOO_SOCKET_IFNAME");
   if (ifnameEnv && ifnameEnv->size() > 1) {
-    for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
+    for (const auto& iface : ::c10d::split(',', *ifnameEnv)) {
       options->devices.push_back(
           ::c10d::ProcessGroupGloo::createDeviceForInterface(iface, lazyInit));
     }

diff --git a/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp b/torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
@@ -1,6 +1,7 @@
 #ifdef USE_C10D_GLOO
 #include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp>
+#include <utility>
 
 #include <gloo/cuda_allreduce_ring_chunked.h>
 
@@ -24,7 +25,7 @@ class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
             "gloo:all_reduce",
             inputs),
         inputs_(inputs),
-        reduceOp_(reduceOp) {}
+        reduceOp_(std::move(reduceOp)) {}
 
   template <typename T>
   void createAlgorithm(std::unique_ptr<gloo::Algorithm>& algo) {

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -1089,8 +1089,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   bool useNonblocking();
 
  protected:
-  int globalRankStart_;
-  int globalRankStride_;
+  int globalRankStart_{};
+  int globalRankStride_{};
 
  private:
   bool eagerInit_{false};
@@ -1380,7 +1380,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
 
   // Whether the coalesced calls are sync or async.
-  bool coalescedAsync_;
+  bool coalescedAsync_{};
 
   // keeps track of input and output tensors when coalescing is in flight.  Will
   // hand over these tensors to WorkNCCL's stash when coalescing is ended.

diff --git a/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp b/torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
@@ -163,8 +163,8 @@ struct CollectiveFingerPrint {
     backend->allgather(output_tensors, tensors_to_verify)->wait();
     // Verify equivalence
     for (const auto i : c10::irange(output_tensors.size())) {
-      const std::vector<at::Tensor> gathered_tensors = output_tensors[i];
-      const at::Tensor reference_tensor = tensors_to_verify[i];
+      const std::vector<at::Tensor>& gathered_tensors = output_tensors[i];
+      const at::Tensor& reference_tensor = tensors_to_verify[i];
       for (const auto rank : c10::irange(gathered_tensors.size())) {
         const auto& rank_tensor = gathered_tensors[rank];
         if (!rank_tensor.equal(reference_tensor)) {

@@ -263,10 +263,12 @@ constexpr static int kNumUvThreads = 16;
 
 std::unique_ptr<ChannelRegistration> makeMultiplexedUvChannel() {
   std::vector<std::shared_ptr<tensorpipe::transport::Context>> contexts;
+  contexts.reserve(kNumUvThreads);
   std::vector<std::shared_ptr<tensorpipe::transport::Listener>> listeners;
+  listeners.reserve(kNumUvThreads);
   for ([[maybe_unused]] const auto laneIdx : c10::irange(kNumUvThreads)) {
     auto context = tensorpipe::transport::uv::create();
-    std::string address = TensorPipeAgent::guessAddress();
+    const std::string& address = TensorPipeAgent::guessAddress();
     contexts.push_back(std::move(context));
     listeners.push_back(contexts.back()->listen(address));
   }