Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ performance-*,
-performance-enum-size,
readability-container-size-empty,
readability-delete-null-pointer,
readability-duplicate-include
readability-duplicate-include,
readability-misplaced-array-index,
readability-redundant*
readability-redundant*,
readability-simplify-subscript-expr,
readability-string-compare,
-readability-redundant-access-specifiers,
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/native/cuda/Blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1375,7 +1375,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
if (scaling_choice_a == ScalingType::RowWise && scaling_choice_b == ScalingType::RowWise
&& ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|| (dprops->major >= 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|| (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty())))) {
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
at::cuda::detail::f8f8bf16_rowwise(
mat1,
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/native/cuda/LinearAlgebraStubs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#include <ATen/NativeFunctions.h>
#include <ATen/Dispatch.h>
#include <ATen/DynamicLibrary.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/cuda/MiscUtils.h>
#include <ATen/native/Resize.h>
#include <ATen/native/LinearAlgebra.h>
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/native/cuda/jit_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1041,8 +1041,8 @@ std::string generate_code(
// and `extra_args` for computation call if
// extra arguments to capture runtime state are passed.
// (look at polygamma for example).
std::string extra_params = "";
std::string extra_args = "";
std::string extra_params;
std::string extra_args;
for (size_t i = 0; i < extra_args_typenames.size(); i++) {
auto type = std::string(extra_args_typenames[i]);
auto name = "extra_arg_" + std::to_string(i);
Expand Down Expand Up @@ -1352,7 +1352,7 @@ std::string generate_reduction_code(
int vec_size,
int max_threads_codegen) {
TORCH_INTERNAL_ASSERT(desc.nInputs == 1);
TORCH_INTERNAL_ASSERT(desc.extra_args_types.size() == 0);
TORCH_INTERNAL_ASSERT(desc.extra_args_types.empty());

return generate_reduction_code(
desc.nOutputs,
Expand Down Expand Up @@ -1451,7 +1451,7 @@ std::optional<std::string> get_cache_dir() {
std::string cache_dir;
char* ptkcp = std::getenv("PYTORCH_KERNEL_CACHE_PATH");
// Create kernel_cache_dir if needed as we do not want to create the base directory passed by the user
std::string kernels_cache_dir = "";
std::string kernels_cache_dir;
if (ptkcp != nullptr) {
cache_dir = std::string(ptkcp);
} else {
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include <ATen/native/LinearAlgebraUtils.h>
#include <ATen/native/cuda/MiscUtils.h>
#include <ATen/native/LinearAlgebra.h>
#include <ATen/native/BatchLinearAlgebra.h>
#include <ATen/native/cuda/linalg/BatchLinearAlgebraLib.h>
#include <ATen/native/cuda/linalg/MagmaUtils.h>
#include <ATen/native/cpu/zmath.h>
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/autograd/python_function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ static void _save_variables(
const std::vector<std::optional<at::Tensor>>& tensors_to_save,
const std::shared_ptr<PyNode>& cdata_ptr,
THPFunction* self) {
if (tensors_to_save.size() == 0)
if (tensors_to_save.empty())
return;
size_t num_saved = tensors_to_save.size();
self->saved_variables.clear();
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/distributed/c10d/Functional.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ at::Tensor allocate_all_gather_output(
int64_t group_size) {
TORCH_CHECK(input.is_contiguous());
auto output_size = input.sizes().vec();
if (output_size.size() == 0) {
if (output_size.empty()) {
output_size.push_back(group_size);
} else {
output_size[0] *= group_size;
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/distributed/c10d/GlooDeviceFactory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ std::shared_ptr<::gloo::transport::Device> makeGlooDevice(
static auto transportName = c10::utils::get_env("GLOO_DEVICE_TRANSPORT");
if (transportName.has_value()) {
return GlooDeviceRegistry()->Create(
transportName.value().c_str(), interfaceName, hostName, lazyInit);
transportName.value(), interfaceName, hostName, lazyInit);
}

#ifdef __linux__
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/distributed/c10d/ProcessGroup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ c10::intrusive_ptr<ProcessGroup> ProcessGroup::splitGroup(
const std::optional<std::string>& name,
const std::optional<std::string>& desc) {
TORCH_CHECK(
ranks.size() > 0,
!ranks.empty(),
"Split ranks cannot be empty. Please provide a non-empty list of ranks to split the group.");
TORCH_CHECK(
ranks.size() <= static_cast<size_t>(size_),
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ c10::intrusive_ptr<ProcessGroupGloo::Options> ProcessGroupGloo::Options::
// Use interfaces listed in "GLOO_SOCKET_IFNAME", if set.
auto ifnameEnv = c10::utils::get_env("GLOO_SOCKET_IFNAME");
if (ifnameEnv && ifnameEnv->size() > 1) {
for (const auto& iface : ::c10d::split(',', ifnameEnv->c_str())) {
for (const auto& iface : ::c10d::split(',', *ifnameEnv)) {
options->devices.push_back(
::c10d::ProcessGroupGloo::createDeviceForInterface(iface, lazyInit));
}
Expand Down
3 changes: 2 additions & 1 deletion torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifdef USE_C10D_GLOO
#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
#include <torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp>
#include <utility>

#include <gloo/cuda_allreduce_ring_chunked.h>

Expand All @@ -24,7 +25,7 @@ class AsyncAllreduceCUDADeviceWork : public ProcessGroupGloo::AsyncWork {
"gloo:all_reduce",
inputs),
inputs_(inputs),
reduceOp_(reduceOp) {}
reduceOp_(std::move(reduceOp)) {}

template <typename T>
void createAlgorithm(std::unique_ptr<gloo::Algorithm>& algo) {
Expand Down
6 changes: 3 additions & 3 deletions torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1089,8 +1089,8 @@ class TORCH_API ProcessGroupNCCL : public Backend {
bool useNonblocking();

protected:
int globalRankStart_;
int globalRankStride_;
int globalRankStart_{};
int globalRankStride_{};

private:
bool eagerInit_{false};
Expand Down Expand Up @@ -1380,7 +1380,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;

// Whether the coalesced calls are sync or async.
bool coalescedAsync_;
bool coalescedAsync_{};

// keeps track of input and output tensors when coalescing is in flight. Will
// hand over these tensors to WorkNCCL's stash when coalescing is ended.
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ struct CollectiveFingerPrint {
backend->allgather(output_tensors, tensors_to_verify)->wait();
// Verify equivalence
for (const auto i : c10::irange(output_tensors.size())) {
const std::vector<at::Tensor> gathered_tensors = output_tensors[i];
const at::Tensor reference_tensor = tensors_to_verify[i];
const std::vector<at::Tensor>& gathered_tensors = output_tensors[i];
const at::Tensor& reference_tensor = tensors_to_verify[i];
for (const auto rank : c10::irange(gathered_tensors.size())) {
const auto& rank_tensor = gathered_tensors[rank];
if (!rank_tensor.equal(reference_tensor)) {
Expand Down
4 changes: 3 additions & 1 deletion torch/csrc/distributed/rpc/tensorpipe_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,12 @@ constexpr static int kNumUvThreads = 16;

std::unique_ptr<ChannelRegistration> makeMultiplexedUvChannel() {
std::vector<std::shared_ptr<tensorpipe::transport::Context>> contexts;
contexts.reserve(kNumUvThreads);
std::vector<std::shared_ptr<tensorpipe::transport::Listener>> listeners;
listeners.reserve(kNumUvThreads);
for ([[maybe_unused]] const auto laneIdx : c10::irange(kNumUvThreads)) {
auto context = tensorpipe::transport::uv::create();
std::string address = TensorPipeAgent::guessAddress();
const std::string& address = TensorPipeAgent::guessAddress();
contexts.push_back(std::move(context));
listeners.push_back(contexts.back()->listen(address));
}
Expand Down
Loading