Apply linter suggestions to #65137 (#65459)

Summary: Pull Request resolved: #65459 Just run linter on the change and apply all suggestions Test Plan: N/A Reviewed By: seemethere Differential Revision: D31102960 fbshipit-source-id: 04e1d07935690f2ddbc64533661b3e55379d13b5
pytorch · Sep 27, 2021 · 82e0bf4 · 82e0bf4
1 parent 811601e
commit 82e0bf4
Show file tree

Hide file tree

Showing 24 changed files with 29 additions and 41 deletions.
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -789,7 +789,7 @@ WelfordResult Welford(
 
     TORCH_CHECK(
         axis >= 0 && (unsigned int)axis < tv->nDims(),
-        "Reduction on invalid axis, recieved: ",
+        "Reduction on invalid axis, received: ",
         axis,
         " however tensor view only has ",
         tv->nDims(),

diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h
@@ -161,7 +161,7 @@ class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool.");
   }
   virtual void handle(const Double*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for Double.");
   }
   virtual void handle(const Int*) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int.");
@@ -184,7 +184,7 @@ class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp.");
   }
   virtual void handle(const WelfordOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for WelfordOp.");
   }
   virtual void handle(const TernaryOp*) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp.");
@@ -196,13 +196,13 @@ class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp.");
   }
   virtual void handle(const TransposeOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for TransposeOp.");
   }
   virtual void handle(const ShiftOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for ShiftOp.");
   }
   virtual void handle(const GatherOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for GatherOp.");
   }
 };
 
@@ -227,7 +227,7 @@ class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool.");
   }
   virtual void handle(Double*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for Double.");
   }
   virtual void handle(Int*) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int.");
@@ -256,19 +256,19 @@ class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp.");
   }
   virtual void handle(WelfordOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for WelfordOp.");
   }
   virtual void handle(BroadcastOp*) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp.");
   }
   virtual void handle(TransposeOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for TransposeOp.");
   }
   virtual void handle(ShiftOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for ShiftOp.");
   }
   virtual void handle(GatherOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp.");
+    TORCH_INTERNAL_ASSERT(false, "Handle not overridden for GatherOp.");
   }
 };
 
@@ -383,19 +383,19 @@ class TORCH_CUDA_CU_API OptInMutator : public PolymorphicBase {
     TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ReductionOp.");
   }
   virtual Statement* mutate(WelfordOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for WelfordOp.");
+    TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for WelfordOp.");
   }
   virtual Statement* mutate(BroadcastOp*) {
     TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BroadcastOp.");
   }
   virtual Statement* mutate(TransposeOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TransposeOp.");
+    TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for TransposeOp.");
   }
   virtual Statement* mutate(ShiftOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ShiftOp.");
+    TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for ShiftOp.");
   }
   virtual Statement* mutate(GatherOp*) {
-    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for GatherOp.");
+    TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for GatherOp.");
   }
 };
 

diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -1,4 +1,3 @@
-
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 
 #include <torch/csrc/jit/codegen/cuda/codegen.h>

diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
@@ -184,7 +184,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // launch kernels without re-inference parameters.
   std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
 
-  // Profiling support: knob to control wheter we actually execute the
+  // Profiling support: knob to control whether we actually execute the
   // kernel on the GPU or not
   bool execute_kernel_ = true;
 

diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -1,4 +1,3 @@
-
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>

diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h
@@ -149,7 +149,7 @@ class TORCH_CUDA_CU_API SegmentedGroup {
   //! Utility to convert edge vector to value vector
   std::vector<Val*> edgesToVals(const std::vector<SegmentedEdge*>& se_v);
 
-  //! Reset method to call at begining of each
+  //! Reset method to call at beginning of each
   //!  merge node iteration
   void clearTraversalInfo();
 

diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp
@@ -309,7 +309,7 @@ IndexCompute getReferenceIndexing(
 
   // I thought this might be necesasry, but turns out it's not. I think it's
   // because of the root ordering above, however leaving it in case we find
-  // out it is necessary in some cases. At the time of commiting, cuda-memcheck
+  // out it is necessary in some cases. At the time of committing, cuda-memcheck
   // passed without this.
   //
   // std::unordered_map<kir::IterDomain*,

diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -511,7 +511,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runMultiKernelWithInput(
 
   TORCH_INTERNAL_ASSERT(
       inputs.size() == segmented_fusion_->inputs().size(),
-      "Inputs were not set up correctly, recieved ",
+      "Inputs were not set up correctly, received ",
       inputs.size(),
       " inputs but expecting ",
       segmented_fusion_->inputs().size());

diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
@@ -1,4 +1,3 @@
-
 #include <torch/csrc/jit/codegen/cuda/instrumentation.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>

diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
@@ -1,4 +1,3 @@
-
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -341,7 +341,7 @@ void GpuLower::lower() {
       processMisalignedVectorization(fusion_, unrolled_loops);
 
   // Reuse memory locations
-  // TODO: Reenable once fixed.
+  // TODO: Re-enable once fixed.
   // const auto reuse_mem_exprs = reuseMemoryAllocations(unrolled_mv_loops);
 
   // Insert SyncThreads at end of for-loop to avoid WAR race condition

diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp
@@ -488,7 +488,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
         "this pass should be run before any conditionals are placed in code.");
   }
 
-  AllocationInserter(std::vector<kir::Expr*> _loop_nests)
+  explicit AllocationInserter(std::vector<kir::Expr*> _loop_nests)
       : loop_nests_(std::move(_loop_nests)),
         gpu_lower(GpuLower::current()),
         ir_builder(gpu_lower->kernel()) {
@@ -506,8 +506,8 @@ class AllocationInserter : public kir::MutableIrVisitor {
       if (alloc.alloc_expr == nullptr) {
         continue;
       }
-      // Dynamic smem exprs need to be at the begining of the kernel outside for
-      // loops
+      // Dynamic smem exprs need to be at the beginning of the kernel outside
+      // for loops
       if (alloc.buffer->memoryType() == MemoryType::Shared &&
           !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) {
         loop_nests_.insert(loop_nests_.begin(), alloc.alloc_expr);

diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h
@@ -1,4 +1,3 @@
-
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>

diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h
@@ -10,7 +10,7 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-//! Insert magic zero definition at the begining of the kernel. Insert magic
+//! Insert magic zero definition at the beginning of the kernel. Insert magic
 //! zero update after every (outer most) loop nest with a compile time extent.
 //!
 //! This will make sure nvrtc does not aggressively save predicate and indices.

diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
@@ -1,4 +1,3 @@
-
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>

diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h
@@ -1,4 +1,3 @@
-
 #pragma once
 
 #include <torch/csrc/WindowsTorchApiMacro.h>

diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
@@ -1,4 +1,3 @@
-
 // Counter-based block synchronization. Only meant to be used for
 // debugging and validating synchronization. This should be replaced
 // with cuda::barrier::arrive_and_wait as that should be more robust.

diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu b/torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
@@ -1,4 +1,3 @@
-
 // Default block synchronization. Just use __barrier_sync
 namespace block_sync {
 

diff --git a/torch/csrc/jit/codegen/cuda/runtime/broadcast.cu b/torch/csrc/jit/codegen/cuda/runtime/broadcast.cu
@@ -1,4 +1,3 @@
-
 namespace broadcast {
 
 template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>

diff --git a/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu b/torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
@@ -1,4 +1,3 @@
-
 #define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
 #define __NVFUSER_HALF_TO_CUS(var) \
   *(reinterpret_cast<const unsigned short*>(&(var)))

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
@@ -142,7 +142,7 @@ ReductionParams innerNormalizationHeuristic(
   target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);
 
   if (target_blocks * target_unroll * target_iterations < n_elems) {
-    // targetting 4 waves, so try to use a quarter of available threads
+    // targeting 4 waves, so try to use a quarter of available threads
     max_threads_in_block = std::min(
         ceilDiv(n_elems, target_blocks * target_unroll),
         ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
@@ -382,7 +382,7 @@ ReductionParams OuterNormalizationHeuristic(
   target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);
 
   if (target_blocks * target_unroll * max_threads_in_block < n_elems) {
-    // targetting 4 waves, so try to use a quarter of available threads
+    // targeting 4 waves, so try to use a quarter of available threads
     max_threads_in_block = std::min(
         ceilDiv(n_elems, target_blocks * target_unroll),
         ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
@@ -655,7 +655,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       auto consumer_tvs = ir_utils::consumerTvsOf(cached_input);
       TORCH_INTERNAL_ASSERT(
           consumer_tvs.size(),
-          "Input was not succesfully filtered out for scheduling but wasn't used.");
+          "Input was not successfully filtered out for scheduling but wasn't used.");
 
       // Grab a consumer which will be used for computeAt structure of cached
       // input into a consumer

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
@@ -139,7 +139,7 @@ ReductionParams innerReductionHeuristic(
   target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);
 
   if (target_blocks * target_unroll * target_iterations < n_elems) {
-    // targetting 4 waves, so try to use a quarter of available threads
+    // targeting 4 waves, so try to use a quarter of available threads
     max_threads_in_block = std::min(
         ceilDiv(n_elems, target_blocks * target_unroll),
         ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
@@ -385,7 +385,7 @@ ReductionParams OuterReductionHeuristic(
   target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);
 
   if (target_blocks * target_unroll * max_threads_in_block < n_elems) {
-    // targetting 4 waves, so try to use a quarter of available threads
+    // targeting 4 waves, so try to use a quarter of available threads
     max_threads_in_block = std::min(
         ceilDiv(n_elems, target_blocks * target_unroll),
         ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));

diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -1,4 +1,3 @@
-
 #include <torch/csrc/jit/codegen/cuda/utils.h>
 
 #include <c10/util/string_view.h>