Skip to content

Commit

Permalink
Apply linter suggestions to #65137 (#65459)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #65459

Just run linter on the change and apply all suggestions

Test Plan: N/A

Reviewed By: seemethere

Differential Revision: D31102960

fbshipit-source-id: 04e1d07935690f2ddbc64533661b3e55379d13b5
  • Loading branch information
malfet authored and facebook-github-bot committed Sep 27, 2021
1 parent 811601e commit 82e0bf4
Show file tree
Hide file tree
Showing 24 changed files with 29 additions and 41 deletions.
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/arith.cpp
Expand Up @@ -789,7 +789,7 @@ WelfordResult Welford(

TORCH_CHECK(
axis >= 0 && (unsigned int)axis < tv->nDims(),
"Reduction on invalid axis, recieved: ",
"Reduction on invalid axis, received: ",
axis,
" however tensor view only has ",
tv->nDims(),
Expand Down
28 changes: 14 additions & 14 deletions torch/csrc/jit/codegen/cuda/dispatch.h
Expand Up @@ -161,7 +161,7 @@ class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool.");
}
virtual void handle(const Double*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for Double.");
}
virtual void handle(const Int*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int.");
Expand All @@ -184,7 +184,7 @@ class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp.");
}
virtual void handle(const WelfordOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for WelfordOp.");
}
virtual void handle(const TernaryOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp.");
Expand All @@ -196,13 +196,13 @@ class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp.");
}
virtual void handle(const TransposeOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for TransposeOp.");
}
virtual void handle(const ShiftOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for ShiftOp.");
}
virtual void handle(const GatherOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for GatherOp.");
}
};

Expand All @@ -227,7 +227,7 @@ class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool.");
}
virtual void handle(Double*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for Double.");
}
virtual void handle(Int*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int.");
Expand Down Expand Up @@ -256,19 +256,19 @@ class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp.");
}
virtual void handle(WelfordOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for WelfordOp.");
}
virtual void handle(BroadcastOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp.");
}
virtual void handle(TransposeOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for TransposeOp.");
}
virtual void handle(ShiftOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for ShiftOp.");
}
virtual void handle(GatherOp*) {
TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp.");
TORCH_INTERNAL_ASSERT(false, "Handle not overridden for GatherOp.");
}
};

Expand Down Expand Up @@ -383,19 +383,19 @@ class TORCH_CUDA_CU_API OptInMutator : public PolymorphicBase {
TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ReductionOp.");
}
virtual Statement* mutate(WelfordOp*) {
TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for WelfordOp.");
TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for WelfordOp.");
}
virtual Statement* mutate(BroadcastOp*) {
TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BroadcastOp.");
}
virtual Statement* mutate(TransposeOp*) {
TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TransposeOp.");
TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for TransposeOp.");
}
virtual Statement* mutate(ShiftOp*) {
TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ShiftOp.");
TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for ShiftOp.");
}
virtual Statement* mutate(GatherOp*) {
TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for GatherOp.");
TORCH_INTERNAL_ASSERT(false, "Mutate not overridden for GatherOp.");
}
};

Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/executor.cpp
@@ -1,4 +1,3 @@

#include <torch/csrc/jit/codegen/cuda/executor.h>

#include <torch/csrc/jit/codegen/cuda/codegen.h>
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/executor.h
Expand Up @@ -184,7 +184,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
// launch kernels without re-inference parameters.
std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;

// Profiling support: knob to control wheter we actually execute the
// Profiling support: knob to control whether we actually execute the
// kernel on the GPU or not
bool execute_kernel_ = true;

Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -1,4 +1,3 @@

#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/fusion_segmenter.h
Expand Up @@ -149,7 +149,7 @@ class TORCH_CUDA_CU_API SegmentedGroup {
//! Utility to convert edge vector to value vector
std::vector<Val*> edgesToVals(const std::vector<SegmentedEdge*>& se_v);

//! Reset method to call at begining of each
//! Reset method to call at beginning of each
//! merge node iteration
void clearTraversalInfo();

Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/index_reference_replay.cpp
Expand Up @@ -309,7 +309,7 @@ IndexCompute getReferenceIndexing(

// I thought this might be necesasry, but turns out it's not. I think it's
// because of the root ordering above, however leaving it in case we find
// out it is necessary in some cases. At the time of commiting, cuda-memcheck
// out it is necessary in some cases. At the time of committing, cuda-memcheck
// passed without this.
//
// std::unordered_map<kir::IterDomain*,
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Expand Up @@ -511,7 +511,7 @@ std::vector<at::Tensor> FusionKernelRuntime::runMultiKernelWithInput(

TORCH_INTERNAL_ASSERT(
inputs.size() == segmented_fusion_->inputs().size(),
"Inputs were not set up correctly, recieved ",
"Inputs were not set up correctly, received ",
inputs.size(),
" inputs but expecting ",
segmented_fusion_->inputs().size());
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp
@@ -1,4 +1,3 @@

#include <torch/csrc/jit/codegen/cuda/instrumentation.h>
#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir_printer.h>
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h
@@ -1,4 +1,3 @@

#pragma once

#include <torch/csrc/WindowsTorchApiMacro.h>
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/lower2device.cpp
Expand Up @@ -341,7 +341,7 @@ void GpuLower::lower() {
processMisalignedVectorization(fusion_, unrolled_loops);

// Reuse memory locations
// TODO: Reenable once fixed.
// TODO: Re-enable once fixed.
// const auto reuse_mem_exprs = reuseMemoryAllocations(unrolled_mv_loops);

// Insert SyncThreads at end of for-loop to avoid WAR race condition
Expand Down
6 changes: 3 additions & 3 deletions torch/csrc/jit/codegen/cuda/lower_allocation.cpp
Expand Up @@ -488,7 +488,7 @@ class AllocationInserter : public kir::MutableIrVisitor {
"this pass should be run before any conditionals are placed in code.");
}

AllocationInserter(std::vector<kir::Expr*> _loop_nests)
explicit AllocationInserter(std::vector<kir::Expr*> _loop_nests)
: loop_nests_(std::move(_loop_nests)),
gpu_lower(GpuLower::current()),
ir_builder(gpu_lower->kernel()) {
Expand All @@ -506,8 +506,8 @@ class AllocationInserter : public kir::MutableIrVisitor {
if (alloc.alloc_expr == nullptr) {
continue;
}
// Dynamic smem exprs need to be at the begining of the kernel outside for
// loops
// Dynamic smem exprs need to be at the beginning of the kernel outside
// for loops
if (alloc.buffer->memoryType() == MemoryType::Shared &&
!kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) {
loop_nests_.insert(loop_nests_.begin(), alloc.alloc_expr);
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/lower_loops.h
@@ -1,4 +1,3 @@

#pragma once

#include <torch/csrc/WindowsTorchApiMacro.h>
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/lower_magic_zero.h
Expand Up @@ -10,7 +10,7 @@ namespace jit {
namespace fuser {
namespace cuda {

//! Insert magic zero definition at the begining of the kernel. Insert magic
//! Insert magic zero definition at the beginning of the kernel. Insert magic
//! zero update after every (outer most) loop nest with a compile time extent.
//!
//! This will make sure nvrtc does not aggressively save predicate and indices.
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/lower_thread_predicate.h
@@ -1,4 +1,3 @@

#pragma once

#include <torch/csrc/WindowsTorchApiMacro.h>
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/lower_utils.h
@@ -1,4 +1,3 @@

#pragma once

#include <torch/csrc/WindowsTorchApiMacro.h>
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu
@@ -1,4 +1,3 @@

// Counter-based block synchronization. Only meant to be used for
// debugging and validating synchronization. This should be replaced
// with cuda::barrier::arrive_and_wait as that should be more robust.
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/runtime/block_sync_default.cu
@@ -1,4 +1,3 @@

// Default block synchronization. Just use __barrier_sync
namespace block_sync {

Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/runtime/broadcast.cu
@@ -1,4 +1,3 @@

namespace broadcast {

template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/runtime/fp16_support.cu
@@ -1,4 +1,3 @@

#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
*(reinterpret_cast<const unsigned short*>(&(var)))
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
Expand Up @@ -142,7 +142,7 @@ ReductionParams innerNormalizationHeuristic(
target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);

if (target_blocks * target_unroll * target_iterations < n_elems) {
// targetting 4 waves, so try to use a quarter of available threads
// targeting 4 waves, so try to use a quarter of available threads
max_threads_in_block = std::min(
ceilDiv(n_elems, target_blocks * target_unroll),
ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
Expand Down Expand Up @@ -382,7 +382,7 @@ ReductionParams OuterNormalizationHeuristic(
target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);

if (target_blocks * target_unroll * max_threads_in_block < n_elems) {
// targetting 4 waves, so try to use a quarter of available threads
// targeting 4 waves, so try to use a quarter of available threads
max_threads_in_block = std::min(
ceilDiv(n_elems, target_blocks * target_unroll),
ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
Expand Up @@ -655,7 +655,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
auto consumer_tvs = ir_utils::consumerTvsOf(cached_input);
TORCH_INTERNAL_ASSERT(
consumer_tvs.size(),
"Input was not succesfully filtered out for scheduling but wasn't used.");
"Input was not successfully filtered out for scheduling but wasn't used.");

// Grab a consumer which will be used for computeAt structure of cached
// input into a consumer
Expand Down
4 changes: 2 additions & 2 deletions torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp
Expand Up @@ -139,7 +139,7 @@ ReductionParams innerReductionHeuristic(
target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);

if (target_blocks * target_unroll * target_iterations < n_elems) {
// targetting 4 waves, so try to use a quarter of available threads
// targeting 4 waves, so try to use a quarter of available threads
max_threads_in_block = std::min(
ceilDiv(n_elems, target_blocks * target_unroll),
ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
Expand Down Expand Up @@ -385,7 +385,7 @@ ReductionParams OuterReductionHeuristic(
target_blocks = std::min(target_blocks, device_multiprocessor_count * 4);

if (target_blocks * target_unroll * max_threads_in_block < n_elems) {
// targetting 4 waves, so try to use a quarter of available threads
// targeting 4 waves, so try to use a quarter of available threads
max_threads_in_block = std::min(
ceilDiv(n_elems, target_blocks * target_unroll),
ceilDiv(device_max_threads_per_multiprocessor, (int64_t)4));
Expand Down
1 change: 0 additions & 1 deletion torch/csrc/jit/codegen/cuda/utils.cpp
@@ -1,4 +1,3 @@

#include <torch/csrc/jit/codegen/cuda/utils.h>

#include <c10/util/string_view.h>
Expand Down

0 comments on commit 82e0bf4

Please sign in to comment.