Skip to content

Commit

Permalink
Fine-grained kernel profiling (#1720)
Browse files Browse the repository at this point in the history
* Fine-grained kernel profiling

With `PYTORCH_NVFUSER_ENABLE=kernel_profile`, elapsed times of grid reductions are printed at the end of each kernel call.
  • Loading branch information
naoyam committed May 23, 2022
1 parent 77c1b4f commit 8fbd0b1
Show file tree
Hide file tree
Showing 18 changed files with 565 additions and 4 deletions.
1 change: 1 addition & 0 deletions tools/build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,7 @@ libtorch_cuda_core_sources = [
"torch/csrc/jit/codegen/cuda/lower_index.cpp",
"torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp",
"torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp",
"torch/csrc/jit/codegen/cuda/lower_instrument.cpp",
"torch/csrc/jit/codegen/cuda/lower_loops.cpp",
"torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp",
"torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp",
Expand Down
21 changes: 21 additions & 0 deletions torch/csrc/jit/codegen/cuda/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1288,6 +1288,19 @@ class CudaKernelGenerator : private OptOutConstDispatch {
return flags.str();
}

void addProfileArguments(ArgumentBuilder& func_args, const Expr* expr) {
if (isEnabled(EnableOption::KernelProfile) &&
kernel_->profile().isProfiled(expr)) {
const auto& buffer_indices =
kernel_->profile().getIndicesInProfileBuffer(expr);
auto buffer = kernel_->profile().getBuffer();
TORCH_INTERNAL_ASSERT(buffer != nullptr);
for (const auto& index : buffer_indices) {
func_args.arg(varName(buffer)).append("[").append(index).append("]");
}
}
}

void handle(const kir::GridReduction* grop) final {
TORCH_INTERNAL_ASSERT(grop->out()->isA<kir::TensorIndex>());

Expand Down Expand Up @@ -1345,6 +1358,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
func_args.arg(genInline(grop->entrance_index()));
func_args.arg(genInline(grop->entrances()));

addProfileArguments(func_args, grop);

indent() << "reduction::gridReduce<" << template_args << ">(\n";
indent() << kTab << func_args << ");\n";
}
Expand Down Expand Up @@ -1412,6 +1427,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
// reduction_op
func_args.arg(genReductionOp(op_type, out->dtype()));

addProfileArguments(func_args, grop);

indent() << kTab << func_args << ");\n";
}

Expand Down Expand Up @@ -1483,6 +1500,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
func_args.arg(read_pred);
}

addProfileArguments(func_args, grouped_grop);

indent() << "reduction::gridReduceGroup<" << template_args << ">(\n";
indent() << kTab << func_args << ");\n";
}
Expand Down Expand Up @@ -1543,6 +1562,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
func_args.arg(read_pred);
}

addProfileArguments(func_args, grouped_grop);

indent() << genFusedReductionName(ir_utils::getTvOutput(grouped_grop))
<< ".reduceGroup(\n";
indent() << kTab << func_args << ");\n";
Expand Down
9 changes: 9 additions & 0 deletions torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,11 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
inferAndAlloc(tv, alloc->shape(), expr_eval, options_, false));
global_buffers.zero_init.push_back(false);
}
// Remember the tensor buffer used for storing kernel profile
if (isEnabled(EnableOption::KernelProfile) &&
tv == kernel->profile().getBuffer()) {
global_buffers.profile_buffer = global_buffers.buffers.back();
}
}

return global_buffers;
Expand Down Expand Up @@ -1035,6 +1040,10 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
}
}

if (isEnabled(EnableOption::KernelProfile)) {
std::cout << kernel()->profile().toString(global_buffers.profile_buffer);
}

return allocated_outputs;
}

Expand Down
1 change: 1 addition & 0 deletions torch/csrc/jit/codegen/cuda/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
struct GlobalBuffers {
std::vector<at::Tensor> buffers;
std::vector<bool> zero_init;
at::Tensor profile_buffer;
};

static std::string kernelNamespace() {
Expand Down
4 changes: 4 additions & 0 deletions torch/csrc/jit/codegen/cuda/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,10 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
args.push_back("-DNDEBUG");
#endif

if (isEnabled(EnableOption::KernelProfile)) {
args.push_back("-DPYTORCH_NVFUSER_PROFILE_KERNEL");
}

const char* ptxas_opt_level = getenv("PYTORCH_NVFUSER_JIT_OPT_LEVEL");
std::string jit_opt_level = "-O";

Expand Down
67 changes: 67 additions & 0 deletions torch/csrc/jit/codegen/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>

#include <ATen/cuda/CUDAContext.h>

#include <iostream>
#include <unordered_set>

Expand Down Expand Up @@ -277,6 +279,7 @@ void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
top_level_exprs_ = std::move(top_level_exprs);
warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
profile_ = GpuLower::current()->profile();
ValidateAllocation::validate(this);
analyze();
// Make sure this is after analyze as it sets summary_
Expand Down Expand Up @@ -358,6 +361,70 @@ std::vector<Expr*>& KernelInternalProxy::topLevelExprs() {
return kernel_->top_level_exprs_;
}

void KernelPerformanceProfile::registerExpr(const Expr* expr) {
if (expr_entry_map_.find(expr) != expr_entry_map_.end()) {
return;
}

auto slot = getNewIndex();
expr_entry_map_.emplace(expr, slot);
}

int KernelPerformanceProfile::getNewIndex() {
return num_profile_entries_++;
}

bool KernelPerformanceProfile::isProfiled(const Expr* expr) const {
return expr_entry_map_.find(expr) != expr_entry_map_.end();
}

c10::optional<int> KernelPerformanceProfile::getIndex(const Expr* expr) const {
auto it = expr_entry_map_.find(expr);
if (it == expr_entry_map_.end()) {
return c10::optional<int>();
} else {
return it->second;
}
}

std::array<int, 2> KernelPerformanceProfile::getIndicesInProfileBuffer(
const Expr* expr) const {
TORCH_INTERNAL_ASSERT(
isProfiled(expr), "Not a profiled expression: ", expr->toString());

int cycle_index = getIndex(expr).value() * 2;
int count_index = cycle_index + 1;

return {cycle_index, count_index};
}

std::string KernelPerformanceProfile::toString(const at::Tensor& buffer) const {
std::stringstream ss;
ss << "Kernel performance profile:\n";
if (!buffer.defined()) {
ss << "No profile found\n";
return ss.str();
}

double kilo_freq = at::cuda::getCurrentDeviceProperties()->clockRate;

ss << std::setprecision(3) << std::fixed;

for (const auto& kv : expr_entry_map_) {
auto expr = kv.first;
auto index = kv.second;
auto out_tv = ir_utils::getTvOutput(expr);
double cycles = static_cast<double>(buffer[index][0].item<int64_t>());
auto count = buffer[index][1].item<int64_t>();
auto cycles_per_call = count == 0 ? 0.0 : cycles / count;
auto us_per_call = cycles_per_call / kilo_freq * 1000.0;
ss << expr->getExprType().value() << ", T" << out_tv->name() << ", "
<< us_per_call << " us, " << count << "\n";
}

return ss.str();
}

} // namespace kir
} // namespace cuda
} // namespace fuser
Expand Down
57 changes: 57 additions & 0 deletions torch/csrc/jit/codegen/cuda/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,57 @@ struct KernelSummary {
std::vector<VectorizedSetInfo> vectorized_set_info;
};

class TORCH_CUDA_CU_API KernelPerformanceProfile {
public:
//! Register an expression to profile
void registerExpr(const Expr* expr);

//! Query if an expression is profiled
bool isProfiled(const Expr* expr) const;

//! Get the number of profiled expressions
int getNumberOfProfileEntries() const {
return num_profile_entries_;
}

//! Set the backing buffer of profile.
void setBuffer(TensorView* buffer) {
buffer_ = buffer;
}

//! Get the backing buffer
TensorView* getBuffer() const {
return buffer_;
}

//! Get the indices of the profile of an expression in the backing buffer
std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;

std::string toString(const at::Tensor& buffer) const;

private:
//! Get the new profile index
int getNewIndex();

//! Get the profile index
c10::optional<int> getIndex(const Expr* expr) const;

private:
int num_profile_entries_ = 0;

//! Backing buffer of Nx2 integer tensor, where N is the number of profiled
//! regions. Each region has two integer values, one representing
//! the cycles spent, and another the count.
TensorView* buffer_ = nullptr;

//! Map profiled expressions to profile entry offsets
std::unordered_map<const Expr*, int> expr_entry_map_;

// TODO: Allow profiling of ForLoops
//! Map profiled ForLoop to profile entry offsets
// std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
};

class KernelInternalProxy;

//! Container for a lowered Kernel IR
Expand Down Expand Up @@ -151,6 +202,10 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion {
return warp_padded_parallel_info_;
}

const KernelPerformanceProfile& profile() const {
return profile_;
}

//! Debug dump of the Kernel IR
void print() const;

Expand Down Expand Up @@ -178,6 +233,8 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion {
DataType index_type_ = DataType::Int;

WarpPaddedParallelInfo warp_padded_parallel_info_;

KernelPerformanceProfile profile_;
};

//! A special debugging proxy for Kernel.
Expand Down
5 changes: 4 additions & 1 deletion torch/csrc/jit/codegen/cuda/lower2device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
#include <torch/csrc/jit/codegen/cuda/lower_index.h>
#include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
#include <torch/csrc/jit/codegen/cuda/lower_instrument.h>
#include <torch/csrc/jit/codegen/cuda/lower_loops.h>
#include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
#include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
Expand Down Expand Up @@ -349,10 +350,12 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
const auto exprs_cleaned_up_loops =
KIRCleaner::cleanUp(exprs_register_adjusted);

const auto exprs_instrumented = instrumentKernel(exprs_cleaned_up_loops);

// We now have the lowered expressions, finalize the kernel IR. This function
// will also copy over some relevant information for code generation from
// GpuLower.
kernel_->finalize(exprs_cleaned_up_loops);
kernel_->finalize(exprs_instrumented);
}

kir::Kernel* GpuLower::kernel() const {
Expand Down
5 changes: 5 additions & 0 deletions torch/csrc/jit/codegen/cuda/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
return sync_map_;
}

kir::KernelPerformanceProfile& profile() {
return profile_;
}

// This is an interface to propagate information after expression
// replacement on the kernel IR. E.g.:
// for ...
Expand Down Expand Up @@ -209,6 +213,7 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
CommonIndexMap common_index_map_;
FusedReductionInfo fused_reduction_info_;
SyncMap sync_map_;
kir::KernelPerformanceProfile profile_;

// Track which tensor views are inputs or outputs of a vectorized operation
// and their maximum vectorized access size
Expand Down
Loading

0 comments on commit 8fbd0b1

Please sign in to comment.