Fine-grained kernel profiling (#1720)

* Fine-grained kernel profiling With `PYTORCH_NVFUSER_ENABLE=kernel_profile`, elapsed times of grid reductions are printed at the end of each kernel call.
pytorch · May 23, 2022 · 8fbd0b1 · 8fbd0b1
1 parent 77c1b4f
commit 8fbd0b1
Show file tree

Hide file tree

Showing 18 changed files with 565 additions and 4 deletions.
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
@@ -695,6 +695,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/lower_index.cpp",
     "torch/csrc/jit/codegen/cuda/lower_index_hoist.cpp",
     "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_instrument.cpp",
     "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
     "torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp",
     "torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp",

diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp
@@ -1288,6 +1288,19 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     return flags.str();
   }
 
+  void addProfileArguments(ArgumentBuilder& func_args, const Expr* expr) {
+    if (isEnabled(EnableOption::KernelProfile) &&
+        kernel_->profile().isProfiled(expr)) {
+      const auto& buffer_indices =
+          kernel_->profile().getIndicesInProfileBuffer(expr);
+      auto buffer = kernel_->profile().getBuffer();
+      TORCH_INTERNAL_ASSERT(buffer != nullptr);
+      for (const auto& index : buffer_indices) {
+        func_args.arg(varName(buffer)).append("[").append(index).append("]");
+      }
+    }
+  }
+
   void handle(const kir::GridReduction* grop) final {
     TORCH_INTERNAL_ASSERT(grop->out()->isA<kir::TensorIndex>());
 
@@ -1345,6 +1358,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     func_args.arg(genInline(grop->entrance_index()));
     func_args.arg(genInline(grop->entrances()));
 
+    addProfileArguments(func_args, grop);
+
     indent() << "reduction::gridReduce<" << template_args << ">(\n";
     indent() << kTab << func_args << ");\n";
   }
@@ -1412,6 +1427,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     // reduction_op
     func_args.arg(genReductionOp(op_type, out->dtype()));
 
+    addProfileArguments(func_args, grop);
+
     indent() << kTab << func_args << ");\n";
   }
 
@@ -1483,6 +1500,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       func_args.arg(read_pred);
     }
 
+    addProfileArguments(func_args, grouped_grop);
+
     indent() << "reduction::gridReduceGroup<" << template_args << ">(\n";
     indent() << kTab << func_args << ");\n";
   }
@@ -1543,6 +1562,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
       func_args.arg(read_pred);
     }
 
+    addProfileArguments(func_args, grouped_grop);
+
     indent() << genFusedReductionName(ir_utils::getTvOutput(grouped_grop))
              << ".reduceGroup(\n";
     indent() << kTab << func_args << ");\n";

diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -603,6 +603,11 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
           inferAndAlloc(tv, alloc->shape(), expr_eval, options_, false));
       global_buffers.zero_init.push_back(false);
     }
+    // Remember the tensor buffer used for storing kernel profile
+    if (isEnabled(EnableOption::KernelProfile) &&
+        tv == kernel->profile().getBuffer()) {
+      global_buffers.profile_buffer = global_buffers.buffers.back();
+    }
   }
 
   return global_buffers;
@@ -1035,6 +1040,10 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
     }
   }
 
+  if (isEnabled(EnableOption::KernelProfile)) {
+    std::cout << kernel()->profile().toString(global_buffers.profile_buffer);
+  }
+
   return allocated_outputs;
 }
 

diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
@@ -157,6 +157,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   struct GlobalBuffers {
     std::vector<at::Tensor> buffers;
     std::vector<bool> zero_init;
+    at::Tensor profile_buffer;
   };
 
   static std::string kernelNamespace() {

diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -964,6 +964,10 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
   args.push_back("-DNDEBUG");
 #endif
 
+  if (isEnabled(EnableOption::KernelProfile)) {
+    args.push_back("-DPYTORCH_NVFUSER_PROFILE_KERNEL");
+  }
+
   const char* ptxas_opt_level = getenv("PYTORCH_NVFUSER_JIT_OPT_LEVEL");
   std::string jit_opt_level = "-O";
 

diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -5,6 +5,8 @@
 #include <torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h>
 #include <torch/csrc/jit/codegen/cuda/lower2device.h>
 
+#include <ATen/cuda/CUDAContext.h>
+
 #include <iostream>
 #include <unordered_set>
 
@@ -277,6 +279,7 @@ void Kernel::finalize(std::vector<Expr*> top_level_exprs) {
   TORCH_INTERNAL_ASSERT(top_level_exprs_.empty());
   top_level_exprs_ = std::move(top_level_exprs);
   warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo();
+  profile_ = GpuLower::current()->profile();
   ValidateAllocation::validate(this);
   analyze();
   // Make sure this is after analyze as it sets summary_
@@ -358,6 +361,70 @@ std::vector<Expr*>& KernelInternalProxy::topLevelExprs() {
   return kernel_->top_level_exprs_;
 }
 
+void KernelPerformanceProfile::registerExpr(const Expr* expr) {
+  if (expr_entry_map_.find(expr) != expr_entry_map_.end()) {
+    return;
+  }
+
+  auto slot = getNewIndex();
+  expr_entry_map_.emplace(expr, slot);
+}
+
+int KernelPerformanceProfile::getNewIndex() {
+  return num_profile_entries_++;
+}
+
+bool KernelPerformanceProfile::isProfiled(const Expr* expr) const {
+  return expr_entry_map_.find(expr) != expr_entry_map_.end();
+}
+
+c10::optional<int> KernelPerformanceProfile::getIndex(const Expr* expr) const {
+  auto it = expr_entry_map_.find(expr);
+  if (it == expr_entry_map_.end()) {
+    return c10::optional<int>();
+  } else {
+    return it->second;
+  }
+}
+
+std::array<int, 2> KernelPerformanceProfile::getIndicesInProfileBuffer(
+    const Expr* expr) const {
+  TORCH_INTERNAL_ASSERT(
+      isProfiled(expr), "Not a profiled expression: ", expr->toString());
+
+  int cycle_index = getIndex(expr).value() * 2;
+  int count_index = cycle_index + 1;
+
+  return {cycle_index, count_index};
+}
+
+std::string KernelPerformanceProfile::toString(const at::Tensor& buffer) const {
+  std::stringstream ss;
+  ss << "Kernel performance profile:\n";
+  if (!buffer.defined()) {
+    ss << "No profile found\n";
+    return ss.str();
+  }
+
+  double kilo_freq = at::cuda::getCurrentDeviceProperties()->clockRate;
+
+  ss << std::setprecision(3) << std::fixed;
+
+  for (const auto& kv : expr_entry_map_) {
+    auto expr = kv.first;
+    auto index = kv.second;
+    auto out_tv = ir_utils::getTvOutput(expr);
+    double cycles = static_cast<double>(buffer[index][0].item<int64_t>());
+    auto count = buffer[index][1].item<int64_t>();
+    auto cycles_per_call = count == 0 ? 0.0 : cycles / count;
+    auto us_per_call = cycles_per_call / kilo_freq * 1000.0;
+    ss << expr->getExprType().value() << ", T" << out_tv->name() << ", "
+       << us_per_call << " us, " << count << "\n";
+  }
+
+  return ss.str();
+}
+
 } // namespace kir
 } // namespace cuda
 } // namespace fuser

diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h
@@ -98,6 +98,57 @@ struct KernelSummary {
   std::vector<VectorizedSetInfo> vectorized_set_info;
 };
 
+class TORCH_CUDA_CU_API KernelPerformanceProfile {
+ public:
+  //! Register an expression to profile
+  void registerExpr(const Expr* expr);
+
+  //! Query if an expression is profiled
+  bool isProfiled(const Expr* expr) const;
+
+  //! Get the number of profiled expressions
+  int getNumberOfProfileEntries() const {
+    return num_profile_entries_;
+  }
+
+  //! Set the backing buffer of profile.
+  void setBuffer(TensorView* buffer) {
+    buffer_ = buffer;
+  }
+
+  //! Get the backing buffer
+  TensorView* getBuffer() const {
+    return buffer_;
+  }
+
+  //! Get the indices of the profile of an expression in the backing buffer
+  std::array<int, 2> getIndicesInProfileBuffer(const Expr* expr) const;
+
+  std::string toString(const at::Tensor& buffer) const;
+
+ private:
+  //! Get the new profile index
+  int getNewIndex();
+
+  //! Get the profile index
+  c10::optional<int> getIndex(const Expr* expr) const;
+
+ private:
+  int num_profile_entries_ = 0;
+
+  //! Backing buffer of Nx2 integer tensor, where N is the number of profiled
+  //! regions. Each region has two integer values, one representing
+  //! the cycles spent, and another the count.
+  TensorView* buffer_ = nullptr;
+
+  //! Map profiled expressions to profile entry offsets
+  std::unordered_map<const Expr*, int> expr_entry_map_;
+
+  // TODO: Allow profiling of ForLoops
+  //! Map profiled ForLoop to profile entry offsets
+  // std::unordered_map<const kir::ForLoop*, int> loop_entry_map_;
+};
+
 class KernelInternalProxy;
 
 //! Container for a lowered Kernel IR
@@ -151,6 +202,10 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion {
     return warp_padded_parallel_info_;
   }
 
+  const KernelPerformanceProfile& profile() const {
+    return profile_;
+  }
+
   //! Debug dump of the Kernel IR
   void print() const;
 
@@ -178,6 +233,8 @@ class TORCH_CUDA_CU_API Kernel final : public Fusion {
   DataType index_type_ = DataType::Int;
 
   WarpPaddedParallelInfo warp_padded_parallel_info_;
+
+  KernelPerformanceProfile profile_;
 };
 
 //! A special debugging proxy for Kernel.

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp
@@ -13,6 +13,7 @@
 #include <torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h>
 #include <torch/csrc/jit/codegen/cuda/lower_index.h>
 #include <torch/csrc/jit/codegen/cuda/lower_insert_syncs.h>
+#include <torch/csrc/jit/codegen/cuda/lower_instrument.h>
 #include <torch/csrc/jit/codegen/cuda/lower_loops.h>
 #include <torch/csrc/jit/codegen/cuda/lower_magic_zero.h>
 #include <torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h>
@@ -349,10 +350,12 @@ void GpuLower::lower(Fusion* fusion, DataType index_type) {
   const auto exprs_cleaned_up_loops =
       KIRCleaner::cleanUp(exprs_register_adjusted);
 
+  const auto exprs_instrumented = instrumentKernel(exprs_cleaned_up_loops);
+
   // We now have the lowered expressions, finalize the kernel IR. This function
   // will also copy over some relevant information for code generation from
   // GpuLower.
-  kernel_->finalize(exprs_cleaned_up_loops);
+  kernel_->finalize(exprs_instrumented);
 }
 
 kir::Kernel* GpuLower::kernel() const {

diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h
@@ -164,6 +164,10 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
     return sync_map_;
   }
 
+  kir::KernelPerformanceProfile& profile() {
+    return profile_;
+  }
+
   // This is an interface to propagate information after expression
   //  replacement on the kernel IR. E.g.:
   //    for ...
@@ -209,6 +213,7 @@ class TORCH_CUDA_CU_API GpuLower : public NonCopyable {
   CommonIndexMap common_index_map_;
   FusedReductionInfo fused_reduction_info_;
   SyncMap sync_map_;
+  kir::KernelPerformanceProfile profile_;
 
   // Track which tensor views are inputs or outputs of a vectorized operation
   // and their maximum vectorized access size