Adding dry run mode to skip arch dependent checks (#1702)

pytorch · May 23, 2022 · 77c1b4f · 77c1b4f
1 parent 151d95b
commit 77c1b4f
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 19 deletions.
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -888,6 +888,11 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
     int id,
     c10::optional<int> opt_block_size) {
   FUSER_PERF_SCOPE("executor_utils::NVRTC");
+  if (isDisabled(DisableOption::ArchCheck)) {
+    TORCH_WARN(
+        "NVFuser Compile: arch check disabled, should not compile any kernel");
+  }
+
   initializeCudaContext();
 
   std::stringstream ptxas_log;
@@ -1213,6 +1218,10 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
       compiled_kernel_.module,
       lowered_kernel_name));
 
+  TORCH_CHECK(
+      !isDisabled(DisableOption::ArchCheck),
+      "NVFuser Compile: arch check disabled, should not return any compiled kernel");
+
   return {compiled_kernel_, ptxas_log.str()};
 }
 

diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -868,6 +868,11 @@ namespace {
 //! Utility to make sure targeted gpu capability is
 //!  higher than provided major.minor.
 void validateMinimumArch(int major, int minor) {
+  // Skip checking arch if disabled.
+  if (isDisabled(DisableOption::ArchCheck)) {
+    return;
+  }
+
   auto prop = at::cuda::getCurrentDeviceProperties();
   TORCH_INTERNAL_ASSERT(prop->major >= major);
   if (prop->major == major) {

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -108,12 +108,19 @@ bool cudaArchGuardShouldSkip(int required_major, int required_minor) {
                  << REQUIRED_MINOR << " to run.\n";                           \
   }
 
+#define NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(                    \
+    REQUIRED_MAJOR, REQUIRED_MINOR, COMPILE_FUSION)              \
+  if (cudaArchGuardShouldSkip(REQUIRED_MAJOR, REQUIRED_MINOR)) { \
+    ASSERT_ANY_THROW(COMPILE_FUSION);                            \
+    return;                                                      \
+  } else {                                                       \
+    COMPILE_FUSION;                                              \
+  }
+
 } // namespace
 
 // MMA unit test for a single instruction tile. VoltaTT
 TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {
-  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
-
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -195,7 +202,8 @@ TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {
   auto t1 = at::randn({4, 16}, options);
 
   FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+      7, 0, fe.compileFusion(&fusion, {t0, t1}));
   auto cg_outputs = fe.runFusion({t0, t1});
 
   auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
@@ -205,8 +213,6 @@ TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {
 
 // MMA unit test for a single instruction tile. VoltaTN
 TEST_F(NVFuserTest, FusionVoltaMMATN_CUDA) {
-  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
-
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -261,15 +267,15 @@ TEST_F(NVFuserTest, FusionVoltaMMATN_CUDA) {
   auto t1 = at::randn({16, 4}, options);
 
   FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+      7, 0, fe.compileFusion(&fusion, {t0, t1}));
   auto cg_outputs = fe.runFusion({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
   testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
 }
 
 // MMA unit test for a single instruction tile. VoltaNT
 TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
-  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -328,7 +334,8 @@ TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
   auto t1 = at::randn({4, 16}, options);
 
   FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+      7, 0, fe.compileFusion(&fusion, {t0, t1}));
   auto cg_outputs = fe.runFusion({t0, t1});
   auto tref = t0.t().to(at::kFloat).matmul(t1.to(at::kFloat));
   testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
@@ -338,8 +345,6 @@ TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
 //  This is the only example that is fully manual,
 //    the rest of them are facilitated by gemm utils.
 TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {
-  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
-
   Fusion fusion;
   FusionGuard fg(&fusion);
 
@@ -579,7 +584,8 @@ TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {
   auto t1 = at::randn({K, N}, options);
 
   FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+      7, 0, fe.compileFusion(&fusion, {t0, t1}));
   auto cg_outputs = fe.runFusion({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
 
@@ -588,8 +594,6 @@ TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {
 
 // Gemm test for Volta MMA: TN
 TEST_F(NVFuserTest, FusionVoltaMatMulTN_CUDA) {
-  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
-
   Fusion fusion;
   FusionGuard fg(&fusion);
   int M = 120, N = 264, K = 56;
@@ -729,16 +733,15 @@ TEST_F(NVFuserTest, FusionVoltaMatMulTN_CUDA) {
   auto t1 = at::randn({N, K}, options);
 
   FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+      7, 0, fe.compileFusion(&fusion, {t0, t1}));
   auto cg_outputs = fe.runFusion({t0, t1});
   auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat).t());
   TORCH_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
 }
 
 // Gemm test for Volta MMA: NT
 TEST_F(NVFuserTest, FusionVoltaMatMulNT_CUDA) {
-  NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
-
   Fusion fusion;
   FusionGuard fg(&fusion);
   int M = 240, N = 320, K = 136;
@@ -883,7 +886,8 @@ TEST_F(NVFuserTest, FusionVoltaMatMulNT_CUDA) {
   auto t1 = at::randn({K, N}, options);
 
   FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
+  NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
+      7, 0, fe.compileFusion(&fusion, {t0, t1}));
   auto cg_outputs = fe.runFusion({t0, t1});
   auto tref = t0.to(at::kFloat).t().matmul(t1.to(at::kFloat));
 

diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -105,6 +105,7 @@ auto parseDebugDumpOptions() {
 
 auto parseDisableOptions() {
   std::unordered_map<DisableOption, bool> options_map = {
+      {DisableOption::ArchCheck, false},
       {DisableOption::Fallback, false},
       {DisableOption::Fma, false},
       {DisableOption::IndexHoist, false},
@@ -117,7 +118,9 @@ auto parseDisableOptions() {
     while (!options_view.empty()) {
       const auto end_pos = options_view.find_first_of(',');
       const auto token = options_view.substr(0, end_pos);
-      if (token == "fallback") {
+      if (token == "arch_check") {
+        options_map[DisableOption::ArchCheck] = true;
+      } else if (token == "fallback") {
         options_map[DisableOption::Fallback] = true;
       } else if (token == "fma") {
         options_map[DisableOption::Fma] = true;
@@ -135,7 +138,7 @@ auto parseDisableOptions() {
             "Invalid disable option: '",
             token,
             "'\nAvailable options:\n",
-            "\tfallback, fma, index_hoist, nvtx, predicate_elimination\n",
+            "\tarch_check, fallback, fma, index_hoist, nvtx, predicate_elimination\n",
             "unroll_with_rng");
       }
       options_view = (end_pos != c10::string_view::npos)

diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
@@ -53,6 +53,7 @@ TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
 //! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
 //!
 enum class DisableOption {
+  ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
   Fallback, //! Disable fallback
   Fma, //! Disable FMA instructions
   IndexHoist, //! Disable index hoisting