Skip to content

Commit

Permalink
Adding dry run mode to skip arch dependent checks (#1702)
Browse files Browse the repository at this point in the history
  • Loading branch information
shmsong committed May 23, 2022
1 parent 151d95b commit 77c1b4f
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 19 deletions.
9 changes: 9 additions & 0 deletions torch/csrc/jit/codegen/cuda/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,11 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
int id,
c10::optional<int> opt_block_size) {
FUSER_PERF_SCOPE("executor_utils::NVRTC");
if (isDisabled(DisableOption::ArchCheck)) {
TORCH_WARN(
"NVFuser Compile: arch check disabled, should not compile any kernel");
}

initializeCudaContext();

std::stringstream ptxas_log;
Expand Down Expand Up @@ -1213,6 +1218,10 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
compiled_kernel_.module,
lowered_kernel_name));

TORCH_CHECK(
!isDisabled(DisableOption::ArchCheck),
"NVFuser Compile: arch check disabled, should not return any compiled kernel");

return {compiled_kernel_, ptxas_log.str()};
}

Expand Down
5 changes: 5 additions & 0 deletions torch/csrc/jit/codegen/cuda/lower_validation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,11 @@ namespace {
//! Utility to make sure targeted gpu capability is
//! higher than provided major.minor.
void validateMinimumArch(int major, int minor) {
// Skip checking arch if disabled.
if (isDisabled(DisableOption::ArchCheck)) {
return;
}

auto prop = at::cuda::getCurrentDeviceProperties();
TORCH_INTERNAL_ASSERT(prop->major >= major);
if (prop->major == major) {
Expand Down
38 changes: 21 additions & 17 deletions torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,19 @@ bool cudaArchGuardShouldSkip(int required_major, int required_minor) {
<< REQUIRED_MINOR << " to run.\n"; \
}

#define NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK( \
REQUIRED_MAJOR, REQUIRED_MINOR, COMPILE_FUSION) \
if (cudaArchGuardShouldSkip(REQUIRED_MAJOR, REQUIRED_MINOR)) { \
ASSERT_ANY_THROW(COMPILE_FUSION); \
return; \
} else { \
COMPILE_FUSION; \
}

} // namespace

// MMA unit test for a single instruction tile. VoltaTT
TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {
NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);

Fusion fusion;
FusionGuard fg(&fusion);

Expand Down Expand Up @@ -195,7 +202,8 @@ TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {
auto t1 = at::randn({4, 16}, options);

FusionExecutor fe;
fe.compileFusion(&fusion, {t0, t1});
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
7, 0, fe.compileFusion(&fusion, {t0, t1}));
auto cg_outputs = fe.runFusion({t0, t1});

auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));
Expand All @@ -205,8 +213,6 @@ TEST_F(NVFuserTest, FusionVoltaMMATT_CUDA) {

// MMA unit test for a single instruction tile. VoltaTN
TEST_F(NVFuserTest, FusionVoltaMMATN_CUDA) {
NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);

Fusion fusion;
FusionGuard fg(&fusion);

Expand Down Expand Up @@ -261,15 +267,15 @@ TEST_F(NVFuserTest, FusionVoltaMMATN_CUDA) {
auto t1 = at::randn({16, 4}, options);

FusionExecutor fe;
fe.compileFusion(&fusion, {t0, t1});
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
7, 0, fe.compileFusion(&fusion, {t0, t1}));
auto cg_outputs = fe.runFusion({t0, t1});
auto tref = t0.to(at::kFloat).matmul(t1.t().to(at::kFloat));
testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
}

// MMA unit test for a single instruction tile. VoltaNT
TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);
Fusion fusion;
FusionGuard fg(&fusion);

Expand Down Expand Up @@ -328,7 +334,8 @@ TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
auto t1 = at::randn({4, 16}, options);

FusionExecutor fe;
fe.compileFusion(&fusion, {t0, t1});
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
7, 0, fe.compileFusion(&fusion, {t0, t1}));
auto cg_outputs = fe.runFusion({t0, t1});
auto tref = t0.t().to(at::kFloat).matmul(t1.to(at::kFloat));
testValidate(&fusion, cg_outputs, {t0, t1}, {tref}, __LINE__, __FILE__);
Expand All @@ -338,8 +345,6 @@ TEST_F(NVFuserTest, FusionVoltaMMANT_CUDA) {
// This is the only example that is fully manual,
// the rest of them are facilitated by gemm utils.
TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {
NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);

Fusion fusion;
FusionGuard fg(&fusion);

Expand Down Expand Up @@ -579,7 +584,8 @@ TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {
auto t1 = at::randn({K, N}, options);

FusionExecutor fe;
fe.compileFusion(&fusion, {t0, t1});
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
7, 0, fe.compileFusion(&fusion, {t0, t1}));
auto cg_outputs = fe.runFusion({t0, t1});
auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat));

Expand All @@ -588,8 +594,6 @@ TEST_F(NVFuserTest, FusionVoltaMatMulTT_CUDA) {

// Gemm test for Volta MMA: TN
TEST_F(NVFuserTest, FusionVoltaMatMulTN_CUDA) {
NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);

Fusion fusion;
FusionGuard fg(&fusion);
int M = 120, N = 264, K = 56;
Expand Down Expand Up @@ -729,16 +733,15 @@ TEST_F(NVFuserTest, FusionVoltaMatMulTN_CUDA) {
auto t1 = at::randn({N, K}, options);

FusionExecutor fe;
fe.compileFusion(&fusion, {t0, t1});
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
7, 0, fe.compileFusion(&fusion, {t0, t1}));
auto cg_outputs = fe.runFusion({t0, t1});
auto tref = t0.to(at::kFloat).matmul(t1.to(at::kFloat).t());
TORCH_CHECK(cg_outputs[0].allclose(tref, 0.0001, 0.0001));
}

// Gemm test for Volta MMA: NT
TEST_F(NVFuserTest, FusionVoltaMatMulNT_CUDA) {
NVFUSER_TEST_CUDA_ARCH_GUARD(7, 0);

Fusion fusion;
FusionGuard fg(&fusion);
int M = 240, N = 320, K = 136;
Expand Down Expand Up @@ -883,7 +886,8 @@ TEST_F(NVFuserTest, FusionVoltaMatMulNT_CUDA) {
auto t1 = at::randn({K, N}, options);

FusionExecutor fe;
fe.compileFusion(&fusion, {t0, t1});
NVFUSER_TEST_CUDA_ARCH_COMPILE_CHECK(
7, 0, fe.compileFusion(&fusion, {t0, t1}));
auto cg_outputs = fe.runFusion({t0, t1});
auto tref = t0.to(at::kFloat).t().matmul(t1.to(at::kFloat));

Expand Down
7 changes: 5 additions & 2 deletions torch/csrc/jit/codegen/cuda/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ auto parseDebugDumpOptions() {

auto parseDisableOptions() {
std::unordered_map<DisableOption, bool> options_map = {
{DisableOption::ArchCheck, false},
{DisableOption::Fallback, false},
{DisableOption::Fma, false},
{DisableOption::IndexHoist, false},
Expand All @@ -117,7 +118,9 @@ auto parseDisableOptions() {
while (!options_view.empty()) {
const auto end_pos = options_view.find_first_of(',');
const auto token = options_view.substr(0, end_pos);
if (token == "fallback") {
if (token == "arch_check") {
options_map[DisableOption::ArchCheck] = true;
} else if (token == "fallback") {
options_map[DisableOption::Fallback] = true;
} else if (token == "fma") {
options_map[DisableOption::Fma] = true;
Expand All @@ -135,7 +138,7 @@ auto parseDisableOptions() {
"Invalid disable option: '",
token,
"'\nAvailable options:\n",
"\tfallback, fma, index_hoist, nvtx, predicate_elimination\n",
"\tarch_check, fallback, fma, index_hoist, nvtx, predicate_elimination\n",
"unroll_with_rng");
}
options_view = (end_pos != c10::string_view::npos)
Expand Down
1 change: 1 addition & 0 deletions torch/csrc/jit/codegen/cuda/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
//!
enum class DisableOption {
ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
Fallback, //! Disable fallback
Fma, //! Disable FMA instructions
IndexHoist, //! Disable index hoisting
Expand Down

0 comments on commit 77c1b4f

Please sign in to comment.