From bfa95f90a039a125c55305c7ac49e76620fc9983 Mon Sep 17 00:00:00 2001 From: Supriya Rao Date: Wed, 9 Dec 2020 14:05:11 -0800 Subject: [PATCH] Revert D25325039: Check CUDA kernel launches (/fbcode/caffe2/) Test Plan: revert-hammer Differential Revision: D25325039 (https://github.com/pytorch/pytorch/commit/f5e9ffbc279626ad8cabda49eed91dbe6399d3c4) Original commit changeset: 2043d6e63c7d fbshipit-source-id: 5377dd2aa7c6f58c8641c956b7642c7c559bbc40 --- modules/detectron/group_spatial_softmax_op.cu | 3 --- modules/detectron/ps_roi_pool_op.cu | 2 -- modules/detectron/roi_pool_f_op.cu | 2 -- modules/detectron/select_smooth_l1_loss_op.cu | 2 -- modules/detectron/sigmoid_cross_entropy_loss_op.cu | 5 ----- modules/detectron/sigmoid_focal_loss_op.cu | 2 -- modules/detectron/smooth_l1_loss_op.cu | 3 --- modules/detectron/softmax_focal_loss_op.cu | 5 ----- modules/detectron/spatial_narrow_as_op.cu | 2 -- modules/detectron/upsample_nearest_op.cu | 3 --- test/cpp_extensions/cuda_extension.cu | 1 - test/cpp_extensions/cuda_extension_kernel.cu | 1 - test/cpp_extensions/cuda_extension_kernel2.cu | 1 - torch/lib/c10d/test/CUDATest.cu | 1 - 14 files changed, 33 deletions(-) diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu index a37a3fba55a7..92e89ae5acc2 100644 --- a/modules/detectron/group_spatial_softmax_op.cu +++ b/modules/detectron/group_spatial_softmax_op.cu @@ -112,7 +112,6 @@ bool GroupSpatialSoftmaxOp::RunOnDevice() { GroupSpatialSoftmaxKernel<<>>( N, A, W, H, Xdata, Pdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -159,13 +158,11 @@ bool GroupSpatialSoftmaxGradientOp::RunOnDevice() { SumProbsKernel<<>>( N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 2: dX[i] = dX[i] - s SubSumKernel<<>>( N, A, W, H, sum_probs_.data(), dXdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Step 3: dX[i] = Y[i] * dX[i] math::Mul(Y.size(), dXdata, Ydata, dXdata, &context_); diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu index 68e4ec377d62..1ba418be5c99 100644 --- a/modules/detectron/ps_roi_pool_op.cu +++ b/modules/detectron/ps_roi_pool_op.cu @@ -253,7 +253,6 @@ bool PSRoIPoolOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), output_dim_, group_size_, Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -277,7 +276,6 @@ bool PSRoIPoolGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, output_dim_, dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu index b261911b95a1..62948f7eacbe 100644 --- a/modules/detectron/roi_pool_f_op.cu +++ b/modules/detectron/roi_pool_f_op.cu @@ -149,7 +149,6 @@ bool RoIPoolFOp::RunOnDevice() { output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, R.data(), Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -174,7 +173,6 @@ bool RoIPoolFGradientOp::RunOnDevice() { dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } return true; } diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu index ce68fcff634d..9065bfc7afbe 100644 --- a/modules/detectron/select_smooth_l1_loss_op.cu +++ b/modules/detectron/select_smooth_l1_loss_op.cu @@ -129,7 +129,6 @@ bool SelectSmoothL1LossOp::RunOnDevice() { M, Y_hat.data(), Y.data(), L.data(), buff_.mutable_data(), S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Sum of all losses // al := sum_i l_i @@ -176,7 +175,6 @@ bool SelectSmoothL1LossGradientOp::RunOnDevice() { D, H, W, M, Y_hat.data(), Y.data(), L.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_, S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu index bb86560fcb01..d69a7b41dc33 100644 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu +++ b/modules/detectron/sigmoid_cross_entropy_loss_op.cu @@ -93,8 +93,6 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { T.data(), losses_.mutable_data(), counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - float* avg_loss_data = avg_loss->mutable_data(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -108,7 +106,6 @@ bool SigmoidCrossEntropyLossOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, avg_loss_data, normalizer_data, avg_loss_data, &context_); } @@ -138,7 +135,6 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { T.data(), dX->mutable_data(), counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); if (normalize_) { float* normalizer_data = normalizer_.mutable_data(); math::Sum( @@ -149,7 +145,6 @@ bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Div( 1, d_avg_loss.data(), diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu index e6f2dea21b5d..5b130c8dfc1f 100644 --- a/modules/detectron/sigmoid_focal_loss_op.cu +++ b/modules/detectron/sigmoid_focal_loss_op.cu @@ -134,7 +134,6 @@ bool SigmoidFocalLossOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), wp.data(), gamma_, alpha_, num_classes_, losses_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Sum( losses_.size(), losses_.data(), avg_loss_data, &context_); @@ -166,7 +165,6 @@ bool SigmoidFocalLossGradientOp::RunOnDevice() { N, D, H, W, X.data(), T.data(), dX->mutable_data(), wp.data(), gamma_, alpha_, num_classes_, d_avg_loss.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu index ea835a4bc2b9..1a3e8b78b53f 100644 --- a/modules/detectron/smooth_l1_loss_op.cu +++ b/modules/detectron/smooth_l1_loss_op.cu @@ -102,7 +102,6 @@ bool SmoothL1LossOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), buff_.mutable_data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Element-wise weighted smooth l1 loss (can be used to specify a per-element // loss weight) @@ -165,8 +164,6 @@ bool SmoothL1LossGradientOp::RunOnDevice() { context_.cuda_stream()>>>( buff_.size(), buff_.data(), d_Y_hat->mutable_data(), d_avg_loss.data(), scale_ / N, beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - // Element-wise scale by alpha_in and alpha_out math::Mul( d_Y_hat->size(), d_Y_hat->data(), alpha_in.data(), diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu index b7f8d2423ebc..93635269f176 100644 --- a/modules/detectron/softmax_focal_loss_op.cu +++ b/modules/detectron/softmax_focal_loss_op.cu @@ -176,7 +176,6 @@ bool SoftmaxFocalLossOp::RunOnDevice() { <<>>( N, A, H, W, Xdata, P->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // Compute loss for each x,y location const int* Tdata = T.data(); @@ -185,7 +184,6 @@ bool SoftmaxFocalLossOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, P->data(), Tdata, losses_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); // sum the losses float* avg_loss_data = avg_loss->mutable_data(); @@ -229,8 +227,6 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, A, H, W, Pdata, Tdata, buff_.mutable_data(), Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - // Compute the gradient with the weights const float* Bdata = buff_.data(); SoftmaxFocalLossGradientKernel @@ -238,7 +234,6 @@ bool SoftmaxFocalLossGradientOp::RunOnDevice() { 0, context_.cuda_stream()>>>( N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data(), dX->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); math::Scale( dX->size(), scale_, diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu index ff8b5632e80a..97ddc492eb07 100644 --- a/modules/detectron/spatial_narrow_as_op.cu +++ b/modules/detectron/spatial_narrow_as_op.cu @@ -115,7 +115,6 @@ bool SpatialNarrowAsOp::DoRunWithType() { out_width, A.template data(), C->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } @@ -153,7 +152,6 @@ bool SpatialNarrowAsGradientOp::DoRunWithType() { out_width, dC.template data(), dA->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu index 0ea32e348c0b..38af4254f922 100644 --- a/modules/detectron/upsample_nearest_op.cu +++ b/modules/detectron/upsample_nearest_op.cu @@ -164,8 +164,6 @@ bool UpsampleNearestOp::RunOnDevice() { upscale<<>>( input_data, output_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; } @@ -211,7 +209,6 @@ bool UpsampleNearestGradientOp::RunOnDevice() { math::Set(no_elements, 0.f, gradInput_data, &context_); downscale<<>>( gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return true; } diff --git a/test/cpp_extensions/cuda_extension.cu b/test/cpp_extensions/cuda_extension.cu index fb3bbd178c07..29511af8a0ed 100644 --- a/test/cpp_extensions/cuda_extension.cu +++ b/test/cpp_extensions/cuda_extension.cu @@ -26,5 +26,4 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel.cu b/test/cpp_extensions/cuda_extension_kernel.cu index c8dce124f9df..660219989863 100644 --- a/test/cpp_extensions/cuda_extension_kernel.cu +++ b/test/cpp_extensions/cuda_extension_kernel.cu @@ -20,5 +20,4 @@ void sigmoid_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; sigmoid_add_kernel<<>>(x, y, output, size); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/test/cpp_extensions/cuda_extension_kernel2.cu b/test/cpp_extensions/cuda_extension_kernel2.cu index 4cdc25cc0110..817bdf64ac8e 100644 --- a/test/cpp_extensions/cuda_extension_kernel2.cu +++ b/test/cpp_extensions/cuda_extension_kernel2.cu @@ -20,5 +20,4 @@ void tanh_add_cuda(const float* x, const float* y, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; tanh_add_kernel<<>>(x, y, output, size); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } diff --git a/torch/lib/c10d/test/CUDATest.cu b/torch/lib/c10d/test/CUDATest.cu index 88f87492206c..c47b29ea536d 100644 --- a/torch/lib/c10d/test/CUDATest.cu +++ b/torch/lib/c10d/test/CUDATest.cu @@ -17,7 +17,6 @@ __global__ void waitClocks(const uint64_t count) { void cudaSleep(at::cuda::CUDAStream& stream, uint64_t clocks) { waitClocks<<<1, 1, 0, stream.stream()>>>(clocks); - C10_CUDA_KERNEL_LAUNCH_CHECK(); } int cudaNumDevices() {