Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove fgrad_input from slow_conv2d #64280

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions aten/src/ATen/core/aten_interned_strings.h
Expand Up @@ -700,9 +700,9 @@ _(aten, th_pow) \
_(aten, th_resize_as) \
_(aten, th_tensor) \
_(aten, th_zero) \
_(aten, thnn_conv2d) \
_(aten, thnn_conv2d_backward) \
_(aten, thnn_conv2d_forward) \
_(aten, _slow_conv2d) \
_(aten, _slow_conv2d_backward) \
_(aten, _slow_conv2d_forward) \
_(aten, tile) \
_(aten, slow_conv3d) \
_(aten, slow_conv3d_backward) \
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/native/Convolution.cpp
Expand Up @@ -1055,7 +1055,7 @@ at::Tensor _convolution_nogroup(
} else {
/* CPU implementation has specialized MM kernels
for non-dilated case here */
return at::thnn_conv2d(
return at::_slow_conv2d(
input, weight, kernel_size, bias,
stride, padding);
}
Expand Down
79 changes: 24 additions & 55 deletions aten/src/ATen/native/ConvolutionMM2d.cpp
Expand Up @@ -210,7 +210,7 @@ void slow_conv2d_backward_update_grad_input_frame(
int64_t pad_width) {
auto grad_output_2d = grad_output.reshape(
{grad_output.size(0), grad_output.size(1) * grad_output.size(2)});
fgrad_input.addmm_(weight, grad_output_2d, 0, 1);
at::mm_out(fgrad_input, weight, grad_output_2d);

grad_input.zero_();
unfolded2d_acc_stub(
Expand All @@ -236,7 +236,6 @@ void slow_conv2d_backward_out_cpu_template(
const Tensor& input_,
const Tensor& weight_,
const Tensor& finput,
Tensor& fgrad_input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding) {
Expand Down Expand Up @@ -264,22 +263,20 @@ void slow_conv2d_backward_out_cpu_template(
const Tensor input = input_.contiguous();
const Tensor grad_output = grad_output_.contiguous();
grad_input.resize_as_(input);
fgrad_input.resize_as_(finput);
fgrad_input.zero_();
const Tensor tweight = weight.transpose(0, 1);
const int64_t batch_size = input.size(0);
at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) {
NoGradGuard no_grad;
AutoDispatchBelowADInplaceOrView non_variable_type_mode;
auto fgrad_input = at::empty(finput.sizes().slice(1), finput.options());
for (int64_t t = start; t < end; t++) {
Tensor grad_input_t = grad_input[t];
Tensor grad_output_t = grad_output[t];
Tensor fgrad_input_t = fgrad_input[t];
slow_conv2d_backward_update_grad_input_frame(
grad_input_t,
grad_output_t,
tweight,
fgrad_input_t,
fgrad_input,
kernel_height,
kernel_width,
stride_height,
Expand All @@ -290,9 +287,8 @@ void slow_conv2d_backward_out_cpu_template(
});
}

void slow_conv2d_backward_parameters_frame(
void slow_conv2d_backward_weight_frame(
Tensor& grad_weight,
Tensor& grad_bias,
Tensor& grad_output,
const Tensor& finput) {
auto grad_output_2d = grad_output.view(
Expand All @@ -301,25 +297,6 @@ void slow_conv2d_backward_parameters_frame(
const Tensor tfinput = finput.transpose(0, 1);
grad_weight.addmm_(grad_output_2d, tfinput);
}

if (grad_bias.defined()) {
AT_DISPATCH_FLOATING_TYPES_AND(
at::ScalarType::BFloat16,
grad_output.scalar_type(),
"slow_conv2d_backward_parameters",
[&] {
auto grad_output_2d_acc = grad_output_2d.accessor<scalar_t, 2>();
auto grad_bias_acc = grad_bias.accessor<scalar_t, 1>();
const auto sz = grad_output_2d.size(1);
for (int64_t i = 0; i < grad_bias.size(0); i++) {
scalar_t sum = 0;
for (int64_t k = 0; k < sz; k++) {
sum += grad_output_2d_acc[i][k];
}
grad_bias_acc[i] += sum;
}
});
}
}

static void slow_conv2d_backward_parameters_out_cpu_template(
Expand All @@ -328,7 +305,6 @@ static void slow_conv2d_backward_parameters_out_cpu_template(
const Tensor& input_,
const Tensor& grad_output_,
const Tensor& finput,
Tensor fgrad_input,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding) {
Expand All @@ -349,10 +325,6 @@ static void slow_conv2d_backward_parameters_out_cpu_template(
grad_weight_2d = view_weight_2d(grad_weight);
}

if (grad_bias.defined()) {
checkContiguous(c, grad_bias_arg);
}

slow_conv2d_shape_check(
input_,
grad_output_,
Expand All @@ -369,6 +341,11 @@ static void slow_conv2d_backward_parameters_out_cpu_template(
auto input = input_.contiguous();
auto grad_output = grad_output_.contiguous();

if (grad_bias.defined()) {
Tensor grad_bias_mut = grad_bias;
at::sum_out(grad_bias_mut, grad_output, IntArrayRef{0, 2, 3});
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a much nicer way to compute grad_bias and seems correct AFAICT.

Couple questions regarding this:

  • Are there any tests verifying the grad_bias calculation? I briefly checked but nothing stood out
  • Do you by chance have any comparative benchmarking results for the change?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any tests verifying the grad_bias calculation? I briefly checked but nothing stood out

The nn tests are a bit confusing, but I think the gradcheck is done here:

def _check_gradients(self, test_case, module, input_tuple):

That test includes inputs and parameters, so bias should be covered.

Do you by chance have any comparative benchmarking results for the change?

I've done some simple benchmarking which showed positive results. LMK if you want more.

For CUDA, I've run a convolution in a loop under nvprof with these parameters:

input = torch.rand(1, 50, 260, 260, device=device, requires_grad=False)
weight = torch.rand(10, 50, 3, 3, device=device, requires_grad=False)
bias = torch.rand(10, device=device, requires_grad=True)

The result is it calls one kernel instead of two (cublas gemm here becomes dot_kernel + reduce_1Block_kernel). The kernel execution time is also noticeably faster:

Master (us) This PR (us)
Forward 23.8 14.6
Backward 22.4 18.1

On CPU without nvprof, the results are dwarfed by the main convolution and the same shapes showed no measurable difference. So I also tried a smaller input size with H=120, C_in = 2 and that showed some minor improvement in the forward case (presumably from vectorization).

Master (us) This PR (us)
Forward (Big) 15,100 15,100
Backward (Big) 13,100 13,100
Forward (Small) 526 499
Backward (Small) 1,050 1,050

const int64_t batch_size = input.size(0);
for (int64_t t = 0; t < batch_size; t++) {
Tensor grad_output_t = grad_output[t];
Expand All @@ -377,21 +354,21 @@ static void slow_conv2d_backward_parameters_out_cpu_template(
finput_t = finput[t];
}

slow_conv2d_backward_parameters_frame(
grad_weight_2d, grad_bias, grad_output_t, finput_t);
slow_conv2d_backward_weight_frame(
grad_weight_2d, grad_output_t, finput_t);
}
}

} // namespace

std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_forward_out_cpu(const Tensor& self,
std::tuple<Tensor&, Tensor&> slow_conv2d_forward_out_cpu(
const Tensor& self,
const Tensor& weight_,
IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
IntArrayRef stride,
IntArrayRef padding,
Tensor& output,
Tensor& finput,
Tensor& fgrad_input) {
Tensor& finput) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
const Tensor& bias = *bias_maybe_owned;
Expand Down Expand Up @@ -474,10 +451,10 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_forward_out_cpu(const Tensor&
}
});

return std::tuple<Tensor&, Tensor&, Tensor&>(output, finput, fgrad_input);
return std::tuple<Tensor&, Tensor&>(output, finput);
}

std::tuple<Tensor, Tensor, Tensor> slow_conv2d_forward_cpu(
std::tuple<Tensor, Tensor> slow_conv2d_forward_cpu(
const Tensor& self,
const Tensor& weight,
IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt,
Expand All @@ -489,7 +466,6 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv2d_forward_cpu(

auto output = at::empty({0}, self.options());
auto finput = at::empty({0}, self.options());
auto fgrad_input = at::empty({0}, self.options());
at::native::slow_conv2d_forward_out_cpu(
self,
weight,
Expand All @@ -498,19 +474,18 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv2d_forward_cpu(
stride,
padding,
output,
finput,
fgrad_input);
return std::make_tuple(output, finput, fgrad_input);
finput);
return std::make_tuple(output, finput);
}

std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cpu(const Tensor& grad_output,
std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cpu(
const Tensor& grad_output,
const Tensor& self,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
const Tensor& finput,
const Tensor& fgrad_input,
Tensor& grad_input,
Tensor& grad_weight,
Tensor& grad_bias) {
Expand All @@ -521,8 +496,6 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cpu(const Tensor&
self,
weight,
finput,
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
const_cast<Tensor&>(fgrad_input), // cast away auto-generated const of buffer
kernel_size,
stride,
padding);
Expand All @@ -545,7 +518,6 @@ std::tuple<Tensor&, Tensor&, Tensor&> slow_conv2d_backward_out_cpu(const Tensor&
self,
grad_output,
finput,
fgrad_input,
kernel_size,
stride,
padding);
Expand All @@ -563,7 +535,6 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv2d_backward_cpu(
IntArrayRef stride,
IntArrayRef padding,
const Tensor& finput,
const Tensor& fgrad_input,
std::array<bool, 3> output_mask) {
Tensor grad_input;
Tensor grad_weight;
Expand All @@ -589,30 +560,28 @@ std::tuple<Tensor, Tensor, Tensor> slow_conv2d_backward_cpu(
stride,
padding,
finput,
fgrad_input,
grad_input,
grad_weight,
grad_bias);

return std::make_tuple(grad_input, grad_weight, grad_bias);
}

Tensor & thnn_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output) {
Tensor & _slow_conv2d_out(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding, Tensor & output) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
const Tensor& bias = *bias_maybe_owned;

Tensor finput = at::empty({0}, self.options());
Tensor fgrad_input = at::empty({0}, self.options());
return std::get<0>(at::thnn_conv2d_forward_out(output, finput, fgrad_input, self, weight, kernel_size, bias, stride, padding));
return std::get<0>(at::_slow_conv2d_forward_out(output, finput, self, weight, kernel_size, bias, stride, padding));
}

Tensor thnn_conv2d(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding) {
Tensor _slow_conv2d(const Tensor & self, const Tensor & weight, IntArrayRef kernel_size, const c10::optional<Tensor>& bias_opt, IntArrayRef stride, IntArrayRef padding) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
const Tensor& bias = *bias_maybe_owned;

return std::get<0>(at::thnn_conv2d_forward(self, weight, kernel_size, bias, stride, padding));
return std::get<0>(at::_slow_conv2d_forward(self, weight, kernel_size, bias, stride, padding));
}

} // namespace native
Expand Down