Skip to content

Commit

Permalink
add channels last for AdaptiveAvgPool2d (#48916)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #48916

optimize adaptive average pool2d forward path

optimize adaptive average pool2d backward path

remove unused headers

minor change

minor change

rename the header; add adaptive max pooling in future.

minor change

loosen adapative_pool2d test on nhwc to both device cuda and cpu

minor change

Test Plan: Imported from OSS

Reviewed By: ngimel

Differential Revision: D25399469

Pulled By: VitalyFedyunin

fbshipit-source-id: 86f9fda35194f21144bd4667b778c861c05a5bac
  • Loading branch information
mingfeima authored and facebook-github-bot committed Dec 14, 2020
1 parent 8397a62 commit 690eaf9
Show file tree
Hide file tree
Showing 6 changed files with 452 additions and 334 deletions.
322 changes: 47 additions & 275 deletions aten/src/ATen/native/AdaptiveAveragePooling.cpp
@@ -1,303 +1,73 @@
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Parallel.h>
#include <tuple>
#include <ATen/native/AdaptivePooling.h>


namespace at {
namespace native {

namespace {

inline int start_index(int a, int b, int c) {
return (int)std::floor((float)(a * c) / b);
}

inline int end_index(int a, int b, int c) {
return (int)std::ceil((float)((a + 1) * c) / b);
}

template <typename scalar_t>
static void adaptive_avg_pool2d_single_out_frame(
scalar_t *input_p,
scalar_t *output_p,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW,
int64_t istrideD,
int64_t istrideH,
int64_t istrideW)
{
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
for (auto d = start; d < end; d++)
{
/* loop over output */
int64_t oh, ow;
for(oh = 0; oh < osizeH; oh++)
{
int istartH = start_index(oh, osizeH, isizeH);
int iendH = end_index(oh, osizeH, isizeH);
int kH = iendH - istartH;

for(ow = 0; ow < osizeW; ow++)
{
int istartW = start_index(ow, osizeW, isizeW);
int iendW = end_index(ow, osizeW, isizeW);
int kW = iendW - istartW;

/* local pointers */
scalar_t *ip = input_p + d*istrideD + istartH*istrideH + istartW*istrideW;
scalar_t *op = output_p + d*osizeH*osizeW + oh*osizeW + ow;

/* compute local average: */
scalar_t sum = 0;
int ih, iw;
for(ih = 0; ih < kH; ih++)
{
for(iw = 0; iw < kW; iw++)
{
scalar_t val = *(ip + ih*istrideH + iw*istrideW);
sum += val;
}
}

/* set output to local average */
*op = sum / kW / kH;
}
}
}
});
}

template <typename scalar_t>
void adaptive_avg_pool2d_out_frame(
scalar_t *input_p,
scalar_t *output_p,
int64_t sizeB,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW,
int64_t istrideB,
int64_t istrideD,
int64_t istrideH,
int64_t istrideW)
{
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
for (auto b = start; b < end; b++)
{
adaptive_avg_pool2d_single_out_frame<scalar_t>(
input_p + b * istrideB,
output_p + b * sizeD * osizeH * osizeW,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideD,
istrideH, istrideW);
}
});
}

void adaptive_avg_pool2d_out_cpu_template(
at::Tensor& output,
at::Tensor const& input,
IntArrayRef output_size)
{
TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
for (int64_t i = 0; i < input.ndimension(); i++) {
int64_t ndim = input.ndimension();
for (int64_t i = 0; i < ndim; i++) {
TORCH_CHECK(input.size(i) > 0,
"adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, "
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
"empty");
}

TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
TORCH_CHECK((ndim == 3 || ndim == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(input.dtype() == output.dtype(),
"expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());

/* sizes */
int64_t sizeD = input.size(-3);
int64_t isizeH = input.size(-2);
int64_t isizeW = input.size(-1);
/* strides */
int64_t istrideD = input.stride(-3);
int64_t istrideH = input.stride(-2);
int64_t istrideW = input.stride(-1);

auto osizeH = output_size[0];
auto osizeW = output_size[1];

/* resize output */
if (input.ndimension() == 3 || input.size(-4) == 1)
{
if (input.ndimension() == 3) {
output.resize_({sizeD, osizeH, osizeW});
} else {
output.resize_({1, sizeD, osizeH, osizeW});
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
adaptive_avg_pool2d_single_out_frame<scalar_t>(
input_data,
output_data,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideD,
istrideH, istrideW);
}
);
}
else
{
int64_t sizeB = input.size(-4);
output.resize_({sizeB, sizeD, osizeH, osizeW});
int64_t istrideB = input.stride(-4);
int64_t channels = input.size(-3);
int64_t input_height = input.size(-2);
int64_t input_width = input.size(-1);
int64_t output_height = output_size[0];
int64_t output_width = output_size[1];

AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
adaptive_avg_pool2d_out_frame<scalar_t>(
input_data,
output_data,
sizeB,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideB,
istrideD,
istrideH, istrideW);
});
if (ndim == 3) {
output.resize_({channels, output_height, output_width});
} else {
int64_t nbatch = input.size(0);
output.resize_({nbatch, channels, output_height, output_width}, input.suggest_memory_format());
}
}

template <typename scalar_t>
static void adaptive_avg_pool2d_backward_single_out_frame(
scalar_t *gradInput_p,
scalar_t *gradOutput_p,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW)
{
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
for (auto d = start; d < end; d++)
{
scalar_t *gradInput_p_d = gradInput_p + d*isizeW*isizeH;
scalar_t *gradOutput_p_d = gradOutput_p + d*osizeW*osizeH;

/* calculate average */
int64_t oh, ow;
for(oh = 0; oh < osizeH; oh++)
{
int istartH = start_index(oh, osizeH, isizeH);
int iendH = end_index(oh, osizeH, isizeH);
int kH = iendH - istartH;

for(ow = 0; ow < osizeW; ow++)
{

int istartW = start_index(ow, osizeW, isizeW);
int iendW = end_index(ow, osizeW, isizeW);
int kW = iendW - istartW;

scalar_t grad_delta = gradOutput_p_d[oh*osizeW +ow] / kH / kW;

int ih, iw;
for(ih = istartH; ih < iendH; ih++)
{
for(iw = istartW; iw < iendW; iw++)
{
/* update gradient */
gradInput_p_d[ih*isizeW + iw] += grad_delta;
}
}
}
}
}
});
}

template <typename scalar_t>
void adaptive_avg_pool2d_backward_out_frame(
scalar_t *gradInput_p,
scalar_t *gradOutput_p,
int64_t sizeB,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW)
{
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
for (auto b = start; b < end; b++)
{
scalar_t *gradInput_p_d = gradInput_p + b * sizeD * isizeW * isizeH;
scalar_t *gradOutput_p_d = gradOutput_p + b * sizeD * osizeW * osizeH;
adaptive_avg_pool2d_backward_single_out_frame<scalar_t>(
gradInput_p_d,
gradOutput_p_d,
sizeD,
isizeH, isizeW,
osizeH, osizeW);
}
});
adaptive_avg_pool2d_kernel(kCPU, output, input, output_size);
}

Tensor& adaptive_avg_pool2d_backward_out_cpu_template(
Tensor& gradInput,
const Tensor& gradOutput_,
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& input)
{
/* sizes */
int sizeD = input.size(-3);
int isizeH = input.size(-2);
int isizeW = input.size(-1);
int osizeH = gradOutput_.size(-2);
int osizeW = gradOutput_.size(-1);

/* get contiguous gradOutput */
auto gradOutput = gradOutput_.contiguous();
int64_t ndim = grad_output.ndimension();
for (int64_t i = 0; i < ndim; i++) {
TORCH_CHECK(grad_output.size(i) > 0,
"adaptive_avg_pooling2d_backward(): expected grad_output to have non-empty spatial dimensions, "
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
"empty");
}

/* backprop */
if (input.ndimension() == 3 || input.size(-4) == 1)
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
TORCH_CHECK((ndim == 3 || ndim == 4),
"non-empty 3D or 4D (batch mode) tensor expected for grad_output");
TORCH_CHECK(input.dtype() == grad_output.dtype(),
"expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype());
TORCH_CHECK(input.dtype() == grad_input.dtype(),
"expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype());

adaptive_avg_pool2d_backward_single_out_frame<scalar_t>(
gradInput_data, gradOutput_data,
sizeD,
isizeH, isizeW,
osizeH, osizeW);
}
);
}
else
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
int64_t sizeB = input.size(-4);
grad_input.resize_(input.sizes(), input.suggest_memory_format());
grad_input.zero_();

adaptive_avg_pool2d_backward_out_frame<scalar_t>(
gradInput_data, gradOutput_data,
sizeB, sizeD,
isizeH, isizeW,
osizeH, osizeW);
}
);
}
return gradInput;
adaptive_avg_pool2d_backward_kernel(kCPU, grad_input, grad_output);
return grad_input;
}

} // namespace
Expand Down Expand Up @@ -346,25 +116,27 @@ namespace {
}

Tensor& adaptive_avg_pool2d_backward_out_cpu(
Tensor& gradInput,
const Tensor& gradOutput,
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& input)
{
gradInput.resize_as_(input);
adaptive_avg_pool2d_backward_out_cpu_template(
gradInput, gradOutput, input);
return gradInput;
grad_input, grad_output, input);
return grad_input;
}

Tensor adaptive_avg_pool2d_backward_cpu(
const Tensor& gradOutput,
const Tensor& grad_output,
const Tensor& input)
{
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto grad_input = at::empty({0}, input.options());
adaptive_avg_pool2d_backward_out_cpu_template(
gradInput, gradOutput, input);
return gradInput;
grad_input, grad_output, input);
return grad_input;
}

DEFINE_DISPATCH(adaptive_avg_pool2d_kernel);
DEFINE_DISPATCH(adaptive_avg_pool2d_backward_kernel);

} // at::native
} // at
21 changes: 21 additions & 0 deletions aten/src/ATen/native/AdaptivePooling.h
@@ -0,0 +1,21 @@
#pragma once

#include <ATen/ATen.h>
#include <ATen/native/DispatchStub.h>

namespace at { namespace native {

using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel);
DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel);

static inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
return (int64_t)std::floor((float)(a * c) / b);
}

static inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
return (int64_t)std::ceil((float)((a + 1) * c) / b);
}

}} // namespace at::native

0 comments on commit 690eaf9

Please sign in to comment.