Skip to content

Commit

Permalink
add channels last for AdaptiveAvgPool2d
Browse files Browse the repository at this point in the history
optimize adaptive average pool2d forward path

optimize adaptive average pool2d backward path

remove unused headers

minor change

minor change

rename the header; add adaptive max pooling in future.

minor change

loosen adapative_pool2d test on nhwc to both device cuda and cpu

minor change
  • Loading branch information
mingfeima committed Nov 5, 2020
1 parent 996f444 commit 271c085
Show file tree
Hide file tree
Showing 6 changed files with 452 additions and 334 deletions.
322 changes: 47 additions & 275 deletions aten/src/ATen/native/AdaptiveAveragePooling.cpp
Original file line number Diff line number Diff line change
@@ -1,303 +1,73 @@
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <ATen/Parallel.h>
#include <tuple>
#include <ATen/native/AdaptivePooling.h>


namespace at {
namespace native {

namespace {

inline int start_index(int a, int b, int c) {
return (int)std::floor((float)(a * c) / b);
}

inline int end_index(int a, int b, int c) {
return (int)std::ceil((float)((a + 1) * c) / b);
}

template <typename scalar_t>
static void adaptive_avg_pool2d_single_out_frame(
scalar_t *input_p,
scalar_t *output_p,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW,
int64_t istrideD,
int64_t istrideH,
int64_t istrideW)
{
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
for (auto d = start; d < end; d++)
{
/* loop over output */
int64_t oh, ow;
for(oh = 0; oh < osizeH; oh++)
{
int istartH = start_index(oh, osizeH, isizeH);
int iendH = end_index(oh, osizeH, isizeH);
int kH = iendH - istartH;

for(ow = 0; ow < osizeW; ow++)
{
int istartW = start_index(ow, osizeW, isizeW);
int iendW = end_index(ow, osizeW, isizeW);
int kW = iendW - istartW;

/* local pointers */
scalar_t *ip = input_p + d*istrideD + istartH*istrideH + istartW*istrideW;
scalar_t *op = output_p + d*osizeH*osizeW + oh*osizeW + ow;

/* compute local average: */
scalar_t sum = 0;
int ih, iw;
for(ih = 0; ih < kH; ih++)
{
for(iw = 0; iw < kW; iw++)
{
scalar_t val = *(ip + ih*istrideH + iw*istrideW);
sum += val;
}
}

/* set output to local average */
*op = sum / kW / kH;
}
}
}
});
}

template <typename scalar_t>
void adaptive_avg_pool2d_out_frame(
scalar_t *input_p,
scalar_t *output_p,
int64_t sizeB,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW,
int64_t istrideB,
int64_t istrideD,
int64_t istrideH,
int64_t istrideW)
{
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
for (auto b = start; b < end; b++)
{
adaptive_avg_pool2d_single_out_frame<scalar_t>(
input_p + b * istrideB,
output_p + b * sizeD * osizeH * osizeW,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideD,
istrideH, istrideW);
}
});
}

void adaptive_avg_pool2d_out_cpu_template(
at::Tensor& output,
at::Tensor const& input,
IntArrayRef output_size)
{
TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2");
for (int64_t i = 0; i < input.ndimension(); i++) {
int64_t ndim = input.ndimension();
for (int64_t i = 0; i < ndim; i++) {
TORCH_CHECK(input.size(i) > 0,
"adaptive_avg_pooling2d(): expected input to have non-empty spatial dimensions, "
"but input has sizes ", input.sizes(), " with dimension ", i, " being "
"empty");
}

TORCH_CHECK((input.ndimension() == 3 || input.ndimension() == 4),
TORCH_CHECK((ndim == 3 || ndim == 4),
"non-empty 3D or 4D (batch mode) tensor expected for input");
TORCH_CHECK(input.dtype() == output.dtype(),
"expected dtype ", input.dtype(), " for `output` but got dtype ", output.dtype());

/* sizes */
int64_t sizeD = input.size(-3);
int64_t isizeH = input.size(-2);
int64_t isizeW = input.size(-1);
/* strides */
int64_t istrideD = input.stride(-3);
int64_t istrideH = input.stride(-2);
int64_t istrideW = input.stride(-1);

auto osizeH = output_size[0];
auto osizeW = output_size[1];

/* resize output */
if (input.ndimension() == 3 || input.size(-4) == 1)
{
if (input.ndimension() == 3) {
output.resize_({sizeD, osizeH, osizeW});
} else {
output.resize_({1, sizeD, osizeH, osizeW});
}
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
adaptive_avg_pool2d_single_out_frame<scalar_t>(
input_data,
output_data,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideD,
istrideH, istrideW);
}
);
}
else
{
int64_t sizeB = input.size(-4);
output.resize_({sizeB, sizeD, osizeH, osizeW});
int64_t istrideB = input.stride(-4);
int64_t channels = input.size(-3);
int64_t input_height = input.size(-2);
int64_t input_width = input.size(-1);
int64_t output_height = output_size[0];
int64_t output_width = output_size[1];

AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "adaptive_avg_pool2d_cpu", [&] {
auto input_data = input.data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
adaptive_avg_pool2d_out_frame<scalar_t>(
input_data,
output_data,
sizeB,
sizeD,
isizeH, isizeW,
osizeH, osizeW,
istrideB,
istrideD,
istrideH, istrideW);
});
if (ndim == 3) {
output.resize_({channels, output_height, output_width});
} else {
int64_t nbatch = input.size(0);
output.resize_({nbatch, channels, output_height, output_width}, input.suggest_memory_format());
}
}

template <typename scalar_t>
static void adaptive_avg_pool2d_backward_single_out_frame(
scalar_t *gradInput_p,
scalar_t *gradOutput_p,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW)
{
at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) {
for (auto d = start; d < end; d++)
{
scalar_t *gradInput_p_d = gradInput_p + d*isizeW*isizeH;
scalar_t *gradOutput_p_d = gradOutput_p + d*osizeW*osizeH;

/* calculate average */
int64_t oh, ow;
for(oh = 0; oh < osizeH; oh++)
{
int istartH = start_index(oh, osizeH, isizeH);
int iendH = end_index(oh, osizeH, isizeH);
int kH = iendH - istartH;

for(ow = 0; ow < osizeW; ow++)
{

int istartW = start_index(ow, osizeW, isizeW);
int iendW = end_index(ow, osizeW, isizeW);
int kW = iendW - istartW;

scalar_t grad_delta = gradOutput_p_d[oh*osizeW +ow] / kH / kW;

int ih, iw;
for(ih = istartH; ih < iendH; ih++)
{
for(iw = istartW; iw < iendW; iw++)
{
/* update gradient */
gradInput_p_d[ih*isizeW + iw] += grad_delta;
}
}
}
}
}
});
}

template <typename scalar_t>
void adaptive_avg_pool2d_backward_out_frame(
scalar_t *gradInput_p,
scalar_t *gradOutput_p,
int64_t sizeB,
int64_t sizeD,
int64_t isizeH,
int64_t isizeW,
int64_t osizeH,
int64_t osizeW)
{
at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) {
for (auto b = start; b < end; b++)
{
scalar_t *gradInput_p_d = gradInput_p + b * sizeD * isizeW * isizeH;
scalar_t *gradOutput_p_d = gradOutput_p + b * sizeD * osizeW * osizeH;
adaptive_avg_pool2d_backward_single_out_frame<scalar_t>(
gradInput_p_d,
gradOutput_p_d,
sizeD,
isizeH, isizeW,
osizeH, osizeW);
}
});
adaptive_avg_pool2d_kernel(kCPU, output, input, output_size);
}

Tensor& adaptive_avg_pool2d_backward_out_cpu_template(
Tensor& gradInput,
const Tensor& gradOutput_,
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& input)
{
/* sizes */
int sizeD = input.size(-3);
int isizeH = input.size(-2);
int isizeW = input.size(-1);
int osizeH = gradOutput_.size(-2);
int osizeW = gradOutput_.size(-1);

/* get contiguous gradOutput */
auto gradOutput = gradOutput_.contiguous();
int64_t ndim = grad_output.ndimension();
for (int64_t i = 0; i < ndim; i++) {
TORCH_CHECK(grad_output.size(i) > 0,
"adaptive_avg_pooling2d_backward(): expected grad_output to have non-empty spatial dimensions, "
"but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being "
"empty");
}

/* backprop */
if (input.ndimension() == 3 || input.size(-4) == 1)
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
TORCH_CHECK((ndim == 3 || ndim == 4),
"non-empty 3D or 4D (batch mode) tensor expected for grad_output");
TORCH_CHECK(input.dtype() == grad_output.dtype(),
"expected dtype ", input.dtype(), " for `grad_output` but got dtype ", grad_output.dtype());
TORCH_CHECK(input.dtype() == grad_input.dtype(),
"expected dtype ", input.dtype(), " for `grad_input` but got dtype ", grad_input.dtype());

adaptive_avg_pool2d_backward_single_out_frame<scalar_t>(
gradInput_data, gradOutput_data,
sizeD,
isizeH, isizeW,
osizeH, osizeW);
}
);
}
else
{
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
input.scalar_type(), "adaptive_avg_pool2d_backward_cpu", [&] {
/* get raw pointers */
scalar_t *gradInput_data = gradInput.data_ptr<scalar_t>();
scalar_t *gradOutput_data = gradOutput.data_ptr<scalar_t>();
int64_t sizeB = input.size(-4);
grad_input.resize_(input.sizes(), input.suggest_memory_format());
grad_input.zero_();

adaptive_avg_pool2d_backward_out_frame<scalar_t>(
gradInput_data, gradOutput_data,
sizeB, sizeD,
isizeH, isizeW,
osizeH, osizeW);
}
);
}
return gradInput;
adaptive_avg_pool2d_backward_kernel(kCPU, grad_input, grad_output);
return grad_input;
}

} // namespace
Expand Down Expand Up @@ -346,25 +116,27 @@ namespace {
}

Tensor& adaptive_avg_pool2d_backward_out_cpu(
Tensor& gradInput,
const Tensor& gradOutput,
Tensor& grad_input,
const Tensor& grad_output,
const Tensor& input)
{
gradInput.resize_as_(input);
adaptive_avg_pool2d_backward_out_cpu_template(
gradInput, gradOutput, input);
return gradInput;
grad_input, grad_output, input);
return grad_input;
}

Tensor adaptive_avg_pool2d_backward_cpu(
const Tensor& gradOutput,
const Tensor& grad_output,
const Tensor& input)
{
auto gradInput = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
auto grad_input = at::empty({0}, input.options());
adaptive_avg_pool2d_backward_out_cpu_template(
gradInput, gradOutput, input);
return gradInput;
grad_input, grad_output, input);
return grad_input;
}

DEFINE_DISPATCH(adaptive_avg_pool2d_kernel);
DEFINE_DISPATCH(adaptive_avg_pool2d_backward_kernel);

} // at::native
} // at
21 changes: 21 additions & 0 deletions aten/src/ATen/native/AdaptivePooling.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <ATen/ATen.h>
#include <ATen/native/DispatchStub.h>

namespace at { namespace native {

using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel);
DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel);

static inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
return (int64_t)std::floor((float)(a * c) / b);
}

static inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
return (int64_t)std::ceil((float)((a + 1) * c) / b);
}

}} // namespace at::native

0 comments on commit 271c085

Please sign in to comment.