Skip to content

Commit

Permalink
[numpy] torch.{all/any} : output dtype is always bool (#47878)
Browse files Browse the repository at this point in the history
Summary:
BC-breaking note:

This PR changes the behavior of the any and all functions to always return a bool tensor. Previously these functions were only defined on bool and uint8 tensors, and when called on uint8 tensors they would also return a uint8 tensor. (When called on a bool tensor they would return a bool tensor.)

PR summary:

#44790 (comment)

Fixes 2 and 3

Also Fixes #48352

Changes
* Output dtype is always `bool` (consistent with numpy) **BC Breaking (Previously used to match the input dtype**)
* Uses vectorized version for all dtypes on CPU
* Enables test for complex
* Update doc for `torch.all` and `torch.any`

TODO
* [x] Update docs
* [x] Benchmark
* [x] Raise issue on XLA

Pull Request resolved: #47878

Reviewed By: albanD

Differential Revision: D25714324

Pulled By: mruberry

fbshipit-source-id: a87345f725297524242d69402dfe53060521ea5d
  • Loading branch information
kshitij12345 authored and facebook-github-bot committed Jan 8, 2021
1 parent a4f30d4 commit 5d45140
Show file tree
Hide file tree
Showing 9 changed files with 361 additions and 271 deletions.
108 changes: 96 additions & 12 deletions aten/src/ATen/native/ReduceOps.cpp
Expand Up @@ -740,6 +740,12 @@ Tensor norm(const Tensor& self, Scalar p) {
return at::native::_norm(self, p);
}

// Note [all, any : uint8 compatibility]:
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// For NumPy comptability, `all` and `any` return
// Tensor of dtype `bool`. However for compatibility reason,
// for `uint8`, they return Tensor of same dtype `uint8`.
// Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561
inline Tensor & _all(Tensor & result, TensorIterator & iter) {
if (iter.numel() == 0) {
result.fill_(1);
Expand All @@ -756,14 +762,40 @@ Tensor all(const Tensor& self) {
TORCH_CHECK(self.layout() == Layout::Strided,
"all only supports strided layout, got: ", self.layout());

Tensor result = at::empty({0}, self.options());
auto iter = make_reduction(
"all", result, self, {}, false, self.scalar_type());
// Refer [all, any : uint8 compatibility]
Tensor result;
ScalarType out_dtype;
if (self.scalar_type() == ScalarType::Byte){
result = at::empty({0}, self.options());
out_dtype = self.scalar_type();
} else {
result = at::empty({0}, self.options().dtype(kBool));
out_dtype = ScalarType::Bool;
}

if (self.is_cuda()) {
// As CUDA supports dynamic type casting, we use this overload of
// `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
// otherwise we use the overload below which casts the input to kBool (which is
// an extra operation).
auto iter = make_reduction(
"all", result, self, {}, false, self.scalar_type(), out_dtype);
return _all(result, iter);
}
auto iter =
make_reduction("all", result, self, {}, false, /*out_dtype=*/out_dtype);
return _all(result, iter);
}

Tensor all(const Tensor& self, int64_t dim, bool keepdim) {
Tensor result = at::empty({0}, self.options());
// Refer [all, any : uint8 compatibility]
Tensor result;
if (self.scalar_type() == ScalarType::Byte){
result = at::empty({0}, self.options());
} else {
result = at::empty({0}, self.options().dtype(kBool));
}

return at::native::all_out(result, self, dim, keepdim);
}

Expand All @@ -772,13 +804,26 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
"all only supports CPU AND CUDA device type, got: ", self.device().type());
TORCH_CHECK(self.layout() == Layout::Strided,
"all only supports strided layout, got: ", self.layout());
// Refer [all, any : uint8 compatibility]
TORCH_CHECK(result.scalar_type() == ScalarType::Bool || result.scalar_type() == ScalarType::Byte,
"all only supports bool tensor for result, got: ", result.scalar_type());

auto out_dtype = result.scalar_type();
dim = maybe_wrap_dim(dim, self.dim());
if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) {
return result;
} else {
auto iter = make_reduction(
"all", result, self, dim, keepdim, self.scalar_type());
if (self.is_cuda()) {
// As CUDA supports dynamic type casting, we use this overload of
// `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
// otherwise we use the overload below which casts the input to kBool (which is
// an extra operation).
auto iter = make_reduction(
"all", result, self, dim, keepdim, self.scalar_type(), out_dtype);
return _all(result, iter);
}
auto iter =
make_reduction("all", result, self, dim, keepdim, /*out_dtype=*/out_dtype);
return _all(result, iter);
}
}
Expand All @@ -798,15 +843,41 @@ Tensor any(const Tensor& self) {
"any only supports CPU AND CUDA device type, got: ", self.device().type());
TORCH_CHECK(self.layout() == Layout::Strided || self.layout() == Layout::Sparse,
"any only supports strided AND sparse layout, got: ", self.layout());

// Refer [all, any : uint8 compatibility]
Tensor result;
ScalarType out_dtype;
if (self.scalar_type() == ScalarType::Byte){
result = at::empty({0}, self.options());
out_dtype = self.scalar_type();
} else {
result = at::empty({0}, self.options().dtype(kBool));
out_dtype = ScalarType::Bool;
}

Tensor result = at::empty({0}, self.options());
auto iter = make_reduction(
"any", result, self, {}, false, self.scalar_type());
if (self.is_cuda()) {
// As CUDA supports dynamic type casting, we use this overload of
// `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
// otherwise we use the overload below which casts the input to kBool (which is
// an extra operation).
auto iter = make_reduction(
"any", result, self, {}, false, self.scalar_type(), out_dtype);
return _any(result, iter);
}
auto iter =
make_reduction("any", result, self, {}, false, /*out_dtype=*/out_dtype);
return _any(result, iter);
}

Tensor any(const Tensor& self, int64_t dim, bool keepdim) {
Tensor result = at::empty({0}, self.options());
// Refer [all, any : uint8 compatibility]
Tensor result;
if (self.scalar_type() == ScalarType::Byte){
result = at::empty({0}, self.options());
} else {
result = at::empty({0}, self.options().dtype(kBool));
}

return at::native::any_out(result, self, dim, keepdim);
}

Expand All @@ -815,13 +886,26 @@ Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) {
"any only supports CPU AND CUDA device type, got: ", self.device().type());
TORCH_CHECK(self.layout() == Layout::Strided,
"any only supports strided layout, got: ", self.layout());
// Refer [all, any : uint8 compatibility]
TORCH_CHECK(result.scalar_type() == ScalarType::Bool || result.scalar_type() == ScalarType::Byte,
"any only supports bool tensor for result, got: ", result.scalar_type());

auto out_dtype = result.scalar_type();
dim = maybe_wrap_dim(dim, self.dim());
if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) {
return result;
} else {
auto iter = make_reduction(
"any", result, self, dim, keepdim, self.scalar_type());
if (self.is_cuda()) {
// As CUDA supports dynamic type casting, we use this overload of
// `make_reduction`, which doesn't cast input to the result type i.e. kBool.,
// otherwise we use the overload below which casts the input to kBool (which is
// an extra operation).
auto iter = make_reduction(
"any", result, self, dim, keepdim, self.scalar_type(), out_dtype);
return _any(result, iter);
}
auto iter =
make_reduction("any", result, self, dim, keepdim, /*out_dtype=*/out_dtype);
return _any(result, iter);
}
}
Expand Down
50 changes: 0 additions & 50 deletions aten/src/ATen/native/SharedReduceOps.h
Expand Up @@ -386,56 +386,6 @@ struct NanSumOps {
#endif
};

template <typename acc_t>
struct AndOps {
inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
return static_cast<bool>(a) && static_cast<bool>(b);
}

inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
return static_cast<bool>(a) && static_cast<bool>(b);
}

inline C10_DEVICE acc_t project(acc_t a) const {
return a;
}

static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
return acc;
}

#if defined(__CUDACC__) || defined(__HIPCC__)
inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
return WARP_SHFL_DOWN(data, offset);
}
#endif
};

template <typename acc_t>
struct OrOps {
inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const {
return static_cast<bool>(a) || static_cast<bool>(b);
}

inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
return static_cast<bool>(a) || static_cast<bool>(b);
}

inline C10_DEVICE acc_t project(acc_t a) const {
return a;
}

static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
return acc;
}

#if defined(__CUDACC__) || defined(__HIPCC__)
inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const {
return WARP_SHFL_DOWN(data, offset);
}
#endif
};

namespace detail {

template <typename scalar_t>
Expand Down
50 changes: 35 additions & 15 deletions aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
Expand Up @@ -256,11 +256,25 @@ static void norm_kernel_tensor_iterator_impl(
}

static void and_kernel_impl(TensorIterator& iter) {
if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) {
if (iter.dtype() == ScalarType::Byte) {
// Refer [all, any : uint8 compatibility]
binary_kernel_reduce_vec(
iter,
[=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; },
[=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
Vec256<uint8_t> c = Vec256<uint8_t>();

for (decltype(c.size()) i = 0; i != Vec256<uint8_t>::size(); i++) {
c[i] = (a[i] && b[i]) ? 1 : 0;
}
return c;
},
/*ident=*/true);
} else {
binary_kernel_reduce_vec(
iter,
[=](bool a, bool b) -> bool { return a && b; },
[=](Vec256<bool> a, Vec256<bool> b) {
// Adding the implementation here instead of in vec256_base to avoid
// return value inconsistency. Other comparison operators in
// vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to
Expand All @@ -271,39 +285,45 @@ static void and_kernel_impl(TensorIterator& iter) {
//
// In this method, users would expect, e.g., all(), to return 1/0 as
// true/false.
Vec256<uint8_t> c = Vec256<uint8_t>();
for (int i = 0; i != Vec256<uint8_t>::size(); i++) {
c[i] = (a[i] && b[i]) ? 1 : 0;
Vec256<bool> c = Vec256<bool>();

for (decltype(c.size()) i = 0; i != Vec256<bool>::size(); i++) {
c[i] = a[i] && b[i];
}
return c;
},
/*ident=*/true);
} else {
AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "and_kernel", [&]() {
binary_kernel_reduce(
iter, AndOps<scalar_t>(), static_cast<scalar_t>(true));
});
}
}

static void or_kernel_impl(TensorIterator& iter) {
if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) {
if (iter.dtype() == ScalarType::Byte) {
// Refer [all, any : uint8 compatibility]
binary_kernel_reduce_vec(
iter,
[=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; },
[=](Vec256<uint8_t> a, Vec256<uint8_t> b) {
Vec256<uint8_t> c = Vec256<uint8_t>();
for (int i = 0; i != Vec256<uint8_t>::size(); i++) {

for (decltype(c.size()) i = 0; i != Vec256<uint8_t>::size(); i++) {
c[i] = (a[i] || b[i]) ? 1 : 0;
}
return c;
},
/*ident=*/false);
} else {
AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "or_kernel", [&]() {
binary_kernel_reduce(
iter, OrOps<scalar_t>(), static_cast<scalar_t>(false));
});
binary_kernel_reduce_vec(
iter,
[=](bool a, bool b) -> bool { return a || b; },
[=](Vec256<bool> a, Vec256<bool> b) {
Vec256<bool> c = Vec256<bool>();

for (decltype(c.size()) i = 0; i != Vec256<bool>::size(); i++) {
c[i] = a[i] || b[i];
}
return c;
},
/*ident=*/false);
}
}

Expand Down
35 changes: 19 additions & 16 deletions aten/src/ATen/native/cuda/ReduceLogicKernel.cu
Expand Up @@ -3,30 +3,33 @@
#include <ATen/native/DispatchStub.h>
#include <ATen/native/SharedReduceOps.h>
#include <ATen/native/ReduceOps.h>
#include <ATen/Dispatch.h>


namespace at { namespace native {

void and_kernel_cuda(TensorIterator& iter) {
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "and_kernel", [&]() {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter,
func_wrapper<scalar_t>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
return static_cast<scalar_t>(static_cast<bool>(a) && static_cast<bool>(b));
}),
static_cast<scalar_t>(true));
});
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
kHalf, kBFloat16, kBool, iter.common_dtype(), "and_cuda", [&]() {
gpu_reduce_kernel<scalar_t, bool>(
iter,
func_wrapper<bool>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
return (static_cast<bool>(a) && static_cast<bool>(b));
}),
true);
});
}

void or_kernel_cuda(TensorIterator& iter) {
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "or_kernel", [&]() {
gpu_reduce_kernel<scalar_t, scalar_t>(
iter,
func_wrapper<scalar_t>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
return static_cast<scalar_t>(static_cast<bool>(a) || static_cast<bool>(b));
}),
static_cast<scalar_t>(false));
});
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
kHalf, kBFloat16, kBool, iter.common_dtype(), "or_cuda", [&]() {
gpu_reduce_kernel<scalar_t, bool>(
iter,
func_wrapper<bool>([] GPU_LAMBDA(scalar_t a, scalar_t b) -> bool {
return (static_cast<bool>(a) || static_cast<bool>(b));
}),
false);
});
}

REGISTER_DISPATCH(and_stub, &and_kernel_cuda);
Expand Down
9 changes: 2 additions & 7 deletions docs/source/tensors.rst
Expand Up @@ -214,6 +214,8 @@ view of a storage and defines numeric operations on it.
.. automethod:: arctan_
.. automethod:: atan2
.. automethod:: atan2_
.. automethod:: all
.. automethod:: any
.. automethod:: backward
:noindex:
.. automethod:: baddbmm
Expand Down Expand Up @@ -648,10 +650,3 @@ view of a storage and defines numeric operations on it.
.. automethod:: xlogy
.. automethod:: xlogy_
.. automethod:: zero_

.. class:: BoolTensor()

The following methods are unique to :class:`torch.BoolTensor`.

.. automethod:: all
.. automethod:: any
2 changes: 2 additions & 0 deletions docs/source/torch.rst
Expand Up @@ -364,6 +364,8 @@ Reduction Ops
argmin
amax
amin
all
any
max
min
dist
Expand Down

0 comments on commit 5d45140

Please sign in to comment.