Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA BFloat16 support of clamp, remainder, lshift, rshift #45247

Closed
wants to merge 8 commits into from
2 changes: 1 addition & 1 deletion aten/src/ATen/native/cuda/BinaryRemainderKernel.cu
Expand Up @@ -21,7 +21,7 @@ void remainder_kernel_cuda(TensorIterator& iter) {
});
});
} else {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "remainder_cuda", [&]() {
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "remainder_cuda", [&]() {
gpu_kernel_with_scalars(iter,
[]GPU_LAMBDA(scalar_t a, scalar_t b) __ubsan_ignore_float_divide_by_zero__ -> scalar_t {
auto mod = ::fmod(a, b);
Expand Down
10 changes: 6 additions & 4 deletions aten/src/ATen/native/cuda/BinaryShiftOpsKernels.cu
Expand Up @@ -13,8 +13,9 @@ namespace at { namespace native {
void lshift_kernel_cuda(TensorIterator& iter) {
if (iter.dtype() == ScalarType::Float ||
iter.dtype() == ScalarType::Double ||
iter.dtype() == ScalarType::Half) {
AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "lshift_cuda", [&]() {
iter.dtype() == ScalarType::Half ||
iter.dtype() == ScalarType::BFloat16) {
AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "lshift_cuda", [&]() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is this being tested?

Copy link
Collaborator Author

@zasdfgbnm zasdfgbnm Sep 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is tested by

    ('__lshift__', '',
        lambda t, d: torch.pow(2, torch.arange(1, 5).to(dtype=_convert_t(t, d), device=d)),
        lambda t, d: [2],
        1e-3, 1e-5, 1e-3, _signed_types, _cpu_types, False),
    ('__rshift__', '',
        lambda t, d: torch.pow(2, torch.arange(3, 7).to(dtype=_convert_t(t, d), device=d)),
        lambda t, d: [2],
        1e-3, 1e-5, 1e-3, _signed_types, _cpu_types, False),

where the _signed_types is modified in https://github.com/pytorch/pytorch/pull/45247/files#diff-9996665f82f52030836eb8657057cfadR19601-R19604 to add bfloat16

gpu_kernel_with_scalars(
iter,
[]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
Expand All @@ -34,8 +35,9 @@ void lshift_kernel_cuda(TensorIterator& iter) {
void rshift_kernel_cuda(TensorIterator& iter) {
if (iter.dtype() == ScalarType::Float ||
iter.dtype() == ScalarType::Double ||
iter.dtype() == ScalarType::Half) {
AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, iter.dtype(), "rshift_cuda", [&]() {
iter.dtype() == ScalarType::Half ||
iter.dtype() == ScalarType::BFloat16) {
AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, iter.dtype(), "rshift_cuda", [&]() {
gpu_kernel_with_scalars(
iter,
[]GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t {
Expand Down
6 changes: 3 additions & 3 deletions aten/src/ATen/native/cuda/UnaryOpsKernel.cu
Expand Up @@ -155,7 +155,7 @@ void erfinv_kernel_cuda(TensorIterator& iter) {
}

void clamp_kernel_cuda(TensorIterator& iter, Scalar min_value, Scalar max_value) {
AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "clamp_cuda", [&]() {
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_cuda", [&]() {
auto lower = min_value.to<scalar_t>();
auto upper = max_value.to<scalar_t>();
gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
Expand All @@ -170,7 +170,7 @@ void clamp_kernel_cuda(TensorIterator& iter, Scalar min_value, Scalar max_value)
}

void clamp_min_kernel_cuda(TensorIterator& iter, Scalar min_value) {
AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "clamp_min_cuda", [&]() {
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_min_cuda", [&]() {
auto lower = min_value.to<scalar_t>();
gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
// Propagate nan, which doesn't propagate automatically for ROCm
Expand All @@ -184,7 +184,7 @@ void clamp_min_kernel_cuda(TensorIterator& iter, Scalar min_value) {
}

void clamp_max_kernel_cuda(TensorIterator& iter, Scalar max_value) {
AT_DISPATCH_ALL_TYPES_AND(kHalf, iter.dtype(), "clamp_max_cuda", [&]() {
AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, iter.dtype(), "clamp_max_cuda", [&]() {
auto upper = max_value.to<scalar_t>();
gpu_kernel(iter, [=]GPU_LAMBDA(scalar_t v) -> scalar_t {
// Propagate nan, which doesn't propagate automatically for ROCm
Expand Down
16 changes: 9 additions & 7 deletions test/test_torch.py
Expand Up @@ -19947,7 +19947,7 @@ def test_movedim_view(self, device):
_float_types2 = _float_types + [torch.bfloat16] if TEST_WITH_ROCM else _float_types

_signed_types = [
torch.half, torch.float, torch.double,
torch.half, torch.bfloat16, torch.float, torch.double,
torch.int8, torch.short, torch.int, torch.long
]

Expand Down Expand Up @@ -20189,8 +20189,10 @@ def inner(self, device, dtype):
('chunk', 'neg_dim', _medium_2d, lambda t, d: [4, -2], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
('clamp', 'neg', _medium_2d, lambda t, d: [-1, 5], 1e-5, 1e-2, 1e-5, _signed_types, [torch.bfloat16]),
('clamp', 'pos', _medium_2d, lambda t, d: [1, 5], 1e-5, 1e-2, 1e-5, _unsigned_types, [torch.bfloat16]),
('clamp_min', '', _medium_2d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types, [torch.bfloat16]),
('clamp_max', '', _medium_2d, lambda t, d: [1], 1e-2, 1e-2, 1e-5, _types, [torch.bfloat16]),
('clamp_min', '', _medium_2d, lambda t, d: [1], 1e-2, 1e-2, 1e-5,
torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=True), [torch.bfloat16]),
('clamp_max', '', _medium_2d, lambda t, d: [1], 1e-2, 1e-2, 1e-5,
torch.testing.get_all_dtypes(include_complex=False, include_bool=False, include_bfloat16=True), [torch.bfloat16]),
('clone', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
('contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, 1e-5, _types, _cpu_types, False),
('conj', '', _small_3d, lambda t, d: [], 1e-5, 0, 1e-5, _types_no_half, [torch.bfloat16], False),
Expand Down Expand Up @@ -20275,14 +20277,14 @@ def inner(self, device, dtype):
1e-5, 1e-5, 1e-5, _float_types_no_half),
('mvlgamma', '2d_p=2', lambda t, d: _small_2d(t, d).clamp(0.6, 10), lambda t, d: [2],
1e-5, 1e-5, 1e-5, _float_types_no_half),
('remainder', 'value', _small_3d, lambda t, d: [3], 1e-1, 1e-5, 1e-5, _signed_types),
('remainder', 'negative_value', _small_3d, lambda t, d: [-3], 1e-1, 1e-5, 1e-5, _signed_types),
('remainder', 'value', _small_3d, lambda t, d: [3], 1e-1, 1e-2, 1e-5, _signed_types),
('remainder', 'negative_value', _small_3d, lambda t, d: [-3], 1e-1, 1e-2, 1e-5, _signed_types),
('remainder', 'tensor', _small_3d,
lambda t, d: [_small_3d(t, d, has_zeros=False)],
1e-1, 1e-5, 1e-5, _signed_types),
1e-1, 1e-2, 1e-5, _signed_types),
('remainder', 'negative_tensor', _small_3d,
lambda t, d: [0 - _small_3d(t, d, has_zeros=False)],
1e-1, 1e-5, 1e-5, _signed_types),
1e-1, 1e-2, 1e-5, _signed_types),
Comment on lines +20280 to +20287
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of these is testing BFloat16 because

_signed_types = [
    torch.half, torch.float, torch.double,
    torch.int8, torch.short, torch.int, torch.long
]

Copy link
Collaborator Author

@zasdfgbnm zasdfgbnm Sep 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

('std', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
('std', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
('std', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, 1e-5, _float_types, _cpu_types, False),
Expand Down