Migrate equal from the TH to Aten (CPU) #33286

xcnick · 2020-02-13T09:12:25Z

Test script:

import timeit

setup_ones = """
import torch
a = torch.ones(({n}, {n}), dtype={dtype})
b = torch.ones(({n}, {n}), dtype={dtype})
"""

for n, t in [(1000, 10000), (2000, 10000)]:
  for dtype in ('torch.bool', 'torch.int', 'torch.long', 'torch.bfloat16', 'torch.float', 'torch.double'):
  #for dtype in ('torch.bool', 'torch.int', 'torch.long', 'torch.float', 'torch.double'):
    print('torch.ones(({n}, {n})) equal for {t} times {dtype}'.format(n=n, t=t, dtype=dtype))
    print(timeit.timeit(stmt='torch.equal(a, b)', setup=setup_ones.format(n=n, dtype=dtype), number=t))

setup_rand = """
import torch
a = torch.rand(({n}, {n}), dtype={dtype})
b = a.clone()
"""
for n, t in [(1000, 10000), (2000, 10000)]:
  for dtype in ('torch.float', 'torch.double'):
    print('torch.rand(({n}, {n})) for {t} times {dtype}'.format(n=n, t=t, dtype=dtype))
    print(timeit.timeit(stmt='torch.equal(a, b)', setup=setup_rand.format(n=n, dtype=dtype), number=t))

setup_non_contiguous = """
import torch
a = torch.rand(({n}, {n}), dtype={dtype})
a2 = a[:, 500:]
a3 = a2.clone()
torch.equal(a2, a3)
"""
for n, t in [(1000, 10000), (2000, 10000)]:
  for dtype in ('torch.float', 'torch.double'):
    print('non_contiguous torch.rand(({n}, {n})) for {t} times {dtype}'.format(n=n, t=t, dtype=dtype))
    print(timeit.timeit(stmt='torch.equal(a2, a3)', setup=setup_non_contiguous.format(n=n, dtype=dtype), number=t))

setup_not_equal = """
import torch
a = torch.rand(({n}, {n}), dtype={dtype})
b = torch.rand(({n}, {n}), dtype={dtype})
torch.equal(a, b)
"""
for n, t in [(1000, 10000), (2000, 10000)]:
  for dtype in ('torch.float', 'torch.double'):
    print('not equal torch.rand(({n}, {n})) for {t} times {dtype}'.format(n=n, t=t, dtype=dtype))
    print(timeit.timeit(stmt='torch.equal(a, b)', setup=setup_not_equal.format(n=n, dtype=dtype), number=t))

TH

torch.ones((1000, 1000)) equal for 10000 times torch.bool
1.8391206220258027
torch.ones((1000, 1000)) equal for 10000 times torch.int
1.8877864250680432
torch.ones((1000, 1000)) equal for 10000 times torch.long
1.938108820002526
torch.ones((1000, 1000)) equal for 10000 times torch.bfloat16
3.184849138953723
torch.ones((1000, 1000)) equal for 10000 times torch.float
1.8825413499725983
torch.ones((1000, 1000)) equal for 10000 times torch.double
2.7266416549682617
torch.ones((2000, 2000)) equal for 10000 times torch.bool
7.227149627986364
torch.ones((2000, 2000)) equal for 10000 times torch.int
7.76215292501729
torch.ones((2000, 2000)) equal for 10000 times torch.long
9.631909006042406
torch.ones((2000, 2000)) equal for 10000 times torch.bfloat16
8.097328286035918
torch.ones((2000, 2000)) equal for 10000 times torch.float
5.5739822529722005
torch.ones((2000, 2000)) equal for 10000 times torch.double
8.444009944912978
torch.rand((1000, 1000)) for 10000 times torch.float
1.168096570065245
torch.rand((1000, 1000)) for 10000 times torch.double
1.6577326939441264
torch.rand((2000, 2000)) for 10000 times torch.float
5.49395391496364
torch.rand((2000, 2000)) for 10000 times torch.double
8.507486199960113
non_contiguous torch.rand((1000, 1000)) for 10000 times torch.float
6.074504268006422
non_contiguous torch.rand((1000, 1000)) for 10000 times torch.double
6.1426916810451075
non_contiguous torch.rand((2000, 2000)) for 10000 times torch.float
37.501055537955835
non_contiguous torch.rand((2000, 2000)) for 10000 times torch.double
44.6880351039581
not equal torch.rand((1000, 1000)) for 10000 times torch.float
0.029356416082009673
not equal torch.rand((1000, 1000)) for 10000 times torch.double
0.025421109050512314
not equal torch.rand((2000, 2000)) for 10000 times torch.float
0.026333761983551085
not equal torch.rand((2000, 2000)) for 10000 times torch.double
0.02748022007290274

ATen

torch.ones((1000, 1000)) equal for 10000 times torch.bool
0.7961567062884569
torch.ones((1000, 1000)) equal for 10000 times torch.int
0.49172434909269214
torch.ones((1000, 1000)) equal for 10000 times torch.long
0.9459248608909547
torch.ones((1000, 1000)) equal for 10000 times torch.bfloat16
2.0877483217045665
torch.ones((1000, 1000)) equal for 10000 times torch.float
0.606857153121382
torch.ones((1000, 1000)) equal for 10000 times torch.double
1.1388208279386163
torch.ones((2000, 2000)) equal for 10000 times torch.bool
2.0329296849668026
torch.ones((2000, 2000)) equal for 10000 times torch.int
3.534358019940555
torch.ones((2000, 2000)) equal for 10000 times torch.long
8.19841272290796
torch.ones((2000, 2000)) equal for 10000 times torch.bfloat16
6.595649406313896
torch.ones((2000, 2000)) equal for 10000 times torch.float
4.193911510054022
torch.ones((2000, 2000)) equal for 10000 times torch.double
7.931309659034014
torch.rand((1000, 1000)) for 10000 times torch.float
0.8877940969541669
torch.rand((1000, 1000)) for 10000 times torch.double
1.4142901846207678
torch.rand((2000, 2000)) for 10000 times torch.float
4.010025603231043
torch.rand((2000, 2000)) for 10000 times torch.double
8.126411964651197
non_contiguous torch.rand((1000, 1000)) for 10000 times torch.float
0.602473056409508
non_contiguous torch.rand((1000, 1000)) for 10000 times torch.double
0.6784545010887086
non_contiguous torch.rand((2000, 2000)) for 10000 times torch.float
3.0991827426478267
non_contiguous torch.rand((2000, 2000)) for 10000 times torch.double
5.719010795000941
not equal torch.rand((1000, 1000)) for 10000 times torch.float
0.046060710679739714
not equal torch.rand((1000, 1000)) for 10000 times torch.double
0.036034489050507545
not equal torch.rand((2000, 2000)) for 10000 times torch.float
0.03686975734308362
not equal torch.rand((2000, 2000)) for 10000 times torch.double
0.04189508780837059

dr-ci · 2020-02-13T09:34:12Z

💊 CI failures summary and remediations

As of commit f72b7f4 (more details on the Dr. CI page):

💚 💚 Looks good so far! There are no failures yet. 💚 💚

This comment was automatically generated by Dr. CI (expand for details).

Follow this link to opt-out of these comments for your Pull Requests.

Please report bugs/suggestions on the GitHub issue tracker or post in the (internal) Dr. CI Users group.

See how this bot performed.

This comment has been revised 42 times.

xuhdev · 2020-02-13T18:42:03Z

aten/src/ATen/native/BinaryOps.cpp

Should we also support Half here?

VitalyFedyunin · 2020-02-13T21:27:50Z

aten/src/ATen/native/BinaryOps.cpp

Please use TensorIterator instead. This is perfect case of reduction.

VitalyFedyunin

CPU_tensor_apply2 is going away eventually, please use TensorIterator instead.

xcnick · 2020-02-14T06:36:38Z

Thanks for the quick review.
Since TensorIterator can handle both contiguous and non-contiguous tensor, so the code can be simplified.

xuhdev · 2020-02-18T17:53:53Z

aten/src/ATen/native/BinaryOps.cpp

Is it possible to move this part to aten/src/ATen/native/cpu/BinaryOpsKernel.cpp? There you can take advantage of SIMD instructions and the code structure is cleaner (especially after the CUDA version is also migrated). You can look into the other binary operators as examples.

Thanks for the quick review.
I moved this to BinaryOpsKernel.

xuhdev · 2020-02-19T16:25:28Z

aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

Shouldn't specify a return type here (It's causing build errors). Capture can be more specific [&equal]

It's weird that the unit test will fail when not specify a return type here. But test will be passed with a return type.

xuhdev · 2020-02-19T16:28:56Z

aten/src/ATen/native/cpu/BinaryOpsKernel.cpp

bfloat16?

VitalyFedyunin

Please add bfloat16 support and add tests to cover it. Everything else looks good.

VitalyFedyunin · 2020-04-22T17:53:02Z

Opting in @glaringlee for review and help with TensorIterator

glaringlee · 2020-04-23T07:31:34Z

@xcnick
Create a result tensor that reduced from all dimensions. make_reduction() can be used to do this.
Then there are 2 ways to implement this feature:
One way is to do similar thing as what we did in:
https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp#L112
But you have two inputs, you need to do some modifications to fit 2 inputs. Since your output tensor is reduced to 1 item (but still have same dimensions as inputs), so the output ptr will always point to the same item in all the inputs iterations.

The other way is to reuse foreach_reduced_elt(func(subiter)) in tensoriterator.
You can take a look at how it is used in binary_kernel_reduce() here:
https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/Reduce.h#L181
https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp#L112

glaringlee

@xcnick
I commented this PR above, let me know if you need more info on how to do tensor reduction.

xcnick · 2020-05-07T10:12:12Z

@glaringlee

sorry for the late response.

I would like to use the first way, so I add TensorIterator TensorIterator::reduce_op_2(Tensor& out, const Tensor& a, const Tensor& b) in TensorIterator.

and add struct EqualOps in SharedReduceOps.

then add bool equal(const Tensor & self, const Tensor & other) and equal_stub in ReduceOps.

add static void equal_kernel_impl(TensorIterator& iter) in ReduceOpsKernel, which call binary_kernel_reduce just like：
https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp#L112

Am I doing it right?

glaringlee · 2020-05-11T15:15:46Z

@xcnik
Yes, you can create a EqualOp and call binary_kernel_reduce.
But as I said, we currently support one input only, it is not because of the operator. The binary_kernel_reduce() calls foreach_reduced_elt which support one input only,
see here:

pytorch/aten/src/ATen/native/TensorIteratorReduce.cpp

Line 120 in 81bbb7e

AT_ASSERT(ninputs() == 1);

need some tweaks in that function to support more inputs.

glaringlee · 2020-05-20T23:32:40Z

Hey @xcnick, if you think changing the reduction code is too much for you, you can do the same thing as this PR.
#36483

The at::eq support both cpu and cuda.
So you can do the same in your cpu version kernel, see this function
https://github.com/pytorch/pytorch/pull/36483/files#diff-94c02a8d9bc1dd0d489763ef096853cbR27

And rename your equal() function to cpu_equal (change it in all related places), then you are done :)

xcnick · 2020-05-21T10:35:42Z

@glaringlee

Hi, I modified the code as you said. Benchmarking results was updated.

VitalyFedyunin · 2020-06-22T00:51:50Z

ping @glaringlee

glaringlee · 2020-06-22T06:01:01Z

@xcnick @VitalyFedyunin
Ah.....very Sry, I've thought I updated this PR already......
I will do a final experiment and update this PR tomorrow

glaringlee · 2020-06-24T00:02:10Z

@xcnick
Please see my comments, I've thought i commented this long time ago, but actually the comments were not there. Very sorry about that.

glaringlee · 2020-06-24T00:00:14Z

aten/src/ATen/native/ReduceOps.cpp

@xcnick
Based on your benchmark, I found one problem when directly calling at::native::eq directly.
for contiguous non equal test, the performance of the at::native::eq is very bad, the reason is that, at::native::eq will compare every element between the two tensors, even an unequal case is already found, it won't stop. This will also slow down the non-contiguous non equal case, but not as bad as contiguous case.

Let's do something else here:
Instead of calling eq directly, let's implement our own loop function. You can put the following code here and remove the 'return at::native::eq' line.

std::atomic<bool> result{true}; auto iter = TensorIteratorConfig() .add_input(self) .add_input(other) .allow_cpu_scalars(true) .promote_inputs_to_common_dtype(true) .build(); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "equal_cpu", [&]{ iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size){ if (!result) { return; } char* self_data = data[0]; char* other_data = data[1]; for (int64_t i = 0; i < dim_size; ++i) { if (*((scalar_t*)self_data) != *((scalar_t*)other_data)) { result = false; return; } self_data += strides[0]; other_data += strides[1]; } }); }); return result.load();

Let me explain a little bit. Since we just need to return a bool, so there is no need to even have an output tensor. So we initialize a tensor iterator with two input only, and keep track a global boolean 'result'

The tensor iterator has a for_each function which takes a function reference.
This reference has signatures like this: loop(char** data, const int64_t* strides, int size);
Tensor iterator internally, chunk the whole tensor into many 2D planes, and process them row by row, this loop function is used to iterator each row. 'data' array contains the starting point of current row for each input tensor, 'strides' array contains stride of current row (dimension) for each input tensor, the tensor orders for both 'data' and 'strides' are the same as what you initialize the tensor iterator. And the 'size' is the row length.

This code dispatch the iter.for_each() to all the supported data type, and if there is any non-equal element found between two tensors, it stopped immediately. For contiguous tensor, this means quit the whole tensor iteration since we totally have one continuous data array in this case. If tensor is not contiguous, there is still overhead, since in this case, the program will jump to next row and immediately break, and jump to next row, break, until looped entire tensors. But it already saved many time compare calling at::native::eq directly, and plus there is no tensor write.

Please make this change in and update the benchmark and rebase. I think we are good to go then.

Drop Half support for equal.

glaringlee · 2020-06-24T02:50:34Z

aten/src/ATen/native/ReduceOps.cpp

+  if (!self.is_same_size(other)) {
+    return false;
+  }
+  bool result = true;


@xcnick
Should this be atomic? I updated my comments around 1hr ago, I think your github page is cached.....

xcnick · 2020-06-24T03:02:03Z

@glaringlee Thanks for your help!
Benchmarking numbers is updated.

facebook-github-bot

@glaringlee has imported this pull request. If you are a Facebook employee, you can view this diff on Phabricator.

glaringlee

LGTM Now.
@xcnick Thanks a lot for your contribution!!

facebook-github-bot · 2020-06-24T22:30:35Z

@glaringlee merged this pull request in 72f2c47.

xcnick requested a review from VitalyFedyunin February 13, 2020 09:13

pytorchbot added the open source label Feb 13, 2020

xcnick removed the request for review from VitalyFedyunin February 13, 2020 09:17

xcnick closed this Feb 13, 2020

xcnick reopened this Feb 13, 2020

xcnick requested a review from VitalyFedyunin February 13, 2020 10:23

xuhdev reviewed Feb 13, 2020

View reviewed changes

VitalyFedyunin reviewed Feb 13, 2020

View reviewed changes

aten/src/ATen/native/BinaryOps.cpp Outdated

Copy link

Contributor

VitalyFedyunin Feb 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use TensorIterator instead. This is perfect case of reduction.

VitalyFedyunin suggested changes Feb 13, 2020

View reviewed changes

xcnick requested review from albanD, apaszke, mrshenli, pritamdamania87 and zhaojuanmao as code owners February 14, 2020 06:17

xcnick force-pushed the equal branch from aceb058 to b5c8fd6 Compare February 14, 2020 06:24

xcnick removed request for albanD, apaszke, mrshenli, pritamdamania87 and zhaojuanmao February 14, 2020 06:27

xcnick requested review from VitalyFedyunin and xuhdev February 14, 2020 06:37

xuhdev reviewed Feb 18, 2020

View reviewed changes

xcnick requested a review from xuhdev February 19, 2020 09:47

xuhdev reviewed Feb 19, 2020

View reviewed changes

VitalyFedyunin reviewed Feb 19, 2020

View reviewed changes

yf225 added module: porting Issues related to porting TH/THNN legacy to ATen native triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module labels Feb 19, 2020

glaringlee requested changes May 5, 2020

View reviewed changes

xcnick force-pushed the equal branch from fa35733 to 2b5f8eb Compare May 21, 2020 09:27

xcnick requested review from VitalyFedyunin and glaringlee May 21, 2020 09:28

glaringlee requested changes Jun 24, 2020

View reviewed changes

xcnick added 4 commits June 24, 2020 09:16

Migrate equal from the TH to Aten (CPU)

c90bf09

fix link error

3acfe1d

Use TensorIterator instead of CPU_tensor_apply2.

826ced2

Drop Half support for equal.

Use at::native::eq

92d4fb6

xcnick force-pushed the equal branch from 2b5f8eb to 92d4fb6 Compare June 24, 2020 01:35

Use TensorIterator

09b12b2

glaringlee requested changes Jun 24, 2020

View reviewed changes

Turn result to atomic

f72b7f4

facebook-github-bot reviewed Jun 24, 2020

View reviewed changes

glaringlee approved these changes Jun 24, 2020

View reviewed changes

facebook-github-bot closed this in 72f2c47 Jun 24, 2020

facebook-github-bot added the merged label Jun 24, 2020

mruberry added the Merged label Oct 28, 2020

Migrate equal from the TH to Aten (CPU) #33286

Migrate equal from the TH to Aten (CPU) #33286

Uh oh!

Conversation

xcnick commented Feb 13, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

dr-ci bot commented Feb 13, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

💊 CI failures summary and remediations

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

VitalyFedyunin left a comment

Choose a reason for hiding this comment

Uh oh!

xcnick commented Feb 14, 2020

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

VitalyFedyunin left a comment

Choose a reason for hiding this comment

Uh oh!

VitalyFedyunin commented Apr 22, 2020

Uh oh!

glaringlee commented Apr 23, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

glaringlee left a comment

Choose a reason for hiding this comment

Uh oh!

xcnick commented May 7, 2020

Uh oh!

glaringlee commented May 11, 2020

Uh oh!

glaringlee commented May 20, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

xcnick commented May 21, 2020

Uh oh!

VitalyFedyunin commented Jun 22, 2020

Uh oh!

glaringlee commented Jun 22, 2020

Uh oh!

glaringlee commented Jun 24, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

glaringlee Jun 24, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

xcnick commented Jun 24, 2020

Uh oh!

facebook-github-bot left a comment

Choose a reason for hiding this comment

Uh oh!

glaringlee left a comment

Choose a reason for hiding this comment

Uh oh!

facebook-github-bot commented Jun 24, 2020

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

xcnick commented Feb 13, 2020 •

edited

Loading

dr-ci bot commented Feb 13, 2020 •

edited

Loading

glaringlee commented Apr 23, 2020 •

edited

Loading

glaringlee commented May 20, 2020 •

edited

Loading

glaringlee commented Jun 24, 2020 •

edited

Loading

glaringlee Jun 24, 2020 •

edited

Loading