[torch][segment_reduce] Support for multi dimension (cpu only) #59951

serhaty · 2021-06-14T16:58:59Z

Summary:
Add support for multi-d input for cpu forward/backward implementation.

Next step: Adding cuda support for multi-d input.

Test Plan: Added unit tests.

Differential Revision: D29105457

facebook-github-bot · 2021-06-14T16:59:05Z

💊 CI failures summary and remediations

As of commit 6dc7c90 (more details on the Dr. CI page):

1/1 failures introduced in this PR

1 failure not recognized by patterns:

Job	Step	Action
^{Linux CI (pytorch-linux-xenial-py3.6-gcc5.4) / render_test_results}	^{Checkout PyTorch}	🔁 rerun

This comment was automatically generated by Dr. CI (expand for details).

Follow this link to opt-out of these comments for your Pull Requests.

Please report bugs/suggestions to the (internal) Dr. CI Users group.

Click here to manually regenerate this comment.

facebook-github-bot · 2021-06-14T16:59:17Z

This pull request was exported from Phabricator. Differential Revision: D29105457

ngimel

This code is becoming quite complicated, and yet it can only handle reduction over 0-th axis for a contiguous tensor, and cannot easily be extended to handle other axes. At this point, the implementation should be switched to TensorIterator. Please see e.g. how tensorIterator is handling max along dimension

pytorch/aten/src/ATen/native/cpu/TensorCompareKernel.cpp

Lines 20 to 125 in 8e92a3a

    
           template <typename scalar_t, typename scalar_t_2 = int64_t, typename loop1d_t> 
        
           static inline void compare_base_kernel_core( 
        
               Tensor& result1, 
        
               Tensor& result2, 
        
               const Tensor& self, 
        
               int64_t dim, 
        
               bool keepdim, 
        
               const loop1d_t& loop) { 
        
             auto self_sizes = ensure_nonempty_vec(self.sizes().vec()); 
        
             self_sizes[dim] = 1; 
        
             // result1 and result2 may be a empty tensor, if not, 
        
             // reshape them as self dims 
        
             if (!keepdim) { 
        
               if (result1.ndimension() >= dim) { 
        
                 result1.unsqueeze_(dim); 
        
               } 
        
               if (result2.ndimension() >= dim) { 
        
                 result2.unsqueeze_(dim); 
        
               } 
        
             } 
        
             at::native::resize_output(result1, self_sizes); 
        
             at::native::resize_output(result2, self_sizes); 
        
             auto iter = TensorIteratorConfig() 
        
               .check_all_same_dtype(false) 
        
               .resize_outputs(false) 
        
               .declare_static_shape(self.sizes(), /*squash_dims=*/dim) 
        
               .add_output(result1) 
        
               .add_output(result2) 
        
               .add_input(self) 
        
               .build(); 
        
             iter.for_each(loop, /* grain_size */ 1); 
        
             if (!keepdim) { 
        
               result1.squeeze_(dim); 
        
               result2.squeeze_(dim); 
        
             } 
        
           } 
        
           template <typename scalar_t, typename scalar_t_2=int64_t, typename func_t> 
        
           static inline void compare_base_kernel(Tensor& result1, Tensor& result2, 
        
               const Tensor& self, 
        
               int64_t dim, 
        
               bool keepdim, 
        
               const func_t& f) { 
        
             auto self_dim_stride = ensure_nonempty_stride(self, dim); 
        
             auto loop = [&](char** data, const int64_t* strides, int64_t n) { 
        
               auto* result1_data_bytes = data[0]; 
        
               auto* result2_data_bytes = data[1]; 
        
               const auto* self_data_bytes = data[2]; 
        
               for (int64_t i = 0; i < n; ++i) { 
        
                 f((scalar_t*)result1_data_bytes, 
        
                   (scalar_t_2*)result2_data_bytes, 
        
                   (scalar_t*)self_data_bytes, 
        
                   self_dim_stride); 
        
                 result1_data_bytes += strides[0]; 
        
                 result2_data_bytes += strides[1]; 
        
                 self_data_bytes += strides[2]; 
        
               } 
        
             }; 
        
             compare_base_kernel_core<scalar_t, scalar_t_2>( 
        
                 result1, result2, self, dim, keepdim, loop); 
        
           } 
        
           static void min_kernel_impl( 
        
               Tensor& result, 
        
               Tensor& indice, 
        
               const Tensor& self, 
        
               int64_t dim, 
        
               bool keepdim) { 
        
             auto wrap_dim = maybe_wrap_dim(dim, self.dim()); 
        
             int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim); 
        
             TORCH_CHECK(result.scalar_type() == self.scalar_type() && indice.scalar_type() == kLong, 
        
               "Expect dtype ", self.scalar_type(), "and torch.long, but got ", result.scalar_type(), "and", indice.scalar_type()); 
        
             AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] { 
        
               compare_base_kernel<scalar_t>(result, indice, self, wrap_dim, keepdim, [&] ( 
        
                 scalar_t* result_data, int64_t* indice_data, 
        
                 const scalar_t* self_data, auto self_dim_stride) { 
        
                   using value_t = typename c10::scalar_value_type<scalar_t>::type; 
        
                   value_t (*zabs_)(scalar_t) = zabs<scalar_t, value_t>; 
        
                   scalar_t min_number = self_data[0]; 
        
                   int64_t index = 0; 
        
                   for (int64_t i = 0; i < self_dim_size; ++i) { 
        
                     scalar_t value = self_data[i * self_dim_stride]; 
        
                     if (!(zabs_(value) >= zabs_(min_number))) { 
        
                       min_number = value; 
        
                       index = i; 
        
                       if (_isnan<scalar_t>(value)) { 
        
                         break; 
        
                       } 
        
                     } 
        
                   } 
        
                   *result_data = min_number; 
        
                   *indice_data = index; 
        
                 } 
        
               ); 
        
             }); 
        
           }

. The idea behind this is you create a tensor iterator squashing the dimension of interest, and let TensorIterator iterate over all other dimensions, while providing a loop function to handle what you want to do along the dimension of interest. It will be a bit more complicated for segmented reduction instead of full reduction, but the advantage is that there's already a good structure that allows templated operations across arbitrary dimensions, and you are utilizing TensorIterator for efficient iteration over tensor, not putting in any requirements that tensor is contiguous.

ngimel · 2021-06-14T19:03:05Z

aten/src/ATen/native/SegmentReduce.cpp

this should be axis, not 0? You can assert that axis==0 if you don't support anything else.

good catch, I am actually checking this on caller side.

ngimel · 2021-06-14T19:07:40Z

aten/src/ATen/native/SegmentReduce.cpp

data.size[0] is slightly faster, also, it should really be axis, not 0?

ngimel · 2021-06-14T20:19:23Z

aten/src/ATen/native/SegmentReduce.cpp

why are you splitting stride_count and seg_element_count into 2 loops? They are conceptually the same, the dimensions that are not being reduced.

Differential Revision: D28922838 fbshipit-source-id: 6544b91df1ed2bc4ef50191c9016395023f3e2ea

…ch#59951) Summary: Pull Request resolved: pytorch#59951 Add support for multi-d input for cpu forward/backward implementation. Next step: Adding cuda support for multi-d input. Test Plan: Added unit tests. Differential Revision: D29105457 fbshipit-source-id: d61fe767a80410501272231219d751cf29225b0b

facebook-github-bot · 2021-06-14T22:36:59Z

This pull request was exported from Phabricator. Differential Revision: D29105457

facebook-github-bot · 2021-06-17T23:30:44Z

This pull request has been merged in a727f65.

facebook-github-bot added the cla signed label Jun 14, 2021

facebook-github-bot added the fb-exported label Jun 14, 2021

serhaty requested a review from ngimel June 14, 2021 18:47

ngimel reviewed Jun 14, 2021

View reviewed changes

Serhat Yilmaz added 2 commits June 14, 2021 15:35

[torch][segment_reduce] Add cuda support for mean reduction

32a98a4

Differential Revision: D28922838 fbshipit-source-id: 6544b91df1ed2bc4ef50191c9016395023f3e2ea

serhaty force-pushed the export-D29105457 branch from ab05b61 to 6dc7c90 Compare June 14, 2021 22:37

serhaty requested a review from ezyang as a code owner June 14, 2021 22:37

facebook-github-bot closed this in a727f65 Jun 17, 2021

facebook-github-bot added the Merged label Jun 17, 2021

serhaty linked an issue Jul 14, 2021 that may be closed by this pull request

Implement Segment Reduction in pytorch #48904

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[torch][segment_reduce] Support for multi dimension (cpu only) #59951

[torch][segment_reduce] Support for multi dimension (cpu only) #59951

Uh oh!

serhaty commented Jun 14, 2021

Uh oh!

facebook-github-bot commented Jun 14, 2021 •

edited

Loading

Uh oh!

facebook-github-bot commented Jun 14, 2021

Uh oh!

ngimel left a comment

Uh oh!

ngimel Jun 14, 2021

Uh oh!

serhaty Jun 14, 2021

Uh oh!

ngimel Jun 14, 2021

Uh oh!

ngimel Jun 14, 2021

Uh oh!

facebook-github-bot commented Jun 14, 2021

Uh oh!

facebook-github-bot commented Jun 17, 2021

Uh oh!

Uh oh!

	template <typename scalar_t, typename scalar_t_2 = int64_t, typename loop1d_t>
	static inline void compare_base_kernel_core(
	Tensor& result1,
	Tensor& result2,
	const Tensor& self,
	int64_t dim,
	bool keepdim,
	const loop1d_t& loop) {
	auto self_sizes = ensure_nonempty_vec(self.sizes().vec());
	self_sizes[dim] = 1;

	// result1 and result2 may be a empty tensor, if not,
	// reshape them as self dims
	if (!keepdim) {
	if (result1.ndimension() >= dim) {
	result1.unsqueeze_(dim);
	}
	if (result2.ndimension() >= dim) {
	result2.unsqueeze_(dim);
	}
	}

	at::native::resize_output(result1, self_sizes);
	at::native::resize_output(result2, self_sizes);

	auto iter = TensorIteratorConfig()
	.check_all_same_dtype(false)
	.resize_outputs(false)
	.declare_static_shape(self.sizes(), /squash_dims=/dim)
	.add_output(result1)
	.add_output(result2)
	.add_input(self)
	.build();

	iter.for_each(loop, /* grain_size */ 1);

	if (!keepdim) {
	result1.squeeze_(dim);
	result2.squeeze_(dim);
	}
	}

	template <typename scalar_t, typename scalar_t_2=int64_t, typename func_t>
	static inline void compare_base_kernel(Tensor& result1, Tensor& result2,
	const Tensor& self,
	int64_t dim,
	bool keepdim,
	const func_t& f) {

	auto self_dim_stride = ensure_nonempty_stride(self, dim);

	auto loop = [&](char** data, const int64_t* strides, int64_t n) {
	auto* result1_data_bytes = data[0];
	auto* result2_data_bytes = data[1];
	const auto* self_data_bytes = data[2];
	for (int64_t i = 0; i < n; ++i) {
	f((scalar_t*)result1_data_bytes,
	(scalar_t_2*)result2_data_bytes,
	(scalar_t*)self_data_bytes,
	self_dim_stride);
	result1_data_bytes += strides[0];
	result2_data_bytes += strides[1];
	self_data_bytes += strides[2];
	}
	};

	compare_base_kernel_core<scalar_t, scalar_t_2>(
	result1, result2, self, dim, keepdim, loop);
	}

	static void min_kernel_impl(
	Tensor& result,
	Tensor& indice,
	const Tensor& self,
	int64_t dim,
	bool keepdim) {
	auto wrap_dim = maybe_wrap_dim(dim, self.dim());
	int64_t self_dim_size = ensure_nonempty_size(self, wrap_dim);

	TORCH_CHECK(result.scalar_type() == self.scalar_type() && indice.scalar_type() == kLong,
	"Expect dtype ", self.scalar_type(), "and torch.long, but got ", result.scalar_type(), "and", indice.scalar_type());

	AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
	compare_base_kernel<scalar_t>(result, indice, self, wrap_dim, keepdim, [&] (
	scalar_t* result_data, int64_t* indice_data,
	const scalar_t* self_data, auto self_dim_stride) {
	using value_t = typename c10::scalar_value_type<scalar_t>::type;
	value_t (*zabs_)(scalar_t) = zabs<scalar_t, value_t>;
	scalar_t min_number = self_data[0];
	int64_t index = 0;
	for (int64_t i = 0; i < self_dim_size; ++i) {
	scalar_t value = self_data[i * self_dim_stride];
	if (!(zabs_(value) >= zabs_(min_number))) {
	min_number = value;
	index = i;
	if (_isnan<scalar_t>(value)) {
	break;
	}
	}
	}
	*result_data = min_number;
	*indice_data = index;
	}
	);
	});
	}

[torch][segment_reduce] Support for multi dimension (cpu only) #59951

[torch][segment_reduce] Support for multi dimension (cpu only) #59951

Uh oh!

Conversation

serhaty commented Jun 14, 2021

Uh oh!

facebook-github-bot commented Jun 14, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

💊 CI failures summary and remediations

1 failure not recognized by patterns:

Uh oh!

facebook-github-bot commented Jun 14, 2021

Uh oh!

ngimel left a comment

Choose a reason for hiding this comment

Uh oh!

ngimel Jun 14, 2021

Choose a reason for hiding this comment

Uh oh!

serhaty Jun 14, 2021

Choose a reason for hiding this comment

Uh oh!

ngimel Jun 14, 2021

Choose a reason for hiding this comment

Uh oh!

ngimel Jun 14, 2021

Choose a reason for hiding this comment

Uh oh!

facebook-github-bot commented Jun 14, 2021

Uh oh!

facebook-github-bot commented Jun 17, 2021

Uh oh!

Uh oh!

facebook-github-bot commented Jun 14, 2021 •

edited

Loading