pytorch · t-vi · Feb 27, 2019 · Feb 28, 2019 · Mar 1, 2019 · Mar 1, 2019
diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
@@ -800,27 +800,6 @@
         - arg: bool keepdim
           default: "false"
 ]]
-[[
-  name: _th_kthvalue
-  backends:
-    - CPU
-  variants: function
-  cname: kthvalue
-  return: argument 0,1
-  scalar_check: self_->dim() == 0 || (keepdim == false && self_->dim() == 1)
-  arguments:
-    - arg: THTensor* values
-      output: True
-    - arg: THIndexTensor* indices
-      output: True
-    - THTensor* self
-    - long k
-    - arg: long dim
-      wrap_dim: self
-      default: __last_dim
-    - arg: bool keepdim
-      default: "false"
-]]
 [[
   name: _th_mode
   variants: function

diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp
@@ -0,0 +1,195 @@
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/native/SortingUtils.h>
+
+namespace at {
+namespace native {
+
+namespace {
+
+// maybe these days, one should define a random access iterator and use
+// std::sort...
+/* Note from TH:
+
+   I cut and pasted (slightly adapted) the quicksort code from
+   Sedgewick's 1978 "Implementing Quicksort Programs" article
+   http://www.csie.ntu.edu.tw/~b93076/p847-sedgewick.pdf
+
+   It is the state of the art existing implementation. The macros
+   are here to make as close a match as possible to the pseudocode of
+   Program 2 p.851
+
+   Note that other partition schemes exist, and are typically presented
+   in textbook, but those are less efficient. See e.g.
+   http://cs.stackexchange.com/questions/11458/quicksort-partitioning-hoare-vs-lomuto
+
+   Julien, November 12th 2013
+*/
+
+constexpr int64_t MAX_LEVELS = 300;
+constexpr int64_t M_SMALL = 10; // Limit for small subfiles
+
+template <typename Fn>
+void dim_apply(TensorList tensors, int64_t dim, Fn f) {
+  AT_ASSERT(tensors.size() > 0);
+  auto t = tensors[0];
+  auto sizes = t.sizes();
+  int64_t ndim = t.dim();
+  int64_t itersize = 1;
+  for (int64_t i = 0; i < ndim; i++) {
+    if (i != dim) {
+      itersize *= t.size(i);
+    }
+  }
+  parallel_for(0, itersize, 1, [&](int64_t i_begin, int64_t i_end) {
+    std::vector<Tensor> narrowed_tensors;
+    narrowed_tensors.reserve(tensors.size());
+    for (int64_t it = i_begin; it < i_end; it++) {
+      narrowed_tensors.clear();
+      for (auto ti : tensors) {
+        int64_t i = it;
+        Tensor nt = ti;
+        for (size_t d = 0; d < ndim; d++) {
+          if (d != dim) {
+            // this could be avoided for slower-changing dimensions if done
+            // better
+            nt = nt.select((d > dim ? 1 : 0), i % sizes[d]);
+            i = i / sizes[d];
+          }
+        }
+        narrowed_tensors.emplace_back(nt);
+      }
+      f(it, narrowed_tensors);
+    }
+  });
+}
+
+template <typename scalar_t, typename Fn>
+void quick_select_template(
+    TensorAccessor<scalar_t, 1> arr,
+    int64_t k,
+    Fn swap_fn) {
+  int64_t P, L, R, i, j, swap;
+  scalar_t rswap, piv;
+  L = 0;
+  R = arr.size(0) - 1;
+
+  do {
+    if (R <= L) // One element only
+      return;
+
+    if (R == L + 1) { // Two elements only
+      if (arr[L] > arr[R]) {
+        swap_fn(L, R);
+      }
+      return;
+    }
+
+    // Use median of three for pivot choice
+    P = (L + R) >> 1;
+    swap_fn(P, L + 1);
+    if (arr[L + 1] > arr[R]) {
+      swap_fn(L + 1, R);
+    }
+    if (arr[L] > arr[R]) {
+      swap_fn(L, R);
+    }
+    if (arr[L + 1] > arr[L]) {
+      swap_fn(L + 1, L);
+    }
+
+    i = L + 1;
+    j = R;
+    piv = arr[L];
+    do {
+      do
+        i++;
+      while (arr[i] < piv);
+      do
+        j--;
+      while (arr[j] > piv);
+      if (j < i)
+        break;
+      swap_fn(i, j);
+    } while (1);
+    swap_fn(L, j);
+
+    // Re-set active partition
+    if (j <= k)
+      L = i;
+    if (j >= k)
+      R = j - 1;
+  } while (1);
+}
+
+} // namespace
+
+std::tuple<Tensor&, Tensor&> kthvalue_out_cpu(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t k,
+    int64_t dim_,
+    bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  // FIXME: This seems bogus, I only do this because it was the old behaviour.
+  //        The reductions are fine, as long as the axis being reduced along
+  //        isn't of 0 elements (and the output has elements).
+  AT_CHECK(
+      self.numel() > 0,
+      "cannot perform reduction function kthvalue",
+      " on tensor with no elements because the operation does not have an identity");
+  AT_CHECK(
+      k > 0 && k <= (self.dim() > 0 ? self.size(dim) : 1),
+      "selected index k out of range");
+
+  _reduction_with_indices_allocate_or_resize_output(
+      values, indices, self, dim_, keepdim);
+  if (self.dim() == 0 && self.numel() == 1) {
+    values.copy_(self);
+    indices.zero_();
+    return std::forward_as_tuple(values, indices);
+  }
+  auto tmp_values = self.clone();
+  auto tmp_indices = at::empty(self.sizes(), self.options().dtype(kLong));
+  AT_DISPATCH_ALL_TYPES(self.type(), "kthvalue", [&] {
+    dim_apply(
+        {tmp_values, tmp_indices, values, indices},
+        dim,
+        [&](int64_t i, TensorList tl) {
+          auto tmp_values = tl[0].accessor<scalar_t, 1>();
+          auto tmp_indices = tl[1].accessor<int64_t, 1>();
+          scalar_t* mode_value = tl[2].data<scalar_t>();
+          int64_t* mode_index = tl[3].data<int64_t>();
+          for (int64_t j = 0; j < tmp_indices.size(0); j++) {
+            tmp_indices[j] = j;
+          }
+          quick_select_template(tmp_values, k - 1, [&](int64_t i, int64_t j) {
+            std::swap(tmp_values[i], tmp_values[j]);
+            std::swap(tmp_indices[i], tmp_indices[j]);
+          });
+          *mode_value = tmp_values[k - 1];
+          *mode_index = tmp_indices[k - 1];
+        });
+  });
+  if (!keepdim) {
+    values.squeeze_(dim);
+    indices.squeeze_(dim);
+  }
+  return std::forward_as_tuple(values, indices);
+}
+
+std::tuple<Tensor, Tensor> kthvalue(
+    const Tensor& self,
+    int64_t k,
+    int64_t dim,
+    bool keepdim) {
+  Tensor values = at::empty({0}, self.options());
+  Tensor indices = at::empty({0}, self.options().dtype(kLong));
+  at::kthvalue_out(values, indices, self, k, dim, keepdim);
+  return std::make_tuple(values, indices);
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h
@@ -0,0 +1,48 @@
+#pragma once
+
+namespace at {
+namespace native {
+
+// ensure we get good values and indices for kthvalue, mode, median
+// this will always be with the reducing dim as 1-d
+static void _reduction_with_indices_allocate_or_resize_output(
+    Tensor& values,
+    Tensor& indices,
+    const Tensor& self,
+    int64_t dim_,
+    bool keepdim) {
+  int64_t dim = maybe_wrap_dim(dim_, self.dim(), /*wrap_scalar=*/true);
+  auto result_sizes = self.sizes().vec();
+  if (result_sizes.size() > 0) {
+    result_sizes[dim] = 1;
+  }
+  if (values.defined()) {
+    AT_CHECK(
+        self.type() == values.type(),
+        "output values must be of same type as input");
+    if (!keepdim && values.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      values.unsqueeze_(dim);
+    }
+    values.resize_(result_sizes);
+  } else {
+    values = at::empty(result_sizes, self.options());
+  }
+  if (indices.defined()) {
+    AT_CHECK(
+        indices.dtype() == kLong, "output indices must be of scalar type Long");
+    AT_CHECK(
+        indices.device() == self.device(),
+        "output indices must be on same device as input");
+    if (!keepdim && indices.dim() == self.dim() - 1) {
+      // unsqueeze to preserve passed in noncontiguous tensor in resize
+      indices.unsqueeze_(dim);
+    }
+    indices.resize_(result_sizes);
+  } else {
+    indices = at::empty(result_sizes, self.options().dtype(kLong));
+  }
+}
+
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
@@ -97,26 +97,6 @@ Tensor _s_where_cpu(const Tensor& condition, const Tensor& self, const Tensor& o
   return ret;
 }
 
-std::tuple<Tensor, Tensor> kthvalue(const Tensor& self, int64_t k, int64_t dim, bool keepdim) {
-  Tensor values = at::empty({0}, self.options());
-  Tensor indices = at::empty({0}, self.options().dtype(kLong));
-  return at::native::kthvalue_out(values, indices, self, k, dim, keepdim);
-}
-
-std::tuple<Tensor &,Tensor &> kthvalue_out(Tensor& values, Tensor& indices,
-                                           const Tensor& self, int64_t k, int64_t dim, bool keepdim) {
-  AT_CHECK(self.type().backend() == Backend::CPU || self.type().backend() == Backend::CUDA,
-           "kthvalue only supports CPU AND CUDA backend, got: ", toString(self.type().backend()));
-  dim = maybe_wrap_dim(dim, self.dim());
-  if (_dimreduce_return_trivial_no_ident(values, self, dim, keepdim, "kthvalue")) {
-    AT_ASSERT(values.dim() == 0);
-    indices.resize_({}).fill_(0);
-    return std::forward_as_tuple(values, indices);
-  } else {
-    return at::legacy::th::_th_kthvalue_out(values, indices, self, k, dim, keepdim);
-  }
-}
-
 std::tuple<Tensor, Tensor> median(const Tensor& self, int64_t dim, bool keepdim) {
   Tensor values = at::empty({0}, self.options());
   Tensor indices = at::empty({0}, self.options().dtype(kLong));