Merge pull request #185 from sony/feature/20190807_use_thrust_async_sort

Use thrust async sort and custom make_seqeuence for faster operation.
sony · Oct 1, 2019 · a7f06b7 · a7f06b7
2 parents c0d12db + cb428b9
commit a7f06b7
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 11 deletions.
diff --git a/include/nbla/cuda/function/sort.hpp b/include/nbla/cuda/function/sort.hpp
@@ -36,7 +36,6 @@ template <typename T> class SortCuda : public Sort<T> {
 
 protected:
   int device_;
-  virtual void setup_impl(const Variables &inputs, const Variables &outputs);
   virtual void forward_impl(const Variables &inputs, const Variables &outputs);
   virtual void backward_impl(const Variables &inputs, const Variables &outputs,
                              const vector<bool> &propagate_down,

diff --git a/src/nbla/cuda/function/generic/sort.cu b/src/nbla/cuda/function/generic/sort.cu
@@ -20,7 +20,12 @@
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
+#if THRUST_VERSION < 100904 || !defined(THRUST_CPP11) ||                       \
+    !defined(THRUST_MODERN_GCC)
 #include <thrust/sort.h>
+#else
+#include <thrust/async/sort.h>
+#endif
 
 namespace nbla {
 
@@ -40,6 +45,10 @@ template <typename T> struct Compare {
   }
 };
 
+__global__ void make_sequence(const size_t size, size_t *dst) {
+  NBLA_CUDA_KERNEL_LOOP(i, size) { dst[i] = static_cast<size_t>(i); }
+}
+
 __global__ void copy_index(const size_t size, const size_t stride,
                            const size_t *src, size_t *dst) {
   NBLA_CUDA_KERNEL_LOOP(i, size) { dst[i * stride] = src[i]; }
@@ -71,16 +80,16 @@ __global__ void set_grad(const size_t size, const size_t stride, const T *src,
 
 } // namespace sort_impl
 
-template <typename T>
-void SortCuda<T>::setup_impl(const Variables &inputs,
-                             const Variables &outputs) {
-  Sort<T>::setup_impl(inputs, outputs);
-  cuda_set_device(this->device_);
-}
-
 template <typename T>
 void SortCuda<T>::forward_impl(const Variables &inputs,
                                const Variables &outputs) {
+#if THRUST_VERSION < 100904 || !defined(THRUST_CPP11) ||                       \
+    !defined(THRUST_MODERN_GCC)
+  using thrust::sort;
+#else
+  using thrust::async::sort;
+#endif
+
   using namespace sort_impl;
   cuda_set_device(this->device_);
 
@@ -103,9 +112,10 @@ void SortCuda<T>::forward_impl(const Variables &inputs,
     auto inner_i_raw = outer_i_raw;
 
     while (inner_x_raw < outer_x_raw + this->inner_size) {
-      thrust::sequence(temp_index_ptr, temp_index_ptr + temp_index_var.size());
-      auto cmp = Compare<Tcu>(inner_x_raw, stride, this->reverse);
-      thrust::sort(temp_index_ptr, temp_index_ptr + temp_index_var.size(), cmp);
+      const auto size = temp_index_var.size();
+      NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(make_sequence, size, temp_index_raw);
+      auto compare = Compare<Tcu>(inner_x_raw, stride, this->reverse);
+      static_cast<void>(sort(temp_index_ptr, temp_index_ptr + size, compare));
       NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(copy_index, shape[this->axis], stride,
                                      temp_index_raw, inner_i_raw);
       inner_x_raw++;