Skip to content

Commit

Permalink
Merge pull request #185 from sony/feature/20190807_use_thrust_async_sort
Browse files Browse the repository at this point in the history
Use thrust async sort and custom make_seqeuence for faster operation.
  • Loading branch information
TakuyaYashima committed Oct 1, 2019
2 parents c0d12db + cb428b9 commit a7f06b7
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
1 change: 0 additions & 1 deletion include/nbla/cuda/function/sort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ template <typename T> class SortCuda : public Sort<T> {

protected:
int device_;
virtual void setup_impl(const Variables &inputs, const Variables &outputs);
virtual void forward_impl(const Variables &inputs, const Variables &outputs);
virtual void backward_impl(const Variables &inputs, const Variables &outputs,
const vector<bool> &propagate_down,
Expand Down
30 changes: 20 additions & 10 deletions src/nbla/cuda/function/generic/sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
#if THRUST_VERSION < 100904 || !defined(THRUST_CPP11) || \
!defined(THRUST_MODERN_GCC)
#include <thrust/sort.h>
#else
#include <thrust/async/sort.h>
#endif

namespace nbla {

Expand All @@ -40,6 +45,10 @@ template <typename T> struct Compare {
}
};

__global__ void make_sequence(const size_t size, size_t *dst) {
NBLA_CUDA_KERNEL_LOOP(i, size) { dst[i] = static_cast<size_t>(i); }
}

__global__ void copy_index(const size_t size, const size_t stride,
const size_t *src, size_t *dst) {
NBLA_CUDA_KERNEL_LOOP(i, size) { dst[i * stride] = src[i]; }
Expand Down Expand Up @@ -71,16 +80,16 @@ __global__ void set_grad(const size_t size, const size_t stride, const T *src,

} // namespace sort_impl

template <typename T>
void SortCuda<T>::setup_impl(const Variables &inputs,
const Variables &outputs) {
Sort<T>::setup_impl(inputs, outputs);
cuda_set_device(this->device_);
}

template <typename T>
void SortCuda<T>::forward_impl(const Variables &inputs,
const Variables &outputs) {
#if THRUST_VERSION < 100904 || !defined(THRUST_CPP11) || \
!defined(THRUST_MODERN_GCC)
using thrust::sort;
#else
using thrust::async::sort;
#endif

using namespace sort_impl;
cuda_set_device(this->device_);

Expand All @@ -103,9 +112,10 @@ void SortCuda<T>::forward_impl(const Variables &inputs,
auto inner_i_raw = outer_i_raw;

while (inner_x_raw < outer_x_raw + this->inner_size) {
thrust::sequence(temp_index_ptr, temp_index_ptr + temp_index_var.size());
auto cmp = Compare<Tcu>(inner_x_raw, stride, this->reverse);
thrust::sort(temp_index_ptr, temp_index_ptr + temp_index_var.size(), cmp);
const auto size = temp_index_var.size();
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(make_sequence, size, temp_index_raw);
auto compare = Compare<Tcu>(inner_x_raw, stride, this->reverse);
static_cast<void>(sort(temp_index_ptr, temp_index_ptr + size, compare));
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(copy_index, shape[this->axis], stride,
temp_index_raw, inner_i_raw);
inner_x_raw++;
Expand Down

0 comments on commit a7f06b7

Please sign in to comment.