Skip to content

Commit

Permalink
Merge pull request #190 from sony/feature/20190918_faster_transpose
Browse files Browse the repository at this point in the history
Improve the speed of transpose function
  • Loading branch information
KazukiYoshiyama-sony committed Oct 22, 2019
2 parents bed050b + 254e5da commit 77bc900
Show file tree
Hide file tree
Showing 2 changed files with 212 additions and 68 deletions.
7 changes: 5 additions & 2 deletions include/nbla/cuda/function/transpose.hpp
Expand Up @@ -25,17 +25,20 @@ namespace nbla {
template <typename T> class TransposeCuda : public Transpose<T> {
protected:
public:
typedef typename CudaType<T>::type Tc;
typedef typename CudaType<T>::type Tcu;

explicit TransposeCuda(const Context &ctx, const vector<int> &axes)
: Transpose<T>(ctx, axes) {}
: Transpose<T>(ctx, axes), device_(std::stoi(ctx.device_id)) {}
virtual ~TransposeCuda() {}
virtual string name() { return "TransposeCuda"; }
virtual vector<string> allowed_array_classes() {
return SingletonManager::get<Cuda>()->array_classes();
}

protected:
int device_;
std::shared_ptr<Variable> var_strides_;

virtual void setup_impl(const Variables &inputs, const Variables &outputs);
virtual void forward_impl(const Variables &inputs, const Variables &outputs);
virtual void backward_impl(const Variables &inputs, const Variables &outputs,
Expand Down
273 changes: 207 additions & 66 deletions src/nbla/cuda/function/generic/transpose.cu
Expand Up @@ -12,75 +12,190 @@
// See the License for the specific language governing permissions and
// limitations under the License.

// transpose.cpp

#include <limits>
#include <nbla/array.hpp>
#include <nbla/variable.hpp>

#include <nbla/cuda/common.hpp>
#include <nbla/cuda/function/transpose.hpp>
#include <nbla/variable.hpp>

namespace nbla {

template <typename T>
__global__ void
kernel_transpose_forward(const int num, const int ndim, const int64_t *axes,
const int64_t *x_strides, const int64_t *y_strides,
const int64_t *y_shape, const T *x, T *y) {

NBLA_CUDA_KERNEL_LOOP(o, num) {
int i = 0;
for (int d = 0; d < ndim; ++d) {
const int k = int(o / y_strides[d]) % y_shape[d];
i += k * x_strides[axes[d]];
namespace { // local namespace

template <typename T, bool accum = false>
__global__ void transpose_1d(const int size, const T *src, T *dst) {
NBLA_CUDA_KERNEL_LOOP(idx, size) {
dst[idx] = accum ? dst[idx] + src[idx] : src[idx];
}
}

template <typename T, bool accum = false>
__global__ void transpose_2d(const int2 shape, const T *src, T *dst) {
// One extra column to avoid memory bank conflicts.
__shared__ T tile[CUDA_WARP_SIZE][CUDA_WARP_SIZE + 1];

int x = blockIdx.x * CUDA_WARP_SIZE + threadIdx.x;
int y = blockIdx.y * CUDA_WARP_SIZE + threadIdx.y;

#pragma unroll
for (int j = 0; j < CUDA_WARP_SIZE; j += 8) {
if ((x < shape.x) && (y + j < shape.y)) {
tile[threadIdx.y + j][threadIdx.x] = src[(y + j) * shape.x + x];
}
}

__syncthreads();

x = blockIdx.y * CUDA_WARP_SIZE + threadIdx.x; // transpose block offset
y = blockIdx.x * CUDA_WARP_SIZE + threadIdx.y;

#pragma unroll
for (int j = 0; j < CUDA_WARP_SIZE; j += 8) {
if ((x < shape.y) && (y + j < shape.x)) {
auto val = tile[threadIdx.x][threadIdx.y + j];
auto idx = (y + j) * shape.y + x;
dst[idx] = accum ? dst[idx] + val : val;
}
y[o] = x[i];
}
}

template <typename T, bool accum>
__global__ void
kernel_transpose_backward(const int num, const int ndim, const int64_t *axes,
const int64_t *x_strides, const int64_t *y_strides,
const int64_t *y_shape, const T *dy, T *dx) {
NBLA_CUDA_KERNEL_LOOP(o, num) {
int i = 0;
for (int d = 0; d < ndim; ++d) {
const int k = int(o / y_strides[d]) % y_shape[d];
i += k * x_strides[axes[d]];
template <typename T, bool accum = false>
__global__ void transpose_3d(const int size, const int3 ostride,
const int3 tstride, const T *src, T *dst) {
// ostride - strides of the transposed input shape
// tstride - transpose of the input shape strides
// Ex. transpose(ishape=(2, 3, 4), axes=(2, 1, 0)) => oshape (4, 3, 2)
// then ostride is (6, 2, 1) and tstride is (1, 4, 12)
NBLA_CUDA_KERNEL_LOOP(idx, size) {
auto z = (idx / ostride.z);
auto y = (idx - z * ostride.z) / ostride.y;
auto x = (idx - z * ostride.z - y * ostride.y);
T val = src[z * tstride.z + y * tstride.y + x * tstride.x];
dst[idx] = accum ? dst[idx] + val : val;
}
}

template <typename T, bool accum = false>
__global__ void transpose_4d(const int size, const int4 ostride,
const int4 tstride, const T *src, T *dst) {
// ostride - strides of the transposed input shape
// tstride - transpose of the input shape strides
// Ex. transpose(ishape=(2,3,4,5), axes=(3,2,1,0)) => oshape (5,4,3,2)
// then ostride is (24, 6, 2, 1) and tstride is (1, 5, 20, 60)
NBLA_CUDA_KERNEL_LOOP(idx, size) {
auto w = (idx / ostride.w);
auto z = (idx - w * ostride.w) / ostride.z;
auto y = (idx - w * ostride.w - z * ostride.z) / ostride.y;
auto x = (idx - w * ostride.w - z * ostride.z - y * ostride.y);
T val = src[w * tstride.w + z * tstride.z + y * tstride.y + x * tstride.x];
dst[idx] = accum ? dst[idx] + val : val;
}
}

template <typename T> struct TransposeStrides {
T ostride; // strides of transposed input shape
T tstride; // transposed strides of input shape
};

template <typename T, bool accum = false>
__global__ void transpose_nd(const int size, const T *src, T *dst,
const TransposeStrides<int> *strides,
const int ndim) {
NBLA_CUDA_KERNEL_LOOP(idx, size) {
int src_index = 0, dst_index = idx;
for (int axis = 0; axis < ndim; axis++) {
const auto k = idx / strides[axis].ostride;
src_index += k * strides[axis].tstride;
idx -= k * strides[axis].ostride;
}
dx[i] = (accum ? dx[i] : (T)0) + dy[o];
dst[dst_index] = accum ? dst[dst_index] + src[src_index] : src[src_index];
}
}

} // end local namespace

template <typename T>
void TransposeCuda<T>::setup_impl(const Variables &inputs,
const Variables &outputs) {
Transpose<T>::setup_impl(inputs, outputs);

const int ndim = this->x_shape_.size();
const int size = outputs[0]->size();

NBLA_CHECK(size <= std::numeric_limits<int>::max(), error_code::value,
"Maximum supported array size is %d elements",
std::numeric_limits<int>::max());

if (ndim > 4) {
Shape_t shape = {2, ndim * (int)sizeof(TransposeStrides<int>)};
this->var_strides_ = std::make_shared<Variable>();
this->var_strides_->reshape(shape, true);
auto var = static_cast<VariablePtr>(this->var_strides_);
auto ptr = var->cast_data_and_get_pointer<char>(Context());
auto strides = reinterpret_cast<TransposeStrides<int> *>(ptr);
for (int i = 0; i < ndim; i++) {
// strides for forward where we iterate the output shape
strides[i].ostride = this->y_strides_[i];
strides[i].tstride = this->x_strides_transposed_[i];
// strides for backward where we iterate the input shape
strides[i + ndim].ostride = this->x_strides_[i];
strides[i + ndim].tstride = this->y_strides_transposed_[i];
}
}
}

template <typename T> inline int2 make_int2_from(vector<T> vec, int skip = 0) {
return make_int2(vec[1 + skip], vec[0 + skip]);
}

template <typename T> inline int3 make_int3_from(vector<T> vec, int skip = 0) {
return make_int3(vec[2 + skip], vec[1 + skip], vec[0 + skip]);
}

template <typename T> inline int4 make_int4_from(vector<T> vec, int skip = 0) {
return make_int4(vec[3 + skip], vec[2 + skip], vec[1 + skip], vec[0 + skip]);
}

#define WARPS_FOR(N) NBLA_CEIL_INT_DIV(N, CUDA_WARP_SIZE)

template <class T>
void TransposeCuda<T>::forward_impl(const Variables &inputs,
const Variables &outputs) {
cuda_set_device(std::stoi(this->ctx_.device_id));
const Tc *x = inputs[0]->get_data_pointer<Tc>(this->ctx_);
Tc *y = outputs[0]->cast_data_and_get_pointer<Tc>(this->ctx_, true);

// To avoid compiler error : type name is not allowed.
// The following statement causes a compiler error.
// this->v_axes_.get_data_pointer<int64_t>(this->ctx_)
auto get_ = [this](Variable &var) {
return var.get_data_pointer<int64_t>(this->ctx_);
};
const int64_t *axes = get_(this->v_axes_);
const int64_t *x_strides = get_(this->v_x_strides_);
const int64_t *y_strides = get_(this->v_y_strides_);
const int64_t *y_shape = get_(this->v_y_shape_);
const int ndim = inputs[0]->ndim();
cuda_set_device(this->device_);
auto x = inputs[0]->get_data_pointer<Tcu>(this->ctx_);
auto y = outputs[0]->cast_data_and_get_pointer<Tcu>(this->ctx_, true);
const int ndim = this->x_shape_.size();
const int size = outputs[0]->size();

NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(kernel_transpose_forward, size, ndim, axes,
x_strides, y_strides, y_shape, x, y);
if (ndim == 1) {
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(transpose_1d, size, x, y);
} else if (ndim == 2) {
const auto shape = make_int2_from(this->x_shape_);
const dim3 grid_dim(WARPS_FOR(shape.x), WARPS_FOR(shape.y));
const dim3 block_dim(CUDA_WARP_SIZE, 8);
transpose_2d<<<grid_dim, block_dim>>>(shape, x, y);
NBLA_CUDA_KERNEL_CHECK();
} else if (ndim == 3 && this->axes_[0] == 0) {
const auto shape = make_int2_from(this->x_shape_, 1);
const dim3 grid_dim(WARPS_FOR(shape.x), WARPS_FOR(shape.y));
const dim3 block_dim(CUDA_WARP_SIZE, 8);
for (int i = 0, w = shape.x * shape.y; i < this->x_shape_[0]; i++)
transpose_2d<<<grid_dim, block_dim>>>(shape, x + i * w, y + i * w);
NBLA_CUDA_KERNEL_CHECK();
} else if (ndim == 3) {
const auto ostride = make_int3_from(this->y_strides_);
const auto tstride = make_int3_from(this->x_strides_transposed_);
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(transpose_3d, size, ostride, tstride, x, y);
} else if (ndim == 4) {
const auto ostride = make_int4_from(this->y_strides_);
const auto tstride = make_int4_from(this->x_strides_transposed_);
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(transpose_4d, size, ostride, tstride, x, y);
} else {
auto var = static_cast<VariablePtr>(this->var_strides_);
auto ptr = var->get_data_pointer<char>(this->ctx_);
auto strides = reinterpret_cast<const TransposeStrides<int> *>(ptr);
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(transpose_nd, size, x, y, strides, ndim);
}
}

template <class T>
Expand All @@ -91,30 +206,56 @@ void TransposeCuda<T>::backward_impl(const Variables &inputs,
if (!propagate_down[0])
return;

cuda_set_device(std::stoi(this->ctx_.device_id));
const Tc *dy = outputs[0]->get_grad_pointer<Tc>(this->ctx_);
Tc *dx = inputs[0]->cast_grad_and_get_pointer<Tc>(this->ctx_, !accum[0]);

// To avoid compiler error : type name is not allowed.
// The following statement causes a compiler error.
// this->v_axes_.get_data_pointer<int64_t>(this->ctx_)
auto get_ = [this](Variable &var) {
return var.get_data_pointer<int64_t>(this->ctx_);
};
const int64_t *axes = get_(this->v_axes_);
const int64_t *x_strides = get_(this->v_x_strides_);
const int64_t *y_strides = get_(this->v_y_strides_);
const int64_t *y_shape = get_(this->v_y_shape_);
const int ndim = inputs[0]->ndim();
cuda_set_device(this->device_);
auto dy = outputs[0]->get_grad_pointer<Tcu>(this->ctx_);
auto dx = inputs[0]->cast_grad_and_get_pointer<Tcu>(this->ctx_, !accum[0]);
const int ndim = this->x_shape_.size();
const int size = outputs[0]->size();
if (accum[0]) {
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE((kernel_transpose_backward<Tc, true>), size,
ndim, axes, x_strides, y_strides, y_shape,
dy, dx);

if (ndim == 1) {
auto kernel = transpose_1d<Tcu>;
if (accum[0])
kernel = transpose_1d<Tcu, true>;
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(kernel, size, dy, dx);
} else if (ndim == 2) {
const auto shape = make_int2_from(this->y_shape_);
const dim3 grid(WARPS_FOR(shape.x), WARPS_FOR(shape.y));
auto kernel = transpose_2d<Tcu>;
if (accum[0])
kernel = transpose_2d<Tcu, true>;
kernel<<<grid, dim3(CUDA_WARP_SIZE, 8)>>>(shape, dy, dx);
NBLA_CUDA_KERNEL_CHECK();
} else if (ndim == 3 && this->axes_[0] == 0) {
const auto shape = make_int2_from(this->y_shape_, 1);
const dim3 grid(WARPS_FOR(shape.x), WARPS_FOR(shape.y));
auto kernel = transpose_2d<Tcu>;
if (accum[0])
kernel = transpose_2d<Tcu, true>;
for (int i = 0, w = shape.x * shape.y; i < this->x_shape_[0]; i++)
kernel<<<grid, dim3(CUDA_WARP_SIZE, 8)>>>(shape, dy + i * w, dx + i * w);
NBLA_CUDA_KERNEL_CHECK();
} else if (ndim == 3) {
const auto ostride = make_int3_from(this->x_strides_);
const auto tstride = make_int3_from(this->y_strides_transposed_);
auto kernel = transpose_3d<Tcu, false>;
if (accum[0])
kernel = transpose_3d<Tcu, true>;
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(kernel, size, ostride, tstride, dy, dx);
} else if (ndim == 4) {
const auto ostride = make_int4_from(this->x_strides_);
const auto tstride = make_int4_from(this->y_strides_transposed_);
auto kernel = transpose_4d<Tcu, false>;
if (accum[0])
kernel = transpose_4d<Tcu, true>;
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(kernel, size, ostride, tstride, dy, dx);
} else {
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE((kernel_transpose_backward<Tc, false>), size,
ndim, axes, x_strides, y_strides, y_shape,
dy, dx);
auto var = static_cast<VariablePtr>(this->var_strides_);
auto ptr = var->get_data_pointer<char>(this->ctx_);
auto strides = reinterpret_cast<const TransposeStrides<int> *>(ptr);
auto kernel = transpose_nd<Tcu, false>;
if (accum[0])
kernel = transpose_nd<Tcu, true>;
NBLA_CUDA_LAUNCH_KERNEL_SIMPLE(kernel, size, dy, dx, strides + ndim, ndim);
}
}
}

0 comments on commit 77bc900

Please sign in to comment.