|
| 1 | +#include "caffe2/operators/batch_permutation_op.h" |
| 2 | + |
| 3 | +#include <cstring> |
| 4 | +#include <vector> |
| 5 | + |
| 6 | +#ifdef CAFFE2_USE_MKLDNN |
| 7 | +#include <caffe2/ideep/operators/operator_fallback_ideep.h> |
| 8 | +#include <caffe2/ideep/utils/ideep_operator.h> |
| 9 | +#endif |
| 10 | + |
| 11 | +namespace caffe2 { |
| 12 | + |
| 13 | +template <bool forwards> |
| 14 | +void batch_permutation_loop( |
| 15 | + const int N, |
| 16 | + const int K, |
| 17 | + const float* src, |
| 18 | + const int* indices, |
| 19 | + float* dst) { |
| 20 | + long numBytes = K * sizeof(float); |
| 21 | + if (forwards) { |
| 22 | +#ifdef _OPENMP |
| 23 | +#if (_OPENMP >= 201307) |
| 24 | +#pragma omp parallel for simd |
| 25 | +#else |
| 26 | +#pragma omp parallel for |
| 27 | +#endif |
| 28 | +#endif |
| 29 | + for (int n = 0; n < N; n++) { |
| 30 | + int origIdx = n * K; |
| 31 | + int permuteIdx = indices[n] * K; |
| 32 | + std::memcpy(dst + origIdx, src + permuteIdx, numBytes); |
| 33 | + } |
| 34 | + } else { |
| 35 | + std::vector<int> backward_indices(N); |
| 36 | + for (size_t i = 0; i < N; ++i) { |
| 37 | + backward_indices[indices[i]] = i; |
| 38 | + } |
| 39 | + for (int n = 0; n < N; n++) { |
| 40 | + int permuteIdx = n * K; |
| 41 | + int origIdx = backward_indices[n] * K; |
| 42 | + std::memcpy(dst + permuteIdx, src + origIdx, numBytes); |
| 43 | + } |
| 44 | + } |
| 45 | +} |
| 46 | + |
| 47 | +template <> |
| 48 | +bool BatchPermutationOp<float, CPUContext>::RunOnDevice() { |
| 49 | + auto& X = Input(0); |
| 50 | + auto& indices = Input(1); |
| 51 | + |
| 52 | + CAFFE_ENFORCE(indices.dim() == 1, "indices must be 1-d"); |
| 53 | + CAFFE_ENFORCE( |
| 54 | + X.dim32(0) == indices.dim32(0), |
| 55 | + "X.dim32(0) must be equal to indices.dim32(0)", |
| 56 | + "(", |
| 57 | + X.dim32(0), |
| 58 | + " vs. ", |
| 59 | + indices.dim32(0), |
| 60 | + ")"); |
| 61 | + |
| 62 | + auto* Y = Output(0, X.sizes(), at::dtype<float>()); |
| 63 | + |
| 64 | + CAFFE_ENFORCE_GT(X.dim32(0), 0); |
| 65 | + batch_permutation_loop<true>( |
| 66 | + X.dim32(0), |
| 67 | + X.numel() / X.dim32(0), |
| 68 | + X.data<float>(), |
| 69 | + indices.data<int>(), |
| 70 | + Y->mutable_data<float>()); |
| 71 | + return true; |
| 72 | +} |
| 73 | + |
| 74 | +template <> |
| 75 | +bool BatchPermutationGradientOp<float, CPUContext>::RunOnDevice() { |
| 76 | + auto& indices = Input(0); |
| 77 | + auto& dY = Input(1); |
| 78 | + |
| 79 | + auto* dX = Output(0, dY.sizes(), at::dtype<float>()); |
| 80 | + |
| 81 | + CAFFE_ENFORCE_GT(dY.dim32(0), 0); |
| 82 | + batch_permutation_loop<false>( |
| 83 | + dY.dim32(0), |
| 84 | + dY.numel() / dY.dim32(0), |
| 85 | + dY.data<float>(), |
| 86 | + indices.data<int>(), |
| 87 | + dX->mutable_data<float>()); |
| 88 | + return true; |
| 89 | +} |
| 90 | + |
| 91 | +#ifdef CAFFE2_USE_MKLDNN |
| 92 | +REGISTER_IDEEP_OPERATOR( |
| 93 | + BatchPermutation, |
| 94 | + IDEEPFallbackOp<BatchPermutationOp<float, CPUContext>>); |
| 95 | +#endif |
| 96 | + |
| 97 | +REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>); |
| 98 | +REGISTER_CPU_OPERATOR( |
| 99 | + BatchPermutationGradient, |
| 100 | + BatchPermutationGradientOp<float, CPUContext>); |
| 101 | + |
| 102 | +// Input: X, indices; Output: Y |
| 103 | +OPERATOR_SCHEMA(BatchPermutation) |
| 104 | + .NumInputs(2) |
| 105 | + .NumOutputs(1) |
| 106 | + .SetDoc(R"DOC( |
| 107 | +Batch permutation of an input tensor X given input indices. First dimension of |
| 108 | +X equals batch size N. The indices stores a be permutation of N. |
| 109 | +The output Y is a tensor of same shape as X, with data re-ordered according to |
| 110 | +the indices within the batch size. |
| 111 | +
|
| 112 | +Example of batch permutation on a 2-D tensor with batch size 4: |
| 113 | + X = [ |
| 114 | + [1, 5, 2, 3, 4, 6, 0], |
| 115 | + [4, 3, 3, 5, 2, 3, 1], |
| 116 | + [2, 2, 3, 6, 0, 0, 1], |
| 117 | + [0, 0, 1, 1, 2, 2, 3] |
| 118 | + ] |
| 119 | + indices = [2, 0, 1, 3] |
| 120 | + Y = [ |
| 121 | + [2, 2, 3, 6, 0, 0, 1], |
| 122 | + [1, 5, 2, 3, 4, 6, 0], |
| 123 | + [4, 3, 3, 5, 2, 3, 1], |
| 124 | + [0, 0, 1, 1, 2, 2, 3] |
| 125 | + ] |
| 126 | +
|
| 127 | +Example of batch permutation on a 3-D tensor with batch size 4: |
| 128 | + X = [ |
| 129 | + [[1, 5, 2], [3, 4, 6, 0]], |
| 130 | + [[4, 3, 3], [5, 2, 3, 1]], |
| 131 | + [[2, 2, 3], [6, 0, 0, 1]], |
| 132 | + [[0, 0, 1], [1, 2, 2, 3]] |
| 133 | + ] |
| 134 | + indices = [2, 0, 1, 3] |
| 135 | + Y = [ |
| 136 | + [[2, 2, 3], [6, 0, 0, 1]], |
| 137 | + [[1, 5, 2], [3, 4, 6, 0]], |
| 138 | + [[4, 3, 3], [5, 2, 3, 1]], |
| 139 | + [[0, 0, 1], [1, 2, 2, 3]] |
| 140 | + ] |
| 141 | +)DOC") |
| 142 | + .Input(0, "X", "Input tensor, where 1st dimension equals batch size") |
| 143 | + .Input(1, "indices", "Input indices of batch to permute") |
| 144 | + .Output(0, "Y", "Output permuted tensor"); |
| 145 | +// Input: indices, dY (aka "gradOutput"); Output: dX (aka "gradInput") |
| 146 | +OPERATOR_SCHEMA(BatchPermutationGradient).NumInputs(2).NumOutputs(1); |
| 147 | + |
| 148 | +class GetBatchPermutationGradient : public GradientMakerBase { |
| 149 | + using GradientMakerBase::GradientMakerBase; |
| 150 | + vector<OperatorDef> GetGradientDefs() override { |
| 151 | + return SingleGradientDef( |
| 152 | + "BatchPermutationGradient", |
| 153 | + "", |
| 154 | + vector<string>{I(1), GO(0)}, |
| 155 | + vector<string>{GI(0)}); |
| 156 | + } |
| 157 | +}; |
| 158 | + |
| 159 | +REGISTER_GRADIENT(BatchPermutation, GetBatchPermutationGradient); |
| 160 | + |
| 161 | +} // namespace caffe2 |
| 162 | + |
| 163 | +using BatchPermutationOpFloatCPU = |
| 164 | + caffe2::BatchPermutationOp<float, caffe2::CPUContext>; |
| 165 | + |
| 166 | +C10_EXPORT_CAFFE2_OP_TO_C10_CPU( |
| 167 | + BatchPermutation, |
| 168 | + "_caffe2::BatchPermutation(Tensor X, Tensor indices) -> Tensor", |
| 169 | + BatchPermutationOpFloatCPU); |
0 commit comments