pytorch
diff --git a/‎caffe2/operators/alias_with_name.cc‎
Lines changed: 25 additions & 0 deletions b/‎caffe2/operators/alias_with_name.cc‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎caffe2/operators/alias_with_name.cu‎
Lines changed: 12 additions & 0 deletions b/‎caffe2/operators/alias_with_name.cu‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎caffe2/operators/alias_with_name.h‎
Lines changed: 46 additions & 0 deletions b/‎caffe2/operators/alias_with_name.h‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎caffe2/operators/batch_permutation_op.cc‎
Lines changed: 169 additions & 0 deletions b/‎caffe2/operators/batch_permutation_op.cc‎
Lines changed: 169 additions & 0 deletions
diff --git a/‎caffe2/operators/batch_permutation_op.cu‎
Lines changed: 113 additions & 0 deletions b/‎caffe2/operators/batch_permutation_op.cu‎
Lines changed: 113 additions & 0 deletions
@@ -0,0 +1,25 @@
+#include "caffe2/operators/alias_with_name.h"
+
+namespace caffe2 {
+
+REGISTER_CPU_OPERATOR(AliasWithName, AliasWithNameOp<CPUContext>);
+
+OPERATOR_SCHEMA(AliasWithName)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .AllowInplace({{0, 0}})
+    .IdenticalTypeAndShape()
+    .SetDoc(R"DOC(
+Similar with AliasOp, storing the alias name as operator argument.
+)DOC")
+    .Arg("name", "name of the aliasing")
+    .Arg("is_backward", "weather or not to alias forward or backward")
+    .Input(0, "input", "Input tensor whose storage will be shared.")
+    .Output(0, "output", "Tensor of same shape as input, sharing its storage.");
+
+} // namespace caffe2
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    AliasWithName,
+    "_caffe2::AliasWithName(Tensor input, str name, bool is_backward = False) -> (Tensor output)",
+    caffe2::AliasWithNameOp<caffe2::CPUContext>);
@@ -0,0 +1,12 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/alias_with_name.h"
+
+namespace caffe2 {
+
+REGISTER_CUDA_OPERATOR(AliasWithName, AliasWithNameOp<CUDAContext>);
+
+} // namespace caffe2
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(
+    AliasWithName,
+    caffe2::AliasWithNameOp<caffe2::CUDAContext>);
@@ -0,0 +1,46 @@
+#ifndef ALIAS_WITH_NAME_OP_H_
+#define ALIAS_WITH_NAME_OP_H_
+
+#include "caffe2/core/context.h"
+#include "caffe2/core/export_caffe2_op_to_c10.h"
+#include "caffe2/core/operator.h"
+
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(AliasWithName)
+
+namespace caffe2 {
+
+template <class Context>
+class AliasWithNameOp final : public Operator<Context> {
+ public:
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+  template <class... Args>
+  explicit AliasWithNameOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
+        name_(this->template GetSingleArgument<std::string>(
+            "name",
+            "invalid_name")),
+        is_backward_(
+            this->template GetSingleArgument<bool>("is_backward", false)) {
+    CAFFE_ENFORCE(
+        OperatorBase::HasArgument("name"), "You have to specify argument name");
+  }
+
+  bool RunOnDevice() override {
+    auto& input = Input(0);
+    CAFFE_ENFORCE_GE(input.numel(), 0, "Tensor is not initialized");
+
+    // This doesn't work anymore as this is "newstyle" operator
+    // OutputTensorAlias(0, input);
+
+    OperatorBase::SetOutputTensor(0, input.Alias());
+    return true;
+  }
+
+ protected:
+  std::string name_;
+  bool is_backward_;
+};
+
+} // namespace caffe2
+
+#endif // ALIAS_WITH_NAME_OP_H_
@@ -0,0 +1,169 @@
+#include "caffe2/operators/batch_permutation_op.h"
+
+#include <cstring>
+#include <vector>
+
+#ifdef CAFFE2_USE_MKLDNN
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
+namespace caffe2 {
+
+template <bool forwards>
+void batch_permutation_loop(
+    const int N,
+    const int K,
+    const float* src,
+    const int* indices,
+    float* dst) {
+  long numBytes = K * sizeof(float);
+  if (forwards) {
+#ifdef _OPENMP
+#if (_OPENMP >= 201307)
+#pragma omp parallel for simd
+#else
+#pragma omp parallel for
+#endif
+#endif
+    for (int n = 0; n < N; n++) {
+      int origIdx = n * K;
+      int permuteIdx = indices[n] * K;
+      std::memcpy(dst + origIdx, src + permuteIdx, numBytes);
+    }
+  } else {
+    std::vector<int> backward_indices(N);
+    for (size_t i = 0; i < N; ++i) {
+      backward_indices[indices[i]] = i;
+    }
+    for (int n = 0; n < N; n++) {
+      int permuteIdx = n * K;
+      int origIdx = backward_indices[n] * K;
+      std::memcpy(dst + permuteIdx, src + origIdx, numBytes);
+    }
+  }
+}
+
+template <>
+bool BatchPermutationOp<float, CPUContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& indices = Input(1);
+
+  CAFFE_ENFORCE(indices.dim() == 1, "indices must be 1-d");
+  CAFFE_ENFORCE(
+      X.dim32(0) == indices.dim32(0),
+      "X.dim32(0) must be equal to indices.dim32(0)",
+      "(",
+      X.dim32(0),
+      " vs. ",
+      indices.dim32(0),
+      ")");
+
+  auto* Y = Output(0, X.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(X.dim32(0), 0);
+  batch_permutation_loop<true>(
+      X.dim32(0),
+      X.numel() / X.dim32(0),
+      X.data<float>(),
+      indices.data<int>(),
+      Y->mutable_data<float>());
+  return true;
+}
+
+template <>
+bool BatchPermutationGradientOp<float, CPUContext>::RunOnDevice() {
+  auto& indices = Input(0);
+  auto& dY = Input(1);
+
+  auto* dX = Output(0, dY.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(dY.dim32(0), 0);
+  batch_permutation_loop<false>(
+      dY.dim32(0),
+      dY.numel() / dY.dim32(0),
+      dY.data<float>(),
+      indices.data<int>(),
+      dX->mutable_data<float>());
+  return true;
+}
+
+#ifdef CAFFE2_USE_MKLDNN
+REGISTER_IDEEP_OPERATOR(
+    BatchPermutation,
+    IDEEPFallbackOp<BatchPermutationOp<float, CPUContext>>);
+#endif
+
+REGISTER_CPU_OPERATOR(BatchPermutation, BatchPermutationOp<float, CPUContext>);
+REGISTER_CPU_OPERATOR(
+    BatchPermutationGradient,
+    BatchPermutationGradientOp<float, CPUContext>);
+
+// Input: X, indices; Output: Y
+OPERATOR_SCHEMA(BatchPermutation)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .SetDoc(R"DOC(
+Batch permutation of an input tensor X given input indices. First dimension of
+X equals batch size N. The indices stores a be permutation of N.
+The output Y is a tensor of same shape as X, with data re-ordered according to
+the indices within the batch size.
+
+Example of batch permutation on a 2-D tensor with batch size 4:
+  X = [
+    [1, 5, 2, 3, 4, 6, 0],
+    [4, 3, 3, 5, 2, 3, 1],
+    [2, 2, 3, 6, 0, 0, 1],
+    [0, 0, 1, 1, 2, 2, 3]
+  ]
+  indices = [2, 0, 1, 3]
+  Y = [
+    [2, 2, 3, 6, 0, 0, 1],
+    [1, 5, 2, 3, 4, 6, 0],
+    [4, 3, 3, 5, 2, 3, 1],
+    [0, 0, 1, 1, 2, 2, 3]
+  ]
+
+Example of batch permutation on a 3-D tensor with batch size 4:
+  X = [
+    [[1, 5, 2], [3, 4, 6, 0]],
+    [[4, 3, 3], [5, 2, 3, 1]],
+    [[2, 2, 3], [6, 0, 0, 1]],
+    [[0, 0, 1], [1, 2, 2, 3]]
+  ]
+  indices = [2, 0, 1, 3]
+  Y = [
+    [[2, 2, 3], [6, 0, 0, 1]],
+    [[1, 5, 2], [3, 4, 6, 0]],
+    [[4, 3, 3], [5, 2, 3, 1]],
+    [[0, 0, 1], [1, 2, 2, 3]]
+  ]
+)DOC")
+    .Input(0, "X", "Input tensor, where 1st dimension equals batch size")
+    .Input(1, "indices", "Input indices of batch to permute")
+    .Output(0, "Y", "Output permuted tensor");
+// Input: indices, dY (aka "gradOutput"); Output: dX (aka "gradInput")
+OPERATOR_SCHEMA(BatchPermutationGradient).NumInputs(2).NumOutputs(1);
+
+class GetBatchPermutationGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "BatchPermutationGradient",
+        "",
+        vector<string>{I(1), GO(0)},
+        vector<string>{GI(0)});
+  }
+};
+
+REGISTER_GRADIENT(BatchPermutation, GetBatchPermutationGradient);
+
+} // namespace caffe2
+
+using BatchPermutationOpFloatCPU =
+    caffe2::BatchPermutationOp<float, caffe2::CPUContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    BatchPermutation,
+    "_caffe2::BatchPermutation(Tensor X, Tensor indices) -> Tensor",
+    BatchPermutationOpFloatCPU);
@@ -0,0 +1,113 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/batch_permutation_op.h"
+
+namespace caffe2 {
+
+namespace {
+template <bool forward>
+__global__ void BatchPermutationKernel(
+    int N,
+    int K,
+    const float* src,
+    const int* indices,
+    float* dst) {
+  if (forward) {
+    CUDA_1D_KERNEL_LOOP(index, N * K) {
+      int k = index % K;
+      int n = index / K;
+      int idx = indices[n];
+      CUDA_KERNEL_ASSERT(idx >= 0);
+      CUDA_KERNEL_ASSERT(idx < N);
+      dst[index] = src[idx * K + k];
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, N * K) {
+      int k = index % K;
+      int n = index / K;
+
+      // NOTE: an alternative implementation if we want to align the index with
+      // the output tensor (rather than the input tensor).
+      // int idx = -1;
+      // for (size_t i = 0; i < N; ++i) {
+      //   if (indices[i] == n) {
+      //     idx = i;
+      //   }
+      // }
+      // CUDA_KERNEL_ASSERT(idx >= 0);
+      // CUDA_KERNEL_ASSERT(idx < N);
+      // dst[index] = src[idx * K + k];
+
+      int idx = indices[n];
+      CUDA_KERNEL_ASSERT(idx >= 0);
+      CUDA_KERNEL_ASSERT(idx < N);
+      dst[idx * K + k] = src[index];
+    }
+  }
+}
+} // namespace
+
+template <>
+bool BatchPermutationOp<float, CUDAContext>::RunOnDevice() {
+  auto& X = Input(0);
+  auto& indices = Input(1);
+
+  CAFFE_ENFORCE(indices.dim() == 1, "indices must be 1-d");
+  CAFFE_ENFORCE(
+      X.dim32(0) == indices.dim32(0),
+      "X.dim32(0) must be equal to indices.dim32(0)",
+      "(",
+      X.dim32(0),
+      " vs. ",
+      indices.dim32(0),
+      ")");
+
+  auto* Y = Output(0, X.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(X.dim32(0), 0);
+  BatchPermutationKernel<true>
+      <<<CAFFE_GET_BLOCKS(X.numel()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          X.dim32(0),
+          X.numel() / X.dim32(0),
+          X.data<float>(),
+          indices.data<int>(),
+          Y->mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool BatchPermutationGradientOp<float, CUDAContext>::RunOnDevice() {
+  auto& indices = Input(0);
+  auto& dY = Input(1);
+  auto* dX = Output(0, dY.sizes(), at::dtype<float>());
+
+  CAFFE_ENFORCE_GT(dY.dim32(0), 0);
+  BatchPermutationKernel<false>
+      <<<CAFFE_GET_BLOCKS(dY.numel()),
+         CAFFE_CUDA_NUM_THREADS,
+         0,
+         context_.cuda_stream()>>>(
+          dY.dim32(0),
+          dY.numel() / dY.dim32(0),
+          dY.data<float>(),
+          indices.data<int>(),
+          dX->mutable_data<float>());
+
+  return true;
+}
+
+REGISTER_CUDA_OPERATOR(
+    BatchPermutation,
+    BatchPermutationOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    BatchPermutationGradient,
+    BatchPermutationGradientOp<float, CUDAContext>);
+} // namespace caffe2
+
+using BatchPermutationOpFloatCUDA =
+    caffe2::BatchPermutationOp<float, caffe2::CUDAContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(BatchPermutation, BatchPermutationOpFloatCUDA);