Mobile Backend: NHWC memory layout + XNNPACK integration.

pytorch · Feb 18, 2020 · 4b95293 · 4b95293
1 parent 4468a7b
commit 4b95293
Show file tree

Hide file tree

Showing 22 changed files with 1,081 additions and 170 deletions.
diff --git a/.circleci/scripts/binary_ios_upload.sh b/.circleci/scripts/binary_ios_upload.sh
@@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
 cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
 # build a FAT bianry
 cd ${ZIP_DIR}/install/lib
-target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a)
+target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
 for lib in ${target_libs[*]}
 do
     if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then

diff --git a/.gitmodules b/.gitmodules
@@ -111,10 +111,14 @@
     path = third_party/foxi
     url = https://github.com/houseroad/foxi.git
 [submodule "third_party/tbb"]
-	path = third_party/tbb
-	url = https://github.com/01org/tbb
-	branch = tbb_2018
+    path = third_party/tbb
+    url = https://github.com/01org/tbb
+    branch = tbb_2018
 [submodule "android/libs/fbjni"]
     ignore = dirty
     path = android/libs/fbjni
     url = https://github.com/facebookincubator/fbjni.git
+[submodule "third_party/XNNPACK"]
+    path = third_party/XNNPACK
+    url = https://github.com/AshkanAliabadi/XNNPACK.git
+    branch = xnnpack_pytorch_merge_temp
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -185,6 +185,7 @@ option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
 option(USE_SYSTEM_EIGEN_INSTALL
     "Use system Eigen instead of the one under third_party" OFF)
 option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
+option(USE_XNNPACK "Use XNNPACK" ON)
 option(USE_ZMQ "Use ZMQ" OFF)
 option(USE_ZSTD "Use ZSTD" OFF)
 cmake_dependent_option(
@@ -415,6 +416,10 @@ if(USE_PYTORCH_QNNPACK)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_PYTORCH_QNNPACK")
 endif()
 
+if(USE_XNNPACK)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_XNNPACK")
+endif()
+
 # ---[ Whitelist file if whitelist is specified
 include(cmake/Whitelist.cmake)
 

diff --git a/android/pytorch_android/CMakeLists.txt b/android/pytorch_android/CMakeLists.txt
@@ -83,6 +83,7 @@ if (ANDROID_ABI)
   import_static_lib(libtorch_cpu)
   import_static_lib(libc10)
   import_static_lib(libnnpack)
+  import_static_lib(libXNNPACK)
   import_static_lib(libpytorch_qnnpack)
   import_static_lib(libeigen_blas)
   import_static_lib(libcpuinfo)
@@ -98,6 +99,7 @@ if (ANDROID_ABI)
       -Wl,--no-whole-archive
       libc10
       libnnpack
+      libXNNPACK
       libpytorch_qnnpack
       libeigen_blas
       libcpuinfo
@@ -113,6 +115,7 @@ else()
       torch_cpu
       c10
       nnpack
+      XNNPACK
       pytorch_qnnpack
       cpuinfo
       clog

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -84,8 +84,11 @@ FILE(GLOB native_sparse_hip_cpp "native/sparse/hip/*.cpp")
 FILE(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip")
 FILE(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp")
 
+# XNNPACK
+FILE(GLOB native_xnnpack "native/xnnpack/*.cpp")
+
 add_subdirectory(quantized)
-set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${core_generated_cpp} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${cpu_kernel_cpp})
+set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${native_xnnpack} ${generated_cpp} ${core_generated_cpp} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${cpu_kernel_cpp})
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -775,6 +775,10 @@
 
 - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
 
+- func: _conv2d_prepack(Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, float? output_min=None, float? output_max=None) -> Tensor
+
+- func: _conv2d_packed(Tensor packed_weight, Tensor input) -> Tensor
+
 - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
 
 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
@@ -1563,6 +1567,10 @@
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   python_module: nn
 
+- func: _linear_prepack(Tensor weight, Tensor? bias=None, float? output_min=None, float? output_max=None) -> Tensor
+
+- func: _linear_packed(Tensor packed_weight, Tensor input) -> Tensor
+
 - func: mkldnn_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
   python_module: nn
   dispatch:

diff --git a/aten/src/ATen/native/utils/Allocator.h b/aten/src/ATen/native/utils/Allocator.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <c10/core/CPUAllocator.h>
+
+namespace at {
+namespace native {
+
+// QNNPACK AND XNNPACK may out-of-bound access the input and / or output tensors.
+// This behavior will trigger ASAN, and may result in a segfault if the accessed
+// memory just so happens to fall on a page the current process has no read access
+// to.  Here we define a custom allocator that allocates the extra storage required
+// to keep this behavior safe.
+//
+// PreGuardBytes: Number of guard bytes to allocate before the allocation.
+// PostGuardBytes: Number of guard bytes to allocate after the allocation.
+
+template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
+class GuardingAllocator final : public at::Allocator {
+ public:
+  GuardingAllocator() = default;
+  virtual ~GuardingAllocator() override = default;
+
+  static void deleter(void* pointer) {
+    const Cast memory{pointer};
+    c10::free_cpu(memory.as_byte_ptr - kPreGuardBytes);
+  }
+
+  virtual DataPtr allocate(size_t nbytes) const override {
+    Cast memory{c10::alloc_cpu(kPreGuardBytes + nbytes + kPostGuardBytes)};
+    memory.as_byte_ptr += kPreGuardBytes;
+
+    return {
+      memory.as_void_ptr,
+      memory.as_void_ptr,
+      &deleter,
+      at::Device(DeviceType::CPU),
+    };
+  }
+
+  virtual DeleterFnPtr raw_deleter() const override {
+    return deleter;
+  }
+
+ private:
+  static constexpr uint32_t kPreGuardBytes = PreGuardBytes;
+  static constexpr uint32_t kPostGuardBytes = PostGuardBytes;
+
+  union Cast final {
+    void * const as_void_ptr;
+    uint8_t * as_byte_ptr;
+  };
+};
+
+} // namespace native
+} // namespace at
+
diff --git a/aten/src/ATen/native/xnnpack/Common.h b/aten/src/ATen/native/xnnpack/Common.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#ifdef USE_XNNPACK
+
+#include <xnnpack.h>
+
+namespace at {
+namespace native {
+namespace xnnpack {
+namespace internal {
+
+struct Layout final {
+  // 4D Activation Maps
+  struct Activation4D final {
+    static constexpr size_t batch = 0u;
+    static constexpr size_t channels = 1u;
+    static constexpr size_t height = 2u;
+    static constexpr size_t width = 3u;
+  };
+
+  // ND Activation Maps
+  struct ActivationND final {
+    // Some operators may not be limited to 4 dimensional tensors. In that scenario,
+    // XNNPACK denotes that operator with an _nc suffix and expects all dimensions,
+    // except channels, to be flattened into one argument: batch_size.
+    static int64_t batch(const IntArrayRef tensor) {
+      if (C10_UNLIKELY(tensor.empty())) {
+        return -1;
+      }
+
+      // Handle the case where batch size is zero.
+      int64_t batch = std::max<int64_t>(1, tensor[0]);
+
+      for (size_t index = 1u; index < (tensor.size() - 1u); ++index) {
+        batch *= tensor[index];
+      }
+
+      return batch;
+    };
+
+    static int64_t channel(const IntArrayRef tensor) {
+      if (C10_UNLIKELY(tensor.empty())) {
+        return -1;
+      }
+
+      return tensor.back();
+    };
+  };
+
+  // Convolution Filters
+  struct Filter final {
+    static constexpr size_t output = 0u;
+    static constexpr size_t input = 1u;
+    static constexpr size_t height = 2u;
+    static constexpr size_t width = 3u;
+  };
+
+  // Parameters (Pooling Kernels, Dilation, Padding, Stride, etc.)
+  struct Parameter final {
+    static constexpr size_t height = 0u;
+    static constexpr size_t width = 1u;
+  };
+};
+
+struct Deleter final {
+  void operator()(const xnn_operator_t op) const {
+    xnn_delete_operator(op);
+  }
+};
+
+using Operator = std::unique_ptr<xnn_operator, Deleter>;
+
+bool available();
+
+} // namespace internal
+} // namespace xnnpack
+} // namespace native
+} // namespace at
+
+#endif /* USE_XNNPACK */