Add benchmark for per channel tensor quantization

Summary: Currently on mobile only per tensor quantization is optimized for mobile using ARM intrinsics. This benchmark is added to help gauge performance improvement on mobile after performing the same optimizations for per channel quantization. Test Plan: Build and push to mobile device ``` BUILD_MOBILE_BENCHMARK=1 BUILD_MOBILE_TEST=1 ANDROID_DEBUG_SYMBOLS=1 BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DANDROID_CCACHE=$(which ccache) -DBUILD_BINARY=ON adb push build_android/bin/quantize_per_channel /data/local/tmp ``` and then run the benchmark binary over adb shell Reviewers: kimishpatel Subscribers: Tasks: Tags: ghstack-source-id: 76349f41ac7e6b2444d3883bf8f7c5ac3d8d43c9 Pull Request resolved: #46017
pytorch · Oct 13, 2020 · d071ec1 · d071ec1
1 parent 95ccf34
commit d071ec1
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 0 deletions.
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -426,6 +426,8 @@ endif()
 
 list(APPEND ATen_MOBILE_BENCHMARK_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/tensor_add.cpp)
+list(APPEND ATen_MOBILE_BENCHMARK_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/quantize_per_channel.cpp)
 list(APPEND ATen_MOBILE_BENCHMARK_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/stateful_conv1d.cpp)
 

diff --git a/aten/src/ATen/benchmarks/quantize_per_channel.cpp b/aten/src/ATen/benchmarks/quantize_per_channel.cpp
@@ -0,0 +1,85 @@
+#include <ATen/ATen.h>
+#include <iostream>
+
+#include <benchmark/benchmark.h>
+
+static void quantize_per_channel_4d_contiguous(benchmark::State& state) {
+  const size_t batches = static_cast<size_t>(state.range(0));
+  const size_t channels = static_cast<size_t>(state.range(1));
+  const size_t height = static_cast<size_t>(state.range(2));
+  const size_t width = static_cast<size_t>(state.range(3));
+
+  at::Tensor a = at::rand({batches, channels, height, width});
+  at::Tensor scales = at::rand({channels});
+  at::Tensor zero_points = at::randint(
+      0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));
+
+  at::Tensor qa;
+  for (auto _ : state) {
+    qa = at::native::quantize_per_channel_cpu(
+        a, scales, zero_points, 1, at::ScalarType::QUInt8);
+  }
+}
+
+static void quantize_per_channel_4d_channels_last(benchmark::State& state) {
+  const size_t batches = static_cast<size_t>(state.range(0));
+  const size_t channels = static_cast<size_t>(state.range(1));
+  const size_t height = static_cast<size_t>(state.range(2));
+  const size_t width = static_cast<size_t>(state.range(3));
+
+  at::Tensor a = at::rand(
+      {batches, channels, height, width},
+      at::TensorOptions().memory_format(at::MemoryFormat::ChannelsLast));
+  at::Tensor scales = at::rand({channels});
+  at::Tensor zero_points = at::randint(
+      0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));
+
+  at::Tensor qa;
+  for (auto _ : state) {
+    qa = at::native::quantize_per_channel_cpu(
+        a, scales, zero_points, 1, at::ScalarType::QUInt8);
+  }
+}
+
+static void quantize_per_channel_2d(benchmark::State& state) {
+  const size_t channels = static_cast<size_t>(state.range(0));
+  const size_t nelem = static_cast<size_t>(state.range(1));
+
+  at::Tensor a = at::rand({channels, nelem});
+  at::Tensor scales = at::rand({channels});
+  at::Tensor zero_points = at::randint(
+      0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));
+
+  at::Tensor qa;
+  for (auto _ : state) {
+    qa = at::native::quantize_per_channel_cpu(
+        a, scales, zero_points, 0, at::ScalarType::QUInt8);
+  }
+}
+
+static void GenerateSizes4d(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "C", "H", "W"});
+
+  for (size_t n = 16; n < 256; n *= 2) {
+    for (size_t c = 4; c < 256; c *= 2) {
+      for (size_t hw = 4; hw < 256; hw *= 2) {
+        b->Args({n, c, hw, hw});
+      }
+    }
+  }
+}
+
+static void GenerateSizes2d(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"C", "N"});
+
+  for (size_t c = 4; c < 512; c *= 2) {
+    for (size_t n = 4; n < 512; n *= 2) {
+      b->Args({c, n});
+    }
+  }
+}
+
+BENCHMARK(quantize_per_channel_2d)->Apply(GenerateSizes2d);
+BENCHMARK(quantize_per_channel_4d_contiguous)->Apply(GenerateSizes4d);
+BENCHMARK(quantize_per_channel_4d_channels_last)->Apply(GenerateSizes4d);
+BENCHMARK_MAIN();