Skip to content

Commit

Permalink
Add benchmark for per channel tensor quantization
Browse files Browse the repository at this point in the history
Summary: Currently on mobile only per tensor quantization is optimized for mobile using ARM
intrinsics. This benchmark is added to help gauge performance improvement on mobile after
performing the same optimizations for per channel quantization.

Test Plan:
Build and push to mobile device
```
BUILD_MOBILE_BENCHMARK=1 BUILD_MOBILE_TEST=1 ANDROID_DEBUG_SYMBOLS=1 BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh  -DANDROID_CCACHE=$(which ccache) -DBUILD_BINARY=ON
adb push build_android/bin/quantize_per_channel /data/local/tmp
```
and then run the benchmark binary over adb shell

Reviewers: kimishpatel

Subscribers:

Tasks:

Tags:

ghstack-source-id: 76349f41ac7e6b2444d3883bf8f7c5ac3d8d43c9
Pull Request resolved: #46017
  • Loading branch information
ajliu committed Oct 13, 2020
1 parent 95ccf34 commit d071ec1
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 0 deletions.
2 changes: 2 additions & 0 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,8 @@ endif()

list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/tensor_add.cpp)
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/quantize_per_channel.cpp)
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/stateful_conv1d.cpp)

Expand Down
85 changes: 85 additions & 0 deletions aten/src/ATen/benchmarks/quantize_per_channel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <ATen/ATen.h>
#include <iostream>

#include <benchmark/benchmark.h>

static void quantize_per_channel_4d_contiguous(benchmark::State& state) {
const size_t batches = static_cast<size_t>(state.range(0));
const size_t channels = static_cast<size_t>(state.range(1));
const size_t height = static_cast<size_t>(state.range(2));
const size_t width = static_cast<size_t>(state.range(3));

at::Tensor a = at::rand({batches, channels, height, width});
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 1, at::ScalarType::QUInt8);
}
}

static void quantize_per_channel_4d_channels_last(benchmark::State& state) {
const size_t batches = static_cast<size_t>(state.range(0));
const size_t channels = static_cast<size_t>(state.range(1));
const size_t height = static_cast<size_t>(state.range(2));
const size_t width = static_cast<size_t>(state.range(3));

at::Tensor a = at::rand(
{batches, channels, height, width},
at::TensorOptions().memory_format(at::MemoryFormat::ChannelsLast));
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 1, at::ScalarType::QUInt8);
}
}

static void quantize_per_channel_2d(benchmark::State& state) {
const size_t channels = static_cast<size_t>(state.range(0));
const size_t nelem = static_cast<size_t>(state.range(1));

at::Tensor a = at::rand({channels, nelem});
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 0, at::ScalarType::QUInt8);
}
}

static void GenerateSizes4d(benchmark::internal::Benchmark* b) {
b->ArgNames({"N", "C", "H", "W"});

for (size_t n = 16; n < 256; n *= 2) {
for (size_t c = 4; c < 256; c *= 2) {
for (size_t hw = 4; hw < 256; hw *= 2) {
b->Args({n, c, hw, hw});
}
}
}
}

static void GenerateSizes2d(benchmark::internal::Benchmark* b) {
b->ArgNames({"C", "N"});

for (size_t c = 4; c < 512; c *= 2) {
for (size_t n = 4; n < 512; n *= 2) {
b->Args({c, n});
}
}
}

BENCHMARK(quantize_per_channel_2d)->Apply(GenerateSizes2d);
BENCHMARK(quantize_per_channel_4d_contiguous)->Apply(GenerateSizes4d);
BENCHMARK(quantize_per_channel_4d_channels_last)->Apply(GenerateSizes4d);
BENCHMARK_MAIN();

0 comments on commit d071ec1

Please sign in to comment.