-
Notifications
You must be signed in to change notification settings - Fork 21.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add benchmark for per channel tensor quantization
Summary: Currently on mobile only per tensor quantization is optimized for mobile using ARM intrinsics. This benchmark is added to help gauge performance improvement on mobile after performing the same optimizations for per channel quantization. Test Plan: Build and push to mobile device ``` BUILD_MOBILE_BENCHMARK=1 BUILD_MOBILE_TEST=1 ANDROID_DEBUG_SYMBOLS=1 BUILD_PYTORCH_MOBILE=1 ANDROID_ABI=arm64-v8a ./scripts/build_android.sh -DANDROID_CCACHE=$(which ccache) -DBUILD_BINARY=ON adb push build_android/bin/quantize_per_channel /data/local/tmp ``` and then run the benchmark binary over adb shell Reviewers: kimishpatel Subscribers: Tasks: Tags: ghstack-source-id: 76349f41ac7e6b2444d3883bf8f7c5ac3d8d43c9 Pull Request resolved: #46017
- Loading branch information
Showing
2 changed files
with
87 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#include <ATen/ATen.h> | ||
#include <iostream> | ||
|
||
#include <benchmark/benchmark.h> | ||
|
||
static void quantize_per_channel_4d_contiguous(benchmark::State& state) { | ||
const size_t batches = static_cast<size_t>(state.range(0)); | ||
const size_t channels = static_cast<size_t>(state.range(1)); | ||
const size_t height = static_cast<size_t>(state.range(2)); | ||
const size_t width = static_cast<size_t>(state.range(3)); | ||
|
||
at::Tensor a = at::rand({batches, channels, height, width}); | ||
at::Tensor scales = at::rand({channels}); | ||
at::Tensor zero_points = at::randint( | ||
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int)); | ||
|
||
at::Tensor qa; | ||
for (auto _ : state) { | ||
qa = at::native::quantize_per_channel_cpu( | ||
a, scales, zero_points, 1, at::ScalarType::QUInt8); | ||
} | ||
} | ||
|
||
static void quantize_per_channel_4d_channels_last(benchmark::State& state) { | ||
const size_t batches = static_cast<size_t>(state.range(0)); | ||
const size_t channels = static_cast<size_t>(state.range(1)); | ||
const size_t height = static_cast<size_t>(state.range(2)); | ||
const size_t width = static_cast<size_t>(state.range(3)); | ||
|
||
at::Tensor a = at::rand( | ||
{batches, channels, height, width}, | ||
at::TensorOptions().memory_format(at::MemoryFormat::ChannelsLast)); | ||
at::Tensor scales = at::rand({channels}); | ||
at::Tensor zero_points = at::randint( | ||
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int)); | ||
|
||
at::Tensor qa; | ||
for (auto _ : state) { | ||
qa = at::native::quantize_per_channel_cpu( | ||
a, scales, zero_points, 1, at::ScalarType::QUInt8); | ||
} | ||
} | ||
|
||
static void quantize_per_channel_2d(benchmark::State& state) { | ||
const size_t channels = static_cast<size_t>(state.range(0)); | ||
const size_t nelem = static_cast<size_t>(state.range(1)); | ||
|
||
at::Tensor a = at::rand({channels, nelem}); | ||
at::Tensor scales = at::rand({channels}); | ||
at::Tensor zero_points = at::randint( | ||
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int)); | ||
|
||
at::Tensor qa; | ||
for (auto _ : state) { | ||
qa = at::native::quantize_per_channel_cpu( | ||
a, scales, zero_points, 0, at::ScalarType::QUInt8); | ||
} | ||
} | ||
|
||
static void GenerateSizes4d(benchmark::internal::Benchmark* b) { | ||
b->ArgNames({"N", "C", "H", "W"}); | ||
|
||
for (size_t n = 16; n < 256; n *= 2) { | ||
for (size_t c = 4; c < 256; c *= 2) { | ||
for (size_t hw = 4; hw < 256; hw *= 2) { | ||
b->Args({n, c, hw, hw}); | ||
} | ||
} | ||
} | ||
} | ||
|
||
static void GenerateSizes2d(benchmark::internal::Benchmark* b) { | ||
b->ArgNames({"C", "N"}); | ||
|
||
for (size_t c = 4; c < 512; c *= 2) { | ||
for (size_t n = 4; n < 512; n *= 2) { | ||
b->Args({c, n}); | ||
} | ||
} | ||
} | ||
|
||
BENCHMARK(quantize_per_channel_2d)->Apply(GenerateSizes2d); | ||
BENCHMARK(quantize_per_channel_4d_contiguous)->Apply(GenerateSizes4d); | ||
BENCHMARK(quantize_per_channel_4d_channels_last)->Apply(GenerateSizes4d); | ||
BENCHMARK_MAIN(); |