Skip to content

Commit 360a003

Browse files
authored
Move lowbit universal kernels from torchaccel to torchao
Differential Revision: D60292095 Pull Request resolved: #582
1 parent c2f5399 commit 360a003

28 files changed

+4596
-0
lines changed
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
cmake_minimum_required(VERSION 3.19)
4+
project(benchmarks)
5+
set(CMAKE_CXX_STANDARD 17)
6+
set(CMAKE_BUILD_TYPE Release)
7+
8+
include(FetchContent)
9+
FetchContent_Declare(googlebenchmark
10+
GIT_REPOSITORY https://github.com/google/benchmark.git
11+
GIT_TAG main) # need main for benchmark::benchmark
12+
13+
set(BENCHMARK_ENABLE_TESTING OFF)
14+
FetchContent_MakeAvailable(
15+
googlebenchmark)
16+
17+
add_compile_options("-Wall" "-Werror")
18+
19+
include(CMakePrintHelpers)
20+
message("TORCHAO_LIBRARIES: ${TORCHAO_LIBRARIES}")
21+
include_directories(${TORCHAO_LIBRARIES})
22+
23+
add_library(
24+
dep
25+
${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
26+
${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
27+
${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
28+
${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp
29+
)
30+
31+
add_executable(benchmark_quantization benchmark_quantization.cpp)
32+
target_link_libraries(
33+
benchmark_quantization
34+
PRIVATE
35+
benchmark::benchmark
36+
dep
37+
)
38+
39+
add_executable(benchmark_bitpacking benchmark_bitpacking.cpp)
40+
target_link_libraries(
41+
benchmark_bitpacking
42+
PRIVATE
43+
benchmark::benchmark
44+
dep
45+
)
46+
47+
add_executable(benchmark_linear benchmark_linear.cpp)
48+
target_link_libraries(
49+
benchmark_linear
50+
PRIVATE
51+
benchmark::benchmark
52+
dep
53+
)
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
2+
3+
#include <arm_neon.h>
4+
#include <benchmark/benchmark.h>
5+
#include <iostream>
6+
7+
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h>
8+
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h>
9+
#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h>
10+
#include <torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h>
11+
#include <cassert>
12+
13+
namespace {
14+
15+
// Benchmark utility to compare variants of uint3 packing
16+
void pack_uint3_values(
17+
uint8_t* packed,
18+
uint8_t* unpacked,
19+
int packed_size,
20+
int unpacked_size,
21+
int variant) {
22+
constexpr int nbit = 3;
23+
constexpr int bitsPerByte = 8;
24+
assert(unpacked_size * nbit / bitsPerByte == packed_size);
25+
assert(packed_size % variant == 0);
26+
27+
uint8x16_t unpacked0;
28+
uint8x16_t unpacked1;
29+
uint8x16_t unpacked2;
30+
uint8x16_t unpacked3;
31+
uint8x16_t unpacked4;
32+
uint8x16_t unpacked5;
33+
uint8x16_t unpacked6;
34+
uint8x16_t unpacked7;
35+
36+
switch (variant) {
37+
case 8:
38+
for (int i = 0; i < unpacked_size; i += 8) {
39+
torchao::bitpacking::internal::pack_8_uint3_values(
40+
packed + ((i * nbit) / bitsPerByte), unpacked + i);
41+
}
42+
break;
43+
case 64:
44+
for (int i = 0; i < unpacked_size; i += 64) {
45+
torchao::bitpacking::internal::vec_load_64_uint8_values(
46+
unpacked0, unpacked1, unpacked2, unpacked3, unpacked + i);
47+
torchao::bitpacking::internal::vec_pack_64_uint3_values(
48+
packed + ((i * nbit) / bitsPerByte),
49+
unpacked0,
50+
unpacked1,
51+
unpacked2,
52+
unpacked3);
53+
}
54+
break;
55+
case 128:
56+
for (int i = 0; i < unpacked_size; i += 128) {
57+
torchao::bitpacking::internal::vec_load_64_uint8_values(
58+
unpacked0, unpacked1, unpacked2, unpacked3, unpacked + i);
59+
torchao::bitpacking::internal::vec_load_64_uint8_values(
60+
unpacked4, unpacked5, unpacked6, unpacked7, unpacked + i + 64);
61+
torchao::bitpacking::internal::vec_pack_128_uint3_values(
62+
packed + ((i * nbit) / bitsPerByte),
63+
unpacked0,
64+
unpacked1,
65+
unpacked2,
66+
unpacked3,
67+
unpacked4,
68+
unpacked5,
69+
unpacked6,
70+
unpacked7);
71+
}
72+
break;
73+
}
74+
}
75+
76+
// Benchmark utility to compare variants of uint3 unpacking
77+
void unpack_uint3_values(
78+
uint8_t* unpacked,
79+
uint8_t* packed,
80+
int unpacked_size,
81+
int packed_size,
82+
int variant) {
83+
constexpr int nbit = 3;
84+
constexpr int bitsPerByte = 8;
85+
assert(unpacked_size * nbit / bitsPerByte == packed_size);
86+
assert(packed_size % variant == 0);
87+
88+
uint8x16_t unpacked0;
89+
uint8x16_t unpacked1;
90+
uint8x16_t unpacked2;
91+
uint8x16_t unpacked3;
92+
uint8x16_t unpacked4;
93+
uint8x16_t unpacked5;
94+
uint8x16_t unpacked6;
95+
uint8x16_t unpacked7;
96+
97+
switch (variant) {
98+
case 8:
99+
for (int i = 0; i < unpacked_size; i += 8) {
100+
torchao::bitpacking::internal::unpack_8_uint3_values(
101+
unpacked + i, packed + ((i * nbit) / bitsPerByte));
102+
}
103+
break;
104+
case 64:
105+
for (int i = 0; i < unpacked_size; i += 64) {
106+
torchao::bitpacking::internal::vec_unpack_64_uint3_values(
107+
unpacked0,
108+
unpacked1,
109+
unpacked2,
110+
unpacked3,
111+
packed + ((i * nbit) / bitsPerByte));
112+
torchao::bitpacking::internal::vec_store_64_uint8_values(
113+
unpacked + i, unpacked0, unpacked1, unpacked2, unpacked3);
114+
}
115+
break;
116+
case 128:
117+
for (int i = 0; i < unpacked_size; i += 128) {
118+
torchao::bitpacking::internal::vec_unpack_128_uint3_values(
119+
unpacked0,
120+
unpacked1,
121+
unpacked2,
122+
unpacked3,
123+
unpacked4,
124+
unpacked5,
125+
unpacked6,
126+
unpacked7,
127+
packed + ((i * nbit) / bitsPerByte));
128+
torchao::bitpacking::internal::vec_store_64_uint8_values(
129+
unpacked + i, unpacked0, unpacked1, unpacked2, unpacked3);
130+
torchao::bitpacking::internal::vec_store_64_uint8_values(
131+
unpacked + i + 64, unpacked4, unpacked5, unpacked6, unpacked7);
132+
}
133+
break;
134+
}
135+
}
136+
137+
// Benchmark utility to compare variants of uint4 packing
138+
void pack_uint4_values(
139+
uint8_t* packed,
140+
uint8_t* unpacked,
141+
int packed_size,
142+
int unpacked_size,
143+
int variant) {
144+
constexpr int nbit = 4;
145+
constexpr int bitsPerByte = 8;
146+
assert(unpacked_size * nbit / bitsPerByte == packed_size);
147+
assert(packed_size % variant == 0);
148+
149+
uint8x16_t unpacked0;
150+
uint8x16_t unpacked1;
151+
152+
switch (variant) {
153+
case 2:
154+
for (int i = 0; i < unpacked_size; i += 2) {
155+
torchao::bitpacking::internal::pack_2_uint4_values(
156+
packed + ((i * nbit) / bitsPerByte), unpacked + i);
157+
}
158+
break;
159+
case 16:
160+
for (int i = 0; i < unpacked_size; i += 16) {
161+
unpacked0 = vld1q_u8(unpacked + i);
162+
torchao::bitpacking::internal::vec_pack_16_uint4_values(
163+
packed + ((i * nbit) / bitsPerByte), unpacked0);
164+
}
165+
break;
166+
case 32:
167+
for (int i = 0; i < unpacked_size; i += 32) {
168+
unpacked0 = vld1q_u8(unpacked + i);
169+
unpacked1 = vld1q_u8(unpacked + 16 + i);
170+
torchao::bitpacking::internal::vec_pack_32_uint4_values(
171+
packed + ((i * nbit) / bitsPerByte), unpacked0, unpacked1);
172+
}
173+
break;
174+
}
175+
}
176+
177+
// Benchmark utility to compare variants of uint4 unpacking
178+
void unpack_uint4_values(
179+
uint8_t* unpacked,
180+
uint8_t* packed,
181+
int unpacked_size,
182+
int packed_size,
183+
int variant) {
184+
constexpr int nbit = 4;
185+
constexpr int bitsPerByte = 8;
186+
assert(unpacked_size * nbit / bitsPerByte == packed_size);
187+
assert(packed_size % variant == 0);
188+
189+
uint8x16_t unpacked0;
190+
uint8x16_t unpacked1;
191+
192+
switch (variant) {
193+
case 2:
194+
for (int i = 0; i < unpacked_size; i += 2) {
195+
torchao::bitpacking::internal::unpack_2_uint4_values(
196+
unpacked + i, packed + ((i * nbit) / bitsPerByte));
197+
}
198+
break;
199+
case 16:
200+
for (int i = 0; i < unpacked_size; i += 16) {
201+
torchao::bitpacking::internal::vec_unpack_16_uint4_values(
202+
unpacked0, packed + ((i * nbit) / bitsPerByte));
203+
vst1q_u8(unpacked + i, unpacked0);
204+
}
205+
break;
206+
case 32:
207+
for (int i = 0; i < unpacked_size; i += 32) {
208+
torchao::bitpacking::internal::vec_unpack_32_uint4_values(
209+
unpacked0, unpacked1, packed + ((i * nbit) / bitsPerByte));
210+
vst1q_u8(unpacked + i, unpacked0);
211+
vst1q_u8(unpacked + 16 + i, unpacked1);
212+
}
213+
break;
214+
}
215+
}
216+
217+
} // namespace
218+
219+
static void benchmark_pack_uint3_values(benchmark::State& state) {
220+
int unpacked_size = state.range(0);
221+
int variant = state.range(1);
222+
int nbit = 3;
223+
224+
assert(unpacked_size % 8 == 0);
225+
int packed_size = (unpacked_size / 8) * nbit;
226+
227+
auto packed = std::vector<uint8_t>(unpacked_size, 0);
228+
auto unpacked = torchao::get_random_lowbit_vector(packed_size, 8);
229+
230+
for (auto _ : state) {
231+
pack_uint3_values(
232+
packed.data(), unpacked.data(), packed_size, unpacked_size, variant);
233+
}
234+
}
235+
236+
static void benchmark_unpack_uint3_values(benchmark::State& state) {
237+
int unpacked_size = state.range(0);
238+
int variant = state.range(1);
239+
int nbit = 3;
240+
241+
assert(unpacked_size % 8 == 0);
242+
int packed_size = (unpacked_size / 8) * nbit;
243+
244+
auto packed = torchao::get_random_lowbit_vector(packed_size, 8);
245+
auto unpacked = std::vector<uint8_t>(unpacked_size, 0);
246+
247+
for (auto _ : state) {
248+
unpack_uint3_values(
249+
unpacked.data(),
250+
packed.data(),
251+
unpacked.size(),
252+
packed.size(),
253+
variant);
254+
}
255+
}
256+
257+
static void benchmark_pack_uint4_values(benchmark::State& state) {
258+
int unpacked_size = state.range(0);
259+
int variant = state.range(1);
260+
int nbit = 4;
261+
262+
assert(unpacked_size % 8 == 0);
263+
int packed_size = (unpacked_size / 8) * nbit;
264+
265+
auto packed = std::vector<uint8_t>(unpacked_size, 0);
266+
auto unpacked = torchao::get_random_lowbit_vector(packed_size, 8);
267+
268+
for (auto _ : state) {
269+
pack_uint4_values(
270+
packed.data(), unpacked.data(), packed_size, unpacked_size, variant);
271+
}
272+
}
273+
274+
static void benchmark_unpack_uint4_values(benchmark::State& state) {
275+
int unpacked_size = state.range(0);
276+
int variant = state.range(1);
277+
int nbit = 4;
278+
279+
assert(unpacked_size % 8 == 0);
280+
int packed_size = (unpacked_size / 8) * nbit;
281+
282+
auto packed = torchao::get_random_lowbit_vector(packed_size, 8);
283+
auto unpacked = std::vector<uint8_t>(unpacked_size, 0);
284+
285+
for (auto _ : state) {
286+
unpack_uint4_values(
287+
unpacked.data(),
288+
packed.data(),
289+
unpacked.size(),
290+
packed.size(),
291+
variant);
292+
}
293+
}
294+
295+
BENCHMARK(benchmark_pack_uint3_values)->ArgsProduct({{128}, {8, 64, 128}});
296+
BENCHMARK(benchmark_unpack_uint3_values)->ArgsProduct({{128}, {8, 64, 128}});
297+
BENCHMARK(benchmark_pack_uint4_values)->ArgsProduct({{128}, {2, 16, 32}});
298+
BENCHMARK(benchmark_unpack_uint4_values)->ArgsProduct({{128}, {2, 16, 32}});
299+
300+
// Run the benchmark
301+
BENCHMARK_MAIN();

0 commit comments

Comments
 (0)