|
| 1 | +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. |
| 2 | + |
| 3 | +#include <arm_neon.h> |
| 4 | +#include <benchmark/benchmark.h> |
| 5 | +#include <iostream> |
| 6 | + |
| 7 | +#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h> |
| 8 | +#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h> |
| 9 | +#include <torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h> |
| 10 | +#include <torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h> |
| 11 | +#include <cassert> |
| 12 | + |
| 13 | +namespace { |
| 14 | + |
| 15 | +// Benchmark utility to compare variants of uint3 packing |
| 16 | +void pack_uint3_values( |
| 17 | + uint8_t* packed, |
| 18 | + uint8_t* unpacked, |
| 19 | + int packed_size, |
| 20 | + int unpacked_size, |
| 21 | + int variant) { |
| 22 | + constexpr int nbit = 3; |
| 23 | + constexpr int bitsPerByte = 8; |
| 24 | + assert(unpacked_size * nbit / bitsPerByte == packed_size); |
| 25 | + assert(packed_size % variant == 0); |
| 26 | + |
| 27 | + uint8x16_t unpacked0; |
| 28 | + uint8x16_t unpacked1; |
| 29 | + uint8x16_t unpacked2; |
| 30 | + uint8x16_t unpacked3; |
| 31 | + uint8x16_t unpacked4; |
| 32 | + uint8x16_t unpacked5; |
| 33 | + uint8x16_t unpacked6; |
| 34 | + uint8x16_t unpacked7; |
| 35 | + |
| 36 | + switch (variant) { |
| 37 | + case 8: |
| 38 | + for (int i = 0; i < unpacked_size; i += 8) { |
| 39 | + torchao::bitpacking::internal::pack_8_uint3_values( |
| 40 | + packed + ((i * nbit) / bitsPerByte), unpacked + i); |
| 41 | + } |
| 42 | + break; |
| 43 | + case 64: |
| 44 | + for (int i = 0; i < unpacked_size; i += 64) { |
| 45 | + torchao::bitpacking::internal::vec_load_64_uint8_values( |
| 46 | + unpacked0, unpacked1, unpacked2, unpacked3, unpacked + i); |
| 47 | + torchao::bitpacking::internal::vec_pack_64_uint3_values( |
| 48 | + packed + ((i * nbit) / bitsPerByte), |
| 49 | + unpacked0, |
| 50 | + unpacked1, |
| 51 | + unpacked2, |
| 52 | + unpacked3); |
| 53 | + } |
| 54 | + break; |
| 55 | + case 128: |
| 56 | + for (int i = 0; i < unpacked_size; i += 128) { |
| 57 | + torchao::bitpacking::internal::vec_load_64_uint8_values( |
| 58 | + unpacked0, unpacked1, unpacked2, unpacked3, unpacked + i); |
| 59 | + torchao::bitpacking::internal::vec_load_64_uint8_values( |
| 60 | + unpacked4, unpacked5, unpacked6, unpacked7, unpacked + i + 64); |
| 61 | + torchao::bitpacking::internal::vec_pack_128_uint3_values( |
| 62 | + packed + ((i * nbit) / bitsPerByte), |
| 63 | + unpacked0, |
| 64 | + unpacked1, |
| 65 | + unpacked2, |
| 66 | + unpacked3, |
| 67 | + unpacked4, |
| 68 | + unpacked5, |
| 69 | + unpacked6, |
| 70 | + unpacked7); |
| 71 | + } |
| 72 | + break; |
| 73 | + } |
| 74 | +} |
| 75 | + |
| 76 | +// Benchmark utility to compare variants of uint3 unpacking |
| 77 | +void unpack_uint3_values( |
| 78 | + uint8_t* unpacked, |
| 79 | + uint8_t* packed, |
| 80 | + int unpacked_size, |
| 81 | + int packed_size, |
| 82 | + int variant) { |
| 83 | + constexpr int nbit = 3; |
| 84 | + constexpr int bitsPerByte = 8; |
| 85 | + assert(unpacked_size * nbit / bitsPerByte == packed_size); |
| 86 | + assert(packed_size % variant == 0); |
| 87 | + |
| 88 | + uint8x16_t unpacked0; |
| 89 | + uint8x16_t unpacked1; |
| 90 | + uint8x16_t unpacked2; |
| 91 | + uint8x16_t unpacked3; |
| 92 | + uint8x16_t unpacked4; |
| 93 | + uint8x16_t unpacked5; |
| 94 | + uint8x16_t unpacked6; |
| 95 | + uint8x16_t unpacked7; |
| 96 | + |
| 97 | + switch (variant) { |
| 98 | + case 8: |
| 99 | + for (int i = 0; i < unpacked_size; i += 8) { |
| 100 | + torchao::bitpacking::internal::unpack_8_uint3_values( |
| 101 | + unpacked + i, packed + ((i * nbit) / bitsPerByte)); |
| 102 | + } |
| 103 | + break; |
| 104 | + case 64: |
| 105 | + for (int i = 0; i < unpacked_size; i += 64) { |
| 106 | + torchao::bitpacking::internal::vec_unpack_64_uint3_values( |
| 107 | + unpacked0, |
| 108 | + unpacked1, |
| 109 | + unpacked2, |
| 110 | + unpacked3, |
| 111 | + packed + ((i * nbit) / bitsPerByte)); |
| 112 | + torchao::bitpacking::internal::vec_store_64_uint8_values( |
| 113 | + unpacked + i, unpacked0, unpacked1, unpacked2, unpacked3); |
| 114 | + } |
| 115 | + break; |
| 116 | + case 128: |
| 117 | + for (int i = 0; i < unpacked_size; i += 128) { |
| 118 | + torchao::bitpacking::internal::vec_unpack_128_uint3_values( |
| 119 | + unpacked0, |
| 120 | + unpacked1, |
| 121 | + unpacked2, |
| 122 | + unpacked3, |
| 123 | + unpacked4, |
| 124 | + unpacked5, |
| 125 | + unpacked6, |
| 126 | + unpacked7, |
| 127 | + packed + ((i * nbit) / bitsPerByte)); |
| 128 | + torchao::bitpacking::internal::vec_store_64_uint8_values( |
| 129 | + unpacked + i, unpacked0, unpacked1, unpacked2, unpacked3); |
| 130 | + torchao::bitpacking::internal::vec_store_64_uint8_values( |
| 131 | + unpacked + i + 64, unpacked4, unpacked5, unpacked6, unpacked7); |
| 132 | + } |
| 133 | + break; |
| 134 | + } |
| 135 | +} |
| 136 | + |
| 137 | +// Benchmark utility to compare variants of uint4 packing |
| 138 | +void pack_uint4_values( |
| 139 | + uint8_t* packed, |
| 140 | + uint8_t* unpacked, |
| 141 | + int packed_size, |
| 142 | + int unpacked_size, |
| 143 | + int variant) { |
| 144 | + constexpr int nbit = 4; |
| 145 | + constexpr int bitsPerByte = 8; |
| 146 | + assert(unpacked_size * nbit / bitsPerByte == packed_size); |
| 147 | + assert(packed_size % variant == 0); |
| 148 | + |
| 149 | + uint8x16_t unpacked0; |
| 150 | + uint8x16_t unpacked1; |
| 151 | + |
| 152 | + switch (variant) { |
| 153 | + case 2: |
| 154 | + for (int i = 0; i < unpacked_size; i += 2) { |
| 155 | + torchao::bitpacking::internal::pack_2_uint4_values( |
| 156 | + packed + ((i * nbit) / bitsPerByte), unpacked + i); |
| 157 | + } |
| 158 | + break; |
| 159 | + case 16: |
| 160 | + for (int i = 0; i < unpacked_size; i += 16) { |
| 161 | + unpacked0 = vld1q_u8(unpacked + i); |
| 162 | + torchao::bitpacking::internal::vec_pack_16_uint4_values( |
| 163 | + packed + ((i * nbit) / bitsPerByte), unpacked0); |
| 164 | + } |
| 165 | + break; |
| 166 | + case 32: |
| 167 | + for (int i = 0; i < unpacked_size; i += 32) { |
| 168 | + unpacked0 = vld1q_u8(unpacked + i); |
| 169 | + unpacked1 = vld1q_u8(unpacked + 16 + i); |
| 170 | + torchao::bitpacking::internal::vec_pack_32_uint4_values( |
| 171 | + packed + ((i * nbit) / bitsPerByte), unpacked0, unpacked1); |
| 172 | + } |
| 173 | + break; |
| 174 | + } |
| 175 | +} |
| 176 | + |
| 177 | +// Benchmark utility to compare variants of uint4 unpacking |
| 178 | +void unpack_uint4_values( |
| 179 | + uint8_t* unpacked, |
| 180 | + uint8_t* packed, |
| 181 | + int unpacked_size, |
| 182 | + int packed_size, |
| 183 | + int variant) { |
| 184 | + constexpr int nbit = 4; |
| 185 | + constexpr int bitsPerByte = 8; |
| 186 | + assert(unpacked_size * nbit / bitsPerByte == packed_size); |
| 187 | + assert(packed_size % variant == 0); |
| 188 | + |
| 189 | + uint8x16_t unpacked0; |
| 190 | + uint8x16_t unpacked1; |
| 191 | + |
| 192 | + switch (variant) { |
| 193 | + case 2: |
| 194 | + for (int i = 0; i < unpacked_size; i += 2) { |
| 195 | + torchao::bitpacking::internal::unpack_2_uint4_values( |
| 196 | + unpacked + i, packed + ((i * nbit) / bitsPerByte)); |
| 197 | + } |
| 198 | + break; |
| 199 | + case 16: |
| 200 | + for (int i = 0; i < unpacked_size; i += 16) { |
| 201 | + torchao::bitpacking::internal::vec_unpack_16_uint4_values( |
| 202 | + unpacked0, packed + ((i * nbit) / bitsPerByte)); |
| 203 | + vst1q_u8(unpacked + i, unpacked0); |
| 204 | + } |
| 205 | + break; |
| 206 | + case 32: |
| 207 | + for (int i = 0; i < unpacked_size; i += 32) { |
| 208 | + torchao::bitpacking::internal::vec_unpack_32_uint4_values( |
| 209 | + unpacked0, unpacked1, packed + ((i * nbit) / bitsPerByte)); |
| 210 | + vst1q_u8(unpacked + i, unpacked0); |
| 211 | + vst1q_u8(unpacked + 16 + i, unpacked1); |
| 212 | + } |
| 213 | + break; |
| 214 | + } |
| 215 | +} |
| 216 | + |
| 217 | +} // namespace |
| 218 | + |
| 219 | +static void benchmark_pack_uint3_values(benchmark::State& state) { |
| 220 | + int unpacked_size = state.range(0); |
| 221 | + int variant = state.range(1); |
| 222 | + int nbit = 3; |
| 223 | + |
| 224 | + assert(unpacked_size % 8 == 0); |
| 225 | + int packed_size = (unpacked_size / 8) * nbit; |
| 226 | + |
| 227 | + auto packed = std::vector<uint8_t>(unpacked_size, 0); |
| 228 | + auto unpacked = torchao::get_random_lowbit_vector(packed_size, 8); |
| 229 | + |
| 230 | + for (auto _ : state) { |
| 231 | + pack_uint3_values( |
| 232 | + packed.data(), unpacked.data(), packed_size, unpacked_size, variant); |
| 233 | + } |
| 234 | +} |
| 235 | + |
| 236 | +static void benchmark_unpack_uint3_values(benchmark::State& state) { |
| 237 | + int unpacked_size = state.range(0); |
| 238 | + int variant = state.range(1); |
| 239 | + int nbit = 3; |
| 240 | + |
| 241 | + assert(unpacked_size % 8 == 0); |
| 242 | + int packed_size = (unpacked_size / 8) * nbit; |
| 243 | + |
| 244 | + auto packed = torchao::get_random_lowbit_vector(packed_size, 8); |
| 245 | + auto unpacked = std::vector<uint8_t>(unpacked_size, 0); |
| 246 | + |
| 247 | + for (auto _ : state) { |
| 248 | + unpack_uint3_values( |
| 249 | + unpacked.data(), |
| 250 | + packed.data(), |
| 251 | + unpacked.size(), |
| 252 | + packed.size(), |
| 253 | + variant); |
| 254 | + } |
| 255 | +} |
| 256 | + |
| 257 | +static void benchmark_pack_uint4_values(benchmark::State& state) { |
| 258 | + int unpacked_size = state.range(0); |
| 259 | + int variant = state.range(1); |
| 260 | + int nbit = 4; |
| 261 | + |
| 262 | + assert(unpacked_size % 8 == 0); |
| 263 | + int packed_size = (unpacked_size / 8) * nbit; |
| 264 | + |
| 265 | + auto packed = std::vector<uint8_t>(unpacked_size, 0); |
| 266 | + auto unpacked = torchao::get_random_lowbit_vector(packed_size, 8); |
| 267 | + |
| 268 | + for (auto _ : state) { |
| 269 | + pack_uint4_values( |
| 270 | + packed.data(), unpacked.data(), packed_size, unpacked_size, variant); |
| 271 | + } |
| 272 | +} |
| 273 | + |
| 274 | +static void benchmark_unpack_uint4_values(benchmark::State& state) { |
| 275 | + int unpacked_size = state.range(0); |
| 276 | + int variant = state.range(1); |
| 277 | + int nbit = 4; |
| 278 | + |
| 279 | + assert(unpacked_size % 8 == 0); |
| 280 | + int packed_size = (unpacked_size / 8) * nbit; |
| 281 | + |
| 282 | + auto packed = torchao::get_random_lowbit_vector(packed_size, 8); |
| 283 | + auto unpacked = std::vector<uint8_t>(unpacked_size, 0); |
| 284 | + |
| 285 | + for (auto _ : state) { |
| 286 | + unpack_uint4_values( |
| 287 | + unpacked.data(), |
| 288 | + packed.data(), |
| 289 | + unpacked.size(), |
| 290 | + packed.size(), |
| 291 | + variant); |
| 292 | + } |
| 293 | +} |
| 294 | + |
| 295 | +BENCHMARK(benchmark_pack_uint3_values)->ArgsProduct({{128}, {8, 64, 128}}); |
| 296 | +BENCHMARK(benchmark_unpack_uint3_values)->ArgsProduct({{128}, {8, 64, 128}}); |
| 297 | +BENCHMARK(benchmark_pack_uint4_values)->ArgsProduct({{128}, {2, 16, 32}}); |
| 298 | +BENCHMARK(benchmark_unpack_uint4_values)->ArgsProduct({{128}, {2, 16, 32}}); |
| 299 | + |
| 300 | +// Run the benchmark |
| 301 | +BENCHMARK_MAIN(); |
0 commit comments