Skip to content

Commit 79cf6e7

Browse files
committed
Optimize lexing on ARM with NEON
Use ARM NEON SIMD intrinsics to speed up lexing. NEON doesn't have an equivalent to SSE's _mm_movemask_epi8, so we implemented a different algorithm for NEON's find_first_false based on ARM's reference memchr implementation. Lexing comments with SIMD is still not implemented for NEON. Before: Benchmark Time CPU Iterations UserCounters... benchmark_lex/long_identifier_1_mean 76.9 ns 76.9 ns 3 byte=1.47969ns bytes=675.818M/s After: Benchmark Time CPU Iterations UserCounters... benchmark_lex/long_identifier_1_mean 53.2 ns 53.2 ns 3 byte=1023ps bytes=977.682M/s
1 parent 5fa5003 commit 79cf6e7

6 files changed

Lines changed: 295 additions & 2 deletions

File tree

src/lex.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,7 +1359,9 @@ lexer::parsed_identifier lexer::parse_identifier(const char8* input) {
13591359
// TODO(strager): Is the check for '\\' correct?
13601360
QLJS_SLOW_ASSERT(this->is_identifier_byte(*input) || *input == u8'\\');
13611361

1362-
#if QLJS_HAVE_X86_SSE2
1362+
#if QLJS_HAVE_ARM_NEON
1363+
using char_vector = char_vector_16_neon;
1364+
#elif QLJS_HAVE_X86_SSE2
13631365
using char_vector = char_vector_16_sse2;
13641366
#else
13651367
using char_vector = char_vector_1;
@@ -1382,7 +1384,9 @@ lexer::parsed_identifier lexer::parse_identifier(const char8* input) {
13821384
_SIDD_CMP_RANGES | _SIDD_LEAST_SIGNIFICANT |
13831385
_SIDD_NEGATIVE_POLARITY | _SIDD_UBYTE_OPS);
13841386
#else
1385-
#if QLJS_HAVE_X86_SSE2
1387+
#if QLJS_HAVE_ARM_NEON
1388+
using bool_vector = bool_vector_16_neon;
1389+
#elif QLJS_HAVE_X86_SSE2
13861390
using bool_vector = bool_vector_16_sse2;
13871391
#else
13881392
using bool_vector = bool_vector_1;

src/quick-lint-js/have.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,14 @@
228228
#define QLJS_HAVE_CHARCONV_HEADER 0
229229
#endif
230230

231+
#if !defined(QLJS_HAVE_ARM_NEON)
232+
#if defined(__ARM_NEON)
233+
#define QLJS_HAVE_ARM_NEON 1
234+
#else
235+
#define QLJS_HAVE_ARM_NEON 0
236+
#endif
237+
#endif
238+
231239
#if !defined(QLJS_HAVE_X86_SSE2)
232240
#if defined(_M_AMD64) || defined(_M_X64) || \
233241
(defined(_M_IX86_FP) && _M_IX86_FP == 2) || defined(__SSE2__)

src/quick-lint-js/simd-neon-arm.h

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// Copyright (C) 2020 Matthew "strager" Glazar
2+
// Copyright (c) 2014-2020, Arm Limited.
3+
// See end of file for extended copyright information.
4+
5+
#ifndef QUICK_LINT_JS_SIMD_NEON_ARM_H
6+
#define QUICK_LINT_JS_SIMD_NEON_ARM_H
7+
8+
#include <cstdint>
9+
#include <quick-lint-js/bit.h>
10+
#include <quick-lint-js/force-inline.h>
11+
#include <quick-lint-js/simd.h>
12+
13+
#if QLJS_HAVE_ARM_NEON
14+
#include <arm_neon.h>
15+
#endif
16+
17+
// Some routines have a different copyright than the rest of quick-lint-js, thus
18+
// are in this separate file.
19+
20+
namespace quick_lint_js {
21+
#if QLJS_HAVE_ARM_NEON
22+
QLJS_FORCE_INLINE inline int bool_vector_16_neon::find_first_false() const
23+
noexcept {
24+
// You might expect a magic pattern to look like the following:
25+
//
26+
// { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x08, [repeat] }
27+
//
28+
// However, the above magic pattern requires mixing cells 3 times
29+
// (16x8 -> 8x16 -> 4x32 -> 2x64). Our magic pattern requires mixing cells
30+
// only 2 times, but creates an unusual mask (see
31+
// NOTE[find_first_false NEON mask]).
32+
::uint8x16_t magic = {
33+
0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, //
34+
0x01, 0x04, 0x10, 0x40, 0x01, 0x04, 0x10, 0x40, //
35+
};
36+
37+
// It doesn't matter what 'garbage' is. Could be zeros or ones or anything. If
38+
// we ever extend this algorithm to uint8x32_t inputs, garbage would be the
39+
// upper 128 bits.
40+
::uint8x16_t garbage = this->data_;
41+
42+
// We invert the input so that we can use countr_zero instead of countr_one.
43+
// countr_one can't be used because of the zero bits in our mask (see
44+
// NOTE[find_first_false NEON mask]).
45+
::uint8x16_t mixed_0 = ::vbicq_u8(magic, this->data_);
46+
47+
// Mix bits to create a mask. Note that arithmetic ADD is effectively
48+
// bitwise OR.
49+
//
50+
// mixed_0: { a b c d e f g h i j k l m n o p }
51+
// mixed_1: { a+b c+d e+f g+h i+j k+l m+n o+p (64 bits unused...) }
52+
// mixed_2: { a+b+c+d e+f+g+h i+j+k+l m+n+o+p (96 bits unused...) }
53+
::uint8x16_t mixed_1 = ::vpaddq_u8(mixed_0, garbage);
54+
::uint8x16_t mixed_2 = ::vpaddq_u8(mixed_1, mixed_1);
55+
std::uint32_t mask = vgetq_lane_u32(::vreinterpretq_u32_u8(mixed_2), 0);
56+
57+
// NOTE[find_first_false NEON mask]: After mixing bits, an ideal mask looks
58+
// like this:
59+
//
60+
// 0b0000000000000000ABCDEFGHIJKLMNOP
61+
//
62+
// But our mask looks like this:
63+
//
64+
// 0b0A0B0C0D0E0F0G0H0I0J0K0L0M0N0O0P
65+
//
66+
// To deal with the extra zeros, we to divide our countr_zero result by 2.
67+
return countr_zero(mask) / 2;
68+
}
69+
#endif
70+
}
71+
72+
#endif
73+
74+
// quick-lint-js finds bugs in JavaScript programs.
75+
// Copyright (C) 2020 Matthew "strager" Glazar
76+
//
77+
// This file is part of quick-lint-js.
78+
//
79+
// quick-lint-js is free software: you can redistribute it and/or modify
80+
// it under the terms of the GNU General Public License as published by
81+
// the Free Software Foundation, either version 3 of the License, or
82+
// (at your option) any later version.
83+
//
84+
// quick-lint-js is distributed in the hope that it will be useful,
85+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
86+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
87+
// GNU General Public License for more details.
88+
//
89+
// You should have received a copy of the GNU General Public License
90+
// along with quick-lint-js. If not, see <https://www.gnu.org/licenses/>.
91+
//
92+
// ---
93+
//
94+
// Portions are this file are
95+
// Copyright (c) 2014-2020, Arm Limited.
96+
// Source:
97+
// https://github.com/ARM-software/optimized-routines/blob/7a9fd1603e1179b044406fb9b6cc5770d736cde7/string/aarch64/memchr.S
98+
//
99+
// Permission is hereby granted, free of charge, to any person obtaining a copy
100+
// of this software and associated documentation files (the "Software"), to deal
101+
// in the Software without restriction, including without limitation the rights
102+
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
103+
// copies of the Software, and to permit persons to whom the Software is
104+
// furnished to do so, subject to the following conditions:
105+
//
106+
// The above copyright notice and this permission notice shall be included in
107+
// all copies or substantial portions of the Software.
108+
//
109+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
110+
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
111+
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
112+
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
113+
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
114+
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
115+
// SOFTWARE.

src/quick-lint-js/simd.h

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@
44
#ifndef QUICK_LINT_JS_SIMD_H
55
#define QUICK_LINT_JS_SIMD_H
66

7+
#include <cstdint>
78
#include <quick-lint-js/bit.h>
89
#include <quick-lint-js/char8.h>
910
#include <quick-lint-js/force-inline.h>
1011
#include <quick-lint-js/have.h>
1112
#include <quick-lint-js/narrow-cast.h>
1213
#include <quick-lint-js/unreachable.h>
1314

15+
#if QLJS_HAVE_ARM_NEON
16+
#include <arm_neon.h>
17+
#endif
18+
1419
#if QLJS_HAVE_X86_SSE2
1520
#include <emmintrin.h>
1621
#endif
@@ -96,6 +101,76 @@ class alignas(__m128i) char_vector_16_sse2 {
96101
};
97102
#endif
98103

104+
#if QLJS_HAVE_ARM_NEON
105+
class alignas(::uint8x16_t) bool_vector_16_neon {
106+
public:
107+
static constexpr int size = 16;
108+
109+
QLJS_FORCE_INLINE explicit bool_vector_16_neon(::uint8x16_t data) noexcept
110+
: data_(data) {}
111+
112+
QLJS_FORCE_INLINE friend bool_vector_16_neon operator|(
113+
bool_vector_16_neon x, bool_vector_16_neon y) noexcept {
114+
return bool_vector_16_neon(::vorrq_u8(x.data_, y.data_));
115+
}
116+
117+
QLJS_FORCE_INLINE friend bool_vector_16_neon operator&(
118+
bool_vector_16_neon x, bool_vector_16_neon y) noexcept {
119+
return bool_vector_16_neon(::vandq_u8(x.data_, y.data_));
120+
}
121+
122+
QLJS_FORCE_INLINE int find_first_false() const noexcept;
123+
124+
private:
125+
::uint8x16_t data_;
126+
};
127+
128+
class alignas(::uint8x16_t) char_vector_16_neon {
129+
public:
130+
static constexpr int size = 16;
131+
132+
QLJS_FORCE_INLINE explicit char_vector_16_neon(::uint8x16_t data) noexcept
133+
: data_(data) {}
134+
135+
QLJS_FORCE_INLINE static char_vector_16_neon load(const char8* data) {
136+
::uint8x16_t vector;
137+
std::memcpy(&vector, data, sizeof(vector));
138+
return char_vector_16_neon(vector);
139+
}
140+
141+
QLJS_FORCE_INLINE static char_vector_16_neon repeated(std::uint8_t c) {
142+
return char_vector_16_neon(::vdupq_n_u8(c));
143+
}
144+
145+
QLJS_FORCE_INLINE friend char_vector_16_neon operator|(
146+
char_vector_16_neon x, char_vector_16_neon y) noexcept {
147+
return char_vector_16_neon(::vorrq_u8(x.data_, y.data_));
148+
}
149+
150+
QLJS_FORCE_INLINE friend bool_vector_16_neon operator==(
151+
char_vector_16_neon x, char_vector_16_neon y) noexcept {
152+
return bool_vector_16_neon(::vceqq_u8(x.data_, y.data_));
153+
}
154+
155+
QLJS_FORCE_INLINE friend bool_vector_16_neon operator<(
156+
char_vector_16_neon x, char_vector_16_neon y) noexcept {
157+
return bool_vector_16_neon(::vcltq_u8(x.data_, y.data_));
158+
}
159+
160+
QLJS_FORCE_INLINE friend bool_vector_16_neon operator>(
161+
char_vector_16_neon x, char_vector_16_neon y) noexcept {
162+
return bool_vector_16_neon(::vcgtq_u8(x.data_, y.data_));
163+
}
164+
165+
QLJS_FORCE_INLINE ::uint8x16_t uint8x16() const noexcept {
166+
return this->data_;
167+
}
168+
169+
private:
170+
::uint8x16_t data_;
171+
};
172+
#endif
173+
99174
class bool_vector_1 {
100175
public:
101176
static constexpr int size = 1;
@@ -164,6 +239,9 @@ class char_vector_1 {
164239
};
165240
}
166241

242+
// Some routines have a different copyright, thus are in a separate file.
243+
#include <quick-lint-js/simd-neon-arm.h>
244+
167245
#endif
168246

169247
// quick-lint-js finds bugs in JavaScript programs.

test/test-monotonic-allocator.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212

1313
QLJS_WARNING_IGNORE_GCC("-Wsuggest-override")
1414

15+
#if QLJS_HAVE_ARM_NEON
16+
#include <arm_neon.h>
17+
#endif
18+
1519
#if QLJS_HAVE_X86_SSE2
1620
#include <emmintrin.h>
1721
#endif
@@ -23,6 +27,10 @@ QLJS_WARNING_IGNORE_GCC("-Wsuggest-override")
2327
#define QLJS_HAVE_OVERALIGNED 0
2428
#endif
2529

30+
// TODO(strager): For some reason, allocating a ::uint8x16_t crashes with
31+
// std::bad_alloc.
32+
#define QLJS_CAN_ALIGN_NEON 0
33+
2634
#if defined(_MSC_VER)
2735
// TODO(strager): For some reason, allocating a __m128 crashes with
2836
// std::bad_alloc.
@@ -101,6 +109,13 @@ TEST(test_monotonic_allocator, filling_first_chunk_allocates_second_chunk) {
101109
struct alignas(std::max_align_t) overaligned_64 {};
102110
#endif
103111

112+
#if QLJS_CAN_ALIGN_NEON
113+
struct neon_aligned {
114+
::uint8x16_t uint8x16;
115+
::uint64x2_t uint64x2;
116+
};
117+
#endif
118+
104119
#if QLJS_CAN_ALIGN_SSE
105120
struct sse_aligned {
106121
__m128 m128;
@@ -112,6 +127,9 @@ using test_monotonic_allocator_typed_types = ::testing::Types<
112127
#if QLJS_HAVE_OVERALIGNED
113128
overaligned_64,
114129
#endif
130+
#if QLJS_CAN_ALIGN_NEON
131+
neon_aligned,
132+
#endif
115133
#if QLJS_CAN_ALIGN_SSE
116134
sse_aligned,
117135
#endif

test/test-simd.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
#include <quick-lint-js/have.h>
1010
#include <quick-lint-js/simd.h>
1111

12+
#if QLJS_HAVE_ARM_NEON
13+
#include <arm_neon.h>
14+
#endif
15+
1216
#if QLJS_HAVE_X86_SSE2
1317
#include <emmintrin.h>
1418
#endif
@@ -43,6 +47,72 @@ TEST(test_char_vector_16_sse2, bitwise_or) {
4347
EXPECT_EQ(std::memcmp(&actual, expected, sizeof(actual)), 0);
4448
}
4549
#endif
50+
51+
#if QLJS_HAVE_ARM_NEON
52+
TEST(test_char_vector_16_neon, repeated) {
53+
char8 expected[16];
54+
std::fill(std::begin(expected), std::end(expected), u8'x');
55+
::uint8x16_t actual = char_vector_16_neon::repeated('x').uint8x16();
56+
EXPECT_EQ(std::memcmp(&actual, expected, sizeof(actual)), 0);
57+
}
58+
59+
TEST(test_char_vector_16_neon, bitwise_or) {
60+
constexpr std::uint8_t lhs[16] = {
61+
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, //
62+
0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, //
63+
};
64+
constexpr std::uint8_t rhs[16] = {
65+
0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, //
66+
0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80, //
67+
};
68+
::uint8x16_t actual =
69+
(char_vector_16_neon::load(reinterpret_cast<const char8*>(lhs)) |
70+
char_vector_16_neon::load(reinterpret_cast<const char8*>(rhs)))
71+
.uint8x16();
72+
constexpr std::uint8_t expected[16] = {
73+
0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, //
74+
0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, //
75+
};
76+
EXPECT_EQ(std::memcmp(&actual, expected, sizeof(actual)), 0);
77+
}
78+
79+
TEST(test_bool_vector_16_neon, first_false_of_all_false) {
80+
::uint8x16_t bools_data = {
81+
0, 0, 0, 0, 0, 0, 0, 0, //
82+
0, 0, 0, 0, 0, 0, 0, 0, //
83+
};
84+
bool_vector_16_neon bools(bools_data);
85+
EXPECT_EQ(bools.find_first_false(), 0);
86+
}
87+
88+
TEST(test_bool_vector_16_neon, first_false_of_all_true) {
89+
constexpr std::uint8_t t = 0xff;
90+
::uint8x16_t bools_data = {
91+
t, t, t, t, t, t, t, t, //
92+
t, t, t, t, t, t, t, t, //
93+
};
94+
bool_vector_16_neon bools(bools_data);
95+
EXPECT_EQ(bools.find_first_false(), 16);
96+
}
97+
98+
TEST(test_bool_vector_16_neon, find_first_false_exhaustive) {
99+
for (std::uint32_t i = 0; i <= 0xffff; ++i) {
100+
SCOPED_TRACE(i);
101+
::uint8x16_t bools_data;
102+
int first_false = 16;
103+
for (int bit = 0; bit < 16; ++bit) {
104+
bool bit_on = (i >> bit) & 1;
105+
bools_data[bit] = bit_on ? 0xff : 0x00;
106+
if (!bit_on) {
107+
first_false = std::min(first_false, bit);
108+
}
109+
}
110+
111+
bool_vector_16_neon bools(bools_data);
112+
EXPECT_EQ(bools.find_first_false(), first_false);
113+
}
114+
}
115+
#endif
46116
}
47117
}
48118

0 commit comments

Comments
 (0)