-
Notifications
You must be signed in to change notification settings - Fork 985
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Don't send commas to stage 2, avoid clmul in most cases #2049
base: master
Are you sure you want to change the base?
Conversation
include/simdjson/ppc64/bitmask.h
Outdated
borrow_out = result >= value1; | ||
return result; | ||
#else | ||
return __builtin_subcll(value1, value2, borrow, &borrow); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At a glance, it looks like __builtin_subcll is LLVM specific?
It might be worth guarding its usage:
https://gcc.gnu.org/onlinedocs/cpp/_005f_005fhas_005fbuiltin.html
It might worth examining alternatives:
https://godbolt.org/z/1WT9nPv6M
#include <cstdint>
using borrow_t = unsigned long long;
uint64_t subtract_borrow(const uint64_t value1, const uint64_t value2, borrow_t& borrow) noexcept {
return __builtin_subcll(value1, value2, borrow, &borrow);
}
uint64_t subtract_borrow_manual(const uint64_t value1, const uint64_t value2, borrow_t& borrow) noexcept {
uint64_t result = value1 - value2 - borrow;
borrow = result >= value1;
return result;
}
#if defined(_M_X64) || defined(__amd64__)
#include <x86intrin.h>
// visual studio has _subborrow_u64 in <intrin.h>
// https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
//
uint64_t subtract_borrow_intel(const uint64_t value1, const uint64_t value2, uint8_t& borrow) {
uint64_t result;
borrow = _subborrow_u64(borrow, value2, value1, (unsigned long long *)&result);
return result;
}
#endif
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's entirely possible; I did construct these to be analogues of the add_overflow()
implementation for the given architecture (i.e. pulling from the same libraries and using the same #ifdefs)
@lemire I made some more variants in this Godbolt. Just based on manual inspection of the assembly, if I had to choose a
uint64_t subtract_borrow_using_overflow_bool(const uint64_t value1, const uint64_t value2, bool& borrow) {
unsigned long long result;
borrow = __builtin_usubll_overflow(value1, value2 + borrow, &result);
return result;
}
uint64_t subtract_borrow_intel_bool(const uint64_t value1, const uint64_t value2, bool& borrow) {
unsigned long long result;
borrow = _subborrow_u64(borrow, value1, value2, (unsigned long long *)&result);
return result;
}
uint64_t subtract_borrow_manual_bool(const uint64_t value1, const uint64_t value2, bool& borrow) noexcept {
uint64_t result = value1 - value2 - borrow;
borrow = result >= value1;
return result;
} subtract_borrow_using_overflow_bool(unsigned long, unsigned long, bool&):
# result = value1 - value2 - overflow
mov rax, rdi # value1
movzx ecx, byte ptr [rdx] # overflow
add rcx, rsi # overflow + value2
sub rax, rcx # value1 - (overflow + value2)
# overflow = result >= value1
setb byte ptr [rdx]
subtract_borrow_intel_bool(unsigned long, unsigned long, bool&):
# result = value1 - value2 - overflow
mov rax, rdi # value1
movzx ecx, byte ptr [rdx] # overflow
add cl, -1 # cl = overflow - 1 ???
sbb rax, rsi # value1 - value2 - overflow
# overflow = result >= value1
setb byte ptr [rdx]
subtract_borrow_manual_bool(unsigned long, unsigned long, bool&):
# result = value1 - (value2 + overflow)
movzx ecx, byte ptr [rdx] # overflow
add rcx, rsi # overflow + value2
sub rax, rcx # value1 - (value2 + overflow)
# overflow = (result >= value1)
cmp rax, rdi
setae byte ptr [rdx] |
Added some comments in the assembly for easier following. Bottom line on Ice Lake, once you use
Of course, running these in a performance test is the only way to know for sure, since the processor does some minor JIT-ish activities :) |
On ARM, it looks like |
And on GCC 13.2 Intel, everything pretty much looks the same as each other.
|
Changing |
The algorithm detects all missing/extra separator errors in stage 1, and then doesn't send commas.