Skip to content

Commit

Permalink
Revert string parsing to old algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
jkeiser committed Aug 29, 2023
1 parent eb32a76 commit 3f1b0c3
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 44 deletions.
21 changes: 11 additions & 10 deletions src/generic/stage1/json_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ class json_scanner {
simdjson_inline uint64_t json_scanner::next(
const simd::simd8x64<uint8_t>& in
) noexcept {
uint64_t backslash = in.eq('\\'); // 3+LN (LN+simd:N)
uint64_t raw_quote = in.eq('"'); // 3+LN (LN+simd:N)

//printf("\n");
//printf("%30.30s: %s\n", "next", format_input_text(in));
simd8x64<uint8_t> curlified = in | ('{' - '['); // 3 (+simd:N)
Expand All @@ -98,17 +101,15 @@ simdjson_inline uint64_t json_scanner::next(
uint64_t scalar_close = ~sep & ~open & ~ws_ctrl; // 7+LN (+1) (ternary)
// total 7+LN (+3+5LN+simd:6N)

uint64_t in_string = string_scanner.next(backslash, raw_quote); // 10+LN (+9) ... 18+LN (+11+simd:3)
//printf("%30.30s: %s\n", "in_string", format_input_text(in, in_string));
// total: 11+LN (+10+2LN+simd:2N) ... 19+LN (+18+2LN+simd:2N+3)

//printf("%30.30s: %s\n", "sep_open", format_input_text(in, sep_open));
//printf("%30.30s: %s\n", "scalar_close", format_input_text(in, scalar_close));
uint64_t separated_values = next_separated_values(sep_open, scalar_close); // 8+LN (+2)
//printf("%30.30s: %s\n", "separated_values", format_input_text(in, separated_values));

uint64_t backslash = in.eq('\\'); // 3+LN (LN+simd:N)
uint64_t raw_quote = in.eq('"'); // 3+LN (LN+simd:N)
uint64_t in_string = string_scanner.next(backslash, raw_quote, separated_values); // 10+LN (+9) ... 18+LN (+11+simd:3)
//printf("%30.30s: %s\n", "in_string", format_input_text(in, in_string));
// total: 11+LN (+10+2LN+simd:2N) ... 19+LN (+18+2LN+simd:2N+3)

uint64_t lead_value = scalar_close & separated_values; // 8+LN (+1)
uint64_t op_without_comma = colon | open | close; // 8+LN (+1) (ternary)
uint64_t all_structurals = op_without_comma | lead_value; // 20+LN ... 20+LN (+1)
Expand Down Expand Up @@ -215,11 +216,11 @@ simdjson_inline uint64_t json_scanner::next_whitespace(

uint64_t backslash = in.eq('\\'); // 3+LN (LN+simd:N)
uint64_t raw_quote = in.eq('"'); // 3+LN (LN+simd:N)
uint64_t in_string = string_scanner.next(backslash, raw_quote, separated_values); // 12+LN (+6) ... 20+LN (+8+simd:3))
// total 12+LN (+6+2LN+simd:2N) ... 20+LN (+8+2LN+simd:2N+3)
uint64_t in_string = string_scanner.next(backslash, raw_quote); // 14+LN (+5+simd:3) ... 18+LN (+11+simd:3)
// total 14+LN (+5+2LN+simd:2N+3)... 18+LN (+11+2LN+simd:2N+3)

return ws & ~in_string;
// critical path = 12+LN (+11+7LN+simd:9N) ... 20+LN (+13+7LN+simd:9N+3)
return ws & ~in_string; // 15+LN ... 19+LN (+1)
// critical path = 15+LN (+13+7LN+simd:9N+3) ... 19+LN (+19+7LN+simd:9N+3)
}


Expand Down
48 changes: 14 additions & 34 deletions src/generic/stage1/json_string_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ namespace stage1 {
// Scans blocks for string characters, storing the state necessary to do so
class json_string_scanner {
public:
simdjson_inline uint64_t next(uint64_t backslash, uint64_t raw_quote, uint64_t separated_values) noexcept;
simdjson_inline uint64_t next(uint64_t backslash, uint64_t raw_quote) noexcept;
simdjson_inline uint64_t next_unescaped_quotes(uint64_t backslash, uint64_t raw_quote) noexcept;
simdjson_inline uint64_t next_in_string(uint64_t in_string, uint64_t separated_values) noexcept;
simdjson_inline uint64_t next_in_string(uint64_t in_string) noexcept;
// Returns either UNCLOSED_STRING or SUCCESS
simdjson_inline error_code finish() const noexcept;

Expand All @@ -38,12 +38,11 @@ class json_string_scanner {
//
simdjson_inline uint64_t json_string_scanner::next(
uint64_t backslash, // 3+LN
uint64_t raw_quote, // 3+LN
uint64_t separated_values // 8+LN
uint64_t raw_quote // 3+LN
) noexcept {
uint64_t quote = next_unescaped_quotes(backslash, raw_quote); // 4+LN (+3) or 8+LN (+9)
return next_in_string(quote, separated_values); // 10+LN (+6) or (14+LN or 18+LN (+8+simd:3)).
// critical path = 10+LN (+9) or (14+LN or 18+LN (+17+simd:3))
uint64_t quote = next_unescaped_quotes(backslash, raw_quote); // 4+LN (+3) ... 8+LN (+9)
return next_in_string(quote); // 14+LN ... 18+LN (+2+simd:3)
// critical path = 14+LN (+5+simd:3) ... 18+LN (+11+simd:3)
}

simdjson_inline uint64_t json_string_scanner::next_unescaped_quotes(
Expand All @@ -56,34 +55,15 @@ simdjson_inline uint64_t json_string_scanner::next_unescaped_quotes(
}

simdjson_inline uint64_t json_string_scanner::next_in_string(
uint64_t quote, // 4+LN or 8+LN
uint64_t separated_values // 8+LN
uint64_t quote // 4+LN ... 8+LN
) noexcept {
// Find values that are in the string. ASSUME that strings do not have separators/openers just
// before the end of the string (i.e. "blah," or "blah,["). These are pretty rare.
// TODO: we can also assume the carry in is 1 if the first quote is a trailing quote.
uint64_t lead_quote = quote & separated_values; // 9+LN (+1)
uint64_t trailing_quote = quote & ~separated_values; // 9+LN (+1)
// If we were correct, the subtraction will leave us with:
// LEAD-QUOTE=1 NON-QUOTE=1* TRAIL-QUOTE=0 NON-QUOTE=0* ...
// The general form is this:
// LEAD-QUOTE=1 NON-QUOTE=1|LEAD-QUOTE=0* TRAIL-QUOTE=0 NON-QUOTE=0|TRAIL-QUOTE=1* ...
// // 10+LN (+2)
auto was_still_in_string = this->still_in_string;
uint64_t in_string = bitmask::subtract_borrow(trailing_quote, lead_quote, this->still_in_string);
// Assumption check! LEAD-QUOTE=0 means a lead quote was inside a string--meaning the second
// quote was preceded by a separator/open.
uint64_t lead_quote_in_string = lead_quote & ~in_string; // 11+LN (+2)
if (!lead_quote_in_string) {
// This shouldn't happen often, so we take the heavy branch penalty for it and use the
// high-latency prefix_xor.
this->still_in_string = was_still_in_string;
in_string = bitmask::prefix_xor(quote ^ this->still_in_string); // 14+LN (+1+simd:3)
this->still_in_string = in_string >> 63; // 15+LN (+1)
}
return in_string ^ quote; // flip start and end quotes
// critical path = 10+LN (+6) or (14+LN or 18+LN (+8+simd:3)).
// would be 14+LN or 18+LN (+2+simd:3) by itself
// This shouldn't happen often, so we take the heavy branch penalty for it and use the
// high-latency prefix_xor.
// this->still_in_string = was_still_in_string;
uint64_t in_string = bitmask::prefix_xor(quote ^ this->still_in_string); // 14+LN (+1+simd:3)
this->still_in_string = in_string >> 63; // 15+LN (+1)
return in_string ^ quote;
// critical path 14+LN ... 18+LN (+2+simd:3)
}


Expand Down

0 comments on commit 3f1b0c3

Please sign in to comment.