Skip to content

Commit

Permalink
Revert string parsing to old algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
jkeiser committed Aug 31, 2023
1 parent 0e28a83 commit 8149284
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 40 deletions.
15 changes: 7 additions & 8 deletions src/generic/stage1/json_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,18 +126,18 @@ simdjson_inline uint64_t json_scanner::next(const simd::simd8x64<uint8_t>& in, c
// printf("\n");
// printf("%30.30s: %s\n", "next", format_input_text(in));

// Figure out what's in a string
uint64_t quote = string_scanner.next_unescaped_quotes(block.backslash, block.raw_quote);
uint64_t in_string = string_scanner.next_in_string(quote);

// Get structurals
uint64_t scalar_close = block.scalar_close();
uint64_t separated_values = next_separated_values(block.sep_open(), scalar_close);
uint64_t scalar = scalar_close & ~block.close;

// Figure out what's in a string
uint64_t quote = string_scanner.next_unescaped_quotes(block.backslash, block.raw_quote);
uint64_t in_string = string_scanner.next_in_string(quote, separated_values);

// Join up structurals and strings
uint64_t lead_value = scalar & separated_values;
uint64_t all_structurals = block.op_without_comma() | lead_value;

// Join up structurals and strings
uint64_t structurals = all_structurals & ~in_string;

// Check for errors
Expand Down Expand Up @@ -209,8 +209,7 @@ simdjson_inline uint64_t json_scanner::next_whitespace(
const simd::simd8x64<uint8_t>& in,
const basic_block_classification& block
) noexcept {
uint64_t separated_values = next_separated_values(block.sep_open(), block.scalar_close());
uint64_t in_string = string_scanner.next(block.backslash, block.raw_quote, separated_values);
uint64_t in_string = string_scanner.next(block.backslash, block.raw_quote);
return block.ws & ~in_string;
}

Expand Down
46 changes: 14 additions & 32 deletions src/generic/stage1/json_string_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ namespace stage1 {
// Scans blocks for string characters, storing the state necessary to do so
class json_string_scanner {
public:
simdjson_inline uint64_t next(uint64_t backslash, uint64_t raw_quote, uint64_t separated_values) noexcept;
simdjson_inline uint64_t next(uint64_t backslash, uint64_t raw_quote) noexcept;
simdjson_inline uint64_t next_unescaped_quotes(uint64_t backslash, uint64_t raw_quote) noexcept;
simdjson_inline uint64_t next_in_string(uint64_t in_string, uint64_t separated_values) noexcept;
simdjson_inline uint64_t next_in_string(uint64_t in_string) noexcept;
// Returns either UNCLOSED_STRING or SUCCESS
simdjson_inline error_code finish() const noexcept;

Expand All @@ -39,12 +39,11 @@ class json_string_scanner {
//
simdjson_inline uint64_t json_string_scanner::next(
uint64_t backslash, // 3+LN
uint64_t raw_quote, // 3+LN
uint64_t separated_values // 8+LN
uint64_t raw_quote // 3+LN
) noexcept {
uint64_t quote = next_unescaped_quotes(backslash, raw_quote); // 4+LN (+3) or 8+LN (+9)
return next_in_string(quote, separated_values); // 10+LN (+6) or (14+LN or 18+LN (+8+simd:3)).
// critical path = 10+LN (+9) or (14+LN or 18+LN (+17+simd:3))
uint64_t quote = next_unescaped_quotes(backslash, raw_quote); // 4+LN (+3) ... 8+LN (+9)
return next_in_string(quote); // 14+LN ... 18+LN (+2+simd:3)
// critical path = 14+LN (+5+simd:3) ... 18+LN (+11+simd:3)
}

simdjson_inline uint64_t json_string_scanner::next_unescaped_quotes(
Expand All @@ -57,32 +56,15 @@ simdjson_inline uint64_t json_string_scanner::next_unescaped_quotes(
}

simdjson_inline uint64_t json_string_scanner::next_in_string(
uint64_t quote, // 4+LN or 8+LN
uint64_t separated_values // 8+LN
uint64_t quote // 4+LN ... 8+LN
) noexcept {
// Find values that are in the string. ASSUME that strings do not have separators/openers just
// before the end of the string (i.e. "blah," or "blah,["). These are pretty rare.
// TODO: we can also assume the carry in is 1 if the first quote is a trailing quote.
uint64_t lead_quote = quote & separated_values; // 9+LN (+1)
uint64_t trailing_quote = quote & ~separated_values; // 9+LN (+1)
// If we were correct, the subtraction will leave us with:
// LEAD-QUOTE=1 NON-QUOTE=1* TRAIL-QUOTE=0 NON-QUOTE=0* ...
// The general form is this:
// LEAD-QUOTE=1 NON-QUOTE=1|LEAD-QUOTE=0* TRAIL-QUOTE=0 NON-QUOTE=0|TRAIL-QUOTE=1* ...
// // 10+LN (+2)
auto was_still_in_string = this->still_in_string;
uint64_t in_string = bitmask::subtract_borrow(trailing_quote, lead_quote, this->still_in_string);
// Assumption check! LEAD-QUOTE=0 means a lead quote was inside a string--meaning the second
// quote was preceded by a separator/open.
if (simdjson_unlikely(lead_quote & ~in_string)) {
// This shouldn't happen often, so we take the heavy branch penalty for it and use the
// high-latency prefix_xor.
in_string = bitmask::prefix_xor(quote ^ was_still_in_string); // 14+LN (+1+simd:3)
this->still_in_string = in_string >> 63; // 15+LN (+1)
}
return in_string ^ quote; // flip start and end quotes
// critical path = 10+LN (+6) or (14+LN or 18+LN (+8+simd:3)).
// would be 14+LN or 18+LN (+2+simd:3) by itself
// This shouldn't happen often, so we take the heavy branch penalty for it and use the
// high-latency prefix_xor.
// this->still_in_string = was_still_in_string;
uint64_t in_string = bitmask::prefix_xor(quote ^ this->still_in_string); // 14+LN (+1+simd:3)
this->still_in_string = in_string >> 63; // 15+LN (+1)
return in_string ^ quote;
// critical path 14+LN ... 18+LN (+2+simd:3)
}


Expand Down

0 comments on commit 8149284

Please sign in to comment.