Skip to content

Commit

Permalink
Adding path for parsing incomplete json. (#2189)
Browse files Browse the repository at this point in the history
1. Allows processing inclomplete, damaged, corrupted json to some extent.
2. Pariity with the Presto Java functionality.
3. Protected with SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON define.
4. Does not interfere with the normal path (can co-exist).
5. Tested in production forkflow.
  • Loading branch information
spershin committed May 30, 2024
1 parent c80dda7 commit deefc88
Show file tree
Hide file tree
Showing 6 changed files with 640 additions and 23 deletions.
17 changes: 17 additions & 0 deletions include/simdjson/generic/ondemand/json_iterator-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,23 @@ simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parse
#endif
}

#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON
simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser, bool streaming) noexcept
: token(buf, &_parser->implementation->structural_indexes[0]),
parser{_parser},
_string_buf_loc{parser->string_buf.get()},
_depth{1},
_root{parser->implementation->structural_indexes.get()},
_streaming{streaming}

{
logger::log_headers();
#if SIMDJSON_CHECK_EOF
assert_more_tokens();
#endif
}
#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON

inline void json_iterator::rewind() noexcept {
token.set_position( root_position() );
logger::log_headers(); // We start again
Expand Down
3 changes: 3 additions & 0 deletions include/simdjson/generic/ondemand/json_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ class json_iterator {
inline bool balanced() const noexcept;
protected:
simdjson_inline json_iterator(const uint8_t *buf, ondemand::parser *parser) noexcept;
#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON
simdjson_inline json_iterator(const uint8_t *buf, ondemand::parser *parser, bool streaming) noexcept;
#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON
/// The last token before the end
simdjson_inline token_position last_position() const noexcept;
/// The token *at* the end. This points at gibberish and should only be used for comparison.
Expand Down
21 changes: 21 additions & 0 deletions include/simdjson/generic/ondemand/parser-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,27 @@ simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(p
return document::start({ reinterpret_cast<const uint8_t *>(json.data()), this });
}

#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON
simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate_allow_incomplete_json(padded_string_view json) & noexcept {
if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; }

json.remove_utf8_bom();

// Allocate if needed
if (capacity() < json.length() || !string_buf) {
SIMDJSON_TRY( allocate(json.length(), max_depth()) );
}

// Run stage 1.
const simdjson::error_code err = implementation->stage1(reinterpret_cast<const uint8_t *>(json.data()), json.length(), stage1_mode::regular);
if (err) {
if (err != UNCLOSED_STRING)
return err;
}
return document::start({ reinterpret_cast<const uint8_t *>(json.data()), this, true });
}
#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON

simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(const char *json, size_t len, size_t allocated) & noexcept {
return iterate(padded_string_view(json, len, allocated));
}
Expand Down
3 changes: 3 additions & 0 deletions include/simdjson/generic/ondemand/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ class parser {
* - UNCLOSED_STRING if there is an unclosed string in the document.
*/
simdjson_warn_unused simdjson_result<document> iterate(padded_string_view json) & noexcept;
#ifdef SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON
simdjson_warn_unused simdjson_result<document> iterate_allow_incomplete_json(padded_string_view json) & noexcept;
#endif // SIMDJSON_EXPERIMENTAL_ALLOW_INCOMPLETE_JSON
/** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
simdjson_warn_unused simdjson_result<document> iterate(const char *json, size_t len, size_t capacity) & noexcept;
/** @overload simdjson_result<document> iterate(padded_string_view json) & noexcept */
Expand Down
5 changes: 3 additions & 2 deletions singleheader/simdjson.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* auto-generated on 2024-05-07 18:04:59 -0400. Do not edit! */
/* auto-generated on 2024-05-23 16:54:23 -0400. Do not edit! */
/* including simdjson.cpp: */
/* begin file simdjson.cpp */
#define SIMDJSON_SRC_SIMDJSON_CPP
Expand Down Expand Up @@ -2359,7 +2359,7 @@ enum error_code {
INDEX_OUT_OF_BOUNDS, ///< JSON array index too large
NO_SUCH_FIELD, ///< JSON field not found in object
IO_ERROR, ///< Error reading a file
INVALID_JSON_POINTER, ///< Invalid JSON pointer reference
INVALID_JSON_POINTER, ///< Invalid JSON pointer syntax
INVALID_URI_FRAGMENT, ///< Invalid URI fragment
UNEXPECTED_ERROR, ///< indicative of a bug in simdjson
PARSER_IN_USE, ///< parser is already in use.
Expand Down Expand Up @@ -6896,6 +6896,7 @@ static inline uint32_t detect_supported_architectures() {
/* end file internal/isadetection.h */

#include <initializer_list>
#include <type_traits>

namespace simdjson {

Expand Down
Loading

0 comments on commit deefc88

Please sign in to comment.