diff --git a/README.md b/README.md index b0c1d20a6e..8d3d1da53c 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ We distinguish between "bindings" (which just wrap the C++ code) and a port to a - [simdjzon](https://github.com/travisstaloch/simdjzon): zig port. - [JSON-Simd](https://github.com/rawleyfowler/JSON-simd): Raku bindings. - [JSON::SIMD](https://metacpan.org/pod/JSON::SIMD): Perl bindings; fully-featured JSON module that uses simdjson for decoding. -- [gemmaJSON](https://github.com/sainttttt/gemmaJSON): Nim json parser based on simdjson bindings. +- [gemmaJSON](https://github.com/sainttttt/gemmaJSON): Nim JSON parser based on simdjson bindings. About simdjson -------------- diff --git a/benchmark/json_benchmark/file_runner.h b/benchmark/json_benchmark/file_runner.h index 1314729350..0dddc067c4 100644 --- a/benchmark/json_benchmark/file_runner.h +++ b/benchmark/json_benchmark/file_runner.h @@ -24,7 +24,7 @@ struct file_runner : public runner_base { simdjson_warn_unused bool before_run(benchmark::State &state) { if (!runner_base::after_run(state)) { return false; }; - // Copy the original json in case we did *in situ* last time + // Copy the original JSON in case we did *in situ* last time std::memcpy(json.data(), original_json.data(), original_json.size()); return true; } diff --git a/benchmark/json_benchmark/string_runner.h b/benchmark/json_benchmark/string_runner.h index ffde93818a..f9b4603869 100644 --- a/benchmark/json_benchmark/string_runner.h +++ b/benchmark/json_benchmark/string_runner.h @@ -13,7 +13,7 @@ struct string_runner : public runner_base { simdjson_warn_unused bool before_run(benchmark::State &state) { if (!runner_base::after_run(state)) { return false; }; - // Copy the original json in case we did *in situ* + // Copy the original JSON in case we did *in situ* std::memcpy(json.data(), original_json.data(), original_json.size()); return true; } diff --git a/doc/basics.md b/doc/basics.md index bc3905d2fe..73a4249fee 100644 --- a/doc/basics.md +++ b/doc/basics.md @@ -19,7 +19,7 @@ An overview of what you need to know to use simdjson, with examples. - [Minifying JSON strings without parsing](#minifying-json-strings-without-parsing) - [UTF-8 validation (alone)](#utf-8-validation-alone) - [JSON Pointer](#json-pointer) - - [JSON Path (subset)](#json-path) + - [JSONPath](#json-path) - [Error Handling](#error-handling) - [Error Handling Examples without Exceptions](#error-handling-examples-without-exceptions) - [Disabling Exceptions](#disabling-exceptions) @@ -1105,6 +1105,17 @@ for (size_t i = 0; i < size; i++) { } ``` +In most instances, a JSON Pointer is an ASCII string and the keys in a JSON document +are ASCII strings. We support UTF-8 in JSON Pointer, but key values are matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. The e acute character is +considered distinct from its escaped version `\u00E9`. E.g., + +```c++ +const padded_string json = "{\"\\u00E9\":123}"_padded; +auto doc = parser.iterate(json); +doc.at_pointer("/\\u00E9") == 123; // true +doc.at_pointer((const char*)u8"/\u00E9") // returns an error (NO_SUCH_FIELD) +``` + Note that `at_pointer` calls [`rewind`](#rewind) to reset the parser at the beginning of the document. Hence, it invalidates all previously parsed values, objects and arrays: make sure to consume the values between each call to `at_pointer`. Consider the following example where one wants to store each object from the JSON into a vector of `struct car_type`: ```c++ @@ -1167,10 +1178,10 @@ std::cout << doc.find_field("k0") << std::endl; // Prints 27 When the JSON Pointer Path is the empty string (`""`) applied to a scalar document (lone string, number, Boolean or null), a SCALAR_DOCUMENT_AS_VALUE error is returned because scalar document cannot be represented as `value` instances. You can check that a document is a scalar with the method `scalar()`. -JSON Path +JSONPath ------------ -The simdjson library now supports a subset of [JSON Path](https://goessner.net/articles/JsonPath/) through the `at_path()` method, allowing you to reach further into the document in a single call. The subset of JSON path that is implemented is the subset that is trivially convertible into the JSON Pointer format, using `.` to access a field and `[]` to access a specific index. +The simdjson library now supports a subset of [JSONPath](https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00) through the `at_path()` method, allowing you to reach further into the document in a single call. The subset of JSONPath that is implemented is the subset that is trivially convertible into the JSON Pointer format, using `.` to access a field and `[]` to access a specific index. This implementation relies on `at_path()` converting its argument to JSON Pointer and then calling `at_pointer`, which makes use of [`rewind`](#rewind) to reset the parser at the beginning of the document. Hence, it invalidates all previously parsed values, objects and arrays: make sure to consume the values between each call to `at_path`. @@ -1187,7 +1198,7 @@ auto cars = parser.iterate(cars_json); cout << cars.at_path("[0].tire_pressure[1]") << endl; // Prints 39.9 ``` -A call to `at_path(json_path)` can result in any of the errors that are returned by the `at_pointer` method and if the conversion of `json_path` to json pointer fails, it will lead to an `simdjson::INVALID_JSON_POINTER`error. +A call to `at_path(json_path)` can result in any of the errors that are returned by the `at_pointer` method and if the conversion of `json_path` to JSON Pointer fails, it will lead to an `simdjson::INVALID_JSON_POINTER`error. ```c++ auto cars_json = R"( [ @@ -1197,8 +1208,20 @@ auto cars_json = R"( [ ] )"_padded; ondemand::parser parser; auto cars = parser.iterate(cars_json); -ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to json pointer, since last square bracket was not properly closed. -ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Conversion to json pointer succeeds, but fails on at_pointer() since the path is invalid. +ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to JSON Pointer, since last square bracket was not properly closed. +ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Conversion to JSON Pointer succeeds, but fails on at_pointer() since the path is invalid. +``` + +In most instances, a JSONPath is an ASCII string and the keys in a JSON document +are ASCII strings. We support UTF-8 within a JSONPath expression, but key values are +matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. +The e acute character is considered distinct from its escaped version `\u00E9`. E.g., + +```c++ +const padded_string json = "{\"\\u00E9\":123}"_padded; +auto doc = parser.iterate(json); +doc.at_path(".\\u00E9") == 123; // true +doc.at_path((const char*)u8".\u00E9") // returns an error (NO_SUCH_FIELD) ``` Error Handling diff --git a/doc/dom.md b/doc/dom.md index feec474016..b7b133b2a0 100644 --- a/doc/dom.md +++ b/doc/dom.md @@ -223,13 +223,13 @@ dom::element cars = parser.parse(cars_json); cout << cars.at_pointer("/0/tire_pressure/1") << endl; // Prints 39.9 ``` -A JSON Path is a sequence of segments each starting with the '/' character. Within arrays, an integer +A JSON Pointer expression is a sequence of segments each starting with the '/' character. Within arrays, an integer index allows you to select the indexed node. Within objects, the string value of the key allows you to select the value. If your keys contain the characters '/' or '~', they must be escaped as '~1' and -'~0' respectively. An empty JSON Path refers to the whole document. +'~0' respectively. An empty JSON Pointer expression refers to the whole document. We also extend the JSON Pointer support to include *relative* paths. -You can apply a JSON path to any node and the path gets interpreted relatively, as if the current node were a whole JSON document. +You can apply a JSON Pointer expression to any node and the path gets interpreted relatively, as if the current node were a whole JSON document. Consider the following example: diff --git a/fuzz/fuzz_atpointer.cpp b/fuzz/fuzz_atpointer.cpp index f45aa5deb3..915da0d732 100644 --- a/fuzz/fuzz_atpointer.cpp +++ b/fuzz/fuzz_atpointer.cpp @@ -7,11 +7,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { - // Split data into two strings, json pointer and the document string. + // Split data into two strings, JSON Pointer and the document string. // Might end up with none, either or both being empty, important for // covering edge cases such as // https://github.com/simdjson/simdjson/issues/1142 Inputs missing the - // separator line will get an empty json pointer but the all the input put in + // separator line will get an empty JSON Pointer but the all the input put in // the document string. This means test data from other fuzzers that take json // input works for this fuzzer as well. FuzzData fd(Data, Size); diff --git a/include/simdjson/generic/ondemand/array.h b/include/simdjson/generic/ondemand/array.h index f9083204dd..e6095d27c2 100644 --- a/include/simdjson/generic/ondemand/array.h +++ b/include/simdjson/generic/ondemand/array.h @@ -102,11 +102,14 @@ class array { inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only supporting a subset of - * json path that is easily convertible to json pointer. + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00 + * + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/generic/ondemand/document.h b/include/simdjson/generic/ondemand/document.h index 1aec08c12c..060f2ec5fe 100644 --- a/include/simdjson/generic/ondemand/document.h +++ b/include/simdjson/generic/ondemand/document.h @@ -587,6 +587,14 @@ class document { * auto doc = parser.iterate(json); * doc.at_pointer("//a/1") == 20 * + * Key values are matched exactly, without unescaping or Unicode normalization. + * We do a byte-by-byte comparison. E.g. + * + * const padded_string json = "{\"\\u00E9\":123}"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/\\u00E9") == 123 + * doc.at_pointer((const char*)u8"/\u00E9") returns an error (NO_SUCH_FIELD) + * * Note that at_pointer() automatically calls rewind between each call. Thus * all values, objects and arrays that you have created so far (including unescaped strings) * are invalidated. After calling at_pointer, you need to consume the result: string values @@ -605,12 +613,22 @@ class document { simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only - * supporting a subset of json path that is easily convertible to json - * pointer. + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. + * + * https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00 + * + * Key values are matched exactly, without unescaping or Unicode normalization. + * We do a byte-by-byte comparison. E.g. + * + * const padded_string json = "{\"\\u00E9\":123}"_padded; + * auto doc = parser.iterate(json); + * doc.at_path(".\\u00E9") == 123 + * doc.at_path((const char*)u8".\u00E9") returns an error (NO_SUCH_FIELD) * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/generic/ondemand/object.h b/include/simdjson/generic/ondemand/object.h index 46680c3be1..b21c10339c 100644 --- a/include/simdjson/generic/ondemand/object.h +++ b/include/simdjson/generic/ondemand/object.h @@ -141,12 +141,12 @@ class object { inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only - * supporting a subset of json path that is easily convertible to json - * pointer. + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/generic/ondemand/value.h b/include/simdjson/generic/ondemand/value.h index 51ceee037a..4e4f035312 100644 --- a/include/simdjson/generic/ondemand/value.h +++ b/include/simdjson/generic/ondemand/value.h @@ -584,12 +584,12 @@ class value { simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only - * supporting a subset of json path that is easily convertible to json - * pointer. + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key + * names and array indices. * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/padded_string-inl.h b/include/simdjson/padded_string-inl.h index ebb056bde1..e31fe77cf4 100644 --- a/include/simdjson/padded_string-inl.h +++ b/include/simdjson/padded_string-inl.h @@ -54,6 +54,14 @@ inline padded_string::padded_string(const char *data, size_t length) noexcept std::memcpy(data_ptr, data, length); } } +#ifdef __cpp_char8_t +inline padded_string::padded_string(const char8_t *data, size_t length) noexcept + : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) { + if ((data != nullptr) && (data_ptr != nullptr)) { + std::memcpy(data_ptr, reinterpret_cast(data), length); + } +} +#endif // note: do not pass std::string arguments by value inline padded_string::padded_string(const std::string & str_ ) noexcept : viable_size(str_.size()), data_ptr(internal::allocate_padded_buffer(str_.size())) { @@ -173,5 +181,9 @@ inline simdjson_result padded_string::load(std::string_view filen inline simdjson::padded_string operator "" _padded(const char *str, size_t len) { return simdjson::padded_string(str, len); } - +#ifdef __cpp_char8_t +inline simdjson::padded_string operator "" _padded(const char8_t *str, size_t len) { + return simdjson::padded_string(reinterpret_cast(str), len); +} +#endif #endif // SIMDJSON_PADDED_STRING_INL_H diff --git a/include/simdjson/padded_string.h b/include/simdjson/padded_string.h index ed157367d9..ddc3cbba87 100644 --- a/include/simdjson/padded_string.h +++ b/include/simdjson/padded_string.h @@ -39,6 +39,9 @@ struct padded_string final { * @param length the number of bytes to copy */ explicit inline padded_string(const char *data, size_t length) noexcept; +#ifdef __cpp_char8_t + explicit inline padded_string(const char8_t *data, size_t length) noexcept; +#endif /** * Create a new padded string by copying the given input. * @@ -150,6 +153,9 @@ inline std::ostream& operator<<(std::ostream& out, simdjson_result(s), len, capacity) { } - +#ifdef __cpp_char8_t +inline padded_string_view::padded_string_view(const char8_t* s, size_t len, size_t capacity) noexcept + : padded_string_view(reinterpret_cast(s), len, capacity) +{ +} +#endif inline padded_string_view::padded_string_view(const std::string &s) noexcept : std::string_view(s), _capacity(s.capacity()) { diff --git a/include/simdjson/padded_string_view.h b/include/simdjson/padded_string_view.h index 6ca7ff3e0d..f8dc59a79c 100644 --- a/include/simdjson/padded_string_view.h +++ b/include/simdjson/padded_string_view.h @@ -34,7 +34,9 @@ class padded_string_view : public std::string_view { explicit inline padded_string_view(const char* s, size_t len, size_t capacity) noexcept; /** overload explicit inline padded_string_view(const char* s, size_t len) noexcept */ explicit inline padded_string_view(const uint8_t* s, size_t len, size_t capacity) noexcept; - +#ifdef __cpp_char8_t + explicit inline padded_string_view(const char8_t* s, size_t len, size_t capacity) noexcept; +#endif /** * Promise the given string has at least SIMDJSON_PADDING extra bytes allocated to it. * diff --git a/tests/ondemand/ondemand_readme_examples.cpp b/tests/ondemand/ondemand_readme_examples.cpp index a7873a9058..dcdf23c661 100644 --- a/tests/ondemand/ondemand_readme_examples.cpp +++ b/tests/ondemand/ondemand_readme_examples.cpp @@ -1068,6 +1068,24 @@ bool json_pointer_simple() { TEST_SUCCEED(); } +bool json_pointer_unicode() { + TEST_START(); + const padded_string json = u8"{\"\u00E9\":123}"_padded; + ondemand::parser parser; + ondemand::document doc; + int64_t x; + ASSERT_SUCCESS(parser.iterate(json).get(doc)); + ASSERT_SUCCESS(doc.at_pointer((const char*)u8"/\u00E9").get(x)); + ASSERT_EQUAL(x,123); + + const padded_string json2 = "{\"\\u00E9\":123}"_padded; + ASSERT_SUCCESS(parser.iterate(json2).get(doc)); + ASSERT_SUCCESS(doc.at_pointer("/\\u00E9").get(x)); + ASSERT_ERROR(doc.at_pointer((const char*)u8"/\u00E9"), NO_SUCH_FIELD); + ASSERT_EQUAL(x,123); + TEST_SUCCEED(); +} + bool json_path_simple() { TEST_START(); ondemand::parser parser; @@ -1079,13 +1097,31 @@ bool json_path_simple() { TEST_SUCCEED(); } +bool json_path_unicode() { + TEST_START(); + ondemand::parser parser; + ondemand::document doc; + const padded_string json = u8"{\"\u00E9\":123}"_padded; + int64_t x; + ASSERT_SUCCESS(parser.iterate(json).get(doc)); + ASSERT_SUCCESS(doc.at_path((const char*)u8".\u00E9").get(x)); + ASSERT_EQUAL(x,123); + + const padded_string json2 = "{\"\\u00E9\":123}"_padded; + ASSERT_SUCCESS(parser.iterate(json2).get(doc)); + ASSERT_SUCCESS(doc.at_path(".\\u00E9").get(x)); + ASSERT_ERROR(doc.at_path((const char*)u8".\u00E9"), NO_SUCH_FIELD); + ASSERT_EQUAL(x,123); + TEST_SUCCEED(); +} + bool invalid_json_path() { TEST_START(); ondemand::parser parser; ondemand::document cars; double x; ASSERT_SUCCESS(parser.iterate(cars_json).get(cars)); - ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to json pointer + ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to JSON Pointer ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Fails on at_pointer() TEST_SUCCEED(); } @@ -1771,7 +1807,9 @@ bool run() { #endif && using_the_parsed_json_6() && json_pointer_simple() + && json_pointer_unicode() && json_path_simple() + && json_path_unicode() && invalid_json_path() && json_pointer_multiple() && json_path_multiple()