From f7b31b0ac3e8bbaebc5fc8b433da682bd761682b Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 15 Feb 2024 11:23:38 -0500 Subject: [PATCH 1/6] Documenting our support of JSON Path and JSON Pointer vs. Unicode characters --- doc/basics.md | 22 ++++++++++++ include/simdjson/generic/ondemand/array.h | 5 +-- include/simdjson/generic/ondemand/document.h | 22 ++++++++++-- include/simdjson/generic/ondemand/object.h | 6 ++-- include/simdjson/generic/ondemand/value.h | 6 ++-- tests/ondemand/ondemand_readme_examples.cpp | 38 ++++++++++++++++++++ 6 files changed, 88 insertions(+), 11 deletions(-) diff --git a/doc/basics.md b/doc/basics.md index bc3905d2fe..9a0383f7d4 100644 --- a/doc/basics.md +++ b/doc/basics.md @@ -1105,6 +1105,17 @@ for (size_t i = 0; i < size; i++) { } ``` +In most instances, a JSON Path is an ASCII string and the keys in a JSON document +are ASCII strings. We support UTF-8 in JSON Pointer, but key values are matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. The e acute character is +considered distinct from its escaped version `\u00E9`. E.g., + +```c++ +const padded_string json = "{\"\\u00E9\":123}"_padded; +auto doc = parser.iterate(json); +doc.at_pointer("/\\u00E9") == 123; // true +doc.at_pointer("/\u00E9") // returns an error (NO_SUCH_FIELD) +``` + Note that `at_pointer` calls [`rewind`](#rewind) to reset the parser at the beginning of the document. Hence, it invalidates all previously parsed values, objects and arrays: make sure to consume the values between each call to `at_pointer`. Consider the following example where one wants to store each object from the JSON into a vector of `struct car_type`: ```c++ @@ -1201,6 +1212,17 @@ ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); / ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Conversion to json pointer succeeds, but fails on at_pointer() since the path is invalid. ``` +In most instances, a JSON Path is an ASCII string and the keys in a JSON document +are ASCII strings. We support UTF-8, but key values are matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. The e acute character is +considered distinct from its escaped version `\u00E9`. E.g., + +```c++ +const padded_string json = "{\"\\u00E9\":123}"_padded; +auto doc = parser.iterate(json); +doc.at_pointer(".\\u00E9") == 123; // true +doc.at_pointer(".\u00E9") // returns an error (NO_SUCH_FIELD) +``` + Error Handling -------------- diff --git a/include/simdjson/generic/ondemand/array.h b/include/simdjson/generic/ondemand/array.h index f9083204dd..541901a37d 100644 --- a/include/simdjson/generic/ondemand/array.h +++ b/include/simdjson/generic/ondemand/array.h @@ -102,8 +102,9 @@ class array { inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only supporting a subset of - * json path that is easily convertible to json pointer. + * Get the value associated with the given JSON path. We only support + * JSON Path queries that trivially convertible to JSON Pointer queries: key + * names and array indices. * * @return The value associated with the given JSON path, or: * - INVALID_JSON_POINTER if the json path to json pointer conversion fails diff --git a/include/simdjson/generic/ondemand/document.h b/include/simdjson/generic/ondemand/document.h index 1aec08c12c..02244dccb6 100644 --- a/include/simdjson/generic/ondemand/document.h +++ b/include/simdjson/generic/ondemand/document.h @@ -587,6 +587,14 @@ class document { * auto doc = parser.iterate(json); * doc.at_pointer("//a/1") == 20 * + * Key values are matched exactly, without unescaping or Unicode normalization. + * We do a byte-by-byte comparison. E.g. + * + * const padded_string json = "{\"\\u00E9\":123}"_padded; + * auto doc = parser.iterate(json); + * doc.at_pointer("/\\u00E9") == 123 + * doc.at_pointer("/\u00E9") returns an error (NO_SUCH_FIELD) + * * Note that at_pointer() automatically calls rewind between each call. Thus * all values, objects and arrays that you have created so far (including unescaped strings) * are invalidated. After calling at_pointer, you need to consume the result: string values @@ -605,9 +613,17 @@ class document { simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only - * supporting a subset of json path that is easily convertible to json - * pointer. + * Get the value associated with the given JSON path. We only support + * JSON Path queries that trivially convertible to JSON Pointer queries: key + * names and array indices. + * + * Key values are matched exactly, without unescaping or Unicode normalization. + * We do a byte-by-byte comparison. E.g. + * + * const padded_string json = "{\"\\u00E9\":123}"_padded; + * auto doc = parser.iterate(json); + * doc.at_path(".\\u00E9") == 123 + * doc.at_path(".\u00E9") returns an error (NO_SUCH_FIELD) * * @return The value associated with the given JSON path, or: * - INVALID_JSON_POINTER if the json path to json pointer conversion fails diff --git a/include/simdjson/generic/ondemand/object.h b/include/simdjson/generic/ondemand/object.h index 46680c3be1..c6d1dda1d5 100644 --- a/include/simdjson/generic/ondemand/object.h +++ b/include/simdjson/generic/ondemand/object.h @@ -141,9 +141,9 @@ class object { inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only - * supporting a subset of json path that is easily convertible to json - * pointer. + * Get the value associated with the given JSON path. We only support + * JSON Path queries that trivially convertible to JSON Pointer queries: key + * names and array indices. * * @return The value associated with the given JSON path, or: * - INVALID_JSON_POINTER if the json path to json pointer conversion fails diff --git a/include/simdjson/generic/ondemand/value.h b/include/simdjson/generic/ondemand/value.h index 51ceee037a..4531c62365 100644 --- a/include/simdjson/generic/ondemand/value.h +++ b/include/simdjson/generic/ondemand/value.h @@ -584,9 +584,9 @@ class value { simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. Right now we are only - * supporting a subset of json path that is easily convertible to json - * pointer. + * Get the value associated with the given JSON path. We only support + * JSON Path queries that trivially convertible to JSON Pointer queries: key + * names and array indices. * * @return The value associated with the given JSON path, or: * - INVALID_JSON_POINTER if the json path to json pointer conversion fails diff --git a/tests/ondemand/ondemand_readme_examples.cpp b/tests/ondemand/ondemand_readme_examples.cpp index a7873a9058..056b7c91fb 100644 --- a/tests/ondemand/ondemand_readme_examples.cpp +++ b/tests/ondemand/ondemand_readme_examples.cpp @@ -1068,6 +1068,24 @@ bool json_pointer_simple() { TEST_SUCCEED(); } +bool json_pointer_unicode() { + TEST_START(); + const padded_string json = "{\"\u00E9\":123}"_padded; + ondemand::parser parser; + ondemand::document doc; + int64_t x; + ASSERT_SUCCESS(parser.iterate(json).get(doc)); + ASSERT_SUCCESS(doc.at_pointer("/\u00E9").get(x)); + ASSERT_EQUAL(x,123); + + const padded_string json2 = "{\"\\u00E9\":123}"_padded; + ASSERT_SUCCESS(parser.iterate(json2).get(doc)); + ASSERT_SUCCESS(doc.at_pointer("/\\u00E9").get(x)); + ASSERT_ERROR(doc.at_pointer("/\u00E9"), NO_SUCH_FIELD); + ASSERT_EQUAL(x,123); + TEST_SUCCEED(); +} + bool json_path_simple() { TEST_START(); ondemand::parser parser; @@ -1079,6 +1097,24 @@ bool json_path_simple() { TEST_SUCCEED(); } +bool json_path_unicode() { + TEST_START(); + ondemand::parser parser; + ondemand::document doc; + const padded_string json = "{\"\u00E9\":123}"_padded; + int64_t x; + ASSERT_SUCCESS(parser.iterate(json).get(doc)); + ASSERT_SUCCESS(doc.at_path(".\u00E9").get(x)); + ASSERT_EQUAL(x,123); + + const padded_string json2 = "{\"\\u00E9\":123}"_padded; + ASSERT_SUCCESS(parser.iterate(json2).get(doc)); + ASSERT_SUCCESS(doc.at_path(".\\u00E9").get(x)); + ASSERT_ERROR(doc.at_path(".\u00E9"), NO_SUCH_FIELD); + ASSERT_EQUAL(x,123); + TEST_SUCCEED(); +} + bool invalid_json_path() { TEST_START(); ondemand::parser parser; @@ -1771,7 +1807,9 @@ bool run() { #endif && using_the_parsed_json_6() && json_pointer_simple() + && json_pointer_unicode() && json_path_simple() + && json_path_unicode() && invalid_json_path() && json_pointer_multiple() && json_path_multiple() From a26893b0fe3be3f1420a38adc50802c46c54e256 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 15 Feb 2024 11:33:39 -0500 Subject: [PATCH 2/6] using more standard terminology (nitpicking) --- benchmark/json_benchmark/file_runner.h | 2 +- benchmark/json_benchmark/string_runner.h | 2 +- doc/basics.md | 25 ++++++++++---------- fuzz/fuzz_atpointer.cpp | 4 ++-- include/simdjson/generic/ondemand/array.h | 10 ++++---- include/simdjson/generic/ondemand/document.h | 10 ++++---- include/simdjson/generic/ondemand/object.h | 8 +++---- include/simdjson/generic/ondemand/value.h | 8 +++---- tests/ondemand/ondemand_readme_examples.cpp | 2 +- 9 files changed, 38 insertions(+), 33 deletions(-) diff --git a/benchmark/json_benchmark/file_runner.h b/benchmark/json_benchmark/file_runner.h index 1314729350..0dddc067c4 100644 --- a/benchmark/json_benchmark/file_runner.h +++ b/benchmark/json_benchmark/file_runner.h @@ -24,7 +24,7 @@ struct file_runner : public runner_base { simdjson_warn_unused bool before_run(benchmark::State &state) { if (!runner_base::after_run(state)) { return false; }; - // Copy the original json in case we did *in situ* last time + // Copy the original JSON in case we did *in situ* last time std::memcpy(json.data(), original_json.data(), original_json.size()); return true; } diff --git a/benchmark/json_benchmark/string_runner.h b/benchmark/json_benchmark/string_runner.h index ffde93818a..f9b4603869 100644 --- a/benchmark/json_benchmark/string_runner.h +++ b/benchmark/json_benchmark/string_runner.h @@ -13,7 +13,7 @@ struct string_runner : public runner_base { simdjson_warn_unused bool before_run(benchmark::State &state) { if (!runner_base::after_run(state)) { return false; }; - // Copy the original json in case we did *in situ* + // Copy the original JSON in case we did *in situ* std::memcpy(json.data(), original_json.data(), original_json.size()); return true; } diff --git a/doc/basics.md b/doc/basics.md index 9a0383f7d4..b89e307016 100644 --- a/doc/basics.md +++ b/doc/basics.md @@ -19,7 +19,7 @@ An overview of what you need to know to use simdjson, with examples. - [Minifying JSON strings without parsing](#minifying-json-strings-without-parsing) - [UTF-8 validation (alone)](#utf-8-validation-alone) - [JSON Pointer](#json-pointer) - - [JSON Path (subset)](#json-path) + - [JSONPath](#json-path) - [Error Handling](#error-handling) - [Error Handling Examples without Exceptions](#error-handling-examples-without-exceptions) - [Disabling Exceptions](#disabling-exceptions) @@ -1105,7 +1105,7 @@ for (size_t i = 0; i < size; i++) { } ``` -In most instances, a JSON Path is an ASCII string and the keys in a JSON document +In most instances, a JSON Pointer is an ASCII string and the keys in a JSON document are ASCII strings. We support UTF-8 in JSON Pointer, but key values are matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. The e acute character is considered distinct from its escaped version `\u00E9`. E.g., @@ -1178,10 +1178,10 @@ std::cout << doc.find_field("k0") << std::endl; // Prints 27 When the JSON Pointer Path is the empty string (`""`) applied to a scalar document (lone string, number, Boolean or null), a SCALAR_DOCUMENT_AS_VALUE error is returned because scalar document cannot be represented as `value` instances. You can check that a document is a scalar with the method `scalar()`. -JSON Path +JSONPath ------------ -The simdjson library now supports a subset of [JSON Path](https://goessner.net/articles/JsonPath/) through the `at_path()` method, allowing you to reach further into the document in a single call. The subset of JSON path that is implemented is the subset that is trivially convertible into the JSON Pointer format, using `.` to access a field and `[]` to access a specific index. +The simdjson library now supports a subset of [JSONPath](https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00) through the `at_path()` method, allowing you to reach further into the document in a single call. The subset of JSONPath that is implemented is the subset that is trivially convertible into the JSON Pointer format, using `.` to access a field and `[]` to access a specific index. This implementation relies on `at_path()` converting its argument to JSON Pointer and then calling `at_pointer`, which makes use of [`rewind`](#rewind) to reset the parser at the beginning of the document. Hence, it invalidates all previously parsed values, objects and arrays: make sure to consume the values between each call to `at_path`. @@ -1198,7 +1198,7 @@ auto cars = parser.iterate(cars_json); cout << cars.at_path("[0].tire_pressure[1]") << endl; // Prints 39.9 ``` -A call to `at_path(json_path)` can result in any of the errors that are returned by the `at_pointer` method and if the conversion of `json_path` to json pointer fails, it will lead to an `simdjson::INVALID_JSON_POINTER`error. +A call to `at_path(json_path)` can result in any of the errors that are returned by the `at_pointer` method and if the conversion of `json_path` to JSON Pointer fails, it will lead to an `simdjson::INVALID_JSON_POINTER`error. ```c++ auto cars_json = R"( [ @@ -1208,19 +1208,20 @@ auto cars_json = R"( [ ] )"_padded; ondemand::parser parser; auto cars = parser.iterate(cars_json); -ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to json pointer, since last square bracket was not properly closed. -ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Conversion to json pointer succeeds, but fails on at_pointer() since the path is invalid. +ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to JSON Pointer, since last square bracket was not properly closed. +ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Conversion to JSON Pointer succeeds, but fails on at_pointer() since the path is invalid. ``` -In most instances, a JSON Path is an ASCII string and the keys in a JSON document -are ASCII strings. We support UTF-8, but key values are matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. The e acute character is -considered distinct from its escaped version `\u00E9`. E.g., +In most instances, a JSONPath is an ASCII string and the keys in a JSON document +are ASCII strings. We support UTF-8 within a JSONPath expression, but key values are +matched exactly, without unescaping or Unicode normalization. We do a byte-by-byte comparison. +The e acute character is considered distinct from its escaped version `\u00E9`. E.g., ```c++ const padded_string json = "{\"\\u00E9\":123}"_padded; auto doc = parser.iterate(json); -doc.at_pointer(".\\u00E9") == 123; // true -doc.at_pointer(".\u00E9") // returns an error (NO_SUCH_FIELD) +doc.at_path(".\\u00E9") == 123; // true +doc.at_path(".\u00E9") // returns an error (NO_SUCH_FIELD) ``` Error Handling diff --git a/fuzz/fuzz_atpointer.cpp b/fuzz/fuzz_atpointer.cpp index f45aa5deb3..915da0d732 100644 --- a/fuzz/fuzz_atpointer.cpp +++ b/fuzz/fuzz_atpointer.cpp @@ -7,11 +7,11 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { - // Split data into two strings, json pointer and the document string. + // Split data into two strings, JSON Pointer and the document string. // Might end up with none, either or both being empty, important for // covering edge cases such as // https://github.com/simdjson/simdjson/issues/1142 Inputs missing the - // separator line will get an empty json pointer but the all the input put in + // separator line will get an empty JSON Pointer but the all the input put in // the document string. This means test data from other fuzzers that take json // input works for this fuzzer as well. FuzzData fd(Data, Size); diff --git a/include/simdjson/generic/ondemand/array.h b/include/simdjson/generic/ondemand/array.h index 541901a37d..e6095d27c2 100644 --- a/include/simdjson/generic/ondemand/array.h +++ b/include/simdjson/generic/ondemand/array.h @@ -102,12 +102,14 @@ class array { inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. We only support - * JSON Path queries that trivially convertible to JSON Pointer queries: key + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key * names and array indices. * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00 + * + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/generic/ondemand/document.h b/include/simdjson/generic/ondemand/document.h index 02244dccb6..29741cb0ba 100644 --- a/include/simdjson/generic/ondemand/document.h +++ b/include/simdjson/generic/ondemand/document.h @@ -613,10 +613,12 @@ class document { simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. We only support - * JSON Path queries that trivially convertible to JSON Pointer queries: key + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key * names and array indices. * + * https://datatracker.ietf.org/doc/html/draft-normington-jsonpath-00 + * * Key values are matched exactly, without unescaping or Unicode normalization. * We do a byte-by-byte comparison. E.g. * @@ -625,8 +627,8 @@ class document { * doc.at_path(".\\u00E9") == 123 * doc.at_path(".\u00E9") returns an error (NO_SUCH_FIELD) * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/generic/ondemand/object.h b/include/simdjson/generic/ondemand/object.h index c6d1dda1d5..b21c10339c 100644 --- a/include/simdjson/generic/ondemand/object.h +++ b/include/simdjson/generic/ondemand/object.h @@ -141,12 +141,12 @@ class object { inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. We only support - * JSON Path queries that trivially convertible to JSON Pointer queries: key + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key * names and array indices. * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/include/simdjson/generic/ondemand/value.h b/include/simdjson/generic/ondemand/value.h index 4531c62365..4e4f035312 100644 --- a/include/simdjson/generic/ondemand/value.h +++ b/include/simdjson/generic/ondemand/value.h @@ -584,12 +584,12 @@ class value { simdjson_inline simdjson_result at_pointer(std::string_view json_pointer) noexcept; /** - * Get the value associated with the given JSON path. We only support - * JSON Path queries that trivially convertible to JSON Pointer queries: key + * Get the value associated with the given JSONPath expression. We only support + * JSONPath queries that trivially convertible to JSON Pointer queries: key * names and array indices. * - * @return The value associated with the given JSON path, or: - * - INVALID_JSON_POINTER if the json path to json pointer conversion fails + * @return The value associated with the given JSONPath expression, or: + * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails * - NO_SUCH_FIELD if a field does not exist in an object * - INDEX_OUT_OF_BOUNDS if an array index is larger than an array length * - INCORRECT_TYPE if a non-integer is used to access an array diff --git a/tests/ondemand/ondemand_readme_examples.cpp b/tests/ondemand/ondemand_readme_examples.cpp index 056b7c91fb..2b51b6f028 100644 --- a/tests/ondemand/ondemand_readme_examples.cpp +++ b/tests/ondemand/ondemand_readme_examples.cpp @@ -1121,7 +1121,7 @@ bool invalid_json_path() { ondemand::document cars; double x; ASSERT_SUCCESS(parser.iterate(cars_json).get(cars)); - ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to json pointer + ASSERT_ERROR(cars.at_path("[0].tire_presure[1").get(x), INVALID_JSON_POINTER); // Fails on conversion to JSON Pointer ASSERT_ERROR(cars.at_path("[0].incorrect_field[1]").get(x), NO_SUCH_FIELD); // Fails on at_pointer() TEST_SUCCEED(); } From cab7658f8474e6b20223748e630656c31cb51465 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 15 Feb 2024 15:56:18 -0500 Subject: [PATCH 3/6] specifying UTF-8 encoding --- doc/basics.md | 4 ++-- include/simdjson/generic/ondemand/document.h | 4 ++-- tests/ondemand/ondemand_readme_examples.cpp | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/basics.md b/doc/basics.md index b89e307016..a000a01088 100644 --- a/doc/basics.md +++ b/doc/basics.md @@ -1113,7 +1113,7 @@ considered distinct from its escaped version `\u00E9`. E.g., const padded_string json = "{\"\\u00E9\":123}"_padded; auto doc = parser.iterate(json); doc.at_pointer("/\\u00E9") == 123; // true -doc.at_pointer("/\u00E9") // returns an error (NO_SUCH_FIELD) +doc.at_pointer(u8"/\u00E9") // returns an error (NO_SUCH_FIELD) ``` Note that `at_pointer` calls [`rewind`](#rewind) to reset the parser at the beginning of the document. Hence, it invalidates all previously parsed values, objects and arrays: make sure to consume the values between each call to `at_pointer`. Consider the following example where one wants to store each object from the JSON into a vector of `struct car_type`: @@ -1221,7 +1221,7 @@ The e acute character is considered distinct from its escaped version `\u00E9`. const padded_string json = "{\"\\u00E9\":123}"_padded; auto doc = parser.iterate(json); doc.at_path(".\\u00E9") == 123; // true -doc.at_path(".\u00E9") // returns an error (NO_SUCH_FIELD) +doc.at_path(u8".\u00E9") // returns an error (NO_SUCH_FIELD) ``` Error Handling diff --git a/include/simdjson/generic/ondemand/document.h b/include/simdjson/generic/ondemand/document.h index 29741cb0ba..2d78a6eea3 100644 --- a/include/simdjson/generic/ondemand/document.h +++ b/include/simdjson/generic/ondemand/document.h @@ -593,7 +593,7 @@ class document { * const padded_string json = "{\"\\u00E9\":123}"_padded; * auto doc = parser.iterate(json); * doc.at_pointer("/\\u00E9") == 123 - * doc.at_pointer("/\u00E9") returns an error (NO_SUCH_FIELD) + * doc.at_pointer(u8"/\u00E9") returns an error (NO_SUCH_FIELD) * * Note that at_pointer() automatically calls rewind between each call. Thus * all values, objects and arrays that you have created so far (including unescaped strings) @@ -625,7 +625,7 @@ class document { * const padded_string json = "{\"\\u00E9\":123}"_padded; * auto doc = parser.iterate(json); * doc.at_path(".\\u00E9") == 123 - * doc.at_path(".\u00E9") returns an error (NO_SUCH_FIELD) + * doc.at_path(u8".\u00E9") returns an error (NO_SUCH_FIELD) * * @return The value associated with the given JSONPath expression, or: * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails diff --git a/tests/ondemand/ondemand_readme_examples.cpp b/tests/ondemand/ondemand_readme_examples.cpp index 2b51b6f028..2f899bdee4 100644 --- a/tests/ondemand/ondemand_readme_examples.cpp +++ b/tests/ondemand/ondemand_readme_examples.cpp @@ -1070,18 +1070,18 @@ bool json_pointer_simple() { bool json_pointer_unicode() { TEST_START(); - const padded_string json = "{\"\u00E9\":123}"_padded; + const padded_string json = u8"{\"\u00E9\":123}"_padded; ondemand::parser parser; ondemand::document doc; int64_t x; ASSERT_SUCCESS(parser.iterate(json).get(doc)); - ASSERT_SUCCESS(doc.at_pointer("/\u00E9").get(x)); + ASSERT_SUCCESS(doc.at_pointer(u8"/\u00E9").get(x)); ASSERT_EQUAL(x,123); const padded_string json2 = "{\"\\u00E9\":123}"_padded; ASSERT_SUCCESS(parser.iterate(json2).get(doc)); ASSERT_SUCCESS(doc.at_pointer("/\\u00E9").get(x)); - ASSERT_ERROR(doc.at_pointer("/\u00E9"), NO_SUCH_FIELD); + ASSERT_ERROR(doc.at_pointer(u8"/\u00E9"), NO_SUCH_FIELD); ASSERT_EQUAL(x,123); TEST_SUCCEED(); } @@ -1101,16 +1101,16 @@ bool json_path_unicode() { TEST_START(); ondemand::parser parser; ondemand::document doc; - const padded_string json = "{\"\u00E9\":123}"_padded; + const padded_string json = u8"{\"\u00E9\":123}"_padded; int64_t x; ASSERT_SUCCESS(parser.iterate(json).get(doc)); - ASSERT_SUCCESS(doc.at_path(".\u00E9").get(x)); + ASSERT_SUCCESS(doc.at_path(u8".\u00E9").get(x)); ASSERT_EQUAL(x,123); const padded_string json2 = "{\"\\u00E9\":123}"_padded; ASSERT_SUCCESS(parser.iterate(json2).get(doc)); ASSERT_SUCCESS(doc.at_path(".\\u00E9").get(x)); - ASSERT_ERROR(doc.at_path(".\u00E9"), NO_SUCH_FIELD); + ASSERT_ERROR(doc.at_path(u8".\u00E9"), NO_SUCH_FIELD); ASSERT_EQUAL(x,123); TEST_SUCCEED(); } From 8c6bbb0c393586bf5110bfeaea12467bf385d2da Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 15 Feb 2024 16:18:28 -0500 Subject: [PATCH 4/6] allow char8_t when compiling as C++20 --- include/simdjson/padded_string-inl.h | 14 +++++++++++++- include/simdjson/padded_string.h | 6 ++++++ include/simdjson/padded_string_view-inl.h | 7 ++++++- include/simdjson/padded_string_view.h | 4 +++- 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/simdjson/padded_string-inl.h b/include/simdjson/padded_string-inl.h index ebb056bde1..e31fe77cf4 100644 --- a/include/simdjson/padded_string-inl.h +++ b/include/simdjson/padded_string-inl.h @@ -54,6 +54,14 @@ inline padded_string::padded_string(const char *data, size_t length) noexcept std::memcpy(data_ptr, data, length); } } +#ifdef __cpp_char8_t +inline padded_string::padded_string(const char8_t *data, size_t length) noexcept + : viable_size(length), data_ptr(internal::allocate_padded_buffer(length)) { + if ((data != nullptr) && (data_ptr != nullptr)) { + std::memcpy(data_ptr, reinterpret_cast(data), length); + } +} +#endif // note: do not pass std::string arguments by value inline padded_string::padded_string(const std::string & str_ ) noexcept : viable_size(str_.size()), data_ptr(internal::allocate_padded_buffer(str_.size())) { @@ -173,5 +181,9 @@ inline simdjson_result padded_string::load(std::string_view filen inline simdjson::padded_string operator "" _padded(const char *str, size_t len) { return simdjson::padded_string(str, len); } - +#ifdef __cpp_char8_t +inline simdjson::padded_string operator "" _padded(const char8_t *str, size_t len) { + return simdjson::padded_string(reinterpret_cast(str), len); +} +#endif #endif // SIMDJSON_PADDED_STRING_INL_H diff --git a/include/simdjson/padded_string.h b/include/simdjson/padded_string.h index ed157367d9..ddc3cbba87 100644 --- a/include/simdjson/padded_string.h +++ b/include/simdjson/padded_string.h @@ -39,6 +39,9 @@ struct padded_string final { * @param length the number of bytes to copy */ explicit inline padded_string(const char *data, size_t length) noexcept; +#ifdef __cpp_char8_t + explicit inline padded_string(const char8_t *data, size_t length) noexcept; +#endif /** * Create a new padded string by copying the given input. * @@ -150,6 +153,9 @@ inline std::ostream& operator<<(std::ostream& out, simdjson_result(s), len, capacity) { } - +#ifdef __cpp_char8_t +inline padded_string_view::padded_string_view(const char8_t* s, size_t len, size_t capacity) noexcept + : padded_string_view(reinterpret_cast(s), len, capacity) +{ +} +#endif inline padded_string_view::padded_string_view(const std::string &s) noexcept : std::string_view(s), _capacity(s.capacity()) { diff --git a/include/simdjson/padded_string_view.h b/include/simdjson/padded_string_view.h index 6ca7ff3e0d..f8dc59a79c 100644 --- a/include/simdjson/padded_string_view.h +++ b/include/simdjson/padded_string_view.h @@ -34,7 +34,9 @@ class padded_string_view : public std::string_view { explicit inline padded_string_view(const char* s, size_t len, size_t capacity) noexcept; /** overload explicit inline padded_string_view(const char* s, size_t len) noexcept */ explicit inline padded_string_view(const uint8_t* s, size_t len, size_t capacity) noexcept; - +#ifdef __cpp_char8_t + explicit inline padded_string_view(const char8_t* s, size_t len, size_t capacity) noexcept; +#endif /** * Promise the given string has at least SIMDJSON_PADDING extra bytes allocated to it. * From e4a285f43caf228d6aa74f907848ea8db2b1f69f Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 15 Feb 2024 16:42:29 -0500 Subject: [PATCH 5/6] casting --- doc/basics.md | 4 ++-- include/simdjson/generic/ondemand/document.h | 4 ++-- tests/ondemand/ondemand_readme_examples.cpp | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/basics.md b/doc/basics.md index a000a01088..73a4249fee 100644 --- a/doc/basics.md +++ b/doc/basics.md @@ -1113,7 +1113,7 @@ considered distinct from its escaped version `\u00E9`. E.g., const padded_string json = "{\"\\u00E9\":123}"_padded; auto doc = parser.iterate(json); doc.at_pointer("/\\u00E9") == 123; // true -doc.at_pointer(u8"/\u00E9") // returns an error (NO_SUCH_FIELD) +doc.at_pointer((const char*)u8"/\u00E9") // returns an error (NO_SUCH_FIELD) ``` Note that `at_pointer` calls [`rewind`](#rewind) to reset the parser at the beginning of the document. Hence, it invalidates all previously parsed values, objects and arrays: make sure to consume the values between each call to `at_pointer`. Consider the following example where one wants to store each object from the JSON into a vector of `struct car_type`: @@ -1221,7 +1221,7 @@ The e acute character is considered distinct from its escaped version `\u00E9`. const padded_string json = "{\"\\u00E9\":123}"_padded; auto doc = parser.iterate(json); doc.at_path(".\\u00E9") == 123; // true -doc.at_path(u8".\u00E9") // returns an error (NO_SUCH_FIELD) +doc.at_path((const char*)u8".\u00E9") // returns an error (NO_SUCH_FIELD) ``` Error Handling diff --git a/include/simdjson/generic/ondemand/document.h b/include/simdjson/generic/ondemand/document.h index 2d78a6eea3..060f2ec5fe 100644 --- a/include/simdjson/generic/ondemand/document.h +++ b/include/simdjson/generic/ondemand/document.h @@ -593,7 +593,7 @@ class document { * const padded_string json = "{\"\\u00E9\":123}"_padded; * auto doc = parser.iterate(json); * doc.at_pointer("/\\u00E9") == 123 - * doc.at_pointer(u8"/\u00E9") returns an error (NO_SUCH_FIELD) + * doc.at_pointer((const char*)u8"/\u00E9") returns an error (NO_SUCH_FIELD) * * Note that at_pointer() automatically calls rewind between each call. Thus * all values, objects and arrays that you have created so far (including unescaped strings) @@ -625,7 +625,7 @@ class document { * const padded_string json = "{\"\\u00E9\":123}"_padded; * auto doc = parser.iterate(json); * doc.at_path(".\\u00E9") == 123 - * doc.at_path(u8".\u00E9") returns an error (NO_SUCH_FIELD) + * doc.at_path((const char*)u8".\u00E9") returns an error (NO_SUCH_FIELD) * * @return The value associated with the given JSONPath expression, or: * - INVALID_JSON_POINTER if the JSONPath to JSON Pointer conversion fails diff --git a/tests/ondemand/ondemand_readme_examples.cpp b/tests/ondemand/ondemand_readme_examples.cpp index 2f899bdee4..dcdf23c661 100644 --- a/tests/ondemand/ondemand_readme_examples.cpp +++ b/tests/ondemand/ondemand_readme_examples.cpp @@ -1075,13 +1075,13 @@ bool json_pointer_unicode() { ondemand::document doc; int64_t x; ASSERT_SUCCESS(parser.iterate(json).get(doc)); - ASSERT_SUCCESS(doc.at_pointer(u8"/\u00E9").get(x)); + ASSERT_SUCCESS(doc.at_pointer((const char*)u8"/\u00E9").get(x)); ASSERT_EQUAL(x,123); const padded_string json2 = "{\"\\u00E9\":123}"_padded; ASSERT_SUCCESS(parser.iterate(json2).get(doc)); ASSERT_SUCCESS(doc.at_pointer("/\\u00E9").get(x)); - ASSERT_ERROR(doc.at_pointer(u8"/\u00E9"), NO_SUCH_FIELD); + ASSERT_ERROR(doc.at_pointer((const char*)u8"/\u00E9"), NO_SUCH_FIELD); ASSERT_EQUAL(x,123); TEST_SUCCEED(); } @@ -1104,13 +1104,13 @@ bool json_path_unicode() { const padded_string json = u8"{\"\u00E9\":123}"_padded; int64_t x; ASSERT_SUCCESS(parser.iterate(json).get(doc)); - ASSERT_SUCCESS(doc.at_path(u8".\u00E9").get(x)); + ASSERT_SUCCESS(doc.at_path((const char*)u8".\u00E9").get(x)); ASSERT_EQUAL(x,123); const padded_string json2 = "{\"\\u00E9\":123}"_padded; ASSERT_SUCCESS(parser.iterate(json2).get(doc)); ASSERT_SUCCESS(doc.at_path(".\\u00E9").get(x)); - ASSERT_ERROR(doc.at_path(u8".\u00E9"), NO_SUCH_FIELD); + ASSERT_ERROR(doc.at_path((const char*)u8".\u00E9"), NO_SUCH_FIELD); ASSERT_EQUAL(x,123); TEST_SUCCEED(); } From bc3a9478352d842bbeceb153558432992b327a85 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 15 Feb 2024 17:45:45 -0500 Subject: [PATCH 6/6] minor doc corrections --- README.md | 2 +- doc/dom.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b0c1d20a6e..8d3d1da53c 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ We distinguish between "bindings" (which just wrap the C++ code) and a port to a - [simdjzon](https://github.com/travisstaloch/simdjzon): zig port. - [JSON-Simd](https://github.com/rawleyfowler/JSON-simd): Raku bindings. - [JSON::SIMD](https://metacpan.org/pod/JSON::SIMD): Perl bindings; fully-featured JSON module that uses simdjson for decoding. -- [gemmaJSON](https://github.com/sainttttt/gemmaJSON): Nim json parser based on simdjson bindings. +- [gemmaJSON](https://github.com/sainttttt/gemmaJSON): Nim JSON parser based on simdjson bindings. About simdjson -------------- diff --git a/doc/dom.md b/doc/dom.md index feec474016..b7b133b2a0 100644 --- a/doc/dom.md +++ b/doc/dom.md @@ -223,13 +223,13 @@ dom::element cars = parser.parse(cars_json); cout << cars.at_pointer("/0/tire_pressure/1") << endl; // Prints 39.9 ``` -A JSON Path is a sequence of segments each starting with the '/' character. Within arrays, an integer +A JSON Pointer expression is a sequence of segments each starting with the '/' character. Within arrays, an integer index allows you to select the indexed node. Within objects, the string value of the key allows you to select the value. If your keys contain the characters '/' or '~', they must be escaped as '~1' and -'~0' respectively. An empty JSON Path refers to the whole document. +'~0' respectively. An empty JSON Pointer expression refers to the whole document. We also extend the JSON Pointer support to include *relative* paths. -You can apply a JSON path to any node and the path gets interpreted relatively, as if the current node were a whole JSON document. +You can apply a JSON Pointer expression to any node and the path gets interpreted relatively, as if the current node were a whole JSON document. Consider the following example: