Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skip UTF-8 BOM if present #2079

Merged
merged 1 commit into from Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions include/simdjson/dom/document_stream.h
Expand Up @@ -224,8 +224,7 @@ class document_stream {
* Parse the next document found in the buffer previously given to document_stream.
*
* The content should be a valid JSON document encoded as UTF-8. If there is a
* UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
* discouraged.
* UTF-8 BOM, the parser skips it.
*
* You do NOT need to pre-allocate a parser. This function takes care of
* pre-allocating a capacity defined by the batch_size defined when creating the
Expand Down
13 changes: 12 additions & 1 deletion include/simdjson/dom/parser-inl.h
Expand Up @@ -12,6 +12,7 @@
#include "simdjson/dom/element-inl.h"

#include <climits>
#include <cstring> /* memcmp */

namespace simdjson {
namespace dom {
Expand Down Expand Up @@ -120,8 +121,14 @@ inline simdjson_result<element> parser::parse_into_document(document& provided_d
_loaded_bytes_capacity = len;
}
std::memcpy(static_cast<void *>(loaded_bytes.get()), buf, len);
buf = reinterpret_cast<const uint8_t*>(loaded_bytes.get());
}
_error = implementation->parse(realloc_if_needed ? reinterpret_cast<const uint8_t*>(loaded_bytes.get()): buf, len, provided_doc);

if((len >= 3) && (std::memcmp(buf, "\xEF\xBB\xBF", 3) == 0)) {
buf += 3;
len -= 3;
}
_error = implementation->parse(buf, len, provided_doc);

if (_error) { return _error; }

Expand Down Expand Up @@ -158,6 +165,10 @@ simdjson_inline simdjson_result<element> parser::parse(const padded_string_view

inline simdjson_result<document_stream> parser::parse_many(const uint8_t *buf, size_t len, size_t batch_size) noexcept {
if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
if((len >= 3) && (std::memcmp(buf, "\xEF\xBB\xBF", 3) == 0)) {
buf += 3;
len -= 3;
}
return document_stream(*this, buf, len, batch_size);
}
inline simdjson_result<document_stream> parser::parse_many(const char *buf, size_t len, size_t batch_size) noexcept {
Expand Down
4 changes: 4 additions & 0 deletions include/simdjson/dom/parser.h
Expand Up @@ -254,6 +254,8 @@ class parser {
* And, possibly, no document many have been parsed when the `parser.load_many(path)` function
* returned.
*
* If there is a UTF-8 BOM, the parser skips it.
*
* ### Format
*
* The file must contain a series of one or more JSON documents, concatenated into a single
Expand Down Expand Up @@ -346,6 +348,8 @@ class parser {
* cout << std::string(doc["title"]) << endl;
* }
*
* If there is a UTF-8 BOM, the parser skips it.
*
* ### Format
*
* The buffer must contain a series of one or more JSON documents, concatenated into a single
Expand Down
3 changes: 1 addition & 2 deletions include/simdjson/generic/ondemand/document_stream.h
Expand Up @@ -241,8 +241,7 @@ class document_stream {
* Parse the next document found in the buffer previously given to document_stream.
*
* The content should be a valid JSON document encoded as UTF-8. If there is a
* UTF-8 BOM, the caller is responsible for omitting it, UTF-8 BOM are
* discouraged.
* UTF-8 BOM, the parser skips it.
*
* You do NOT need to pre-allocate a parser. This function takes care of
* pre-allocating a capacity defined by the batch_size defined when creating the
Expand Down
8 changes: 8 additions & 0 deletions include/simdjson/generic/ondemand/parser-inl.h
Expand Up @@ -46,6 +46,8 @@ simdjson_warn_unused simdjson_inline error_code parser::allocate(size_t new_capa
simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(padded_string_view json) & noexcept {
if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; }

json.remove_utf8_bom();

// Allocate if needed
if (capacity() < json.length() || !string_buf) {
SIMDJSON_TRY( allocate(json.length(), max_depth()) );
Expand Down Expand Up @@ -96,6 +98,8 @@ simdjson_warn_unused simdjson_inline simdjson_result<document> parser::iterate(c
simdjson_warn_unused simdjson_inline simdjson_result<json_iterator> parser::iterate_raw(padded_string_view json) & noexcept {
if (json.padding() < SIMDJSON_PADDING) { return INSUFFICIENT_PADDING; }

json.remove_utf8_bom();

// Allocate if needed
if (capacity() < json.length()) {
SIMDJSON_TRY( allocate(json.length(), max_depth()) );
Expand All @@ -108,6 +112,10 @@ simdjson_warn_unused simdjson_inline simdjson_result<json_iterator> parser::iter

inline simdjson_result<document_stream> parser::iterate_many(const uint8_t *buf, size_t len, size_t batch_size, bool allow_comma_separated) noexcept {
if(batch_size < MINIMAL_BATCH_SIZE) { batch_size = MINIMAL_BATCH_SIZE; }
if((len >= 3) && (std::memcmp(buf, "\xEF\xBB\xBF", 3) == 0)) {
buf += 3;
len -= 3;
}
if(allow_comma_separated && batch_size < len) { batch_size = len; }
return document_stream(*this, buf, len, batch_size, allow_comma_separated);
}
Expand Down
3 changes: 2 additions & 1 deletion include/simdjson/generic/ondemand/parser.h
Expand Up @@ -59,7 +59,7 @@ class parser {
* It is expected that the content is a valid UTF-8 file, containing a valid JSON document.
* Otherwise the iterate method may return an error. In particular, the whole input should be
* valid: we do not attempt to tolerate incorrect content either before or after a JSON
* document.
* document. If there is a UTF-8 BOM, the parser skips it.
*
* ### IMPORTANT: Validate what you use
*
Expand Down Expand Up @@ -188,6 +188,7 @@ class parser {
* arrays or objects) MUST be separated with ASCII whitespace.
*
* The characters inside a JSON document, and between JSON documents, must be valid Unicode (UTF-8).
* If there is a UTF-8 BOM, the parser skips it.
*
* The documents must not exceed batch_size bytes (by default 1MB) or they will fail to parse.
* Setting batch_size to excessively large or excessively small values may impact negatively the
Expand Down
13 changes: 12 additions & 1 deletion include/simdjson/padded_string_view-inl.h
Expand Up @@ -2,9 +2,10 @@
#define SIMDJSON_PADDED_STRING_VIEW_INL_H

#include "simdjson/padded_string_view.h"

#include "simdjson/error-inl.h"

#include <cstring> /* memcmp */

namespace simdjson {

inline padded_string_view::padded_string_view(const char* s, size_t len, size_t capacity) noexcept
Expand All @@ -31,6 +32,16 @@ inline size_t padded_string_view::capacity() const noexcept { return _capacity;

inline size_t padded_string_view::padding() const noexcept { return capacity() - length(); }

inline bool padded_string_view::remove_utf8_bom() noexcept {
if(length() < 3) { return false; }
if (std::memcmp(data(), "\xEF\xBB\xBF", 3) == 0) {
remove_prefix(3);
_capacity -= 3;
return true;
}
return false;
}

#if SIMDJSON_EXCEPTIONS
inline std::ostream& operator<<(std::ostream& out, simdjson_result<padded_string_view> &s) noexcept(false) { return out << s.value(); }
#endif
Expand Down
7 changes: 7 additions & 0 deletions include/simdjson/padded_string_view.h
Expand Up @@ -54,6 +54,13 @@ class padded_string_view : public std::string_view {
/** The number of allocated bytes. */
inline size_t capacity() const noexcept;

/**
* Remove the UTF-8 Byte Order Mark (BOM) if it exists.
*
* @return whether a BOM was found and removed
*/
inline bool remove_utf8_bom() noexcept;

/** The amount of padding on the string (capacity() - length()) */
inline size_t padding() const noexcept;

Expand Down
30 changes: 29 additions & 1 deletion tests/dom/basictests.cpp
Expand Up @@ -66,6 +66,16 @@ namespace number_tests {
return true;
}

bool bomskip() {
TEST_START();
simdjson::dom::parser parser;
simdjson::padded_string docdata = "\xEF\xBB\xBF{\"score\":0.8825149536132812}"_padded;
double score;
ASSERT_SUCCESS(parser.parse(docdata)["score"].get_double().get(score));
ASSERT_EQUAL(score, 0.8825149536132812);
TEST_SUCCEED();
}

bool issue2017() {
TEST_START();
simdjson::dom::parser parser;
Expand Down Expand Up @@ -386,7 +396,8 @@ namespace number_tests {
}

bool run() {
return issue2017() &&
return bomskip() &&
issue2017() &&
truncated_borderline() &&
specific_tests() &&
ground_truth() &&
Expand Down Expand Up @@ -1658,6 +1669,7 @@ namespace validate_tests {
}
return true;
}

bool test_validate() {
std::cout << "Running " << __func__ << std::endl;
const std::string test = R"({ "foo" : 1, "bar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })";
Expand All @@ -1666,6 +1678,7 @@ namespace validate_tests {
}
return true;
}

bool test_range() {
std::cout << "Running " << __func__ << std::endl;
for(size_t len = 0; len <= 128; len++) {
Expand All @@ -1683,6 +1696,7 @@ namespace validate_tests {
}
return true;
}

bool test_issue1169() {
std::cout << "Running " << __func__ << std::endl;
std::vector<uint8_t> source(64,' ');
Expand All @@ -1693,6 +1707,7 @@ namespace validate_tests {
}
return true;
}

bool test_issue1169_long() {
std::cout << "Running " << __func__ << std::endl;
for(size_t len = 1; len <= 128; len++) {
Expand All @@ -1702,6 +1717,7 @@ namespace validate_tests {
}
return true;
}

bool test_random() {
std::cout << "Running " << __func__ << std::endl;
std::vector<uint8_t> source(64,' ');
Expand Down Expand Up @@ -1763,6 +1779,7 @@ namespace minify_tests {
}
return true;
}

// this is meant to test buffer overflows.
bool test_various_lengths2() {
std::cout << "Running " << __func__ << std::endl;
Expand All @@ -1781,6 +1798,7 @@ namespace minify_tests {
}
return true;
}

bool test_single_quote() {
std::cout << "Running " << __func__ << std::endl;
const std::string test = "\"";
Expand All @@ -1801,12 +1819,14 @@ namespace minify_tests {
const std::string minified(R"({"foo":1,"bar":[1,2,0.11111111111111113],"baz":{"a":3.1415926535897936,"b":2,"c":3.141592653589794}})");
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
}

bool test_minify_array() {
std::cout << "Running " << __func__ << std::endl;
std::string test("[ 1, 2, 3]");
std::string minified("[1,2,3]");
return check_minification(test.c_str(), test.size(), minified.c_str(), minified.size());
}

bool test_minify_object() {
std::cout << "Running " << __func__ << std::endl;
std::string test(R"({ "foo " : 1, "b ar" : [ 1, 2, 3 ], "baz": { "a": 1, "b": 2, "c": 3 } })");
Expand Down Expand Up @@ -1849,6 +1869,7 @@ namespace format_tests {
s << doc;
return assert_minified(s);
}

bool print_minify_parser_parse() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1868,6 +1889,7 @@ namespace format_tests {
s << value;
return assert_minified(s, "1");
}

bool print_minify_element() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1887,6 +1909,7 @@ namespace format_tests {
s << array;
return assert_minified(s, "[1,2,0.11111111111111113]");
}

bool print_minify_array() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1906,6 +1929,7 @@ namespace format_tests {
s << object;
return assert_minified(s, R"({"a":3.1415926535897936,"b":2,"c":3.141592653589794})");
}

bool print_minify_object() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1925,6 +1949,7 @@ namespace format_tests {
s << parser.parse(DOCUMENT);
return assert_minified(s);
}

bool print_minify_parser_parse_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1940,6 +1965,7 @@ namespace format_tests {
s << parser.parse(DOCUMENT)["foo"];
return assert_minified(s, "1");
}

bool print_minify_element_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1956,6 +1982,7 @@ namespace format_tests {
s << value;
return assert_minified(s, "1");
}

bool print_minify_element_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand All @@ -1972,6 +1999,7 @@ namespace format_tests {
s << parser.parse(DOCUMENT)["bar"].get_array();
return assert_minified(s, "[1,2,0.11111111111111113]");
}

bool print_minify_array_result_exception() {
std::cout << "Running " << __func__ << std::endl;
dom::parser parser;
Expand Down
30 changes: 29 additions & 1 deletion tests/dom/document_stream_tests.cpp
Expand Up @@ -397,6 +397,33 @@ namespace document_stream_tests {
std::cout << "number of documents " << count << std::endl;
return count == 1;
}

bool skipbom() {
std::cout << "Running " << __func__ << std::endl;
simdjson::dom::parser parser;
auto json = "\xEF\xBB\xBF{\"hello\": \"world\"}"_padded;
simdjson::dom::document_stream stream;
ASSERT_SUCCESS(parser.parse_many(json).get(stream));
size_t count = 0;
for (auto doc : stream) {
if(doc.error()) {
std::cerr << "Unexpected error: " << doc.error() << std::endl;
return false;
}
std::string expected = R"({"hello":"world"})";
simdjson::dom::element this_document;
ASSERT_SUCCESS(doc.get(this_document));

std::string answer = simdjson::minify(this_document);
if(answer != expected) {
std::cout << this_document << std::endl;
return false;
}
count += 1;
}
std::cout << "number of documents " << count << std::endl;
return count == 1;
}
#if SIMDJSON_EXCEPTIONS
bool single_document_exceptions() {
std::cout << "Running " << __func__ << std::endl;
Expand Down Expand Up @@ -913,7 +940,8 @@ namespace document_stream_tests {
}

bool run() {
return fuzzaccess() &&
return skipbom() &&
fuzzaccess() &&
baby_fuzzer() &&
issue1649() &&
adversarial_single_document_array() &&
Expand Down