Skip to content

Commit

Permalink
Merge pull request #51 from inrustwetrust/sse-null-byte
Browse files Browse the repository at this point in the history
Fix issues encoding/decoding strings with null bytes
  • Loading branch information
punchfox committed May 2, 2018
2 parents 320dc08 + 93e9a08 commit abb2350
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 9 deletions.
4 changes: 2 additions & 2 deletions src/detail/escape_sse42.cpp
Expand Up @@ -53,7 +53,7 @@ void write_escaped_sse42(
auto out = buf;

const __m128i ranges = _mm_setr_epi8(
0x01, 0x1F, // control characters
0x00, 0x1F, // null byte & control characters
0x22, 0x22, // double quotation mark
0x5C, 0x5C, // reverse solidus (backslash)
0, 0, 0, 0, 0, 0, 0, 0, 0, 0
Expand All @@ -66,7 +66,7 @@ void write_escaped_sse42(

for (; end - begin >= 16; begin += 16) {
const __m128i chunk = _mm_load_si128(reinterpret_cast<const __m128i *>(begin));
const unsigned has_character_in_ranges = _mm_cmpistrc(ranges, chunk, _SIDD_CMP_RANGES);
const unsigned has_character_in_ranges = _mm_cmpestrc(ranges, 6, chunk, 16, _SIDD_CMP_RANGES);
if (json_likely(!has_character_in_ranges)) {
_mm_storeu_si128(reinterpret_cast<__m128i *>(out), chunk);
out += 16;
Expand Down
4 changes: 2 additions & 2 deletions src/detail/skip_chars_sse42.cpp
Expand Up @@ -42,7 +42,7 @@ void skip_any_simple_characters_sse42(decode_context &context) {
for (; end - pos >= 16; pos += 16) {
const auto chunk = _mm_load_si128(reinterpret_cast<const __m128i *>(pos));
constexpr auto flags = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_POSITIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT;
const auto index = _mm_cmpistri(chars, chunk, flags);
const auto index = _mm_cmpestri(chars, 2, chunk, 16, flags);
if (index != 16) {
context.position = pos + index;
return;
Expand Down Expand Up @@ -74,7 +74,7 @@ void skip_any_whitespace_sse42(decode_context &context) {
for (; end - pos >= 16; pos += 16) {
const auto chunk = _mm_load_si128(reinterpret_cast<const __m128i *>(pos));
constexpr auto flags = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_NEGATIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT;
const auto index = _mm_cmpistri(chars, chunk, flags);
const auto index = _mm_cmpestri(chars, 4, chunk, 16, flags);
if (index != 16) {
context.position = pos + index;
return;
Expand Down
21 changes: 21 additions & 0 deletions test/src/test_decode_helpers.cpp
Expand Up @@ -20,6 +20,7 @@

#include <spotify/json/codec/boolean.hpp>
#include <spotify/json/codec/omit.hpp>
#include <spotify/json/codec/string.hpp>
#include <spotify/json/detail/decode_helpers.hpp>

BOOST_AUTO_TEST_SUITE(spotify)
Expand Down Expand Up @@ -474,6 +475,26 @@ BOOST_AUTO_TEST_CASE(json_decode_helpers_decode_object_without_ending_brace) {
}), decode_exception);
}

BOOST_AUTO_TEST_CASE(json_decode_helpers_decode_object_with_null_bytes) {
alignas(16) char input_data[43] =
"{ \"AAA"
"BBB\0\":true, \""
"CCC\":true}";
BOOST_REQUIRE(input_data[sizeof(input_data) - 2] == '}');
auto ctx = decode_context(input_data, sizeof(input_data));
int num = 0;
decode_object<codec::string_t>(ctx, [&](std::string &&key) {
BOOST_CHECK(num < 2);
if (num == 0) {
BOOST_CHECK_EQUAL(key, std::string("AAABBB\0", 7));
} else {
BOOST_CHECK_EQUAL(key, "CCC");
}
num++;
decode_boolean(ctx);
});
}

BOOST_AUTO_TEST_SUITE_END() // detail
BOOST_AUTO_TEST_SUITE_END() // json
BOOST_AUTO_TEST_SUITE_END() // spotify
10 changes: 10 additions & 0 deletions test/src/test_skip_chars.cpp
Expand Up @@ -82,6 +82,16 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(json_skip_any_simple_characters, use_sse, true_fal
}
}

BOOST_AUTO_TEST_CASE_TEMPLATE(json_skip_any_simple_characters_null_byte_in_string,
use_sse,
true_false) {
alignas(16) char input_data[17] = "a\0\"\"\"\"\"\"\"\"\"\"\"\"\"\"";
auto context = decode_context(input_data, input_data + 16);
*const_cast<bool *>(&context.has_sse42) &= use_sse::value;
skip_any_simple_characters(context);
BOOST_CHECK_EQUAL(context.position - input_data, 2);
}

BOOST_AUTO_TEST_CASE_TEMPLATE(json_skip_any_simple_characters_with_empty_string,
use_sse,
true_false) {
Expand Down
36 changes: 31 additions & 5 deletions test/src/test_string.cpp
Expand Up @@ -53,23 +53,29 @@ std::string random_simple_character(size_t i) {
return std::string(&c, 1);
}

std::string random_simple_character_or_escape_sequence(size_t i) {
std::string random_simple_character_or_escape_sequence(size_t i, bool minimal_escaping) {
switch (i % 37) {
case 27: return "\\u0000";
case 28: return "\\\"";
case 29: return "\\/";
case 29: return minimal_escaping ? "/" : "\\/";
case 30: return "\\b";
case 31: return "\\f";
case 32: return "\\n";
case 33: return "\\r";
case 34: return "\\t";
case 35: return "\\\\";
case 36: return "\\u20AC";
case 36: return minimal_escaping ? "\xE2\x82\xAC" : "\\u20AC";
default: return random_simple_character(i);
}
}

std::string random_simple_character_or_unescaped_character(size_t i) {
// Note: since one case below returns 3 bytes and all the other return 1, the
// total number of bytes in one rotation is 39. This number is coprime with
// 16, which is the number of bytes in an SSE block, so this will ensure that
// each byte will end up first in a 16-byte SSE block eventually.
switch (i % 37) {
case 27: return std::string(1, '\0');
case 28: return "\"";
case 29: return "/";
case 30: return "\b";
Expand Down Expand Up @@ -100,10 +106,10 @@ std::string generate_simple_string_answer(size_t size) {
return string;
}

std::string generate_escaped_string(size_t approximate_size) {
std::string generate_escaped_string(size_t approximate_size, bool minimal_escaping = false) {
std::string string("\"");
for (size_t i = 0; i < approximate_size; i++) {
string.append(random_simple_character_or_escape_sequence(i));
string.append(random_simple_character_or_escape_sequence(i, minimal_escaping));
}
string.append("\"");
return string;
Expand Down Expand Up @@ -323,6 +329,26 @@ BOOST_AUTO_TEST_CASE(json_codec_string_should_encode_escaped_control_characters)
BOOST_CHECK_EQUAL(encode(std::string("\x01\x02")), "\"\\u0001\\u0002\"");
}

BOOST_AUTO_TEST_CASE(json_codec_string_should_encode_null_char) {
// Enough zero bytes to get a full 16-byte SSE block, regardless of what
// memory aligment the string data happens to get.
const std::string input_data(31, '\0');

std::string expected_result = "\"";
for (std::size_t i = 0; i < input_data.size(); i++) {
expected_result += "\\u0000";
}
expected_result += "\"";

BOOST_CHECK_EQUAL(encode(input_data), expected_result);
}

BOOST_AUTO_TEST_CASE(json_codec_string_should_encode_long_string_with_special_chars) {
const auto input_str = generate_escaped_string_answer(10000);
const auto expected_result = generate_escaped_string(10000, true);
BOOST_CHECK(encode(input_str) == expected_result);
}

BOOST_AUTO_TEST_SUITE_END() // codec
BOOST_AUTO_TEST_SUITE_END() // json
BOOST_AUTO_TEST_SUITE_END() // spotify

0 comments on commit abb2350

Please sign in to comment.