Skip to content

Commit

Permalink
Support for escaped chars in U+1xxxx range. (#60)
Browse files Browse the repository at this point in the history
* Don't over-allocate strings with \u00xx escape sequences.

 * data_size: off-by-one, one caracter is always allocated at the begin of the loop.
 * offset: not advanced enough, counting characters at the end of the \u00xx sequence twice.

v2: clang-format

* Support for escaped chars in U+1xxxx range.

 U+1F603 '😃' can be escaped to "\uD83D\uDE03"

see https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#rfc.section.7
    To escape an extended character that is not in the Basic Multilingual Plane, the character is represented as a twelve-character sequence, encoding the UTF-16 surrogate pair. So, for example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".

v2: add comments
  • Loading branch information
xxxbxxx authored and sheredom committed Oct 17, 2018
1 parent 7dc2565 commit 337ae57
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 39 deletions.
102 changes: 65 additions & 37 deletions json.c
Expand Up @@ -268,6 +268,7 @@ static int json_get_string_size(struct json_parse_state_s *state,
const char quote_to_use = is_single_quote ? '\'' : '"';
const size_t flags_bitset = state->flags_bitset;
unsigned long codepoint;
unsigned long high_surrogate = 0;

if ((json_parse_flags_allow_location_information & flags_bitset) != 0 &&
is_key != 0) {
Expand Down Expand Up @@ -326,7 +327,7 @@ static int json_get_string_size(struct json_parse_state_s *state,
state->offset = offset;
return 1;
}

codepoint = 0;
if (!json_hexadecimal_value(&src[offset + 1], 4, &codepoint)) {
// escaped unicode sequences must contain 4 hexadecimal digits!
Expand All @@ -343,25 +344,42 @@ static int json_get_string_size(struct json_parse_state_s *state,
// 4 21 U + 10000 U + 10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Note: the high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) are not legal Unicode values, and their UTF-8 encoding must be treated as an invalid byte sequence.

if (codepoint <= 0x7f) {
offset += 1;
if (high_surrogate != 0) {
// we previously read the high half of the \uxxxx\uxxxx pair,
// so now we expect the low half.
if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { // low surrogate range
data_size += 3;
high_surrogate = 0;
} else {
state->error = json_parse_error_invalid_string_escape_sequence;
state->offset = offset;
return 1;
}
} else if (codepoint <= 0x7f) {
data_size += 0;
} else if (codepoint <= 0x7ff) {
data_size += 1;
}
else if (codepoint <= 0x7ff) {
offset += 2;
data_size += 2;
}
else if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
} else if (codepoint >= 0xd800 && codepoint <= 0xdbff) { // high surrogate range
// the codepoint is the first half of a "utf-16 surrogate pair"
// so we need the other half for it to be valid: \uHHHH\uLLLL
if (offset + 11 > size || '\\' != src[offset + 5] ||
'u' != src[offset + 6]) {
state->error = json_parse_error_invalid_string_escape_sequence;
state->offset = offset;
return 1;
}
high_surrogate = codepoint;
} else if (codepoint >= 0xd800 && codepoint <= 0xdfff) { // low surrogate range
// we did not read the other half before.
state->error = json_parse_error_invalid_string_escape_sequence;
state->offset = offset;
return 1;
} else {
data_size += 2;
}
else {
offset += 3;
data_size += 3;
}
// codepoints after 0xffff are not supported in json
// escaped codepoints after 0xffff are supported in json through utf-16 surrogate pairs: \uD83D\uDD25 for U+1F525

offset += 5;
break;
}
} else if (('\r' == src[offset]) || ('\n' == src[offset])) {
Expand Down Expand Up @@ -804,6 +822,7 @@ static int json_get_number_size(struct json_parse_state_s *state) {
if (json_parse_flags_allow_equals_in_object & flags_bitset) {
break;
}
// FALLTHROUGH
default:
state->error = json_parse_error_invalid_number_format;
state->offset = offset;
Expand Down Expand Up @@ -937,6 +956,7 @@ static void json_parse_string(struct json_parse_state_s *state,
const char *const src = state->src;
const char quote_to_use = '\'' == src[offset] ? '\'' : '"';
char *data = state->data;
unsigned long high_surrogate = 0;
unsigned long codepoint;

string->string = data;
Expand All @@ -952,30 +972,38 @@ static void json_parse_string(struct json_parse_state_s *state,
switch (src[offset++]) {
default:
return; // we cannot ever reach here
case 'u':
{
codepoint = 0;
if (!json_hexadecimal_value(&src[offset], 4, &codepoint)) {
return; // this shouldn't happen as the value was already validated
}

offset += 4;

if (codepoint <= 0x7fu) {
data[bytes_written++] = (char)codepoint; // 0xxxxxxx
}
else if (codepoint <= 0x7ffu) {
data[bytes_written++] = (char)(0xc0u | (codepoint >> 6)); // 110xxxxx
data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx
}
else {
// we assume the value was validated and thus is within the valid range
data[bytes_written++] = (char)(0xe0u | (codepoint >> 12)); // 1110xxxx
data[bytes_written++] = (char)(0x80u | ((codepoint >> 6) & 0x3fu)); // 10xxxxxx
data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx
}
case 'u': {
codepoint = 0;
if (!json_hexadecimal_value(&src[offset], 4, &codepoint)) {
return; // this shouldn't happen as the value was already validated
}
break;

offset += 4;

if (codepoint <= 0x7fu) {
data[bytes_written++] = (char)codepoint; // 0xxxxxxx
} else if (codepoint <= 0x7ffu) {
data[bytes_written++] = (char)(0xc0u | (codepoint >> 6)); // 110xxxxx
data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx
} else if (codepoint >= 0xd800 && codepoint <= 0xdbff) { // high surrogate
high_surrogate = codepoint;
continue; // we need the low half to form a complete codepoint.
} else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { // low surrogate
// combine with the previously read half to obtain the complete codepoint.
const unsigned long surrogate_offset = 0x10000u - (0xD800u << 10) - 0xDC00u;
codepoint = (high_surrogate << 10) + codepoint + surrogate_offset;
high_surrogate = 0;
data[bytes_written++] = (char)(0xF0u | (codepoint >> 18)); // 11110xxx
data[bytes_written++] = (char)(0x80u | ((codepoint >> 12) & 0x3fu)); // 10xxxxxx
data[bytes_written++] = (char)(0x80u | ((codepoint >> 6) & 0x3fu)); // 10xxxxxx
data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx
} else {
// we assume the value was validated and thus is within the valid range
data[bytes_written++] = (char)(0xe0u | (codepoint >> 12)); // 1110xxxx
data[bytes_written++] = (char)(0x80u | ((codepoint >> 6) & 0x3fu)); // 10xxxxxx
data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx
}
} break;
case '"':
data[bytes_written++] = '"';
break;
Expand Down
6 changes: 4 additions & 2 deletions test/main.c
Expand Up @@ -808,8 +808,10 @@ UTEST(object, empty_strings) {


UTEST(string, unicode_escape) {
const char expected_str[] = "\xEA\x83\x8A" "ABC" "\xC3\x8A" "DEF" "\n";
const char payload[] = "[\"\\ua0caABC\\u00caDEF\\u000a\"]";
const char expected_str[] = "\xEA\x83\x8A" "ABC" "\xC3\x8A" "DEF" "\n"
" ,\xC5\xBD,\xE0\xA0\x80,\xE0\xA6\xA8,\xE2\x99\x9E,\xEF\xBF\xBD,\xD0\xA8,\xE4\x93\x8D,\xF0\x90\x80\x80,\xF0\x9F\x98\x83.";
const char payload[] = "[\"\\ua0caABC\\u00caDEF\\u000a"
"\\u0020,\\u017D,\\u0800,\\u09A8,\\u265E,\\uFFFD,\\u0428,\\u44CD,\\uD800\\uDC00,\\uD83D\\uDE03.\"]";
struct json_value_s *value = json_parse(payload, strlen(payload));
struct json_array_s *array = 0;
struct json_string_s *str = 0;
Expand Down

0 comments on commit 337ae57

Please sign in to comment.