From 337ae576698a25cc844d966765c80b405c4fd907 Mon Sep 17 00:00:00 2001 From: xxxbxxx Date: Wed, 17 Oct 2018 23:54:26 +0200 Subject: [PATCH] Support for escaped chars in U+1xxxx range. (#60) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Don't over-allocate strings with \u00xx escape sequences. * data_size: off-by-one, one caracter is always allocated at the begin of the loop. * offset: not advanced enough, counting characters at the end of the \u00xx sequence twice. v2: clang-format * Support for escaped chars in U+1xxxx range. U+1F603 '😃' can be escaped to "\uD83D\uDE03" see https://tools.ietf.org/id/draft-ietf-json-rfc4627bis-09.html#rfc.section.7 To escape an extended character that is not in the Basic Multilingual Plane, the character is represented as a twelve-character sequence, encoding the UTF-16 surrogate pair. So, for example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E". v2: add comments --- json.c | 102 +++++++++++++++++++++++++++++++++------------------- test/main.c | 6 ++-- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/json.c b/json.c index 60655bb..077c6ad 100644 --- a/json.c +++ b/json.c @@ -268,6 +268,7 @@ static int json_get_string_size(struct json_parse_state_s *state, const char quote_to_use = is_single_quote ? '\'' : '"'; const size_t flags_bitset = state->flags_bitset; unsigned long codepoint; + unsigned long high_surrogate = 0; if ((json_parse_flags_allow_location_information & flags_bitset) != 0 && is_key != 0) { @@ -326,7 +327,7 @@ static int json_get_string_size(struct json_parse_state_s *state, state->offset = offset; return 1; } - + codepoint = 0; if (!json_hexadecimal_value(&src[offset + 1], 4, &codepoint)) { // escaped unicode sequences must contain 4 hexadecimal digits! @@ -343,25 +344,42 @@ static int json_get_string_size(struct json_parse_state_s *state, // 4 21 U + 10000 U + 10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx // Note: the high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) are not legal Unicode values, and their UTF-8 encoding must be treated as an invalid byte sequence. - if (codepoint <= 0x7f) { - offset += 1; + if (high_surrogate != 0) { + // we previously read the high half of the \uxxxx\uxxxx pair, + // so now we expect the low half. + if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { // low surrogate range + data_size += 3; + high_surrogate = 0; + } else { + state->error = json_parse_error_invalid_string_escape_sequence; + state->offset = offset; + return 1; + } + } else if (codepoint <= 0x7f) { + data_size += 0; + } else if (codepoint <= 0x7ff) { data_size += 1; - } - else if (codepoint <= 0x7ff) { - offset += 2; - data_size += 2; - } - else if (codepoint >= 0xd800 && codepoint <= 0xdfff) { + } else if (codepoint >= 0xd800 && codepoint <= 0xdbff) { // high surrogate range + // the codepoint is the first half of a "utf-16 surrogate pair" + // so we need the other half for it to be valid: \uHHHH\uLLLL + if (offset + 11 > size || '\\' != src[offset + 5] || + 'u' != src[offset + 6]) { + state->error = json_parse_error_invalid_string_escape_sequence; + state->offset = offset; + return 1; + } + high_surrogate = codepoint; + } else if (codepoint >= 0xd800 && codepoint <= 0xdfff) { // low surrogate range + // we did not read the other half before. state->error = json_parse_error_invalid_string_escape_sequence; state->offset = offset; return 1; + } else { + data_size += 2; } - else { - offset += 3; - data_size += 3; - } - // codepoints after 0xffff are not supported in json + // escaped codepoints after 0xffff are supported in json through utf-16 surrogate pairs: \uD83D\uDD25 for U+1F525 + offset += 5; break; } } else if (('\r' == src[offset]) || ('\n' == src[offset])) { @@ -804,6 +822,7 @@ static int json_get_number_size(struct json_parse_state_s *state) { if (json_parse_flags_allow_equals_in_object & flags_bitset) { break; } + // FALLTHROUGH default: state->error = json_parse_error_invalid_number_format; state->offset = offset; @@ -937,6 +956,7 @@ static void json_parse_string(struct json_parse_state_s *state, const char *const src = state->src; const char quote_to_use = '\'' == src[offset] ? '\'' : '"'; char *data = state->data; + unsigned long high_surrogate = 0; unsigned long codepoint; string->string = data; @@ -952,30 +972,38 @@ static void json_parse_string(struct json_parse_state_s *state, switch (src[offset++]) { default: return; // we cannot ever reach here - case 'u': - { - codepoint = 0; - if (!json_hexadecimal_value(&src[offset], 4, &codepoint)) { - return; // this shouldn't happen as the value was already validated - } - - offset += 4; - - if (codepoint <= 0x7fu) { - data[bytes_written++] = (char)codepoint; // 0xxxxxxx - } - else if (codepoint <= 0x7ffu) { - data[bytes_written++] = (char)(0xc0u | (codepoint >> 6)); // 110xxxxx - data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx - } - else { - // we assume the value was validated and thus is within the valid range - data[bytes_written++] = (char)(0xe0u | (codepoint >> 12)); // 1110xxxx - data[bytes_written++] = (char)(0x80u | ((codepoint >> 6) & 0x3fu)); // 10xxxxxx - data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx - } + case 'u': { + codepoint = 0; + if (!json_hexadecimal_value(&src[offset], 4, &codepoint)) { + return; // this shouldn't happen as the value was already validated } - break; + + offset += 4; + + if (codepoint <= 0x7fu) { + data[bytes_written++] = (char)codepoint; // 0xxxxxxx + } else if (codepoint <= 0x7ffu) { + data[bytes_written++] = (char)(0xc0u | (codepoint >> 6)); // 110xxxxx + data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx + } else if (codepoint >= 0xd800 && codepoint <= 0xdbff) { // high surrogate + high_surrogate = codepoint; + continue; // we need the low half to form a complete codepoint. + } else if (codepoint >= 0xdc00 && codepoint <= 0xdfff) { // low surrogate + // combine with the previously read half to obtain the complete codepoint. + const unsigned long surrogate_offset = 0x10000u - (0xD800u << 10) - 0xDC00u; + codepoint = (high_surrogate << 10) + codepoint + surrogate_offset; + high_surrogate = 0; + data[bytes_written++] = (char)(0xF0u | (codepoint >> 18)); // 11110xxx + data[bytes_written++] = (char)(0x80u | ((codepoint >> 12) & 0x3fu)); // 10xxxxxx + data[bytes_written++] = (char)(0x80u | ((codepoint >> 6) & 0x3fu)); // 10xxxxxx + data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx + } else { + // we assume the value was validated and thus is within the valid range + data[bytes_written++] = (char)(0xe0u | (codepoint >> 12)); // 1110xxxx + data[bytes_written++] = (char)(0x80u | ((codepoint >> 6) & 0x3fu)); // 10xxxxxx + data[bytes_written++] = (char)(0x80u | (codepoint & 0x3fu)); // 10xxxxxx + } + } break; case '"': data[bytes_written++] = '"'; break; diff --git a/test/main.c b/test/main.c index 277d549..d549bd5 100644 --- a/test/main.c +++ b/test/main.c @@ -808,8 +808,10 @@ UTEST(object, empty_strings) { UTEST(string, unicode_escape) { - const char expected_str[] = "\xEA\x83\x8A" "ABC" "\xC3\x8A" "DEF" "\n"; - const char payload[] = "[\"\\ua0caABC\\u00caDEF\\u000a\"]"; + const char expected_str[] = "\xEA\x83\x8A" "ABC" "\xC3\x8A" "DEF" "\n" + " ,\xC5\xBD,\xE0\xA0\x80,\xE0\xA6\xA8,\xE2\x99\x9E,\xEF\xBF\xBD,\xD0\xA8,\xE4\x93\x8D,\xF0\x90\x80\x80,\xF0\x9F\x98\x83."; + const char payload[] = "[\"\\ua0caABC\\u00caDEF\\u000a" + "\\u0020,\\u017D,\\u0800,\\u09A8,\\u265E,\\uFFFD,\\u0428,\\u44CD,\\uD800\\uDC00,\\uD83D\\uDE03.\"]"; struct json_value_s *value = json_parse(payload, strlen(payload)); struct json_array_s *array = 0; struct json_string_s *str = 0;