Skip to content

Commit f8166c2

Browse files
committed
Optimize convert_UTF8_to_JSON for mostly ASCII strings
If we assume that even UTF-8 strings are mostly ASCII, we can implement a fast path for the ASCII parts. Before: ``` == Encoding mixed utf8 (20012001 bytes) ruby 3.4.0dev (2024-10-18T15:12:54Z master d1b5c10957) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- json 5.000 i/100ms oj 9.000 i/100ms rapidjson 2.000 i/100ms Calculating ------------------------------------- json 49.403 (± 2.0%) i/s (20.24 ms/i) - 250.000 in 5.062647s oj 100.120 (± 2.0%) i/s (9.99 ms/i) - 504.000 in 5.035349s rapidjson 26.404 (± 0.0%) i/s (37.87 ms/i) - 132.000 in 5.001025s Comparison: json: 49.4 i/s oj: 100.1 i/s - 2.03x faster rapidjson: 26.4 i/s - 1.87x slower ``` After: ``` == Encoding mixed utf8 (20012001 bytes) ruby 3.4.0dev (2024-10-18T15:12:54Z master d1b5c10957) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- json 10.000 i/100ms oj 9.000 i/100ms rapidjson 2.000 i/100ms Calculating ------------------------------------- json 95.686 (± 2.1%) i/s (10.45 ms/i) - 480.000 in 5.018575s oj 96.875 (± 2.1%) i/s (10.32 ms/i) - 486.000 in 5.019097s rapidjson 26.260 (± 3.8%) i/s (38.08 ms/i) - 132.000 in 5.033151s Comparison: json: 95.7 i/s oj: 96.9 i/s - same-ish: difference falls within error rapidjson: 26.3 i/s - 3.64x slower ```
1 parent 9ce215d commit f8166c2

File tree

2 files changed

+53
-71
lines changed

2 files changed

+53
-71
lines changed

benchmark/encoder.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ def implementations(ruby_obj)
1717
state = JSON::State.new(JSON.dump_default_options)
1818

1919
{
20-
json: ["json", proc { JSON.dump(ruby_obj) }],
2120
json_state: ["json (reuse)", proc { state.generate(ruby_obj) }],
21+
json: ["json", proc { JSON.dump(ruby_obj) }],
2222
oj: ["oj", proc { Oj.dump(ruby_obj) }],
2323
rapidjson: ["rapidjson", proc { RapidJSON.dump(ruby_obj) }],
2424
}
@@ -59,7 +59,8 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
5959
benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
6060
benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }
6161

62-
# On these two benchmark we perform well.
62+
# On these three benchmarks we perform well. Either on par or very closely faster/slower
63+
benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 2000), except: %i(json_state)
6364
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)
6465
benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json"), except: %i(json_state)
6566

ext/json/ext/generator/generator.c

Lines changed: 50 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -25,96 +25,77 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
2525
* Everything else (should be UTF-8) is just passed through and
2626
* appended to the result.
2727
*/
28-
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
28+
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe)
2929
{
3030
const char *hexdig = "0123456789abcdef";
3131
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
3232

33-
const char *in_utf8_str = RSTRING_PTR(in_string);
34-
unsigned long in_utf8_len = RSTRING_LEN(in_string);
33+
const char *ptr = RSTRING_PTR(str);
34+
unsigned long len = RSTRING_LEN(str);
3535

3636
unsigned long beg = 0, pos;
3737

38-
for (pos = 0; pos < in_utf8_len;) {
39-
uint32_t ch;
40-
short ch_len;
41-
bool should_escape;
42-
43-
/* UTF-8 decoding */
44-
short i;
45-
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
46-
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
47-
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
48-
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
49-
else {
50-
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
51-
}
52-
53-
for (i = 1; i < ch_len; i++) {
54-
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
55-
}
56-
57-
/* JSON policy */
58-
should_escape =
59-
(ch < 0x20) ||
60-
(ch == '"') ||
61-
(ch == '\\') ||
62-
(out_script_safe && (ch == '/')) ||
63-
(out_script_safe && (ch == 0x2028)) ||
64-
(out_script_safe && (ch == 0x2029));
65-
38+
for (pos = 0; pos < len;) {
39+
unsigned char ch = ptr[pos];
6640
/* JSON encoding */
67-
if (should_escape) {
68-
if (pos > beg) {
69-
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
70-
}
71-
72-
beg = pos + ch_len;
41+
if (escape_table[ch]) {
42+
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
7343
switch (ch) {
74-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
75-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
76-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
77-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
78-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
79-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
80-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
81-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
82-
default:
83-
if (ch <= 0xFFFF) {
44+
case '"': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break;
45+
case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break;
46+
case '/': FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break;
47+
case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break;
48+
case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break;
49+
case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break;
50+
case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break;
51+
case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break;
52+
default: {
53+
if ((ch & 0x80) == 0x00) { /* leading 1 bit is 0b0 */
54+
FLUSH_POS(1);
8455
scratch[2] = hexdig[ch >> 12];
8556
scratch[3] = hexdig[(ch >> 8) & 0xf];
8657
scratch[4] = hexdig[(ch >> 4) & 0xf];
8758
scratch[5] = hexdig[ch & 0xf];
8859
fbuffer_append(out_buffer, scratch, 6);
60+
} else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110 */
61+
pos += 2;
62+
} else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110 */
63+
unsigned char b2 = ptr[pos + 1];
64+
unsigned char b3 = ptr[pos + 2];
65+
if (out_script_safe && (b2 == 0x80)) {
66+
if (b3 == 0xA8) {
67+
FLUSH_POS(3);
68+
fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
69+
fbuffer_append(out_buffer, "\\u2028", 6);
70+
} else if (b3 == 0xA9) {
71+
FLUSH_POS(3);
72+
fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
73+
fbuffer_append(out_buffer, "\\u2029", 6);
74+
} else {
75+
pos += 3;
76+
}
77+
} else {
78+
pos += 3;
79+
}
80+
} else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */
81+
pos += 4;
8982
} else {
90-
uint16_t hi, lo;
91-
ch -= 0x10000;
92-
hi = 0xD800 + (uint16_t)(ch >> 10);
93-
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
94-
95-
scratch[2] = hexdig[hi >> 12];
96-
scratch[3] = hexdig[(hi >> 8) & 0xf];
97-
scratch[4] = hexdig[(hi >> 4) & 0xf];
98-
scratch[5] = hexdig[hi & 0xf];
99-
100-
scratch[8] = hexdig[lo >> 12];
101-
scratch[9] = hexdig[(lo >> 8) & 0xf];
102-
scratch[10] = hexdig[(lo >> 4) & 0xf];
103-
scratch[11] = hexdig[lo & 0xf];
104-
105-
fbuffer_append(out_buffer, scratch, 12);
83+
// This should be unreachable
84+
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
10685
}
86+
}
10787
}
88+
} else {
89+
pos++;
10890
}
109-
110-
pos += ch_len;
11191
}
92+
#undef FLUSH_POS
11293

113-
if (beg < in_utf8_len) {
114-
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
94+
if (beg < len) {
95+
fbuffer_append(out_buffer, &ptr[beg], len - beg);
11596
}
11697

117-
RB_GC_GUARD(in_string);
98+
RB_GC_GUARD(str);
11899
}
119100

120101
static const bool escape_table[256] = {
@@ -736,7 +717,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
736717
if (RB_UNLIKELY(state->ascii_only)) {
737718
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe);
738719
} else {
739-
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
720+
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
740721
}
741722
break;
742723
default:

0 commit comments

Comments
 (0)