Optimize convert_UTF8_to_JSON for mostly ASCII strings

byroot · byroot · commit f8166c2d7f7d · 2024-10-18T19:50:43.000+02:00
If we assume that even UTF-8 strings are mostly ASCII, we can implement a
fast path for the ASCII parts.

Before:

```
== Encoding mixed utf8 (20012001 bytes)
ruby 3.4.0dev (2024-10-18T15:12:54Z master d1b5c10957) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
                json     5.000 i/100ms
                  oj     9.000 i/100ms
           rapidjson     2.000 i/100ms
Calculating -------------------------------------
                json     49.403 (± 2.0%) i/s   (20.24 ms/i) -    250.000 in   5.062647s
                  oj    100.120 (± 2.0%) i/s    (9.99 ms/i) -    504.000 in   5.035349s
           rapidjson     26.404 (± 0.0%) i/s   (37.87 ms/i) -    132.000 in   5.001025s

Comparison:
                json:       49.4 i/s
                  oj:      100.1 i/s - 2.03x  faster
           rapidjson:       26.4 i/s - 1.87x  slower
```

After:

```
== Encoding mixed utf8 (20012001 bytes)
ruby 3.4.0dev (2024-10-18T15:12:54Z master d1b5c10957) +YJIT +PRISM [arm64-darwin23]
Warming up --------------------------------------
                json    10.000 i/100ms
                  oj     9.000 i/100ms
           rapidjson     2.000 i/100ms
Calculating -------------------------------------
                json     95.686 (± 2.1%) i/s   (10.45 ms/i) -    480.000 in   5.018575s
                  oj     96.875 (± 2.1%) i/s   (10.32 ms/i) -    486.000 in   5.019097s
           rapidjson     26.260 (± 3.8%) i/s   (38.08 ms/i) -    132.000 in   5.033151s

Comparison:
                json:       95.7 i/s
                  oj:       96.9 i/s - same-ish: difference falls within error
           rapidjson:       26.3 i/s - 3.64x  slower
```
diff --git a/benchmark/encoder.rb b/benchmark/encoder.rb
@@ -17,8 +17,8 @@ def implementations(ruby_obj)
   state = JSON::State.new(JSON.dump_default_options)
 
   {
-    json: ["json", proc { JSON.dump(ruby_obj) }],
     json_state: ["json (reuse)", proc { state.generate(ruby_obj) }],
+    json: ["json", proc { JSON.dump(ruby_obj) }],
     oj: ["oj", proc { Oj.dump(ruby_obj) }],
     rapidjson: ["rapidjson", proc { RapidJSON.dump(ruby_obj) }],
   }
@@ -59,7 +59,8 @@ def benchmark_encoding(benchmark_name, ruby_obj, check_expected: true, except: [
 benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
 benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }
 
-# On these two benchmark we perform well.
+# On these three benchmarks we perform well. Either on par or very closely faster/slower
+benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 2000), except: %i(json_state)
 benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)
 benchmark_encoding "citm_catalog.json", JSON.load_file("#{__dir__}/data/citm_catalog.json"), except: %i(json_state)
 
diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c
@@ -25,96 +25,77 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
  * Everything else (should be UTF-8) is just passed through and
  * appended to the result.
  */
-static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe)
 {
     const char *hexdig = "0123456789abcdef";
     char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
 
-    const char *in_utf8_str = RSTRING_PTR(in_string);
-    unsigned long in_utf8_len = RSTRING_LEN(in_string);
+    const char *ptr = RSTRING_PTR(str);
+    unsigned long len = RSTRING_LEN(str);
 
     unsigned long beg = 0, pos;
 
-    for (pos =  0; pos < in_utf8_len;) {
-        uint32_t ch;
-        short ch_len;
-        bool should_escape;
-
-        /* UTF-8 decoding */
-        short i;
-        if      ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos];        } /* leading 1 bit is   0b0     */
-        else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110   */
-        else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110  */
-        else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
-        else {
-            rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
-        }
-
-        for (i = 1; i < ch_len; i++) {
-            ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
-        }
-
-        /* JSON policy */
-        should_escape =
-            (ch < 0x20) ||
-            (ch == '"') ||
-            (ch == '\\') ||
-            (out_script_safe && (ch == '/')) ||
-            (out_script_safe && (ch == 0x2028)) ||
-            (out_script_safe && (ch == 0x2029));
-
+    for (pos = 0; pos < len;) {
+        unsigned char ch = ptr[pos];
         /* JSON encoding */
-        if (should_escape) {
-            if (pos > beg) {
-                fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
-            }
-
-            beg = pos + ch_len;
+        if (escape_table[ch]) {
+#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
             switch (ch) {
-                case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
-                case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
-                case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
-                case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
-                case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
-                case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
-                case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
-                case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
-                default:
-                    if (ch <= 0xFFFF) {
+                case '"':  FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break;
+                case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break;
+                case '/':  FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break;
+                case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break;
+                case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break;
+                case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break;
+                case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break;
+                case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break;
+                default: {
+                    if ((ch & 0x80) == 0x00) { /* leading 1 bit is   0b0     */
+                        FLUSH_POS(1);
                         scratch[2] = hexdig[ch >> 12];
                         scratch[3] = hexdig[(ch >> 8) & 0xf];
                         scratch[4] = hexdig[(ch >> 4) & 0xf];
                         scratch[5] = hexdig[ch & 0xf];
                         fbuffer_append(out_buffer, scratch, 6);
+                    } else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110   */
+                        pos += 2;
+                    } else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110  */
+                        unsigned char b2 = ptr[pos + 1];
+                        unsigned char b3 = ptr[pos + 2];
+                        if (out_script_safe && (b2 == 0x80)) {
+                            if (b3 == 0xA8) {
+                                FLUSH_POS(3);
+                                fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
+                                fbuffer_append(out_buffer, "\\u2028", 6);
+                            } else if (b3 == 0xA9) {
+                                FLUSH_POS(3);
+                                fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
+                                fbuffer_append(out_buffer, "\\u2029", 6);
+                            } else {
+                                pos += 3;
+                            }
+                        } else {
+                            pos += 3;
+                        }
+                    } else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */
+                        pos += 4;
                     } else {
-                        uint16_t hi, lo;
-                        ch -= 0x10000;
-                        hi = 0xD800 + (uint16_t)(ch >> 10);
-                        lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
-
-                        scratch[2] = hexdig[hi >> 12];
-                        scratch[3] = hexdig[(hi >> 8) & 0xf];
-                        scratch[4] = hexdig[(hi >> 4) & 0xf];
-                        scratch[5] = hexdig[hi & 0xf];
-
-                        scratch[8] = hexdig[lo >> 12];
-                        scratch[9] = hexdig[(lo >> 8) & 0xf];
-                        scratch[10] = hexdig[(lo >> 4) & 0xf];
-                        scratch[11] = hexdig[lo & 0xf];
-
-                        fbuffer_append(out_buffer, scratch, 12);
+                        // This should be unreachable
+                        rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
                     }
+                }
             }
+        } else {
+            pos++;
         }
-
-        pos += ch_len;
     }
+#undef FLUSH_POS
 
-    if (beg < in_utf8_len) {
-        fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
+    if (beg < len) {
+        fbuffer_append(out_buffer, &ptr[beg], len - beg);
     }
 
-    RB_GC_GUARD(in_string);
+    RB_GC_GUARD(str);
 }
 
 static const bool escape_table[256] = {
@@ -736,7 +717,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
             if (RB_UNLIKELY(state->ascii_only)) {
                 convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe);
             } else {
-                convert_UTF8_to_JSON(buffer, obj, state->script_safe);
+                convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table, state->script_safe);
             }
             break;
         default: