diff --git a/CHANGES.md b/CHANGES.md index d8915b13..6ad2da7c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ ### Unreleased +* `JSON::Coder` now also yields to the block when encountering strings with invalid encoding. * Fix GeneratorError messages to be UTF-8 encoded. * Fix memory leak when `Exception` is raised, or `throw` is used during JSON generation. * Optimized floating point number parsing by integrating the ryu algorithm (thanks to Josef Šimánek). diff --git a/README.md b/README.md index eed71ba4..fee99305 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,23 @@ puts MyApp::API_JSON_CODER.dump(Time.now.utc) # => "2025-01-21T08:41:44.286Z" The provided block is called for all objects that don't have a native JSON equivalent, and must return a Ruby object that has a native JSON equivalent. -It is also called for objects that do have a JSON equivalent, but are used as Hash keys, for instance `{ 1 => 2}`. +It is also called for objects that do have a JSON equivalent, but are used as Hash keys, for instance `{ 1 => 2}`, +as well as for strings that aren't valid UTF-8: + +```ruby +coder = JSON::Combining.new do |object, is_object_key| + case object + when String + if !string.valid_encoding? || string.encoding != Encoding::UTF_8 + Base64.encode64(string) + else + string + end + else + object + end +end +``` ## Combining JSON fragments diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 72155abe..024a8572 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -996,13 +996,12 @@ static inline VALUE vstate_get(struct generate_json_data *data) return data->vstate; } -struct hash_foreach_arg { - VALUE hash; - struct generate_json_data *data; - int first_key_type; - bool first; - bool mixed_keys_encountered; -}; +static VALUE +json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key) +{ + VALUE proc_args[2] = {object, is_key}; + return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil); +} static VALUE convert_string_subclass(VALUE key) @@ -1019,6 +1018,129 @@ convert_string_subclass(VALUE key) return key_to_s; } +static bool enc_utf8_compatible_p(int enc_idx) +{ + if (enc_idx == usascii_encindex) return true; + if (enc_idx == utf8_encindex) return true; + return false; +} + +static VALUE encode_json_string_try(VALUE str) +{ + return rb_funcall(str, i_encode, 1, Encoding_UTF_8); +} + +static VALUE encode_json_string_rescue(VALUE str, VALUE exception) +{ + raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0)); + return Qundef; +} + +static inline bool valid_json_string_p(VALUE str) +{ + int coderange = rb_enc_str_coderange(str); + + if (RB_LIKELY(coderange == ENC_CODERANGE_7BIT)) { + return true; + } + + if (RB_LIKELY(coderange == ENC_CODERANGE_VALID)) { + return enc_utf8_compatible_p(RB_ENCODING_GET_INLINED(str)); + } + + return false; +} + +static inline VALUE ensure_valid_encoding(struct generate_json_data *data, VALUE str, bool as_json_called, bool is_key) +{ + if (RB_LIKELY(valid_json_string_p(str))) { + return str; + } + + if (!as_json_called && data->state->strict && RTEST(data->state->as_json)) { + VALUE coerced_str = json_call_as_json(data->state, str, Qfalse); + if (coerced_str != str) { + if (RB_TYPE_P(coerced_str, T_STRING)) { + if (!valid_json_string_p(coerced_str)) { + raise_generator_error(str, "source sequence is illegal/malformed utf-8"); + } + } else { + // as_json could return another type than T_STRING + if (is_key) { + raise_generator_error(coerced_str, "%"PRIsVALUE" not allowed as object key in JSON", CLASS_OF(coerced_str)); + } + } + + return coerced_str; + } + } + + if (RB_ENCODING_GET_INLINED(str) == binary_encindex) { + VALUE utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex); + switch (rb_enc_str_coderange(utf8_string)) { + case ENC_CODERANGE_7BIT: + return utf8_string; + case ENC_CODERANGE_VALID: + // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work. + // TODO: Raise in 3.0.0 + rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0"); + return utf8_string; + break; + } + } + + return rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str); +} + +static void raw_generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj) +{ + fbuffer_append_char(buffer, '"'); + + long len; + search_state search; + search.buffer = buffer; + RSTRING_GETMEM(obj, search.ptr, len); + search.cursor = search.ptr; + search.end = search.ptr + len; + +#ifdef HAVE_SIMD + search.matches_mask = 0; + search.has_matches = false; + search.chunk_base = NULL; +#endif /* HAVE_SIMD */ + + switch (rb_enc_str_coderange(obj)) { + case ENC_CODERANGE_7BIT: + case ENC_CODERANGE_VALID: + if (RB_UNLIKELY(data->state->ascii_only)) { + convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table); + } else if (RB_UNLIKELY(data->state->script_safe)) { + convert_UTF8_to_script_safe_JSON(&search); + } else { + convert_UTF8_to_JSON(&search); + } + break; + default: + raise_generator_error(obj, "source sequence is illegal/malformed utf-8"); + break; + } + fbuffer_append_char(buffer, '"'); +} + +static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj) +{ + obj = ensure_valid_encoding(data, obj, false, false); + raw_generate_json_string(buffer, data, obj); +} + +struct hash_foreach_arg { + VALUE hash; + struct generate_json_data *data; + int first_key_type; + bool first; + bool mixed_keys_encountered; +}; + NOINLINE() static void json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg) @@ -1035,13 +1157,6 @@ json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg) } } -static VALUE -json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key) -{ - VALUE proc_args[2] = {object, is_key}; - return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil); -} - static int json_object_i(VALUE key, VALUE val, VALUE _arg) { @@ -1107,8 +1222,10 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) break; } + key_to_s = ensure_valid_encoding(data, key_to_s, as_json_called, true); + if (RB_LIKELY(RBASIC_CLASS(key_to_s) == rb_cString)) { - generate_json_string(buffer, data, key_to_s); + raw_generate_json_string(buffer, data, key_to_s); } else { generate_json(buffer, data, key_to_s); } @@ -1191,85 +1308,6 @@ static void generate_json_array(FBuffer *buffer, struct generate_json_data *data fbuffer_append_char(buffer, ']'); } -static inline int enc_utf8_compatible_p(int enc_idx) -{ - if (enc_idx == usascii_encindex) return 1; - if (enc_idx == utf8_encindex) return 1; - return 0; -} - -static VALUE encode_json_string_try(VALUE str) -{ - return rb_funcall(str, i_encode, 1, Encoding_UTF_8); -} - -static VALUE encode_json_string_rescue(VALUE str, VALUE exception) -{ - raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0)); - return Qundef; -} - -static inline VALUE ensure_valid_encoding(VALUE str) -{ - int encindex = RB_ENCODING_GET(str); - VALUE utf8_string; - if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) { - if (encindex == binary_encindex) { - utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex); - switch (rb_enc_str_coderange(utf8_string)) { - case ENC_CODERANGE_7BIT: - return utf8_string; - case ENC_CODERANGE_VALID: - // For historical reason, we silently reinterpret binary strings as UTF-8 if it would work. - // TODO: Raise in 3.0.0 - rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0"); - return utf8_string; - break; - } - } - - str = rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str); - } - return str; -} - -static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj) -{ - obj = ensure_valid_encoding(obj); - - fbuffer_append_char(buffer, '"'); - - long len; - search_state search; - search.buffer = buffer; - RSTRING_GETMEM(obj, search.ptr, len); - search.cursor = search.ptr; - search.end = search.ptr + len; - -#ifdef HAVE_SIMD - search.matches_mask = 0; - search.has_matches = false; - search.chunk_base = NULL; -#endif /* HAVE_SIMD */ - - switch (rb_enc_str_coderange(obj)) { - case ENC_CODERANGE_7BIT: - case ENC_CODERANGE_VALID: - if (RB_UNLIKELY(data->state->ascii_only)) { - convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table); - } else if (RB_UNLIKELY(data->state->script_safe)) { - convert_UTF8_to_script_safe_JSON(&search); - } else { - convert_UTF8_to_JSON(&search); - } - break; - default: - raise_generator_error(obj, "source sequence is illegal/malformed utf-8"); - break; - } - fbuffer_append_char(buffer, '"'); -} - static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { VALUE tmp; @@ -1408,7 +1446,16 @@ static void generate_json(FBuffer *buffer, struct generate_json_data *data, VALU break; case T_STRING: if (klass != rb_cString) goto general; - generate_json_string(buffer, data, obj); + + if (RB_LIKELY(valid_json_string_p(obj))) { + raw_generate_json_string(buffer, data, obj); + } else if (as_json_called) { + raise_generator_error(obj, "source sequence is illegal/malformed utf-8"); + } else { + obj = ensure_valid_encoding(data, obj, false, false); + as_json_called = true; + goto start; + } break; case T_SYMBOL: generate_json_symbol(buffer, data, obj); diff --git a/java/src/json/ext/Generator.java b/java/src/json/ext/Generator.java index be0e8257..78281dbd 100644 --- a/java/src/json/ext/Generator.java +++ b/java/src/json/ext/Generator.java @@ -606,6 +606,8 @@ private static IRubyObject castKey(ThreadContext context, IRubyObject key) { } private static void processEntry(ThreadContext context, Session session, OutputStream buffer, RubyHash.RubyHashEntry entry, boolean firstPair, ByteList objectNl, byte[] indent, ByteList spaceBefore, ByteList space) { + StringEncoder encoder = session.getStringEncoder(context); + IRubyObject key = (IRubyObject) entry.getKey(); IRubyObject value = (IRubyObject) entry.getValue(); @@ -619,7 +621,7 @@ private static void processEntry(ThreadContext context, Session session, OutputS Ruby runtime = context.runtime; IRubyObject keyStr = castKey(context, key); - if (keyStr == null || !(keyStr instanceof RubyString)) { + if (keyStr == null || !(keyStr instanceof RubyString) || !encoder.hasValidEncoding((RubyString)keyStr)) { GeneratorState state = session.getState(context); if (state.strict()) { if (state.getAsJSON() != null) { @@ -664,6 +666,19 @@ int guessSize(ThreadContext context, Session session, RubyString object) { @Override void generate(ThreadContext context, Session session, RubyString object, OutputStream buffer) throws IOException { + GeneratorState state = session.getState(context); + StringEncoder encoder = session.getStringEncoder(context); + + if (state.strict() && !encoder.hasValidEncoding(object) && state.getAsJSON() != null) { + IRubyObject value = state.getAsJSON().call(context, object, context.getRuntime().getFalse()); + if (value instanceof RubyString) { + object = (RubyString)value; + } else { + Handler handler = getHandlerFor(context.runtime, value); + handler.generate(context, session, value, buffer); + return; + } + } generateString(context, session, object, buffer); } } diff --git a/java/src/json/ext/StringEncoder.java b/java/src/json/ext/StringEncoder.java index b874ad78..4eea11c6 100644 --- a/java/src/json/ext/StringEncoder.java +++ b/java/src/json/ext/StringEncoder.java @@ -208,6 +208,17 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr append('"'); } + static boolean hasValidEncoding(RubyString str) { + switch (str.scanForCodeRange()) { + case StringSupport.CR_7BIT: + return true; + case StringSupport.CR_VALID: + return str.getEncoding() == UTF8Encoding.INSTANCE || str.getEncoding() == USASCIIEncoding.INSTANCE; + default: + return false; + } + } + static RubyString ensureValidEncoding(ThreadContext context, RubyString str) { Encoding encoding = str.getEncoding(); diff --git a/lib/json/truffle_ruby/generator.rb b/lib/json/truffle_ruby/generator.rb index 703211cb..feed4c79 100644 --- a/lib/json/truffle_ruby/generator.rb +++ b/lib/json/truffle_ruby/generator.rb @@ -55,6 +55,11 @@ def self.native_key?(key) # :nodoc: (Symbol === key || String === key) end + def self.valid_encoding?(string) # :nodoc: + return false unless string.encoding == ::Encoding::UTF_8 || string.encoding == ::Encoding::US_ASCII + string.is_a?(Symbol) || string.valid_encoding? + end + # Convert a UTF8 encoded Ruby string _string_ to a JSON string, encoded with # UTF16 big endian characters as \u????, and return it. def self.utf8_to_json(string, script_safe = false) # :nodoc: @@ -521,13 +526,17 @@ def json_transform(state) end result << state.indent * depth if indent - if state.strict? && !Generator.native_key?(key) - if state.as_json + if state.strict? + if state.as_json && (!Generator.native_key?(key) || !Generator.valid_encoding?(key)) key = state.as_json.call(key, true) end unless Generator.native_key?(key) - raise GeneratorError.new("#{key.class} not allowed as object key in JSON", value) + raise GeneratorError.new("#{key.class} not allowed as object key in JSON", key) + end + + unless Generator.valid_encoding?(key) + raise GeneratorError.new("source sequence is illegal/malformed utf-8", key) end end @@ -674,14 +683,25 @@ module String # \u????. def to_json(state = nil, *args) state = State.from_state(state) - if encoding == ::Encoding::UTF_8 - unless valid_encoding? + string = self + + if state.strict? && state.as_json + unless Generator.valid_encoding?(string) + string = state.as_json.call(string, false) + unless string.is_a?(::String) + return string.to_json(state, *args) + end + end + end + + if string.encoding == ::Encoding::UTF_8 + unless string.valid_encoding? raise GeneratorError.new("source sequence is illegal/malformed utf-8", self) end - string = self else - string = encode(::Encoding::UTF_8) + string = string.encode(::Encoding::UTF_8) end + if state.ascii_only? %("#{JSON::TruffleRuby::Generator.utf8_to_json_ascii(string, state.script_safe)}") else diff --git a/test/json/json_coder_test.rb b/test/json/json_coder_test.rb index fb9d7b30..83b89a3b 100755 --- a/test/json/json_coder_test.rb +++ b/test/json/json_coder_test.rb @@ -67,6 +67,71 @@ def test_json_coder_dump_NaN_or_Infinity_loop assert_include error.message, "NaN not allowed in JSON" end + def test_json_coder_string_invalid_encoding + calls = 0 + coder = JSON::Coder.new do |object, is_key| + calls += 1 + object + end + + error = assert_raise JSON::GeneratorError do + coder.dump("\xFF") + end + assert_equal "source sequence is illegal/malformed utf-8", error.message + assert_equal 1, calls + + error = assert_raise JSON::GeneratorError do + coder.dump({ "\xFF" => 1 }) + end + assert_equal "source sequence is illegal/malformed utf-8", error.message + assert_equal 2, calls + + calls = 0 + coder = JSON::Coder.new do |object, is_key| + calls += 1 + object.dup + end + + error = assert_raise JSON::GeneratorError do + coder.dump("\xFF") + end + assert_equal "source sequence is illegal/malformed utf-8", error.message + assert_equal 1, calls + + error = assert_raise JSON::GeneratorError do + coder.dump({ "\xFF" => 1 }) + end + assert_equal "source sequence is illegal/malformed utf-8", error.message + assert_equal 2, calls + + calls = 0 + coder = JSON::Coder.new do |object, is_key| + calls += 1 + object.bytes + end + + assert_equal "[255]", coder.dump("\xFF") + assert_equal 1, calls + + error = assert_raise JSON::GeneratorError do + coder.dump({ "\xFF" => 1 }) + end + assert_equal "Array not allowed as object key in JSON", error.message + assert_equal 2, calls + + calls = 0 + coder = JSON::Coder.new do |object, is_key| + calls += 1 + [object].pack("m") + end + + assert_equal '"/w==\\n"', coder.dump("\xFF") + assert_equal 1, calls + + assert_equal '{"/w==\\n":1}', coder.dump({ "\xFF" => 1 }) + assert_equal 2, calls + end + def test_nesting_recovery coder = JSON::Coder.new ary = []