Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

### Unreleased

* `JSON::Coder` now also yields to the block when encountering strings with invalid encoding.
* Fix GeneratorError messages to be UTF-8 encoded.
* Fix memory leak when `Exception` is raised, or `throw` is used during JSON generation.
* Optimized floating point number parsing by integrating the ryu algorithm (thanks to Josef Šimánek).
Expand Down
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,23 @@ puts MyApp::API_JSON_CODER.dump(Time.now.utc) # => "2025-01-21T08:41:44.286Z"
The provided block is called for all objects that don't have a native JSON equivalent, and
must return a Ruby object that has a native JSON equivalent.

It is also called for objects that do have a JSON equivalent, but are used as Hash keys, for instance `{ 1 => 2}`.
It is also called for objects that do have a JSON equivalent, but are used as Hash keys, for instance `{ 1 => 2}`,
as well as for strings that aren't valid UTF-8:

```ruby
coder = JSON::Combining.new do |object, is_object_key|
case object
when String
if !string.valid_encoding? || string.encoding != Encoding::UTF_8
Base64.encode64(string)
else
string
end
else
object
end
end
```

## Combining JSON fragments

Expand Down
237 changes: 142 additions & 95 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -996,13 +996,12 @@ static inline VALUE vstate_get(struct generate_json_data *data)
return data->vstate;
}

struct hash_foreach_arg {
VALUE hash;
struct generate_json_data *data;
int first_key_type;
bool first;
bool mixed_keys_encountered;
};
static VALUE
json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key)
{
VALUE proc_args[2] = {object, is_key};
return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil);
}

static VALUE
convert_string_subclass(VALUE key)
Expand All @@ -1019,6 +1018,129 @@ convert_string_subclass(VALUE key)
return key_to_s;
}

static bool enc_utf8_compatible_p(int enc_idx)
{
if (enc_idx == usascii_encindex) return true;
if (enc_idx == utf8_encindex) return true;
return false;
}

static VALUE encode_json_string_try(VALUE str)
{
return rb_funcall(str, i_encode, 1, Encoding_UTF_8);
}

static VALUE encode_json_string_rescue(VALUE str, VALUE exception)
{
raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0));
return Qundef;
}

static inline bool valid_json_string_p(VALUE str)
{
int coderange = rb_enc_str_coderange(str);

if (RB_LIKELY(coderange == ENC_CODERANGE_7BIT)) {
return true;
}

if (RB_LIKELY(coderange == ENC_CODERANGE_VALID)) {
return enc_utf8_compatible_p(RB_ENCODING_GET_INLINED(str));
}

return false;
}

static inline VALUE ensure_valid_encoding(struct generate_json_data *data, VALUE str, bool as_json_called, bool is_key)
{
if (RB_LIKELY(valid_json_string_p(str))) {
return str;
}

if (!as_json_called && data->state->strict && RTEST(data->state->as_json)) {
VALUE coerced_str = json_call_as_json(data->state, str, Qfalse);
if (coerced_str != str) {
if (RB_TYPE_P(coerced_str, T_STRING)) {
if (!valid_json_string_p(coerced_str)) {
raise_generator_error(str, "source sequence is illegal/malformed utf-8");
}
} else {
// as_json could return another type than T_STRING
if (is_key) {
raise_generator_error(coerced_str, "%"PRIsVALUE" not allowed as object key in JSON", CLASS_OF(coerced_str));
}
}

return coerced_str;
}
}

if (RB_ENCODING_GET_INLINED(str) == binary_encindex) {
VALUE utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
return utf8_string;
case ENC_CODERANGE_VALID:
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Raise in 3.0.0
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
return utf8_string;
break;
}
}

return rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str);
}

static void raw_generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
{
fbuffer_append_char(buffer, '"');

long len;
search_state search;
search.buffer = buffer;
RSTRING_GETMEM(obj, search.ptr, len);
search.cursor = search.ptr;
search.end = search.ptr + len;

#ifdef HAVE_SIMD
search.matches_mask = 0;
search.has_matches = false;
search.chunk_base = NULL;
#endif /* HAVE_SIMD */

switch (rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(data->state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else if (RB_UNLIKELY(data->state->script_safe)) {
convert_UTF8_to_script_safe_JSON(&search);
} else {
convert_UTF8_to_JSON(&search);
}
break;
default:
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
break;
}
fbuffer_append_char(buffer, '"');
}

static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
{
obj = ensure_valid_encoding(data, obj, false, false);
raw_generate_json_string(buffer, data, obj);
}

struct hash_foreach_arg {
VALUE hash;
struct generate_json_data *data;
int first_key_type;
bool first;
bool mixed_keys_encountered;
};

NOINLINE()
static void
json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg)
Expand All @@ -1035,13 +1157,6 @@ json_inspect_hash_with_mixed_keys(struct hash_foreach_arg *arg)
}
}

static VALUE
json_call_as_json(JSON_Generator_State *state, VALUE object, VALUE is_key)
{
VALUE proc_args[2] = {object, is_key};
return rb_proc_call_with_block(state->as_json, 2, proc_args, Qnil);
}

static int
json_object_i(VALUE key, VALUE val, VALUE _arg)
{
Expand Down Expand Up @@ -1107,8 +1222,10 @@ json_object_i(VALUE key, VALUE val, VALUE _arg)
break;
}

key_to_s = ensure_valid_encoding(data, key_to_s, as_json_called, true);

if (RB_LIKELY(RBASIC_CLASS(key_to_s) == rb_cString)) {
generate_json_string(buffer, data, key_to_s);
raw_generate_json_string(buffer, data, key_to_s);
} else {
generate_json(buffer, data, key_to_s);
}
Expand Down Expand Up @@ -1191,85 +1308,6 @@ static void generate_json_array(FBuffer *buffer, struct generate_json_data *data
fbuffer_append_char(buffer, ']');
}

static inline int enc_utf8_compatible_p(int enc_idx)
{
if (enc_idx == usascii_encindex) return 1;
if (enc_idx == utf8_encindex) return 1;
return 0;
}

static VALUE encode_json_string_try(VALUE str)
{
return rb_funcall(str, i_encode, 1, Encoding_UTF_8);
}

static VALUE encode_json_string_rescue(VALUE str, VALUE exception)
{
raise_generator_error_str(str, rb_funcall(exception, rb_intern("message"), 0));
return Qundef;
}

static inline VALUE ensure_valid_encoding(VALUE str)
{
int encindex = RB_ENCODING_GET(str);
VALUE utf8_string;
if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
if (encindex == binary_encindex) {
utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
switch (rb_enc_str_coderange(utf8_string)) {
case ENC_CODERANGE_7BIT:
return utf8_string;
case ENC_CODERANGE_VALID:
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
// TODO: Raise in 3.0.0
rb_warn("JSON.generate: UTF-8 string passed as BINARY, this will raise an encoding error in json 3.0");
return utf8_string;
break;
}
}

str = rb_rescue(encode_json_string_try, str, encode_json_string_rescue, str);
}
return str;
}

static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
{
obj = ensure_valid_encoding(obj);

fbuffer_append_char(buffer, '"');

long len;
search_state search;
search.buffer = buffer;
RSTRING_GETMEM(obj, search.ptr, len);
search.cursor = search.ptr;
search.end = search.ptr + len;

#ifdef HAVE_SIMD
search.matches_mask = 0;
search.has_matches = false;
search.chunk_base = NULL;
#endif /* HAVE_SIMD */

switch (rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(data->state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else if (RB_UNLIKELY(data->state->script_safe)) {
convert_UTF8_to_script_safe_JSON(&search);
} else {
convert_UTF8_to_JSON(&search);
}
break;
default:
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
break;
}
fbuffer_append_char(buffer, '"');
}

static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *data, VALUE obj)
{
VALUE tmp;
Expand Down Expand Up @@ -1408,7 +1446,16 @@ static void generate_json(FBuffer *buffer, struct generate_json_data *data, VALU
break;
case T_STRING:
if (klass != rb_cString) goto general;
generate_json_string(buffer, data, obj);

if (RB_LIKELY(valid_json_string_p(obj))) {
raw_generate_json_string(buffer, data, obj);
} else if (as_json_called) {
raise_generator_error(obj, "source sequence is illegal/malformed utf-8");
} else {
obj = ensure_valid_encoding(data, obj, false, false);
as_json_called = true;
goto start;
}
break;
case T_SYMBOL:
generate_json_symbol(buffer, data, obj);
Expand Down
17 changes: 16 additions & 1 deletion java/src/json/ext/Generator.java
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,8 @@ private static IRubyObject castKey(ThreadContext context, IRubyObject key) {
}

private static void processEntry(ThreadContext context, Session session, OutputStream buffer, RubyHash.RubyHashEntry entry, boolean firstPair, ByteList objectNl, byte[] indent, ByteList spaceBefore, ByteList space) {
StringEncoder encoder = session.getStringEncoder(context);

IRubyObject key = (IRubyObject) entry.getKey();
IRubyObject value = (IRubyObject) entry.getValue();

Expand All @@ -619,7 +621,7 @@ private static void processEntry(ThreadContext context, Session session, OutputS
Ruby runtime = context.runtime;

IRubyObject keyStr = castKey(context, key);
if (keyStr == null || !(keyStr instanceof RubyString)) {
if (keyStr == null || !(keyStr instanceof RubyString) || !encoder.hasValidEncoding((RubyString)keyStr)) {
GeneratorState state = session.getState(context);
if (state.strict()) {
if (state.getAsJSON() != null) {
Expand Down Expand Up @@ -664,6 +666,19 @@ int guessSize(ThreadContext context, Session session, RubyString object) {

@Override
void generate(ThreadContext context, Session session, RubyString object, OutputStream buffer) throws IOException {
GeneratorState state = session.getState(context);
StringEncoder encoder = session.getStringEncoder(context);

if (state.strict() && !encoder.hasValidEncoding(object) && state.getAsJSON() != null) {
IRubyObject value = state.getAsJSON().call(context, object, context.getRuntime().getFalse());
if (value instanceof RubyString) {
object = (RubyString)value;
} else {
Handler handler = getHandlerFor(context.runtime, value);
handler.generate(context, session, value, buffer);
return;
}
}
generateString(context, session, object, buffer);
}
}
Expand Down
11 changes: 11 additions & 0 deletions java/src/json/ext/StringEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,17 @@ void generate(ThreadContext context, RubyString object, OutputStream buffer) thr
append('"');
}

static boolean hasValidEncoding(RubyString str) {
switch (str.scanForCodeRange()) {
case StringSupport.CR_7BIT:
return true;
case StringSupport.CR_VALID:
return str.getEncoding() == UTF8Encoding.INSTANCE || str.getEncoding() == USASCIIEncoding.INSTANCE;
default:
return false;
}
}

static RubyString ensureValidEncoding(ThreadContext context, RubyString str) {
Encoding encoding = str.getEncoding();

Expand Down
Loading
Loading