Skip to content

Commit 6201ae1

Browse files
hsbtclaude
andcommitted
Round the io_reader clamp down to a character boundary
Clamping the over-read at exactly the requested size could split a multibyte character, since the string an IO returns may carry a non-binary encoding. Round the cut down to the last character boundary at or before the size so the bytes handed to libyaml are always whole characters. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 99ecd94 commit 6201ae1

2 files changed

Lines changed: 26 additions & 3 deletions

File tree

ext/psych/psych_parser.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,17 @@ static int io_reader(void * data, unsigned char *buf, size_t size, size_t *read)
3232
*read = 0;
3333

3434
if(! NIL_P(string)) {
35-
void * str = (void *)StringValuePtr(string);
35+
char * str = StringValuePtr(string);
3636
size_t len = (size_t)RSTRING_LEN(string);
3737

3838
/* IO#read(size) is documented to return at most `size` bytes, but a
3939
* misbehaving IO-like object may return more. Clamp the copy to the
40-
* buffer libyaml gave us to avoid writing past its end. */
41-
if(len > size) len = size;
40+
* buffer libyaml gave us to avoid writing past its end, rounding down
41+
* to a character boundary so a multibyte character is never split. */
42+
if(len > size) {
43+
rb_encoding * enc = rb_enc_get(string);
44+
len = (size_t)(rb_enc_left_char_head(str, str + size, str + len, enc) - str);
45+
}
4246

4347
*read = len;
4448
memcpy(buf, str, len);

test/psych/test_parser.rb

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,25 @@ def io.read len
221221
assert_called :end_stream
222222
end
223223

224+
def test_parse_io_returns_more_bytes_than_requested_multibyte
225+
# The over-read is rounded down to a character boundary so a multibyte
226+
# character is never split when the copy is clamped.
227+
io = Object.new
228+
def io.external_encoding; Encoding::UTF_8 end
229+
def io.read len
230+
return nil if @done
231+
@done = true
232+
"--- a\n#" + ("あ" * (len + (1 << 20)))
233+
end
234+
235+
begin
236+
@parser.parse io
237+
rescue IOError
238+
return
239+
end
240+
assert_called :scalar
241+
end
242+
224243
def test_syntax_error
225244
assert_raise(Psych::SyntaxError) do
226245
@parser.parse("---\n\"foo\"\n\"bar\"\n")

0 commit comments

Comments
 (0)