Skip to content

Commit b249631

Browse files
committed
Supported BOM
1 parent 53def32 commit b249631

File tree

2 files changed

+95
-0
lines changed

2 files changed

+95
-0
lines changed

ext/stringio/stringio.c

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,73 @@ strio_initialize(int argc, VALUE *argv, VALUE self)
262262
return strio_init(argc, argv, ptr, self);
263263
}
264264

265+
static int
266+
detect_bom(VALUE str, int *bomlen)
267+
{
268+
const char *p;
269+
long len;
270+
271+
RSTRING_GETMEM(str, p, len);
272+
if (len < 1) return 0;
273+
switch ((unsigned char)p[0]) {
274+
case 0xEF:
275+
if (len < 2) break;
276+
if ((unsigned char)p[1] == 0xBB && len > 2) {
277+
if ((unsigned char)p[2] == 0xBF) {
278+
*bomlen = 3;
279+
return rb_utf8_encindex();
280+
}
281+
}
282+
break;
283+
284+
case 0xFE:
285+
if (len < 2) break;
286+
if ((unsigned char)p[1] == 0xFF) {
287+
*bomlen = 2;
288+
return rb_enc_find_index("UTF-16BE");
289+
}
290+
break;
291+
292+
case 0xFF:
293+
if (len < 2) break;
294+
if ((unsigned char)p[1] == 0xFE) {
295+
if (len >= 4 && (unsigned char)p[2] == 0 && (unsigned char)p[3] == 0) {
296+
*bomlen = 4;
297+
return rb_enc_find_index("UTF-32LE");
298+
}
299+
*bomlen = 2;
300+
return rb_enc_find_index("UTF-16LE");
301+
}
302+
break;
303+
304+
case 0:
305+
if (len < 4) break;
306+
if ((unsigned char)p[1] == 0 && (unsigned char)p[2] == 0xFE & (unsigned char)p[3] == 0xFF) {
307+
*bomlen = 4;
308+
return rb_enc_find_index("UTF-32BE");
309+
}
310+
break;
311+
}
312+
return 0;
313+
}
314+
315+
static rb_encoding *
316+
set_encoding_by_bom(struct StringIO *ptr)
317+
{
318+
int bomlen, idx = detect_bom(ptr->string, &bomlen);
319+
rb_encoding *extenc = NULL;
320+
321+
if (idx) {
322+
extenc = rb_enc_from_index(idx);
323+
ptr->pos = bomlen;
324+
if (ptr->flags & FMODE_WRITABLE) {
325+
rb_enc_associate_index(ptr->string, idx);
326+
}
327+
}
328+
ptr->enc = extenc;
329+
return extenc;
330+
}
331+
265332
static VALUE
266333
strio_init(int argc, VALUE *argv, struct StringIO *ptr, VALUE self)
267334
{
@@ -294,6 +361,7 @@ strio_init(int argc, VALUE *argv, struct StringIO *ptr, VALUE self)
294361
ptr->enc = convconfig.enc;
295362
ptr->pos = 0;
296363
ptr->lineno = 0;
364+
if (ptr->flags & FMODE_SETENC_BY_BOM) set_encoding_by_bom(ptr);
297365
RBASIC(self)->flags |= (ptr->flags & FMODE_READWRITE) * (STRIO_READABLE / FMODE_READABLE);
298366
return self;
299367
}
@@ -1677,6 +1745,18 @@ strio_set_encoding(int argc, VALUE *argv, VALUE self)
16771745
return self;
16781746
}
16791747

1748+
static VALUE
1749+
strio_set_encoding_by_bom(VALUE self)
1750+
{
1751+
struct StringIO *ptr = StringIO(self);
1752+
1753+
if (ptr->enc) {
1754+
rb_raise(rb_eArgError, "encoding conversion is set");
1755+
}
1756+
if (!set_encoding_by_bom(ptr)) return Qnil;
1757+
return rb_enc_from_encoding(ptr->enc);
1758+
}
1759+
16801760
/*
16811761
* Pseudo I/O on String object.
16821762
*
@@ -1778,6 +1858,7 @@ Init_stringio(void)
17781858
rb_define_method(StringIO, "external_encoding", strio_external_encoding, 0);
17791859
rb_define_method(StringIO, "internal_encoding", strio_internal_encoding, 0);
17801860
rb_define_method(StringIO, "set_encoding", strio_set_encoding, -1);
1861+
rb_define_method(StringIO, "set_encoding_by_bom", strio_set_encoding_by_bom, 0);
17811862

17821863
{
17831864
VALUE mReadable = rb_define_module_under(rb_cIO, "generic_readable");

test/stringio/test_stringio.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,20 @@ def test_encoding_read
795795
assert_equal("\0\0\0a\0\0\0b\0\0\0c", s.read)
796796
end
797797

798+
%w/UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE/.each do |name|
799+
define_method("test_strip_bom:#{name}") do
800+
text = "\uFEFF\u0100a"
801+
content = text.encode(name)
802+
result = StringIO.new(content, mode: 'rb:BOM|UTF-8').read
803+
assert_equal(Encoding.find(name), result.encoding, name)
804+
assert_equal(content[1..-1].b, result.b, name)
805+
806+
StringIO.open(content) {|f|
807+
assert_equal(Encoding.find(name), f.set_encoding_by_bom)
808+
}
809+
end
810+
end
811+
798812
def assert_string(content, encoding, str, mesg = nil)
799813
assert_equal([content, encoding], [str, str.encoding], mesg)
800814
end

0 commit comments

Comments
 (0)