Skip to content

Commit 119fc2d

Browse files
committed
Fix lex compat with BOM
* BOM should not impact looking for the encoding string * We should re-encode tokens when the encoding changes * BOM should change the column of comments only
1 parent 22ff2e6 commit 119fc2d

File tree

4 files changed

+116
-15
lines changed

4 files changed

+116
-15
lines changed

ext/yarp/extension.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,20 @@ static void
221221
lex_encoding_changed_callback(yp_parser_t *parser) {
222222
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
223223
lex_data->encoding = rb_enc_find(parser->encoding.name);
224+
225+
// Since we got a new encoding, we need to go back and change the encoding
226+
// of the tokens that we've already lexed. This should be a tiny amount
227+
// since encoding magic comments need to be the first or second line of the
228+
// file.
229+
VALUE tokens = lex_data->tokens;
230+
for (long index = 0; index < RARRAY_LEN(tokens); index++) {
231+
VALUE yields = rb_ary_entry(tokens, index);
232+
VALUE token = rb_ary_entry(yields, 0);
233+
234+
VALUE value = rb_ivar_get(token, rb_intern("@value"));
235+
rb_enc_associate(value, lex_data->encoding);
236+
ENC_CODERANGE_CLEAR(value);
237+
}
224238
}
225239

226240
// Return an array of tokens corresponding to the given source.

lib/yarp/lex_compat.rb

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -574,19 +574,41 @@ def result
574574
result = YARP.lex(source, @filepath)
575575
result_value = result.value
576576
previous_state = nil
577-
578-
# If there's a UTF-8 byte-order mark as the start of the file, then ripper
579-
# sets every token's on the first line back by 6 bytes. It also keeps the
580-
# byte order mark in the first token's value. This is weird, and I don't
581-
# want to mirror that in our parser. So instead, we'll match up the values
582-
# here, and then match up the locations as we process the tokens.
583-
bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
584-
result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
577+
bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
585578

586579
result_value.each_with_index do |(token, lex_state), index|
587580
lineno = token.location.start_line
588581
column = token.location.start_column
589-
column -= index == 0 ? 6 : 3 if bom && lineno == 1
582+
583+
# If there's a UTF-8 byte-order mark as the start of the file, then for
584+
# certain tokens ripper sets the first token back by 3 bytes. It also
585+
# keeps the byte order mark in the first token's value. This is weird,
586+
# and I don't want to mirror that in our parser. So instead, we'll match
587+
# up the columns and values here.
588+
if bom && lineno == 1
589+
column -= 3
590+
591+
if index == 0 && column == 0
592+
flushed =
593+
case token.type
594+
when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
595+
:GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
596+
:PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
597+
:PERCENT_UPPER_W, :STRING_BEGIN
598+
true
599+
when :REGEXP_BEGIN, :SYMBOL_BEGIN
600+
token.value.start_with?("%")
601+
else
602+
false
603+
end
604+
605+
unless flushed
606+
column -= 3
607+
value = token.value
608+
value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
609+
end
610+
end
611+
end
590612

591613
event = RIPPER.fetch(token.type)
592614
value = token.value
@@ -668,6 +690,11 @@ def result
668690
end_offset = token.location.start_offset
669691

670692
if previous_token.type == :COMMENT && start_offset < end_offset
693+
if bom
694+
start_offset += 3
695+
end_offset += 3
696+
end
697+
671698
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
672699
end
673700

src/yarp.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12878,6 +12878,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
1287812878
// Initialize a parser with the given start and end pointers.
1287912879
YP_EXPORTED_FUNCTION void
1288012880
yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
12881+
assert(source != NULL);
12882+
1288112883
// Set filepath to the file that was passed
1288212884
if (!filepath) filepath = "";
1288312885
yp_string_t filepath_string;
@@ -12946,14 +12948,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
1294612948
size_t newline_size = size / 22;
1294712949
yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
1294812950

12949-
assert(source != NULL);
12951+
// Skip past the UTF-8 BOM if it exists.
1295012952
if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
12951-
// If the first three bytes of the source are the UTF-8 BOM, then we'll skip
12952-
// over them.
1295312953
parser->current.end += 3;
12954-
} else if (size >= 2 && source[0] == '#' && source[1] == '!') {
12955-
// If the first two bytes of the source are a shebang, then we'll indicate
12956-
// that the encoding comment is at the end of the shebang.
12954+
parser->encoding_comment_start += 3;
12955+
}
12956+
12957+
// If the first two bytes of the source are a shebang, then we'll indicate
12958+
// that the encoding comment is at the end of the shebang.
12959+
if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
1295712960
const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
1295812961
if (encoding_comment_start) {
1295912962
parser->encoding_comment_start = encoding_comment_start + 1;

test/bom_test.rb

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# frozen_string_literal: true
2+
3+
# Don't bother checking this on these engines, this is such a specific Ripper
4+
# test.
5+
return if RUBY_ENGINE == "jruby" || RUBY_ENGINE == "truffleruby"
6+
7+
require "yarp_test_helper"
8+
9+
class BOMTest < Test::Unit::TestCase
10+
def test_ident
11+
assert_bom("foo")
12+
end
13+
14+
def test_back_reference
15+
assert_bom("$+")
16+
end
17+
18+
def test_instance_variable
19+
assert_bom("@foo")
20+
end
21+
22+
def test_class_variable
23+
assert_bom("@@foo")
24+
end
25+
26+
def test_global_variable
27+
assert_bom("$foo")
28+
end
29+
30+
def test_numbered_reference
31+
assert_bom("$1")
32+
end
33+
34+
def test_percents
35+
assert_bom("%i[]")
36+
assert_bom("%r[]")
37+
assert_bom("%s[]")
38+
assert_bom("%q{}")
39+
assert_bom("%w[]")
40+
assert_bom("%x[]")
41+
assert_bom("%I[]")
42+
assert_bom("%W[]")
43+
assert_bom("%Q{}")
44+
end
45+
46+
def test_string
47+
assert_bom("\"\"")
48+
assert_bom("''")
49+
end
50+
51+
private
52+
53+
def assert_bom(source)
54+
bommed = "\xEF\xBB\xBF#{source}"
55+
assert_equal YARP.lex_ripper(bommed), YARP.lex_compat(bommed).value
56+
end
57+
end

0 commit comments

Comments
 (0)