Skip to content

Commit 836a35f

Browse files
committed
Fix up multibyte escapes
1 parent 6e432f3 commit 836a35f

File tree

2 files changed

+95
-34
lines changed

2 files changed

+95
-34
lines changed

src/prism.c

Lines changed: 82 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7674,6 +7674,28 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
76747674
pm_buffer_append_byte(buffer, byte);
76757675
}
76767676

7677+
/**
7678+
* Write each byte of the given escaped character into the buffer.
7679+
*/
7680+
static inline void
7681+
escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
7682+
size_t width;
7683+
if (parser->encoding_changed) {
7684+
width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
7685+
} else {
7686+
width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
7687+
}
7688+
7689+
// TODO: If the character is invalid in the given encoding, then we'll just
7690+
// push one byte into the buffer. This should actually be an error.
7691+
width = (width == 0) ? 1 : width;
7692+
7693+
for (size_t index = 0; index < width; index++) {
7694+
escape_write_byte_encoded(parser, buffer, *parser->current.end);
7695+
parser->current.end++;
7696+
}
7697+
}
7698+
76777699
/**
76787700
* The regular expression engine doesn't support the same escape sequences as
76797701
* Ruby does. So first we have to read the escape sequence, and then we have to
@@ -8012,7 +8034,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
80128034
/* fallthrough */
80138035
default: {
80148036
if (parser->current.end < parser->end) {
8015-
escape_write_byte_encoded(parser, buffer, *parser->current.end++);
8037+
escape_write_escape_encoded(parser, buffer);
80168038
}
80178039
return;
80188040
}
@@ -8289,10 +8311,40 @@ typedef struct {
82898311
* Push the given byte into the token buffer.
82908312
*/
82918313
static inline void
8292-
pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
8314+
pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) {
82938315
pm_buffer_append_byte(&token_buffer->buffer, byte);
82948316
}
82958317

8318+
/**
8319+
* Append the given bytes into the token buffer.
8320+
*/
8321+
static inline void
8322+
pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) {
8323+
pm_buffer_append_bytes(&token_buffer->buffer, bytes, length);
8324+
}
8325+
8326+
/**
8327+
* Push an escaped character into the token buffer.
8328+
*/
8329+
static inline void
8330+
pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) {
8331+
// First, determine the width of the character to be escaped.
8332+
size_t width;
8333+
if (parser->encoding_changed) {
8334+
width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8335+
} else {
8336+
width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
8337+
}
8338+
8339+
// TODO: If the character is invalid in the given encoding, then we'll just
8340+
// push one byte into the buffer. This should actually be an error.
8341+
width = (width == 0 ? 1 : width);
8342+
8343+
// Now, push the bytes into the buffer.
8344+
pm_token_buffer_push_bytes(token_buffer, parser->current.end, width);
8345+
parser->current.end += width;
8346+
}
8347+
82968348
/**
82978349
* When we're about to return from lexing the current token and we know for sure
82988350
* that we have found an escape sequence, this function is called to copy the
@@ -9705,18 +9757,18 @@ parser_lex(pm_parser_t *parser) {
97059757
case '\t':
97069758
case '\v':
97079759
case '\\':
9708-
pm_token_buffer_push(&token_buffer, peeked);
9760+
pm_token_buffer_push_byte(&token_buffer, peeked);
97099761
parser->current.end++;
97109762
break;
97119763
case '\r':
97129764
parser->current.end++;
97139765
if (peek(parser) != '\n') {
9714-
pm_token_buffer_push(&token_buffer, '\r');
9766+
pm_token_buffer_push_byte(&token_buffer, '\r');
97159767
break;
97169768
}
97179769
/* fallthrough */
97189770
case '\n':
9719-
pm_token_buffer_push(&token_buffer, '\n');
9771+
pm_token_buffer_push_byte(&token_buffer, '\n');
97209772

97219773
if (parser->heredoc_end) {
97229774
// ... if we are on the same line as a heredoc,
@@ -9734,14 +9786,13 @@ parser_lex(pm_parser_t *parser) {
97349786
break;
97359787
default:
97369788
if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
9737-
pm_token_buffer_push(&token_buffer, peeked);
9789+
pm_token_buffer_push_byte(&token_buffer, peeked);
97389790
parser->current.end++;
97399791
} else if (lex_mode->as.list.interpolation) {
97409792
escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
97419793
} else {
9742-
pm_token_buffer_push(&token_buffer, '\\');
9743-
pm_token_buffer_push(&token_buffer, peeked);
9744-
parser->current.end++;
9794+
pm_token_buffer_push_byte(&token_buffer, '\\');
9795+
pm_token_buffer_push_escaped(&token_buffer, parser);
97459796
}
97469797

97479798
break;
@@ -9899,9 +9950,9 @@ parser_lex(pm_parser_t *parser) {
98999950
parser->current.end++;
99009951
if (peek(parser) != '\n') {
99019952
if (lex_mode->as.regexp.terminator != '\r') {
9902-
pm_token_buffer_push(&token_buffer, '\\');
9953+
pm_token_buffer_push_byte(&token_buffer, '\\');
99039954
}
9904-
pm_token_buffer_push(&token_buffer, '\r');
9955+
pm_token_buffer_push_byte(&token_buffer, '\r');
99059956
break;
99069957
}
99079958
/* fallthrough */
@@ -9936,20 +9987,19 @@ parser_lex(pm_parser_t *parser) {
99369987
case '$': case ')': case '*': case '+':
99379988
case '.': case '>': case '?': case ']':
99389989
case '^': case '|': case '}':
9939-
pm_token_buffer_push(&token_buffer, '\\');
9990+
pm_token_buffer_push_byte(&token_buffer, '\\');
99409991
break;
99419992
default:
99429993
break;
99439994
}
99449995

9945-
pm_token_buffer_push(&token_buffer, peeked);
9996+
pm_token_buffer_push_byte(&token_buffer, peeked);
99469997
parser->current.end++;
99479998
break;
99489999
}
994910000

9950-
if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
9951-
pm_token_buffer_push(&token_buffer, peeked);
9952-
parser->current.end++;
10001+
if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\');
10002+
pm_token_buffer_push_escaped(&token_buffer, parser);
995310003
break;
995410004
}
995510005

@@ -10116,23 +10166,23 @@ parser_lex(pm_parser_t *parser) {
1011610166

1011710167
switch (peeked) {
1011810168
case '\\':
10119-
pm_token_buffer_push(&token_buffer, '\\');
10169+
pm_token_buffer_push_byte(&token_buffer, '\\');
1012010170
parser->current.end++;
1012110171
break;
1012210172
case '\r':
1012310173
parser->current.end++;
1012410174
if (peek(parser) != '\n') {
1012510175
if (!lex_mode->as.string.interpolation) {
10126-
pm_token_buffer_push(&token_buffer, '\\');
10176+
pm_token_buffer_push_byte(&token_buffer, '\\');
1012710177
}
10128-
pm_token_buffer_push(&token_buffer, '\r');
10178+
pm_token_buffer_push_byte(&token_buffer, '\r');
1012910179
break;
1013010180
}
1013110181
/* fallthrough */
1013210182
case '\n':
1013310183
if (!lex_mode->as.string.interpolation) {
10134-
pm_token_buffer_push(&token_buffer, '\\');
10135-
pm_token_buffer_push(&token_buffer, '\n');
10184+
pm_token_buffer_push_byte(&token_buffer, '\\');
10185+
pm_token_buffer_push_byte(&token_buffer, '\n');
1013610186
}
1013710187

1013810188
if (parser->heredoc_end) {
@@ -10151,17 +10201,16 @@ parser_lex(pm_parser_t *parser) {
1015110201
break;
1015210202
default:
1015310203
if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
10154-
pm_token_buffer_push(&token_buffer, peeked);
10204+
pm_token_buffer_push_byte(&token_buffer, peeked);
1015510205
parser->current.end++;
1015610206
} else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
10157-
pm_token_buffer_push(&token_buffer, peeked);
10207+
pm_token_buffer_push_byte(&token_buffer, peeked);
1015810208
parser->current.end++;
1015910209
} else if (lex_mode->as.string.interpolation) {
1016010210
escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
1016110211
} else {
10162-
pm_token_buffer_push(&token_buffer, '\\');
10163-
pm_token_buffer_push(&token_buffer, peeked);
10164-
parser->current.end++;
10212+
pm_token_buffer_push_byte(&token_buffer, '\\');
10213+
pm_token_buffer_push_escaped(&token_buffer, parser);
1016510214
}
1016610215

1016710216
break;
@@ -10418,29 +10467,28 @@ parser_lex(pm_parser_t *parser) {
1041810467
case '\r':
1041910468
parser->current.end++;
1042010469
if (peek(parser) != '\n') {
10421-
pm_token_buffer_push(&token_buffer, '\\');
10422-
pm_token_buffer_push(&token_buffer, '\r');
10470+
pm_token_buffer_push_byte(&token_buffer, '\\');
10471+
pm_token_buffer_push_byte(&token_buffer, '\r');
1042310472
break;
1042410473
}
1042510474
/* fallthrough */
1042610475
case '\n':
10427-
pm_token_buffer_push(&token_buffer, '\\');
10428-
pm_token_buffer_push(&token_buffer, '\n');
10476+
pm_token_buffer_push_byte(&token_buffer, '\\');
10477+
pm_token_buffer_push_byte(&token_buffer, '\n');
1042910478
token_buffer.cursor = parser->current.end + 1;
1043010479
breakpoint = parser->current.end;
1043110480
continue;
1043210481
default:
10433-
parser->current.end++;
10434-
pm_token_buffer_push(&token_buffer, '\\');
10435-
pm_token_buffer_push(&token_buffer, peeked);
10482+
pm_token_buffer_push_byte(&token_buffer, '\\');
10483+
pm_token_buffer_push_escaped(&token_buffer, parser);
1043610484
break;
1043710485
}
1043810486
} else {
1043910487
switch (peeked) {
1044010488
case '\r':
1044110489
parser->current.end++;
1044210490
if (peek(parser) != '\n') {
10443-
pm_token_buffer_push(&token_buffer, '\r');
10491+
pm_token_buffer_push_byte(&token_buffer, '\r');
1044410492
break;
1044510493
}
1044610494
/* fallthrough */

test/prism/encoding_test.rb

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,19 @@ def test_slice_encoding
225225
assert_equal Encoding::SHIFT_JIS, slice.encoding
226226
end
227227

228+
def test_multibyte_escapes
229+
[
230+
["'", "'"],
231+
["\"", "\""],
232+
["`", "`"],
233+
["/", "/"],
234+
["<<'HERE'\n", "\nHERE"],
235+
["<<-HERE\n", "\nHERE"]
236+
].each do |opening, closing|
237+
assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
238+
end
239+
end
240+
228241
private
229242

230243
class ConstantContext < BasicObject

0 commit comments

Comments
 (0)