@@ -7674,6 +7674,28 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
7674
7674
pm_buffer_append_byte(buffer, byte);
7675
7675
}
7676
7676
7677
+ /**
7678
+ * Write each byte of the given escaped character into the buffer.
7679
+ */
7680
+ static inline void
7681
+ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
7682
+ size_t width;
7683
+ if (parser->encoding_changed) {
7684
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
7685
+ } else {
7686
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
7687
+ }
7688
+
7689
+ // TODO: If the character is invalid in the given encoding, then we'll just
7690
+ // push one byte into the buffer. This should actually be an error.
7691
+ width = (width == 0) ? 1 : width;
7692
+
7693
+ for (size_t index = 0; index < width; index++) {
7694
+ escape_write_byte_encoded(parser, buffer, *parser->current.end);
7695
+ parser->current.end++;
7696
+ }
7697
+ }
7698
+
7677
7699
/**
7678
7700
* The regular expression engine doesn't support the same escape sequences as
7679
7701
* Ruby does. So first we have to read the escape sequence, and then we have to
@@ -8012,7 +8034,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
8012
8034
/* fallthrough */
8013
8035
default: {
8014
8036
if (parser->current.end < parser->end) {
8015
- escape_write_byte_encoded (parser, buffer, *parser->current.end++ );
8037
+ escape_write_escape_encoded (parser, buffer);
8016
8038
}
8017
8039
return;
8018
8040
}
@@ -8289,10 +8311,40 @@ typedef struct {
8289
8311
* Push the given byte into the token buffer.
8290
8312
*/
8291
8313
static inline void
8292
- pm_token_buffer_push (pm_token_buffer_t *token_buffer, uint8_t byte) {
8314
+ pm_token_buffer_push_byte (pm_token_buffer_t *token_buffer, uint8_t byte) {
8293
8315
pm_buffer_append_byte(&token_buffer->buffer, byte);
8294
8316
}
8295
8317
8318
+ /**
8319
+ * Append the given bytes into the token buffer.
8320
+ */
8321
+ static inline void
8322
+ pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) {
8323
+ pm_buffer_append_bytes(&token_buffer->buffer, bytes, length);
8324
+ }
8325
+
8326
+ /**
8327
+ * Push an escaped character into the token buffer.
8328
+ */
8329
+ static inline void
8330
+ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) {
8331
+ // First, determine the width of the character to be escaped.
8332
+ size_t width;
8333
+ if (parser->encoding_changed) {
8334
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8335
+ } else {
8336
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
8337
+ }
8338
+
8339
+ // TODO: If the character is invalid in the given encoding, then we'll just
8340
+ // push one byte into the buffer. This should actually be an error.
8341
+ width = (width == 0 ? 1 : width);
8342
+
8343
+ // Now, push the bytes into the buffer.
8344
+ pm_token_buffer_push_bytes(token_buffer, parser->current.end, width);
8345
+ parser->current.end += width;
8346
+ }
8347
+
8296
8348
/**
8297
8349
* When we're about to return from lexing the current token and we know for sure
8298
8350
* that we have found an escape sequence, this function is called to copy the
@@ -9705,18 +9757,18 @@ parser_lex(pm_parser_t *parser) {
9705
9757
case '\t':
9706
9758
case '\v':
9707
9759
case '\\':
9708
- pm_token_buffer_push (&token_buffer, peeked);
9760
+ pm_token_buffer_push_byte (&token_buffer, peeked);
9709
9761
parser->current.end++;
9710
9762
break;
9711
9763
case '\r':
9712
9764
parser->current.end++;
9713
9765
if (peek(parser) != '\n') {
9714
- pm_token_buffer_push (&token_buffer, '\r');
9766
+ pm_token_buffer_push_byte (&token_buffer, '\r');
9715
9767
break;
9716
9768
}
9717
9769
/* fallthrough */
9718
9770
case '\n':
9719
- pm_token_buffer_push (&token_buffer, '\n');
9771
+ pm_token_buffer_push_byte (&token_buffer, '\n');
9720
9772
9721
9773
if (parser->heredoc_end) {
9722
9774
// ... if we are on the same line as a heredoc,
@@ -9734,14 +9786,13 @@ parser_lex(pm_parser_t *parser) {
9734
9786
break;
9735
9787
default:
9736
9788
if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
9737
- pm_token_buffer_push (&token_buffer, peeked);
9789
+ pm_token_buffer_push_byte (&token_buffer, peeked);
9738
9790
parser->current.end++;
9739
9791
} else if (lex_mode->as.list.interpolation) {
9740
9792
escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
9741
9793
} else {
9742
- pm_token_buffer_push(&token_buffer, '\\');
9743
- pm_token_buffer_push(&token_buffer, peeked);
9744
- parser->current.end++;
9794
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9795
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9745
9796
}
9746
9797
9747
9798
break;
@@ -9899,9 +9950,9 @@ parser_lex(pm_parser_t *parser) {
9899
9950
parser->current.end++;
9900
9951
if (peek(parser) != '\n') {
9901
9952
if (lex_mode->as.regexp.terminator != '\r') {
9902
- pm_token_buffer_push (&token_buffer, '\\');
9953
+ pm_token_buffer_push_byte (&token_buffer, '\\');
9903
9954
}
9904
- pm_token_buffer_push (&token_buffer, '\r');
9955
+ pm_token_buffer_push_byte (&token_buffer, '\r');
9905
9956
break;
9906
9957
}
9907
9958
/* fallthrough */
@@ -9936,20 +9987,19 @@ parser_lex(pm_parser_t *parser) {
9936
9987
case '$': case ')': case '*': case '+':
9937
9988
case '.': case '>': case '?': case ']':
9938
9989
case '^': case '|': case '}':
9939
- pm_token_buffer_push (&token_buffer, '\\');
9990
+ pm_token_buffer_push_byte (&token_buffer, '\\');
9940
9991
break;
9941
9992
default:
9942
9993
break;
9943
9994
}
9944
9995
9945
- pm_token_buffer_push (&token_buffer, peeked);
9996
+ pm_token_buffer_push_byte (&token_buffer, peeked);
9946
9997
parser->current.end++;
9947
9998
break;
9948
9999
}
9949
10000
9950
- if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
9951
- pm_token_buffer_push(&token_buffer, peeked);
9952
- parser->current.end++;
10001
+ if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\');
10002
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9953
10003
break;
9954
10004
}
9955
10005
@@ -10116,23 +10166,23 @@ parser_lex(pm_parser_t *parser) {
10116
10166
10117
10167
switch (peeked) {
10118
10168
case '\\':
10119
- pm_token_buffer_push (&token_buffer, '\\');
10169
+ pm_token_buffer_push_byte (&token_buffer, '\\');
10120
10170
parser->current.end++;
10121
10171
break;
10122
10172
case '\r':
10123
10173
parser->current.end++;
10124
10174
if (peek(parser) != '\n') {
10125
10175
if (!lex_mode->as.string.interpolation) {
10126
- pm_token_buffer_push (&token_buffer, '\\');
10176
+ pm_token_buffer_push_byte (&token_buffer, '\\');
10127
10177
}
10128
- pm_token_buffer_push (&token_buffer, '\r');
10178
+ pm_token_buffer_push_byte (&token_buffer, '\r');
10129
10179
break;
10130
10180
}
10131
10181
/* fallthrough */
10132
10182
case '\n':
10133
10183
if (!lex_mode->as.string.interpolation) {
10134
- pm_token_buffer_push (&token_buffer, '\\');
10135
- pm_token_buffer_push (&token_buffer, '\n');
10184
+ pm_token_buffer_push_byte (&token_buffer, '\\');
10185
+ pm_token_buffer_push_byte (&token_buffer, '\n');
10136
10186
}
10137
10187
10138
10188
if (parser->heredoc_end) {
@@ -10151,17 +10201,16 @@ parser_lex(pm_parser_t *parser) {
10151
10201
break;
10152
10202
default:
10153
10203
if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
10154
- pm_token_buffer_push (&token_buffer, peeked);
10204
+ pm_token_buffer_push_byte (&token_buffer, peeked);
10155
10205
parser->current.end++;
10156
10206
} else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
10157
- pm_token_buffer_push (&token_buffer, peeked);
10207
+ pm_token_buffer_push_byte (&token_buffer, peeked);
10158
10208
parser->current.end++;
10159
10209
} else if (lex_mode->as.string.interpolation) {
10160
10210
escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
10161
10211
} else {
10162
- pm_token_buffer_push(&token_buffer, '\\');
10163
- pm_token_buffer_push(&token_buffer, peeked);
10164
- parser->current.end++;
10212
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10213
+ pm_token_buffer_push_escaped(&token_buffer, parser);
10165
10214
}
10166
10215
10167
10216
break;
@@ -10418,29 +10467,28 @@ parser_lex(pm_parser_t *parser) {
10418
10467
case '\r':
10419
10468
parser->current.end++;
10420
10469
if (peek(parser) != '\n') {
10421
- pm_token_buffer_push (&token_buffer, '\\');
10422
- pm_token_buffer_push (&token_buffer, '\r');
10470
+ pm_token_buffer_push_byte (&token_buffer, '\\');
10471
+ pm_token_buffer_push_byte (&token_buffer, '\r');
10423
10472
break;
10424
10473
}
10425
10474
/* fallthrough */
10426
10475
case '\n':
10427
- pm_token_buffer_push (&token_buffer, '\\');
10428
- pm_token_buffer_push (&token_buffer, '\n');
10476
+ pm_token_buffer_push_byte (&token_buffer, '\\');
10477
+ pm_token_buffer_push_byte (&token_buffer, '\n');
10429
10478
token_buffer.cursor = parser->current.end + 1;
10430
10479
breakpoint = parser->current.end;
10431
10480
continue;
10432
10481
default:
10433
- parser->current.end++;
10434
- pm_token_buffer_push(&token_buffer, '\\');
10435
- pm_token_buffer_push(&token_buffer, peeked);
10482
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10483
+ pm_token_buffer_push_escaped(&token_buffer, parser);
10436
10484
break;
10437
10485
}
10438
10486
} else {
10439
10487
switch (peeked) {
10440
10488
case '\r':
10441
10489
parser->current.end++;
10442
10490
if (peek(parser) != '\n') {
10443
- pm_token_buffer_push (&token_buffer, '\r');
10491
+ pm_token_buffer_push_byte (&token_buffer, '\r');
10444
10492
break;
10445
10493
}
10446
10494
/* fallthrough */
0 commit comments