Skip to content

Commit f6a6584

Browse files
committed
fix: double-counting of errors in parsing escaped strings
Essentially, this change updates `yp_unescape_calculate_difference` to not create syntax errors, and we rely entirely on `yp_unescape_manipulate_string` to report syntax errors. To do that, this PR adds another (!) parameter to `unescape`: `yp_list_t *error_list`. When present, `unescape` reports syntax errors (and otherwise does not). However, an edge case that needed to be addressed is reporting syntax errors in this case: ?\u{1234 2345} In a string context, it's possible to have multiple codepoints by doing something like `"\u{1234 2345}"`; however, in the character literal context, this is a syntax error -- only a single codepoint is allowed. Unfortunately, when `yp_unescape_manipulate_string` is called, there's nothing to indicate that we are in a "character literal" context and that only a single codepoint is valid. To make this work, this PR: - introduces a new static utility function in yarp.c, `yp_char_literal_node_create_and_unescape`, which is called when we're parsing `YP_TOKEN_CHARACTER_LITERAL` - introduces a new (unexported) function, `yp_unescape_manipulate_char_literal` which does the same thing as `yp_unescape_manipulate_string` but tells `unescape` that only a single codepoint is expected
1 parent fc88558 commit f6a6584

File tree

4 files changed

+73
-41
lines changed

4 files changed

+73
-41
lines changed

include/yarp/unescape.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ typedef enum {
2929
YP_UNESCAPE_ALL
3030
} yp_unescape_type_t;
3131

32-
// Unescape the contents of the given token into the given string using the
33-
// given unescape mode.
32+
// Unescape the contents of the given token into the given string using the given unescape mode.
3433
YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type);
34+
void yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type);
3535

3636
// Accepts a source string and a type of unescaping and returns the unescaped version.
3737
// The caller must yp_string_free(result); after calling this function.

src/unescape.c

Lines changed: 59 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, cons
156156
// If we get here, then the value is too big. This is an error, but we don't
157157
// want to just crash, so instead we'll add an error to the error list and put
158158
// in a replacement character instead.
159-
yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
159+
if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence.");
160160
dest[0] = 0xEF;
161161
dest[1] = 0xBF;
162162
dest[2] = 0xBD;
@@ -186,7 +186,15 @@ unescape_char(uint8_t value, const uint8_t flags) {
186186

187187
// Read a specific escape sequence into the given destination.
188188
static const uint8_t *
189-
unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t *backslash, const uint8_t *end, const uint8_t flags) {
189+
unescape(
190+
yp_parser_t *parser,
191+
uint8_t *dest,
192+
size_t *dest_length,
193+
const uint8_t *backslash,
194+
const uint8_t *end,
195+
const uint8_t flags,
196+
yp_list_t *error_list
197+
) {
190198
switch (backslash[1]) {
191199
case 'a':
192200
case 'b':
@@ -226,7 +234,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
226234
// \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
227235
case 'u': {
228236
if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
229-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
237+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
230238
return backslash + 2;
231239
}
232240

@@ -243,12 +251,11 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
243251

244252
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
245253
if (hexadecimal_length > 6) {
246-
yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
254+
if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
247255
}
248-
249256
// there are not hexadecimal characters
250-
if (hexadecimal_length == 0) {
251-
yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
257+
else if (hexadecimal_length == 0) {
258+
if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
252259
return unicode_cursor;
253260
}
254261

@@ -261,63 +268,63 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
261268
uint32_t value;
262269
unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
263270
if (dest) {
264-
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list);
271+
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
265272
}
266273

267274
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
268275
}
269276

270277
// ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
271-
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
272-
yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
273-
278+
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
279+
if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
280+
}
274281

275282
if (unicode_cursor < end && *unicode_cursor == '}') {
276283
unicode_cursor++;
277284
} else {
278-
yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape.");
285+
if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape.");
279286
}
287+
280288
return unicode_cursor;
281289
}
282-
283-
if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
290+
else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
284291
uint32_t value;
285292
unescape_unicode(backslash + 2, 4, &value);
286293

287294
if (dest) {
288-
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list);
295+
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
289296
}
290297
return backslash + 6;
291298
}
292299

293-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
300+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
294301
return backslash + 2;
295302
}
296303
// \c\M-x meta control character, where x is an ASCII printable character
297304
// \c? delete, ASCII 7Fh (DEL)
298305
// \cx control character, where x is an ASCII printable character
299306
case 'c':
300307
if (backslash + 2 >= end) {
301-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
308+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
302309
return end;
303310
}
304311

305312
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
306-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
313+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
307314
return backslash + 2;
308315
}
309316

310317
switch (backslash[2]) {
311318
case '\\':
312-
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL);
319+
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
313320
case '?':
314321
if (dest) {
315322
dest[(*dest_length)++] = unescape_char(0x7f, flags);
316323
}
317324
return backslash + 3;
318325
default: {
319326
if (!char_is_ascii_printable(backslash[2])) {
320-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
327+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
321328
return backslash + 2;
322329
}
323330

@@ -331,31 +338,31 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
331338
// \C-? delete, ASCII 7Fh (DEL)
332339
case 'C':
333340
if (backslash + 3 >= end) {
334-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
341+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
335342
return end;
336343
}
337344

338345
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
339-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
346+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
340347
return backslash + 2;
341348
}
342349

343350
if (backslash[2] != '-') {
344-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
351+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
345352
return backslash + 2;
346353
}
347354

348355
switch (backslash[3]) {
349356
case '\\':
350-
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL);
357+
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
351358
case '?':
352359
if (dest) {
353360
dest[(*dest_length)++] = unescape_char(0x7f, flags);
354361
}
355362
return backslash + 4;
356363
default:
357364
if (!char_is_ascii_printable(backslash[3])) {
358-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence");
365+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
359366
return backslash + 2;
360367
}
361368

@@ -369,22 +376,22 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
369376
// \M-x meta character, where x is an ASCII printable character
370377
case 'M': {
371378
if (backslash + 3 >= end) {
372-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
379+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
373380
return end;
374381
}
375382

376383
if (flags & YP_UNESCAPE_FLAG_META) {
377-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
384+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
378385
return backslash + 2;
379386
}
380387

381388
if (backslash[2] != '-') {
382-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
389+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
383390
return backslash + 2;
384391
}
385392

386393
if (backslash[3] == '\\') {
387-
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META);
394+
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
388395
}
389396

390397
if (char_is_ascii_printable(backslash[3])) {
@@ -394,7 +401,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
394401
return backslash + 4;
395402
}
396403

397-
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
404+
if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
398405
return backslash + 3;
399406
}
400407
// \n
@@ -448,8 +455,8 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
448455
// \c\M-x same as above
449456
// \c? or \C-? delete, ASCII 7Fh (DEL)
450457
//
451-
YP_EXPORTED_FUNCTION void
452-
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
458+
static void
459+
yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
453460
if (unescape_type == YP_UNESCAPE_NONE) {
454461
// If we're not unescaping then we can reference the source directly.
455462
return;
@@ -511,7 +518,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
511518
// This is the only type of unescaping left. In this case we need to
512519
// handle all of the different unescapes.
513520
assert(unescape_type == YP_UNESCAPE_ALL);
514-
cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE);
521+
522+
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
523+
if (expect_single_codepoint) {
524+
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
525+
}
526+
527+
cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
515528
break;
516529
}
517530

@@ -539,6 +552,16 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
539552
yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
540553
}
541554

555+
YP_EXPORTED_FUNCTION void
556+
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
557+
yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
558+
}
559+
560+
void
561+
yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
562+
yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
563+
}
564+
542565
// This function is similar to yp_unescape_manipulate_string, except it doesn't
543566
// actually perform any string manipulations. Instead, it calculates how long
544567
// the unescaped character is, and returns that value
@@ -564,10 +587,11 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash,
564587
assert(unescape_type == YP_UNESCAPE_ALL);
565588

566589
uint8_t flags = YP_UNESCAPE_FLAG_NONE;
567-
if (expect_single_codepoint)
590+
if (expect_single_codepoint) {
568591
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
592+
}
569593

570-
const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags);
594+
const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
571595
assert(cursor > backslash);
572596

573597
return (size_t) (cursor - backslash);

src/yarp.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7547,6 +7547,17 @@ yp_symbol_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin
75477547
return node;
75487548
}
75497549

7550+
static yp_string_node_t *
7551+
yp_char_literal_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *opening, const yp_token_t *content, const yp_token_t *closing, yp_unescape_type_t unescape_type) {
7552+
yp_string_node_t *node = yp_string_node_create(parser, opening, content, closing);
7553+
7554+
assert((content->end - content->start) >= 0);
7555+
yp_string_shared_init(&node->unescaped, content->start, content->end);
7556+
7557+
yp_unescape_manipulate_char_literal(parser, &node->unescaped, unescape_type);
7558+
return node;
7559+
}
7560+
75507561
static yp_string_node_t *
75517562
yp_string_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *opening, const yp_token_t *content, const yp_token_t *closing, yp_unescape_type_t unescape_type) {
75527563
yp_string_node_t *node = yp_string_node_create(parser, opening, content, closing);
@@ -10921,7 +10932,7 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
1092110932

1092210933
yp_token_t closing = not_provided(parser);
1092310934

10924-
return (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &closing, YP_UNESCAPE_ALL);
10935+
return (yp_node_t *) yp_char_literal_node_create_and_unescape(parser, &opening, &content, &closing, YP_UNESCAPE_ALL);
1092510936
}
1092610937
case YP_TOKEN_CLASS_VARIABLE: {
1092710938
parser_lex(parser);

test/yarp/errors_test.rb

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -608,7 +608,6 @@ def test_do_not_allow_more_than_6_hexadecimal_digits_in_u_Unicode_character_nota
608608

609609
assert_errors expected, '"\u{0000001}"', [
610610
["invalid Unicode escape.", 4..11],
611-
["invalid Unicode escape.", 4..11]
612611
]
613612
end
614613

@@ -617,14 +616,12 @@ def test_do_not_allow_characters_other_than_0_9_a_f_and_A_F_in_u_Unicode_charact
617616

618617
assert_errors expected, '"\u{000z}"', [
619618
["unterminated Unicode escape", 7..7],
620-
["unterminated Unicode escape", 7..7]
621619
]
622620
end
623621

624622
def test_unterminated_unicode_brackets_should_be_a_syntax_error
625623
assert_errors expression('?\\u{3'), '?\\u{3', [
626624
["invalid Unicode escape.", 1..5],
627-
["invalid Unicode escape.", 1..5],
628625
]
629626
end
630627

0 commit comments

Comments
 (0)