Skip to content

Commit 4b157a8

Browse files
committed
Also rework regexp lexer to check terminators properly
1 parent 60315d0 commit 4b157a8

File tree

1 file changed

+71
-78
lines changed

1 file changed

+71
-78
lines changed

src/yarp.c

Lines changed: 71 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -6756,105 +6756,98 @@ parser_lex(yp_parser_t *parser) {
67566756
}
67576757

67586758
// Get a reference to the current mode.
6759-
yp_lex_mode_t *mode = parser->lex_modes.current;
6759+
yp_lex_mode_t *lex_mode = parser->lex_modes.current;
67606760

67616761
// These are the places where we need to split up the content of the
67626762
// regular expression. We'll use strpbrk to find the first of these
67636763
// characters.
6764-
const char *breakpoints = mode->as.regexp.breakpoints;
6764+
const char *breakpoints = lex_mode->as.regexp.breakpoints;
67656765
const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
67666766

67676767
while (breakpoint != NULL) {
6768-
switch (*breakpoint) {
6769-
case '\0':
6770-
// If we hit a null byte, skip directly past it.
6771-
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6772-
break;
6773-
case '\\': {
6774-
// If we hit escapes, then we need to treat the next token
6775-
// literally. In this case we'll skip past the next character and
6776-
// find the next breakpoint.
6777-
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, YP_UNESCAPE_ALL, false, &parser->error_list);
6768+
// If we hit a null byte, skip directly past it.
6769+
if (*breakpoint == '\0') {
6770+
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6771+
continue;
6772+
}
67786773

6779-
// If the result is an escaped newline, then we need to
6780-
// track that newline.
6781-
if (breakpoint[difference - 1] == '\n') {
6782-
yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
6783-
}
6774+
// If we've hit a newline, then we need to track that in the
6775+
// list of newlines.
6776+
if (*breakpoint == '\n') {
6777+
yp_newline_list_append(&parser->newline_list, breakpoint);
67846778

6785-
breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
6786-
break;
6779+
if (lex_mode->as.regexp.terminator != '\n') {
6780+
// If the terminator is not a newline, then we can set
6781+
// the next breakpoint and continue.
6782+
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6783+
continue;
67876784
}
6788-
case '#': {
6789-
// If the terminator is #, then we need to fall into the
6790-
// default case. Otherwise we'll attempt to lex
6791-
// interpolation.
6792-
if (mode->as.regexp.terminator != '#') {
6793-
yp_token_type_t type = lex_interpolation(parser, breakpoint);
6794-
if (type != YP_TOKEN_NOT_PROVIDED) {
6795-
LEX(type);
6796-
}
6785+
}
67976786

6798-
// If we haven't returned at this point then we had something
6799-
// that looked like an interpolated class or instance variable
6800-
// like "#@" but wasn't actually. In this case we'll just skip
6801-
// to the next breakpoint.
6802-
breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
6803-
break;
6804-
}
6787+
// If we hit the terminator, we need to determine what kind of
6788+
// token to return.
6789+
if (*breakpoint == lex_mode->as.regexp.terminator) {
6790+
if (lex_mode->as.regexp.nesting > 0) {
6791+
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6792+
lex_mode->as.regexp.nesting--;
6793+
continue;
68056794
}
6806-
/* fallthrough */
6807-
default: {
6808-
if (*breakpoint == mode->as.regexp.incrementor) {
6809-
// If we've hit the incrementor, then we need to skip past it and
6810-
// find the next breakpoint.
6811-
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6812-
mode->as.regexp.nesting++;
6813-
break;
6814-
}
6815-
6816-
if (*breakpoint == '\n') {
6817-
// If we've hit a newline, then we need to track
6818-
// that in the list of newlines.
6819-
yp_newline_list_append(&parser->newline_list, breakpoint);
68206795

6821-
if (mode->as.regexp.terminator != '\n') {
6822-
// If the terminator is not a newline, then we
6823-
// can set the next breakpoint and continue.
6824-
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6825-
break;
6826-
}
6796+
// Here we've hit the terminator. If we have already consumed
6797+
// content then we need to return that content as string content
6798+
// first.
6799+
if (breakpoint > parser->current.start) {
6800+
parser->current.end = breakpoint;
6801+
LEX(YP_TOKEN_STRING_CONTENT);
6802+
}
68276803

6828-
// Otherwise, the newline character is the
6829-
// terminator so we need to continue on.
6830-
}
6804+
// Since we've hit the terminator of the regular expression, we now
6805+
// need to parse the options.
6806+
parser->current.end = breakpoint + 1;
6807+
parser->current.end += yp_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
68316808

6832-
assert(*breakpoint == mode->as.regexp.terminator);
6809+
lex_mode_pop(parser);
6810+
lex_state_set(parser, YP_LEX_STATE_END);
6811+
LEX(YP_TOKEN_REGEXP_END);
6812+
}
68336813

6834-
if (mode->as.regexp.nesting > 0) {
6835-
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6836-
mode->as.regexp.nesting--;
6837-
break;
6838-
}
6814+
// If we hit escapes, then we need to treat the next token
6815+
// literally. In this case we'll skip past the next character
6816+
// and find the next breakpoint.
6817+
if (*breakpoint == '\\') {
6818+
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, YP_UNESCAPE_ALL, false, &parser->error_list);
68396819

6840-
// Here we've hit the terminator. If we have already consumed
6841-
// content then we need to return that content as string content
6842-
// first.
6843-
if (breakpoint > parser->current.start) {
6844-
parser->current.end = breakpoint;
6845-
LEX(YP_TOKEN_STRING_CONTENT);
6846-
}
6820+
// If the result is an escaped newline, then we need to
6821+
// track that newline.
6822+
if (breakpoint[difference - 1] == '\n') {
6823+
yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
6824+
}
68476825

6848-
// Since we've hit the terminator of the regular expression, we now
6849-
// need to parse the options.
6850-
parser->current.end = breakpoint + 1;
6851-
parser->current.end += yp_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
6826+
breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
6827+
continue;
6828+
}
68526829

6853-
lex_mode_pop(parser);
6854-
lex_state_set(parser, YP_LEX_STATE_END);
6855-
LEX(YP_TOKEN_REGEXP_END);
6830+
// If we hit a #, then we will attempt to lex interpolation.
6831+
if (*breakpoint == '#') {
6832+
yp_token_type_t type = lex_interpolation(parser, breakpoint);
6833+
if (type != YP_TOKEN_NOT_PROVIDED) {
6834+
LEX(type);
68566835
}
6836+
6837+
// If we haven't returned at this point then we had
6838+
// something that looked like an interpolated class or
6839+
// instance variable like "#@" but wasn't actually. In this
6840+
// case we'll just skip to the next breakpoint.
6841+
breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
6842+
continue;
68576843
}
6844+
6845+
// If we've hit the incrementor, then we need to skip past it
6846+
// and find the next breakpoint.
6847+
assert(*breakpoint == lex_mode->as.regexp.incrementor);
6848+
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
6849+
lex_mode->as.regexp.nesting++;
6850+
continue;
68586851
}
68596852

68606853
// At this point, the breakpoint is NULL which means we were unable to

0 commit comments

Comments
 (0)