diff --git a/src/lexer/Lexer.spec.ts b/src/lexer/Lexer.spec.ts index c12a6ba30..c5ac046fc 100644 --- a/src/lexer/Lexer.spec.ts +++ b/src/lexer/Lexer.spec.ts @@ -1222,4 +1222,26 @@ describe('lexer', () => { TokenKind.Eof ]); }); + + describe('regular expression literals', () => { + function testRegex(...regexps) { + const results = [] as string[]; + for (const regexp of regexps) { + const { tokens } = Lexer.scan(regexp); + results.push(tokens[0].text); + } + expect(results).to.eql(regexps); + } + + it('recognizes regex literals', () => { + testRegex( + '/simple/', + '/SimpleWithValidFlags/imsx', + '/UnknownFlags/VUI', + '/with spaces/andflags', + '/with(parens)and[squarebraces]/', + '/*()^$@/' + ); + }); + }); }); diff --git a/src/lexer/Lexer.ts b/src/lexer/Lexer.ts index 962f4ab1d..d10c2c95f 100644 --- a/src/lexer/Lexer.ts +++ b/src/lexer/Lexer.ts @@ -199,14 +199,17 @@ export class Lexer { } }, '/': function (this: Lexer) { - switch (this.peek()) { - case '=': - this.advance(); - this.addToken(TokenKind.ForwardslashEqual); - break; - default: - this.addToken(TokenKind.Forwardslash); - break; + //try capturing a regex literal. If that doesn't work, fall back to normal handling + if (!this.regexLiteral()) { + switch (this.peek()) { + case '=': + this.advance(); + this.addToken(TokenKind.ForwardslashEqual); + break; + default: + this.addToken(TokenKind.Forwardslash); + break; + } } }, '\\': function (this: Lexer) { @@ -384,6 +387,19 @@ export class Lexer { this.columnEnd++; } + private lookaheadStack = [] as Array<{ current: number; columnEnd: number }>; + private pushLookahead() { + this.lookaheadStack.push({ + current: this.current, + columnEnd: this.columnEnd + }); + } + private popLookahead() { + const { current, columnEnd } = this.lookaheadStack.pop(); + this.current = current; + this.columnEnd = columnEnd; + } + /** * Returns the character at position `current` or a null character if we've reached the end of * input. @@ -835,6 +851,17 @@ export class Lexer { return candidates.includes(this.source.charAt(this.current)); } + /** + * Advance if the current token matches one of the candidates + */ + private advanceIf(...candidates: string[]) { + if (this.check(...candidates)) { + this.advance(); + return true; + } + return false; + } + /** * Check the previous character */ @@ -927,6 +954,38 @@ export class Lexer { } } + /** + * Capture a regex literal token. Returns false if not found. + * This is lookahead lexing which might techincally belong in the parser, + * but it's easy enough to do here in the lexer + */ + private regexLiteral() { + this.pushLookahead(); + + //finite loop to prevent infinite loop if something went wrong + for (let i = this.current; i < this.source.length; i++) { + + //if we reached the end of the regex, consume any flags + if (this.advanceIf('/')) { + //consume all flag-like chars (let the parser validate the actual values) + while (/[a-z]/i.exec(this.peek())) { + this.advance(); + } + //finalize the regex literal and EXIT + this.addToken(TokenKind.RegexLiteral); + return true; + + //if we found a non-escaped newline, there's a syntax error with this regex (or it's not a regex), so quit + } else if (this.check('\n')) { + break; + } else { + this.advance(); + } + } + this.popLookahead(); + return false; + } + /** * Creates a `Token` and adds it to the `tokens` array. * @param kind the type of token to produce. diff --git a/src/lexer/TokenKind.ts b/src/lexer/TokenKind.ts index f01bb4b0c..70ed8d7df 100644 --- a/src/lexer/TokenKind.ts +++ b/src/lexer/TokenKind.ts @@ -52,6 +52,7 @@ export enum TokenKind { DoubleLiteral = 'DoubleLiteral', LongIntegerLiteral = 'LongIntegerLiteral', EscapedCharCodeLiteral = 'EscapedCharCodeLiteral', //this is used to capture things like `\n`, `\r\n` in template strings + RegexLiteral = 'RegexLiteral', //types Void = 'Void',